Commit 698618cf by 曾国盛

更新包名

parent 70551d64
package com.zhiwei.crawler.monitor;
package com.zhiwei.crawler.compare;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.JSONPath;
......
package com.zhiwei.crawler.monitor;
package com.zhiwei.crawler.compare;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.client.MongoCollection;
......
package com.zhiwei.crawler.compare;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import org.apache.commons.lang3.time.FastDateFormat;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.MediaType;
import okhttp3.RequestBody;
import okhttp3.Response;
import org.apache.logging.log4j.util.Strings;
import org.bson.Document;
import org.jsoup.Jsoup;
import org.seimicrawler.xpath.JXDocument;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.*;
/**
* 搜狗百科信息监控
* @author 朝花夕誓
*/
public class SouGouBaiKeMonitor {
public static void start() {
com.zhiwei.crawler.monitor.SouGouBaiKeMonitor souGouBaiKeMonitor = new com.zhiwei.crawler.monitor.SouGouBaiKeMonitor();
ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1);
/**
* 查找出数据需要的集合
* @return
*/
public List<Map<String, Object>> findKeyWordAddress(){
List<Map<String, Object>> dataList = new ArrayList<>();
MongoUtils mongoUtils = new MongoUtils();
MongoCursor<Document> allData = mongoUtils.findAllData();
while (allData.hasNext()){
Document next = allData.next();
String keyword = null;
String pushAddress = null;
String souGouBaiKeUrl = null;
try {
keyword = next.getString("keyword");
pushAddress = next.getString("push_address");
souGouBaiKeUrl = next.getString("sougoubaike_url");
} catch (Exception e) {
e.printStackTrace();
}
if (Strings.isNotBlank(keyword) && Strings.isNotBlank(pushAddress) && Strings.isNotBlank(souGouBaiKeUrl)){
Map<String, Object> dataMap = new HashMap<>();
dataMap.put("keyword", keyword);
dataMap.put("pushAddress", pushAddress);
dataMap.put("souGouBaiKeUrl", souGouBaiKeUrl);
dataList.add(dataMap);
}
}
return dataList;
}
public Document findInDataBaseData(String keyword){
MongoUtils mongoUtils = new MongoUtils();
MongoDatabase mongoDataBase = mongoUtils.getMongoDataBase();
MongoCollection<Document> souGouBaiKe = mongoDataBase.getCollection("souGouBaiKe");
Document query = new Document();
query.put("keyword", keyword);
Document createTime = new Document();
createTime.put("create_time", -1);
Document first = souGouBaiKe.find(query).sort(createTime).first();
return first;
}
public void saveData(Map<String, Object> dataMap){
MongoUtils mongoUtils = new MongoUtils();
MongoDatabase mongoDataBase = mongoUtils.getMongoDataBase();
MongoCollection<Document> souGouBaiKe = mongoDataBase.getCollection("souGouBaiKe");
Document document = new Document();
document.put("create_time", (int)(System.currentTimeMillis()/1000));
document.append("keyword", dataMap.get("keyword"))
.append("creator", dataMap.get("creator"))
.append("edit_number", dataMap.get("editNumber"))
.append("recent_update", dataMap.get("recentUpdate"));
souGouBaiKe.insertOne(document);
}
public void dataCompare(String keyword, String monitorUrl, String pushAddress){
String htmlBody = getHtmlBody(monitorUrl);
org.jsoup.nodes.Document parse = Jsoup.parse(htmlBody);
JXDocument jxDocument = JXDocument.create(parse);
Map<String, Object> dataMap = new HashMap<>();
try {
// 创建者
String creator = jxDocument.selNOne("//ul[@class='lemma_data']/li[1]").asElement().text();
// 编辑次数
String editNumber = jxDocument.selNOne("//ul[@class='lemma_data']/li[3]").asElement().text();
// 最近更新
String recentUpdate = jxDocument.selNOne("//ul[@class='lemma_data']/li[5]").asElement().text();
// 对比地址
String compareUrl = "https://baike.sogou.com" + jxDocument.selNOne("//ul[@class='lemma_data']/li[3]//a").asElement().attr("href");
dataMap.put("keyword", keyword);
dataMap.put("creator", creator);
dataMap.put("editNumber", editNumber);
dataMap.put("recentUpdate", recentUpdate);
dataMap.put("compareUrl", compareUrl);
dataMap.put("pushAddress", pushAddress);
} catch (Exception e) {
new Throwable("数据解析错误").printStackTrace();
}
Document inDataBaseData = findInDataBaseData(keyword);
if(Objects.nonNull(inDataBaseData)){
String editNumber = inDataBaseData.getString("edit_number");
if (!editNumber.equals(dataMap.get("editNumber"))){
System.out.println(keyword + " \t数据有更新");
System.out.println("\n\n开始推送......\n\n");
// 推送数据
setHotSearchDataAndPushContent(dataMap);
// 存放数据
saveData(dataMap);
}else {
System.out.println(keyword + " 在mongo中的数据: " + inDataBaseData.toString());
System.out.println(keyword + " 无数据更新.");
}
}else {
System.out.println("第一次访问:" + keyword);
// 存放数据
saveData(dataMap);
}
}
private static void sendWorkWechatByMarkdown(List<Map<String, String>> content, String sendUrl) {
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
Map<String, Object> newsMap = new HashMap<>();
newsMap.put("articles", content);
Map<String, Object> params = new HashMap<>();
params.put("msgtype", "news");
params.put("news", newsMap);
String data = JSONObject.toJSONString(params);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapPost(sendUrl, RequestBody.create(MediaType.parse("application/json"), data))).body()
.string();
if (htmlBody.contains("ok")) {
System.out.println("----------企业微信账号数据推送成功-----------");
} else {
System.out.println("----------企业微信账号数据推送失败-----------");
}
} catch (Exception e) {
System.out.println("----------企业微信账号数据推送失败,出现错误-----------");
e.printStackTrace();
}
}
public static void setHotSearchDataAndPushContent(Map<String, Object> dataMap) {
List<Map<String, String>> listContent = new ArrayList<>();
Map<String, String> map = new HashMap<>(4);
map.put("title", (String) dataMap.get("keyword"));
map.put("description", "数据有更新\n"
+ dataMap.get("recentUpdate") + " " + dataMap.get("editNumber")
+ "\n");
map.put("url", (String) dataMap.get("compareUrl"));
map.put("picurl", "https://login.zhiweidata.com/plogin/img/cat.8de03170.png");
listContent.add(map);
sendWorkWechatByMarkdown(listContent, (String) dataMap.get("pushAddress"));
}
scheduledExecutorService.scheduleAtFixedRate(()->{
MongoCursor<Document> allData = mongoUtils.findAllData();
String timeStamp = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss").format(System.currentTimeMillis());
System.out.println("\n\n正在访问... " + timeStamp + "\n\n");
while (allData.hasNext()){
Document next = allData.next();
String url = next.getString("sougoubaike_url");
String keyword = next.getString("keyword");
String pushAddress = next.getString("push_address");
if(Strings.isNotBlank(url) && Strings.isNotBlank(keyword) && Strings.isNotBlank(pushAddress)){
souGouBaiKeMonitor.dataCompare(keyword, url, pushAddress);
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public String getHtmlBody(String url){
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
for (int i = 0; i < 3; i++) {
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url))){
return response.body().string();
}catch(Exception e){
e.printStackTrace();
}
}, 0, 1, TimeUnit.HOURS);
}
return null;
}
}
package com.zhiwei.crawler.main;
import com.zhiwei.crawler.compare.BaiDuBaiKeMonitor;
import com.zhiwei.crawler.compare.SouGouBaiKeMonitor;
import com.zhiwei.crawler.monitor.BaiDuBaiKeMonitor;
import com.zhiwei.crawler.monitor.SouGouBaiKeMonitor;
/**
* @Author: 朝花夕誓
......
package com.zhiwei.crawler.compare;
package com.zhiwei.crawler.monitor;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import org.apache.commons.lang3.time.FastDateFormat;
......
package com.zhiwei.crawler.compare;
package com.zhiwei.crawler.monitor;
import com.mongodb.client.MongoCursor;
import com.zhiwei.crawler.monitor.Baike360;
import com.zhiwei.crawler.compare.Baike360;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import org.apache.commons.lang3.time.FastDateFormat;
import org.apache.logging.log4j.util.Strings;
......
package com.zhiwei.crawler.monitor;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.MediaType;
import okhttp3.RequestBody;
import okhttp3.Response;
import org.apache.commons.lang3.time.FastDateFormat;
import org.apache.logging.log4j.util.Strings;
import org.bson.Document;
import org.jsoup.Jsoup;
import org.seimicrawler.xpath.JXDocument;
import java.util.*;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
/**
* 搜狗百科信息监控
* @author 朝花夕誓
*/
public class SouGouBaiKeMonitor {
public static void start() {
com.zhiwei.crawler.compare.SouGouBaiKeMonitor souGouBaiKeMonitor = new com.zhiwei.crawler.compare.SouGouBaiKeMonitor();
ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1);
/**
* 查找出数据需要的集合
* @return
*/
public List<Map<String, Object>> findKeyWordAddress(){
List<Map<String, Object>> dataList = new ArrayList<>();
MongoUtils mongoUtils = new MongoUtils();
MongoCursor<Document> allData = mongoUtils.findAllData();
while (allData.hasNext()){
Document next = allData.next();
String keyword = null;
String pushAddress = null;
String souGouBaiKeUrl = null;
try {
keyword = next.getString("keyword");
pushAddress = next.getString("push_address");
souGouBaiKeUrl = next.getString("sougoubaike_url");
} catch (Exception e) {
e.printStackTrace();
}
if (Strings.isNotBlank(keyword) && Strings.isNotBlank(pushAddress) && Strings.isNotBlank(souGouBaiKeUrl)){
Map<String, Object> dataMap = new HashMap<>();
dataMap.put("keyword", keyword);
dataMap.put("pushAddress", pushAddress);
dataMap.put("souGouBaiKeUrl", souGouBaiKeUrl);
dataList.add(dataMap);
}
}
return dataList;
}
public Document findInDataBaseData(String keyword){
MongoUtils mongoUtils = new MongoUtils();
MongoDatabase mongoDataBase = mongoUtils.getMongoDataBase();
MongoCollection<Document> souGouBaiKe = mongoDataBase.getCollection("souGouBaiKe");
Document query = new Document();
query.put("keyword", keyword);
Document createTime = new Document();
createTime.put("create_time", -1);
Document first = souGouBaiKe.find(query).sort(createTime).first();
return first;
}
public void saveData(Map<String, Object> dataMap){
MongoUtils mongoUtils = new MongoUtils();
MongoDatabase mongoDataBase = mongoUtils.getMongoDataBase();
MongoCollection<Document> souGouBaiKe = mongoDataBase.getCollection("souGouBaiKe");
Document document = new Document();
document.put("create_time", (int)(System.currentTimeMillis()/1000));
document.append("keyword", dataMap.get("keyword"))
.append("creator", dataMap.get("creator"))
.append("edit_number", dataMap.get("editNumber"))
.append("recent_update", dataMap.get("recentUpdate"));
souGouBaiKe.insertOne(document);
}
public void dataCompare(String keyword, String monitorUrl, String pushAddress){
String htmlBody = getHtmlBody(monitorUrl);
org.jsoup.nodes.Document parse = Jsoup.parse(htmlBody);
JXDocument jxDocument = JXDocument.create(parse);
Map<String, Object> dataMap = new HashMap<>();
try {
// 创建者
String creator = jxDocument.selNOne("//ul[@class='lemma_data']/li[1]").asElement().text();
// 编辑次数
String editNumber = jxDocument.selNOne("//ul[@class='lemma_data']/li[3]").asElement().text();
// 最近更新
String recentUpdate = jxDocument.selNOne("//ul[@class='lemma_data']/li[5]").asElement().text();
// 对比地址
String compareUrl = "https://baike.sogou.com" + jxDocument.selNOne("//ul[@class='lemma_data']/li[3]//a").asElement().attr("href");
dataMap.put("keyword", keyword);
dataMap.put("creator", creator);
dataMap.put("editNumber", editNumber);
dataMap.put("recentUpdate", recentUpdate);
dataMap.put("compareUrl", compareUrl);
dataMap.put("pushAddress", pushAddress);
} catch (Exception e) {
new Throwable("数据解析错误").printStackTrace();
}
Document inDataBaseData = findInDataBaseData(keyword);
if(Objects.nonNull(inDataBaseData)){
String editNumber = inDataBaseData.getString("edit_number");
if (!editNumber.equals(dataMap.get("editNumber"))){
System.out.println(keyword + " \t数据有更新");
System.out.println("\n\n开始推送......\n\n");
// 推送数据
setHotSearchDataAndPushContent(dataMap);
// 存放数据
saveData(dataMap);
}else {
System.out.println(keyword + " 在mongo中的数据: " + inDataBaseData.toString());
System.out.println(keyword + " 无数据更新.");
}
}else {
System.out.println("第一次访问:" + keyword);
// 存放数据
saveData(dataMap);
}
}
private static void sendWorkWechatByMarkdown(List<Map<String, String>> content, String sendUrl) {
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
Map<String, Object> newsMap = new HashMap<>();
newsMap.put("articles", content);
Map<String, Object> params = new HashMap<>();
params.put("msgtype", "news");
params.put("news", newsMap);
String data = JSONObject.toJSONString(params);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapPost(sendUrl, RequestBody.create(MediaType.parse("application/json"), data))).body()
.string();
if (htmlBody.contains("ok")) {
System.out.println("----------企业微信账号数据推送成功-----------");
} else {
System.out.println("----------企业微信账号数据推送失败-----------");
}
} catch (Exception e) {
System.out.println("----------企业微信账号数据推送失败,出现错误-----------");
e.printStackTrace();
}
}
public static void setHotSearchDataAndPushContent(Map<String, Object> dataMap) {
List<Map<String, String>> listContent = new ArrayList<>();
Map<String, String> map = new HashMap<>(4);
map.put("title", (String) dataMap.get("keyword"));
map.put("description", "数据有更新\n"
+ dataMap.get("recentUpdate") + " " + dataMap.get("editNumber")
+ "\n");
map.put("url", (String) dataMap.get("compareUrl"));
map.put("picurl", "https://login.zhiweidata.com/plogin/img/cat.8de03170.png");
listContent.add(map);
sendWorkWechatByMarkdown(listContent, (String) dataMap.get("pushAddress"));
}
public String getHtmlBody(String url){
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
for (int i = 0; i < 3; i++) {
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url))){
return response.body().string();
}catch(Exception e){
e.printStackTrace();
scheduledExecutorService.scheduleAtFixedRate(()->{
MongoCursor<Document> allData = mongoUtils.findAllData();
String timeStamp = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss").format(System.currentTimeMillis());
System.out.println("\n\n正在访问... " + timeStamp + "\n\n");
while (allData.hasNext()){
Document next = allData.next();
String url = next.getString("sougoubaike_url");
String keyword = next.getString("keyword");
String pushAddress = next.getString("push_address");
if(Strings.isNotBlank(url) && Strings.isNotBlank(keyword) && Strings.isNotBlank(pushAddress)){
souGouBaiKeMonitor.dataCompare(keyword, url, pushAddress);
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
return null;
}, 0, 1, TimeUnit.HOURS);
}
}
package com.zhiwei.crawler;
import com.mongodb.client.MongoCursor;
import com.zhiwei.crawler.monitor.Baike360;
import com.zhiwei.crawler.compare.Baike360;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import com.zhiwei.crawler.utils.RequestUtils;
import org.apache.commons.lang3.time.FastDateFormat;
import org.apache.logging.log4j.util.Strings;
import org.bson.Document;
......
package com.zhiwei.crawler;
import com.zhiwei.crawler.compare.BaiDuBaiKeMonitor;
import com.zhiwei.crawler.compare.SouGouBaiKeMonitor;
import com.zhiwei.crawler.monitor.BaiDuBaiKeMonitor;
import com.zhiwei.crawler.monitor.SouGouBaiKeMonitor;
import org.junit.Test;
/**
......
package com.zhiwei.crawler;
import com.mongodb.client.MongoCursor;
import com.zhiwei.crawler.monitor.SouGouBaiKeMonitor;
import com.zhiwei.crawler.compare.SouGouBaiKeMonitor;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import org.apache.commons.lang3.time.FastDateFormat;
import org.apache.logging.log4j.util.Strings;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment