Commit ab604261 by 曾国盛

upload code

parents
.idea
target
*.iml
## 百度百科数据异常推送程序
####调用地址: src/main/java/com/zhiwei/crawler/main/Main.java
#### 数据库记录监控词条地址:db/baiKeKeyword.json, 程序启动需要将baiKeKeyword.json导入数据库中。
> 库名:baidubaike 表名: baiKeKeyword
#### 默认判断间隔为一小时进行一次判断,可更具具体需求自行调整
{
"RECORDS": [
{
"_id": "5f5b2083a9100000a9006463",
"360baike_url": "",
"commitor": "MT组",
"keyword": "金星",
"monitor_mean": "新氧科技信息监控",
"monitor_url": "https://baike.baidu.com/item/%E9%87%91%E6%98%9F/24122994",
"push_address": "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=36475302-c7c0-4d87-a336-7fa7c4681e94",
"sougoubaike_url": "https://baike.sogou.com/v191261245.htm?fromTitle=%E9%87%91%E6%98%9F%EF%BC%88%E6%96%B0%E6%B0%A7%E7%A7%91%E6%8A%80%E8%91%A3%E4%BA%8B%E9%95%BF%E5%85%BCCEO%EF%BC%89"
},
{
"_id": "5f5b2476571a000040007102",
"360baike_url": "",
"commitor": "MT组",
"keyword": "新氧",
"monitor_mean": "新氧科技信息监控",
"monitor_url": "https://baike.baidu.com/item/%E6%96%B0%E6%B0%A7/23751923",
"push_address": "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=36475302-c7c0-4d87-a336-7fa7c4681e94",
"sougoubaike_url": "https://baike.sogou.com/v183770635.htm"
},
{
"_id": "5f5b25e8571a000040007105",
"360baike_url": "",
"commitor": "MT组",
"keyword": "北京新氧科技有限公司",
"monitor_mean": "新氧科技信息监控",
"monitor_url": "https://baike.baidu.com/item/%E5%8C%97%E4%BA%AC%E6%96%B0%E6%B0%A7%E7%A7%91%E6%8A%80%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8",
"push_address": "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=36475302-c7c0-4d87-a336-7fa7c4681e94",
"sougoubaike_url": "https://baike.sogou.com/v101108006.htm?fromTitle=%E5%8C%97%E4%BA%AC%E6%96%B0%E6%B0%A7%E7%A7%91%E6%8A%80%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8"
},
{
"_id": "5f5b42ae64500000be001812",
"360baike_url": "",
"commitor": "LP组",
"keyword": "上海商汤智能科技有限公司",
"monitor_mean": "商汤科技信息监控",
"monitor_url": "https://baike.baidu.com/item/%E4%B8%8A%E6%B5%B7%E5%95%86%E6%B1%A4%E6%99%BA%E8%83%BD%E7%A7%91%E6%8A%80%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8",
"push_address": "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=34bda8fa-4d29-4a88-b77d-2e8ac81ad945",
"sougoubaike_url": ""
},
{
"_id": "5f5b42e764500000be001813",
"360baike_url": "",
"commitor": "LP组",
"keyword": "北京市商汤科技开发有限公司",
"monitor_mean": "商汤科技信息监控",
"monitor_url": "https://baike.baidu.com/item/%E5%8C%97%E4%BA%AC%E5%B8%82%E5%95%86%E6%B1%A4%E7%A7%91%E6%8A%80%E5%BC%80%E5%8F%91%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8",
"push_address": "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=34bda8fa-4d29-4a88-b77d-2e8ac81ad945",
"sougoubaike_url": ""
},
{
"_id": "5f5b42f564500000be001814",
"360baike_url": "",
"commitor": "LP组",
"keyword": "深圳市商汤科技有限公司",
"monitor_mean": "商汤科技信息监控",
"monitor_url": "https://baike.baidu.com/item/%E6%B7%B1%E5%9C%B3%E5%B8%82%E5%95%86%E6%B1%A4%E7%A7%91%E6%8A%80%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8",
"push_address": "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=34bda8fa-4d29-4a88-b77d-2e8ac81ad945",
"sougoubaike_url": ""
},
{
"_id": "5f5b430d64500000be001815",
"360baike_url": "",
"commitor": "LP组",
"keyword": "徐立",
"monitor_mean": "商汤科技信息监控",
"monitor_url": "https://baike.baidu.com/item/%E5%BE%90%E7%AB%8B/22031892",
"push_address": "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=34bda8fa-4d29-4a88-b77d-2e8ac81ad945",
"sougoubaike_url": ""
},
{
"_id": "5f5b430f64500000be001816",
"360baike_url": "",
"commitor": "LP组",
"keyword": "汤晓鸥",
"monitor_mean": "商汤科技信息监控",
"monitor_url": "https://baike.baidu.com/item/%E6%B1%A4%E6%99%93%E9%B8%A5",
"push_address": "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=34bda8fa-4d29-4a88-b77d-2e8ac81ad945",
"sougoubaike_url": ""
},
{
"_id": "5fa3561f7650000058007383",
"360baike_url": "https://baike.so.com/Asyncdata/entryStat?eid=1038695&sid=1098608&ename=%E8%85%BE%E8%AE%AF",
"commitor": "admin",
"keyword": "腾讯",
"monitor_mean": "test",
"monitor_url": "",
"push_address": "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=6ba4e800-e84b-4b61-88c4-e6169e440a77",
"sougoubaike_url": ""
},
{
"_id": "5fa367397650000058007384",
"360baike_url": "https://baike.so.com/Asyncdata/entryStat?eid=3755114&sid=3944874&ename=%E5%8D%8E%E4%B8%BA%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8",
"commitor": "",
"keyword": "华为",
"monitor_mean": "test",
"monitor_url": "",
"push_address": "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=6ba4e800-e84b-4b61-88c4-e6169e440a77",
"sougoubaike_url": ""
},
{
"_id": "5fe5a8df232d0000b7006ce2",
"360baike_url": "",
"commitor": "MT组-张娜娜",
"keyword": "绿宝石医生榜单",
"monitor_mean": "新氧科技信息监控",
"monitor_url": "https://baike.baidu.com/item/%E7%BB%BF%E5%AE%9D%E7%9F%B3%E5%8C%BB%E7%94%9F%E6%A6%9C%E5%8D%95/55590616?fr=aladdin",
"push_address": "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=36475302-c7c0-4d87-a336-7fa7c4681e94",
"sougoubaike_url": "https://baike.sogou.com/m/v200627685.htm?fromTitle="
}
]
}
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>zhiwei-baidu</artifactId>
<version>1.0-SNAPSHOT</version>
<!--放置的都是项目运行所需要的jar包-->
<dependencies>
<!--lombok-->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.12</version>
</dependency>
<!--log4j,由于大多数的maven依赖都自带log4j,所以需要时启用-->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.13.3</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>com.zhiwei</groupId>-->
<!-- <artifactId>common-crawler-parent</artifactId>-->
<!-- <version>0.0.4-SNAPSHOT</version>-->
<!-- </dependency>-->
<!-- 采集核心包 -->
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.6.8-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.2.1-RELEASE</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.68</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.7.0</version>
</dependency>
<dependency>
<groupId>com.googlecode.java-diff-utils</groupId>
<artifactId>diffutils</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13</version>
<scope>compile</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/io.github.pityka/fileutils -->
<dependency>
<groupId>io.github.pityka</groupId>
<artifactId>fileutils_2.12</artifactId>
<version>1.2.2</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.7</version>
</dependency>
<dependency>
<groupId>org.python</groupId>
<artifactId>jython-standalone</artifactId>
<version>2.7.0</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package com.zhiwei.crawler.compare;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import org.apache.commons.lang3.time.FastDateFormat;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public class BaiDuBaiKeMonitor {
public static void start() {
com.zhiwei.crawler.transfertest.BaiDuBaiKeDiffPush baiDuBaiKe = new com.zhiwei.crawler.transfertest.BaiDuBaiKeDiffPush();
MongoUtils mongoUtils = new MongoUtils();
ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1);
scheduledExecutorService.scheduleAtFixedRate(() -> {
List<Map<String, String>> keywordData = mongoUtils.findKeywordData();
if(keywordData.size() != 0){
String timeStamp = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss").format(System.currentTimeMillis());
System.out.println("\n\n正在访问... " + timeStamp + "\n\n");
keywordData.forEach(item->{
String keyword = item.get("keyword");
String monitorUrl = item.get("monitorUrl");
String pushAddress = item.get("pushAddress");
baiDuBaiKe.requestUrl(keyword, monitorUrl, pushAddress);
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
});
}
},0,1, TimeUnit.HOURS);
}
}
package com.zhiwei.crawler.compare;
import com.mongodb.client.MongoCursor;
import com.zhiwei.crawler.monitor.Baike360;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import org.apache.commons.lang3.time.FastDateFormat;
import org.apache.logging.log4j.util.Strings;
import org.bson.Document;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public class BaiKe360Monitor {
public static void start() {
Baike360 baike360 = new Baike360();
ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1);
MongoUtils mongoUtils = new MongoUtils();
scheduledExecutorService.scheduleAtFixedRate(()->{
MongoCursor<Document> allData = mongoUtils.findAllData();
String timeStamp = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss").format(System.currentTimeMillis());
System.out.println("\n\n正在访问... " + timeStamp + "\n\n");
while (allData.hasNext()){
Document next = allData.next();
String url = next.getString("360baike_url");
String keyword = next.getString("keyword");
String pushAddress = next.getString("push_address");
if(Strings.isNotBlank(url) && Strings.isNotBlank(keyword) && Strings.isNotBlank(pushAddress)){
baike360.dataCompare(keyword, url, pushAddress);
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}, 0, 1, TimeUnit.HOURS);
}
}
package com.zhiwei.crawler.compare;
import com.mongodb.client.MongoCursor;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import org.apache.commons.lang3.time.FastDateFormat;
import org.apache.logging.log4j.util.Strings;
import org.bson.Document;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public class SouGouBaiKeMonitor {
public static void start() {
com.zhiwei.crawler.monitor.SouGouBaiKeMonitor souGouBaiKeMonitor = new com.zhiwei.crawler.monitor.SouGouBaiKeMonitor();
ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1);
MongoUtils mongoUtils = new MongoUtils();
scheduledExecutorService.scheduleAtFixedRate(()->{
MongoCursor<Document> allData = mongoUtils.findAllData();
String timeStamp = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss").format(System.currentTimeMillis());
System.out.println("\n\n正在访问... " + timeStamp + "\n\n");
while (allData.hasNext()){
Document next = allData.next();
String url = next.getString("sougoubaike_url");
String keyword = next.getString("keyword");
String pushAddress = next.getString("push_address");
if(Strings.isNotBlank(url) && Strings.isNotBlank(keyword) && Strings.isNotBlank(pushAddress)){
souGouBaiKeMonitor.dataCompare(keyword, url, pushAddress);
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}, 0, 1, TimeUnit.HOURS);
}
}
package com.zhiwei.crawler.main;
import com.zhiwei.crawler.compare.BaiDuBaiKeMonitor;
import com.zhiwei.crawler.compare.SouGouBaiKeMonitor;
/**
* @Author: 朝花夕誓
* @Date: 2020/11/10 11:46
* @Version 1.0
*/
public class Main {
// 主方法启动
public static void main(String arg[]){
// 百度百科词条监控
BaiDuBaiKeMonitor.start();
// 搜狗百科词条监控
SouGouBaiKeMonitor.start();
// 360百科词条监控
// BaiKe360Monitor.start();
}
}
package com.zhiwei.crawler.monitor;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.MediaType;
import okhttp3.RequestBody;
import okhttp3.Response;
import org.apache.logging.log4j.util.Strings;
import org.bson.Document;
import java.util.*;
/**
* @Author: 朝花夕誓
* @Date: 2020/11/5 9:34
* @Version 1.0
*/
public class Baike360 {
/**
* 查找出数据需要的集合
* @return
*/
public List<Map<String, Object>> findKeyWordAddress(){
List<Map<String, Object>> dataList = new ArrayList<>();
MongoUtils mongoUtils = new MongoUtils();
MongoCursor<Document> allData = mongoUtils.findAllData();
while (allData.hasNext()){
Document next = allData.next();
String keyword = null;
String pushAddress = null;
String souGouBaiKeUrl = null;
try {
keyword = next.getString("keyword");
pushAddress = next.getString("push_address");
souGouBaiKeUrl = next.getString("360baike_url");
} catch (Exception e) {
e.printStackTrace();
}
if (Strings.isNotBlank(keyword) && Strings.isNotBlank(pushAddress) && Strings.isNotBlank(souGouBaiKeUrl)){
Map<String, Object> dataMap = new HashMap<>();
dataMap.put("keyword", keyword);
dataMap.put("pushAddress", pushAddress);
dataMap.put("souGouBaiKeUrl", souGouBaiKeUrl);
dataList.add(dataMap);
}
}
return dataList;
}
public Document findInDataBaseData(String keyword){
MongoUtils mongoUtils = new MongoUtils();
MongoDatabase mongoDataBase = mongoUtils.getMongoDataBase();
MongoCollection<Document> souGouBaiKe = mongoDataBase.getCollection("360Baike");
Document query = new Document();
query.put("keyword", keyword);
Document createTime = new Document();
createTime.put("create_time", -1);
Document first = souGouBaiKe.find(query).sort(createTime).first();
return first;
}
public void saveData(Map<String, Object> dataMap){
MongoUtils mongoUtils = new MongoUtils();
MongoDatabase mongoDataBase = mongoUtils.getMongoDataBase();
MongoCollection<Document> souGouBaiKe = mongoDataBase.getCollection("360Baike");
Document document = new Document();
document.put("create_time", (int)(System.currentTimeMillis()/1000));
document.append("keyword", dataMap.get("keyword"))
.append("creator", dataMap.get("creator"))
.append("edit_number", dataMap.get("editNumber"))
.append("recent_update", dataMap.get("recentUpdate"));
souGouBaiKe.insertOne(document);
}
public void dataCompare(String keyword, String monitorUrl, String pushAddress){
String htmlBody = getHtmlBody(monitorUrl);
Map<String, Object> dataMap = new HashMap<>();
try {
JSONObject jsonObject = JSONObject.parseObject(htmlBody);
JSONObject data = jsonObject.getJSONObject("data");
// 创建者
String creator = "最近更新者:" + data.getString("entry_last_update_user");
// 编辑次数
String editNumber = "编辑次数:" + data.get("entry_edit_times");
// 最近更新
String recentUpdate = "更新时间:" + data.get("entry_update_time");
// 对比地址
String eid = monitorUrl.replaceAll(".*(?<=\\?)|(?=\\&).*", "");
String sid = monitorUrl.replaceAll(".*" + eid + "&" + "|\\&.*", "");
String compareUrl = "https://baike.so.com/history?" + eid + "&" + sid;
dataMap.put("keyword", keyword);
dataMap.put("creator", creator);
dataMap.put("editNumber", editNumber);
dataMap.put("recentUpdate", recentUpdate);
dataMap.put("compareUrl", compareUrl);
dataMap.put("pushAddress", pushAddress);
} catch (Exception e) {
new Throwable("数据解析错误").printStackTrace();
}
Document inDataBaseData = findInDataBaseData(keyword);
if(Objects.nonNull(inDataBaseData)){
String editNumber = inDataBaseData.getString("edit_number");
if (!editNumber.equals(dataMap.get("editNumber"))){
System.out.println(keyword + " \t数据有更新");
System.out.println("\n\n开始推送......\n\n");
// 推送数据
setHotSearchDataAndPushContent(dataMap);
// 存放数据
saveData(dataMap);
}else {
System.out.println(keyword + " 在mongo中的数据: " + inDataBaseData.toString());
System.out.println(keyword + " 无数据更新.");
}
}else {
System.out.println("第一次访问:" + keyword);
// 存放数据
saveData(dataMap);
}
}
private static void sendWorkWechatByMarkdown(List<Map<String, String>> content, String sendUrl) {
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
Map<String, Object> newsMap = new HashMap<>();
newsMap.put("articles", content);
Map<String, Object> params = new HashMap<>();
params.put("msgtype", "news");
params.put("news", newsMap);
String data = JSONObject.toJSONString(params);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapPost(sendUrl, RequestBody.create(MediaType.parse("application/json"), data))).body()
.string();
if (htmlBody.contains("ok")) {
System.out.println("----------企业微信账号数据推送成功-----------");
} else {
System.out.println("----------企业微信账号数据推送失败-----------");
}
} catch (Exception e) {
System.out.println("----------企业微信账号数据推送失败,出现错误-----------");
e.printStackTrace();
}
}
public static void setHotSearchDataAndPushContent(Map<String, Object> dataMap) {
List<Map<String, String>> listContent = new ArrayList<>();
Map<String, String> map = new HashMap<>(4);
map.put("title", (String) dataMap.get("keyword"));
map.put("description", "数据有更新\n"
+ dataMap.get("recentUpdate") + " " + dataMap.get("editNumber")
+ "\n");
map.put("url", (String) dataMap.get("compareUrl"));
map.put("picurl", "https://login.zhiweidata.com/plogin/img/cat.8de03170.png");
listContent.add(map);
sendWorkWechatByMarkdown(listContent, (String) dataMap.get("pushAddress"));
}
public String getHtmlBody(String url){
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
for (int i = 0; i < 3; i++) {
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url))){
return response.body().string();
}catch(Exception e){
e.printStackTrace();
}
}
return null;
}
}
package com.zhiwei.crawler.monitor;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.MediaType;
import okhttp3.RequestBody;
import okhttp3.Response;
import org.apache.logging.log4j.util.Strings;
import org.bson.Document;
import org.jsoup.Jsoup;
import org.seimicrawler.xpath.JXDocument;
import java.util.*;
/**
* 搜狗百科信息监控
* @author 朝花夕誓
*/
public class SouGouBaiKeMonitor {
/**
* 查找出数据需要的集合
* @return
*/
public List<Map<String, Object>> findKeyWordAddress(){
List<Map<String, Object>> dataList = new ArrayList<>();
MongoUtils mongoUtils = new MongoUtils();
MongoCursor<Document> allData = mongoUtils.findAllData();
while (allData.hasNext()){
Document next = allData.next();
String keyword = null;
String pushAddress = null;
String souGouBaiKeUrl = null;
try {
keyword = next.getString("keyword");
pushAddress = next.getString("push_address");
souGouBaiKeUrl = next.getString("sougoubaike_url");
} catch (Exception e) {
e.printStackTrace();
}
if (Strings.isNotBlank(keyword) && Strings.isNotBlank(pushAddress) && Strings.isNotBlank(souGouBaiKeUrl)){
Map<String, Object> dataMap = new HashMap<>();
dataMap.put("keyword", keyword);
dataMap.put("pushAddress", pushAddress);
dataMap.put("souGouBaiKeUrl", souGouBaiKeUrl);
dataList.add(dataMap);
}
}
return dataList;
}
public Document findInDataBaseData(String keyword){
MongoUtils mongoUtils = new MongoUtils();
MongoDatabase mongoDataBase = mongoUtils.getMongoDataBase();
MongoCollection<Document> souGouBaiKe = mongoDataBase.getCollection("souGouBaiKe");
Document query = new Document();
query.put("keyword", keyword);
Document createTime = new Document();
createTime.put("create_time", -1);
Document first = souGouBaiKe.find(query).sort(createTime).first();
return first;
}
public void saveData(Map<String, Object> dataMap){
MongoUtils mongoUtils = new MongoUtils();
MongoDatabase mongoDataBase = mongoUtils.getMongoDataBase();
MongoCollection<Document> souGouBaiKe = mongoDataBase.getCollection("souGouBaiKe");
Document document = new Document();
document.put("create_time", (int)(System.currentTimeMillis()/1000));
document.append("keyword", dataMap.get("keyword"))
.append("creator", dataMap.get("creator"))
.append("edit_number", dataMap.get("editNumber"))
.append("recent_update", dataMap.get("recentUpdate"));
souGouBaiKe.insertOne(document);
}
public void dataCompare(String keyword, String monitorUrl, String pushAddress){
String htmlBody = getHtmlBody(monitorUrl);
org.jsoup.nodes.Document parse = Jsoup.parse(htmlBody);
JXDocument jxDocument = JXDocument.create(parse);
Map<String, Object> dataMap = new HashMap<>();
try {
// 创建者
String creator = jxDocument.selNOne("//ul[@class='lemma_data']/li[1]").asElement().text();
// 编辑次数
String editNumber = jxDocument.selNOne("//ul[@class='lemma_data']/li[3]").asElement().text();
// 最近更新
String recentUpdate = jxDocument.selNOne("//ul[@class='lemma_data']/li[5]").asElement().text();
// 对比地址
String compareUrl = "https://baike.sogou.com" + jxDocument.selNOne("//ul[@class='lemma_data']/li[3]//a").asElement().attr("href");
dataMap.put("keyword", keyword);
dataMap.put("creator", creator);
dataMap.put("editNumber", editNumber);
dataMap.put("recentUpdate", recentUpdate);
dataMap.put("compareUrl", compareUrl);
dataMap.put("pushAddress", pushAddress);
} catch (Exception e) {
new Throwable("数据解析错误").printStackTrace();
}
Document inDataBaseData = findInDataBaseData(keyword);
if(Objects.nonNull(inDataBaseData)){
String editNumber = inDataBaseData.getString("edit_number");
if (!editNumber.equals(dataMap.get("editNumber"))){
System.out.println(keyword + " \t数据有更新");
System.out.println("\n\n开始推送......\n\n");
// 推送数据
setHotSearchDataAndPushContent(dataMap);
// 存放数据
saveData(dataMap);
}else {
System.out.println(keyword + " 在mongo中的数据: " + inDataBaseData.toString());
System.out.println(keyword + " 无数据更新.");
}
}else {
System.out.println("第一次访问:" + keyword);
// 存放数据
saveData(dataMap);
}
}
private static void sendWorkWechatByMarkdown(List<Map<String, String>> content, String sendUrl) {
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
Map<String, Object> newsMap = new HashMap<>();
newsMap.put("articles", content);
Map<String, Object> params = new HashMap<>();
params.put("msgtype", "news");
params.put("news", newsMap);
String data = JSONObject.toJSONString(params);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapPost(sendUrl, RequestBody.create(MediaType.parse("application/json"), data))).body()
.string();
if (htmlBody.contains("ok")) {
System.out.println("----------企业微信账号数据推送成功-----------");
} else {
System.out.println("----------企业微信账号数据推送失败-----------");
}
} catch (Exception e) {
System.out.println("----------企业微信账号数据推送失败,出现错误-----------");
e.printStackTrace();
}
}
public static void setHotSearchDataAndPushContent(Map<String, Object> dataMap) {
List<Map<String, String>> listContent = new ArrayList<>();
Map<String, String> map = new HashMap<>(4);
map.put("title", (String) dataMap.get("keyword"));
map.put("description", "数据有更新\n"
+ dataMap.get("recentUpdate") + " " + dataMap.get("editNumber")
+ "\n");
map.put("url", (String) dataMap.get("compareUrl"));
map.put("picurl", "https://login.zhiweidata.com/plogin/img/cat.8de03170.png");
listContent.add(map);
sendWorkWechatByMarkdown(listContent, (String) dataMap.get("pushAddress"));
}
public String getHtmlBody(String url){
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
for (int i = 0; i < 3; i++) {
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url))){
return response.body().string();
}catch(Exception e){
e.printStackTrace();
}
}
return null;
}
}
package com.zhiwei.crawler.transfertest;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.seimicrawler.xpath.JXDocument;
import org.seimicrawler.xpath.JXNode;
import org.slf4j.LoggerFactory;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Objects;
/**
* 百度百科消息异常推送
* 根据历史更改次数进行判断是否有数据变动
* time 2020/06/08
* @author zgs
*
*/
public class BaiDuBaiKeDiffPush {
// 进行关键字转码
private static String urlEncoderKeyWord;
// 建立爬取连接
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
// 存放响应内容
private static String htmlBody;
// 访问内容的地址
private static Request request;
private static String savaUrl;
// 搜索关键词
private static String searchKeyWord;
/**
* 爬取页面信息
* 传入关键字进行访问
* @param containsQiXinBaoData 用于传入是否有启信宝中的数据
*/
public void requestUrl(String keyWord, String url, String pushWeiChatAddress){
// 在关键词未进行URL编码时,进行赋值存储
searchKeyWord = keyWord;
savaUrl = url;
// 进行URL编码
try {
urlEncoderKeyWord = URLEncoder.encode(keyWord,"utf-8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
// 发出请求
request = RequestUtils.wrapGet(url);
try {
// 正常网页的请求
Response response = httpBoot.syncCall(request);
htmlBody = response.body().string();
// 传入解析方法
parseHtml(htmlBody, pushWeiChatAddress);
}catch (Exception e){
e.printStackTrace();
System.out.println("页面请求出错 :");
}
}
/**
* 解析页面
* 存在启信宝数据方解析方法
*/
public void parseHtml(String htmlBody, String pushWeiChatAddress){
// 获取Document对象
Document document = Jsoup.parse(htmlBody);
// 根据document对象,创建JXDocument对象
JXDocument jxDocument = JXDocument.create(document);
// 历史版本
String historyVersion = jxDocument.selNOne("//dd[@class='description split-line']/ul/li[2]").asElement().text();
// 编辑
String editor = jxDocument.selNOne("//a[@class='show-userCard']").asElement().text();
// 推送地址
String pushUrl = "https://baike.baidu.com" + jxDocument.selNOne("//a[@class='nslog:1021']").asElement().attr("href");
// 修改时间
String recentUpdate = jxDocument.selNOne("//span[@class='j-modified-time']").asElement().text();
// 缩小解析范围
JXNode contentJXNode = jxDocument.selNOne("//div[@class='main-content']");
String qiXinBaoData = "";
// 文章内容
String content = contentJXNode.asElement().text();
// 数据库查询
org.bson.Document dbResultDocument = new MongoUtils().findDoucment(searchKeyWord);
// 如果在数据中查询出来的结果为空 -> 则是第一次访问 -> 直接数据入库
if (Objects.isNull(dbResultDocument)){
System.out.println("第一次访问..." + searchKeyWord);
// 存放百度百科第一次的数据
BaiDuBaiKeHelp.saveData(savaUrl, recentUpdate, qiXinBaoData, editor, content, searchKeyWord, historyVersion, pushUrl);
}else {
String dbAllContent = dbResultDocument.getString("history_version");
// 如何数据库中的数据和解析回来的数据不相等,则发生了改变
if (!dbAllContent.equals(historyVersion)){
System.out.println(searchKeyWord + "\t数据有更新...");
// 消息推送
PushToWorkWeiXin.setHotSearchDataAndPushContent(searchKeyWord, pushUrl, pushWeiChatAddress, editor, recentUpdate, historyVersion);
System.out.println("\n\n开始推送......\n\n");
// 存放百度百科数据
BaiDuBaiKeHelp.saveData(savaUrl, recentUpdate, qiXinBaoData, editor, content, searchKeyWord, historyVersion, pushUrl);
}else {
System.out.println("无数据更新....");
}
}
}
}
package com.zhiwei.crawler.transfertest;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.seimicrawler.xpath.JXDocument;
import java.util.HashMap;
import java.util.Map;
/**
* 百度百科的工具类
*/
public class BaiDuBaiKeHelp {
public JXDocument documentParse(String htmlBody){
// 获取Document对象
Document document = Jsoup.parse(htmlBody);
//根据document对象,创建JXDocument对象
JXDocument jxDocument = JXDocument.create(document);
return jxDocument;
}
public static HashMap requestHeader(){
//存放请求头
HashMap<String, Object> header = new HashMap<>(7);
header.put("Host", "baike.baidu.com");
header.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36");
header.put("Sec-Fetch-User", "?1");
header.put("Upgrade-Insecure-Requests", "1");
header.put("Sec-Fetch-Mode", "navigate");
header.put("Sec-Fetch-Dest", "document");
return header;
}
/**
* 存放百度百科里面的数据
* @param url
* @param recentUpdate
* @param qiXinBaoData
* @param editor
* @param content
* @param searchKeyWord
*/
public static void saveData(String url, String recentUpdate, String qiXinBaoData, String editor, String content, String searchKeyWord, String historyVersion, String pushUrl){
// 存放数据
Map<String, Object> data = new HashMap<>(16);
// 入库时间
data.put("create_time", (int)(System.currentTimeMillis()/1000));
// 入库的ID
// try {
// data.put("_id", url.concat(recentUpdate));
// }catch (Exception e){
// System.out.println("数据库中_id重复。\t 选用当前URL加上时间戳的形式存储。");
// e.printStackTrace();
data.put("_id", url.concat("_").concat(String.valueOf((int)(System.currentTimeMillis()/1000))));
// }
// 存放推送地址
data.put("push_url", pushUrl);
// 存放历史版本
data.put("history_version", historyVersion);
// 1.存放最新更新的时间
data.put("recent_update", recentUpdate);
// 2.存放启信宝数据
data.put("qi_xin_bao_data", qiXinBaoData);
// 3.存放编辑人的信息
data.put("editor", editor);
// 4.存放全部信息(启信宝加上文章内容)
if (StringUtils.isNotBlank(qiXinBaoData)){
data.put("all_content", content.concat(qiXinBaoData));
}
// 5.存放搜索的关键词
data.put("search_key_word", searchKeyWord);
// 6.存放文章内容
data.put("article_content", content);
// 7.存放Url
data.put("url", url);
// 入库操作
MongoUtils mongoUtils = new MongoUtils();
mongoUtils.insertDocument(data);
}
// public static void saveData2(Map dataMap){
// // 存放数据
// Map<String, Object> data = new HashMap<>(16);
// // 入库时间
// data.put("create_time", (int)(System.currentTimeMillis()/1000));
// // 入库的ID
//// try {
//// data.put("_id", url.concat(recentUpdate));
//// }catch (Exception e){
//// System.out.println("数据库中_id重复。\t 选用当前URL加上时间戳的形式存储。");
//// e.printStackTrace();
// data.put("_id", url.concat("_").concat(String.valueOf((int)(System.currentTimeMillis()/1000))));
//// }
// // 存放推送地址
// data.put("push_url", pushUrl);
// // 存放历史版本
// data.put("history_version", historyVersion);
// // 1.存放最新更新的时间
// data.put("recent_update", recentUpdate);
// // 2.存放启信宝数据
// data.put("qi_xin_bao_data", qiXinBaoData);
// // 3.存放编辑人的信息
// data.put("editor", editor);
// // 4.存放全部信息(启信宝加上文章内容)
// if (StringUtils.isNotBlank(qiXinBaoData)){
// data.put("all_content", content.concat(qiXinBaoData));
// }
// // 5.存放搜索的关键词
// data.put("search_key_word", searchKeyWord);
// // 6.存放文章内容
// data.put("article_content", content);
// // 7.存放Url
// data.put("url", url);
//
// // 入库操作
// MongoUtils mongoUtils = new MongoUtils();
// mongoUtils.insertDocument(data);
// }
}
package com.zhiwei.crawler.transfertest;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.MediaType;
import okhttp3.RequestBody;
/**
* @author 朝花夕誓
*/
public class PushToWorkWeiXin {
private static Logger logger = LogManager.getLogger(PushToWorkWeiXin.class.getName());
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* 企业微信机器人推送
*
* @param content
*/
private static void sendWorkWechatByMarkdown(List<Map<String, String>> content, String sendUrl) {
Map<String, Object> newsMap = new HashMap<>();
newsMap.put("articles", content);
Map<String, Object> params = new HashMap<>();
params.put("msgtype", "news");
params.put("news", newsMap);
String data = JSONObject.toJSONString(params);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapPost(sendUrl, RequestBody.create(MediaType.parse("application/json"), data))).body()
.string();
if (htmlBody.contains("ok")) {
logger.info("----------企业微信账号数据推送成功-----------");
} else {
logger.info("----------企业微信账号数据推送失败-----------");
}
} catch (Exception e) {
logger.error("----------企业微信账号数据推送失败,出现错误-----------",e);
}
}
public static void setHotSearchDataAndPushContent(String companyName, String url, String pushAddress, String editor, String recentUpdate, String historyVersion) {
// SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
List<Map<String, String>> listContent = new ArrayList<>();
Map<String, String> map = new HashMap<>(4);
map.put("title", companyName);
map.put("description", "数据有更新\n"
+ "编辑日期:" + recentUpdate.replaceAll("\\(|\\)|\\(|\\)", "") + "\t 编辑:" + editor
+ "\n" + historyVersion);
map.put("url", url);
map.put("picurl", "https://login.zhiweidata.com/plogin/img/cat.8de03170.png");
listContent.add(map);
sendWorkWechatByMarkdown(listContent, pushAddress);
}
// public static void main(String[] args) {
// String url = "https://baike.baidu.com/item/%E5%8C%97%E4%BA%AC%E5%B8%82%E5%95%86%E6%B1%A4%E7%A7%91%E6%8A%80%E5%BC%80%E5%8F%91%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8";
// String weiChatAddress = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=34bda8fa-4d29-4a88-b77d-2e8ac81ad945";
// setHotSearchDataAndPushContent("测试", url, weiChatAddress);
// }
}
package com.zhiwei.crawler.transfertest;
import lombok.extern.log4j.Log4j;
import org.python.core.*;
import org.python.util.PythonInterpreter;
/**
* @author 朝花夕誓
*/
@Log4j
public class StrCompare {
public static String compare(Object compareStr1, Object compareStr2){
PythonInterpreter interpreter = new PythonInterpreter();
interpreter.execfile("D:\\PythonProjects\\com\\zhiwei\\difflibTest\\difflibTest01.py");
// 第一个参数为期望获得的函数(变量)的名字,第二个参数为期望返回的对象类型
PyFunction pyFunction = interpreter.get("compareTest", PyFunction.class);
//调用函数,如果函数需要参数,在Java中必须先将参数转化为对应的“Python类型”
PyObject pyobj = null;
try{
pyobj = pyFunction.__call__((PyType) compareStr1, (PyType) compareStr2);
}catch (ClassCastException e){
log.error("Java 转 Python 类型出错", e);
}
return String.valueOf(pyobj);
}
}
package com.zhiwei.crawler.transfertest;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import java.io.*;
import java.util.HashMap;
import java.util.Map;
/**
* 字节缓冲流下载测试
* @author 朝花夕誓
*/
public class Test {
public static void main(String[] args) throws IOException {
Map<String, String> header = new HashMap<>();
Connection connect = Jsoup.connect("https://tpc.googlesyndication.com/simgad/6378499198176166429/2076313506083323656");
Connection.Response execute = connect.ignoreContentType(true).method(Connection.Method.GET).execute();
BufferedInputStream bufferedInputStream = execute.bodyStream();
byte[] buffer = new byte[1024];
int len = 0;
// 创建缓冲流
FileOutputStream fileOutputStream = new FileOutputStream(new File("1.jpg"));
BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(fileOutputStream);
// 图片写入
while ((len = bufferedInputStream.read(buffer, 0, 1024)) != -1){
bufferedOutputStream.write(buffer, 0, len);
}
// 缓冲流释放与关闭
bufferedOutputStream.flush();
bufferedOutputStream.close();
}
}
package com.zhiwei.crawler.transfertest.mongo;
import com.mongodb.client.*;
import org.apache.logging.log4j.util.Strings;
import org.bson.Document;
import org.junit.Test;
import java.util.*;
/**
* @author 朝花夕誓
* 该工具中封装了存储方法和查找方法
* 其中查找方法传入搜索的关键词,会更具数据库中的create_time字段倒叙查找
*/
public class MongoUtils {
static com.mongodb.client.MongoClient mongoClient;
// 连接地址
static String connectionUrl = "mongodb://localhost:27017/?readPreference=primary&appname=MongoDB%20Compass%20Community&ssl=false";
static {
// 建立连接
mongoClient = MongoClients.create(connectionUrl);
}
// 存储方法
public void insertDocument(Map data){
//1.获取test集合getDatabase("test");
MongoDatabase mongoDB = mongoClient.getDatabase("baidubaike");
//2.切换到baidubaike数据库中的test连接点
MongoCollection<Document> collection = mongoDB.getCollection("baiKeData");
// MongoCollection<Document> collection = mongoDB.getCollection("baiKeDataTest");
//3.使用字段和嵌入的文档填充文档,然后将其插入数据库
Document canvas = new Document();
for (Object key : data.keySet()){
canvas.append((String) key, data.get(key));
}
collection.insertOne(canvas);
// mongoClient.close();
}
// 查找方法
public Document findDoucment(String searchKeyWord){
//1.获取test集合getDatabase("test");
MongoDatabase mongoDB = mongoClient.getDatabase("baidubaike");
//2.切换到baidubaike数据库中的test连接点
MongoCollection<Document> collection = mongoDB.getCollection("baiKeData");
// MongoCollection<Document> collection = mongoDB.getCollection("baiKeDataTest");
// 查询结合中的所有文档
Document query = new Document();
query.put("search_key_word", searchKeyWord);
Document createTime = new Document();
createTime.put("create_time", -1);
FindIterable<Document> findIterable = collection.find(query).sort(createTime);
Document resultDocument = findIterable.first();
if(Objects.nonNull(resultDocument)){
System.out.println(searchKeyWord + "在MongoDB中的数据:" + resultDocument);
return resultDocument;
}
return null;
}
/**
* 查找指定数据
* @return
*/
@Test
public List<Map<String, String>> findKeywordData(){
List<Map<String, String>> dataList = new ArrayList<>();
MongoDatabase baiDubaiKe = mongoClient.getDatabase("baidubaike");
MongoCollection<Document> collection = baiDubaiKe.getCollection("baiKeKeyword");
FindIterable<Document> documents = collection.find();
MongoCursor<Document> iterator = documents.iterator();
while (iterator.hasNext()){
Map<String, String> dataMap = new HashMap<>();
Document next = iterator.next();
String keyword = null;
String monitorUrl = null;
String pushAddress = null;
try {
keyword = next.getString("keyword");
monitorUrl = next.getString("monitor_url");
pushAddress = next.getString("push_address");
} catch (Exception e) {
e.printStackTrace();
}
if (Strings.isNotBlank(keyword) && Strings.isNotBlank(monitorUrl) && Strings.isNotBlank(pushAddress)){
dataMap.put("keyword", keyword);
dataMap.put("monitorUrl", monitorUrl);
dataMap.put("pushAddress", pushAddress);
dataList.add(dataMap);
}/*else {
System.out.println("数据有误" + next.toString());
}*/
}
// dataList.forEach(item->{
// Set<Map.Entry<String, String>> entries = item.entrySet();
// entries.forEach(it->{
// System.out.println("key is :" + it.getKey() + " value is : " + it.getValue());
// });
// });
return dataList;
}
/**
* 查找所有数据
*/
public MongoCursor<Document> findAllData(){
MongoDatabase database = mongoClient.getDatabase("baidubaike");
MongoCollection<Document> baiKeKeyword = database.getCollection("baiKeKeyword");
MongoCursor<Document> iterator = baiKeKeyword.find().iterator();
return iterator;
}
public MongoDatabase getMongoDataBase(){
MongoDatabase baidubaike = mongoClient.getDatabase("baidubaike");
return baidubaike;
}
}
package com.zhiwei.crawler.utils;
import com.zhiwei.crawler.core.HttpBoot;
import okhttp3.Request;
import okhttp3.Response;
import java.io.IOException;
import java.util.Map;
import java.util.Objects;
/**
* 传入URL
* 下载网页数据源码
*/
public class GetHtmlBody {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static String parserHtml(String url, Map<String, Object> headerMap) throws IOException {
// Request request = null;
// if (Objects.nonNull(headerMap) && !headerMap.isEmpty()){
// request = RequestUtils.wrapGet(url, headerMap);
// }else {
// request = RequestUtils.wrapGet(url);
// }
// try {
// Response response = httpBoot.syncCall(request);
// System.out.println(response);
// String htmlBody = response.body().string();
// System.out.println(htmlBody);
// return htmlBody;
// }catch (IOException e){
// e.printStackTrace();
// }
// return null;
// }
//初始化网页请求头信息
Request request = RequestUtils.wrapGet(url);
if(Objects.nonNull(headerMap) && !headerMap.isEmpty()){
RequestUtils.wrapGet(url, headerMap);
}
//发送请求并获取网页结构数据
try{
Response response = httpBoot.syncCall(request);
String htmlBody = response.body().string();
return htmlBody;
}catch (IOException e){
e.printStackTrace();
}
return null;
}
}
package com.zhiwei.crawler.utils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Element;
import org.seimicrawler.xpath.JXNode;
import java.util.Objects;
public class ManyJsonParse{
/**
* 根据xpath解析文本内容
* @param jxNode
* @param xpathStr
* @return
*/
public Element getTextByXapth(JXNode jxNode, String xpathStr){
if(StringUtils.isNotBlank(xpathStr)){
String[] xpathes = xpathStr.split("\\|");
for (int i = 0; i < xpathes.length; i++) {
JXNode node = jxNode.selOne(xpathes[i]);
if(Objects.nonNull(node)){
return node.asElement();
}
}
}
return null;
}
/**
* 获取指定标签下面的所有text
* @param jxNode 要
* @param xpathStr
* @param starWith
* @param endWith
* @return allText
*/
public String getAllTextByXapth(JXNode jxNode, String xpathStr, int starWith, int endWith){
if(StringUtils.isNotBlank(xpathStr)){
String allText = "";
for(int j = starWith; j <=endWith; j++){
JXNode node = jxNode.selOne(xpathStr + "[" + j + "]");
allText = allText + node.asElement().text();
if (j == endWith) {
return allText;
}
}
}
return null;
}
}
\ No newline at end of file
package com.zhiwei.crawler.utils;
import difflib.*;
import difflib.DiffRow.Tag;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.builder.Diff;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
import java.util.List;
public class StrCompare {
@Test
public void testCompare() throws IOException {
List<String> original = FileUtils.readLines(new File("C:\\Users\\Administrator\\Desktop\\1.txt"));
List<String> revised = FileUtils.readLines(new File("C:\\Users\\Administrator\\Desktop\\2.txt"));
Patch patch = DiffUtils.diff(original, revised);
for (Delta delta : patch.getDeltas()) {
List<?> list = delta.getRevised().getLines();
for (Object object : list) {
System.out.println(object);
}
}
DiffRowGenerator.Builder builder = new DiffRowGenerator.Builder();
builder.showInlineDiffs(false);
DiffRowGenerator generator = builder.build();
for (Delta delta : patch.getDeltas()) {
List<DiffRow> generateDiffRows = generator.generateDiffRows((List<String>) delta.getOriginal().getLines(), (List<String>) delta
.getRevised().getLines());
int leftPos = delta.getOriginal().getPosition();
int rightPos = delta.getRevised().getPosition();
for (DiffRow row : generateDiffRows) {
Tag tag = row.getTag();
if (tag == Tag.INSERT) {
System.out.println("Insert: ");
System.out.println("new-> " + row.getNewLine());
System.out.println();
} else if (tag == Tag.CHANGE) {
System.out.println("change: ");
System.out.println("old-> " + row.getOldLine());
System.out.println("new-> " + row.getNewLine());
System.out.println();
} else if (tag == Tag.DELETE) {
System.out.println("delete: ");
System.out.println("old-> " + row.getOldLine());
System.out.println();
} else if (tag == Tag.EQUAL) {
System.out.println("equal: ");
System.out.println("old-> " + row.getOldLine());
System.out.println("new-> " + row.getNewLine());
System.out.println();
} else {
throw new IllegalStateException("Unknown pattern tag: " + tag);
}
}
}
}
}
package com.zhiwei.crawler.utils;
import org.python.modules.itertools.count;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class test {
public static void main(String[] args) {
// MyThread myThread = new MyThread(-10254);
// new Thread(myThread, "thead-1").start();
// new Thread(myThread, "thead-2").start();
// new Thread(myThread, "thead-3").start();
new String();
int count = 0;
String regEx = "[\\u4e00-\\u9fa5]";
String str = "AQWASD我们都是好孩子AAAA11222 ";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
while (m.find()) {
// for (int i = 0; i <= m.groupCount(); i++) {
count = count + 1;
String group = m.group();
System.out.println(group);
// }
}
System.out.println("共有 " + count + "个 ");
}
}
class MyThread implements Runnable{
int starData;
boolean coursor = true;
public MyThread(int data) {
this.starData = data;
}
@Override
public void run() {
while (coursor){
starData++;
System.out.println(Thread.currentThread().getName() + "is run, number is : " + starData);
if (starData>10000000000L){
coursor = false;
}
}
}
}
package com.zhiwei.crawler;
import com.zhiwei.crawler.transfertest.BaiDuBaiKeDiffPush;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import org.apache.commons.lang3.time.FastDateFormat;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public class BaiDuBaiKeDiffPushTest {
public static void main(String[] args) {
BaiDuBaiKeDiffPush baiDuBaiKe = new BaiDuBaiKeDiffPush();
MongoUtils mongoUtils = new MongoUtils();
ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1);
scheduledExecutorService.scheduleAtFixedRate(() -> {
List<Map<String, String>> keywordData = mongoUtils.findKeywordData();
if(keywordData.size() != 0){
String timeStamp = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss").format(System.currentTimeMillis());
System.out.println("\n\n正在访问... " + timeStamp + "\n\n");
keywordData.forEach(item->{
String keyword = item.get("keyword");
String monitorUrl = item.get("monitorUrl");
String pushAddress = item.get("pushAddress");
baiDuBaiKe.requestUrl(keyword, monitorUrl, pushAddress);
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
});
}
},0,1, TimeUnit.HOURS);
}
}
package com.zhiwei.crawler;
import com.mongodb.client.MongoCursor;
import com.zhiwei.crawler.monitor.Baike360;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import com.zhiwei.crawler.utils.RequestUtils;
import org.apache.commons.lang3.time.FastDateFormat;
import org.apache.logging.log4j.util.Strings;
import org.bson.Document;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public class BaiKe360MonitorTest {
public static void main(String[] args) {
Baike360 baike360 = new Baike360();
ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1);
MongoUtils mongoUtils = new MongoUtils();
scheduledExecutorService.scheduleAtFixedRate(()->{
MongoCursor<Document> allData = mongoUtils.findAllData();
String timeStamp = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss").format(System.currentTimeMillis());
System.out.println("\n\n正在访问... " + timeStamp + "\n\n");
while (allData.hasNext()){
Document next = allData.next();
String url = next.getString("360baike_url");
String keyword = next.getString("keyword");
String pushAddress = next.getString("push_address");
if(Strings.isNotBlank(url) && Strings.isNotBlank(keyword) && Strings.isNotBlank(pushAddress)){
baike360.dataCompare(keyword, url, pushAddress);
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}, 0, 1, TimeUnit.HOURS);
}
}
package com.zhiwei.crawler;
import com.zhiwei.crawler.compare.BaiDuBaiKeMonitor;
import com.zhiwei.crawler.compare.SouGouBaiKeMonitor;
import org.junit.Test;
/**
* @Author: 朝花夕誓
* @Date: 2020/11/10 11:46
* @Version 1.0
*/
public class MainTest {
@Test
public void test(){
BaiDuBaiKeMonitor.start();
SouGouBaiKeMonitor.start();
}
}
package com.zhiwei.crawler;
import com.mongodb.client.MongoCursor;
import com.zhiwei.crawler.monitor.SouGouBaiKeMonitor;
import com.zhiwei.crawler.transfertest.mongo.MongoUtils;
import org.apache.commons.lang3.time.FastDateFormat;
import org.apache.logging.log4j.util.Strings;
import org.bson.Document;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public class SouGouBaiKeMonitorTest {
public static void main(String[] args) {
SouGouBaiKeMonitor souGouBaiKeMonitor = new SouGouBaiKeMonitor();
ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1);
MongoUtils mongoUtils = new MongoUtils();
scheduledExecutorService.scheduleAtFixedRate(()->{
MongoCursor<Document> allData = mongoUtils.findAllData();
String timeStamp = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss").format(System.currentTimeMillis());
System.out.println("\n\n正在访问... " + timeStamp + "\n\n");
while (allData.hasNext()){
Document next = allData.next();
String url = next.getString("sougoubaike_url");
String keyword = next.getString("keyword");
String pushAddress = next.getString("push_address");
if(Strings.isNotBlank(url) && Strings.isNotBlank(keyword) && Strings.isNotBlank(pushAddress)){
souGouBaiKeMonitor.dataCompare(keyword, url, pushAddress);
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}, 0, 1, TimeUnit.HOURS);
}
}
package com.zhiwei.crawler;
//import org.apache.commons.lang3.time.FastDateFormat;
//
//import java.util.Calendar;
import jdk.nashorn.tools.Shell;
import org.apache.commons.lang3.StringUtils;
import javax.swing.*;
import javax.swing.filechooser.FileFilter;
import java.awt.*;
import java.io.File;
import java.io.InputStream;
import java.lang.annotation.Annotation;
import java.util.Arrays;
/**
* @Author: 朝花夕誓
* @Date: 2020/11/19 12:00
* @Version 1.0
*/
public class Test {
@org.junit.Test
public void test02(){
System.out.println("0001".toString());
}
@org.junit.Test
public void test() throws Exception {
//首先是创建JFileChooser 对象,里面带个参数,表示默认打开的目录,这里是默认打开当前文件所在的目录。
JFileChooser file = new JFileChooser(".");
//下面这句是去掉显示所有文件这个过滤器。
file.setAcceptAllFileFilterUsed(false);
//添加excel文件的过滤器
// file.addChoosableFileFilter(new ExcelFileFilter("xls"));
//添加exe文件的过滤器
// file.addChoosableFileFilter(new ExcelFileFilter("exe"));
//使用showOpenDialog()方法,显示出打开选择文件的窗口,当选择了某个文件后,或者关闭此窗口那么都会返回一个
//整型数值,如果返回的是0,代表已经选择了某个文件。如果返回1代表选择了取消按钮或者直接关闭了窗口*/
int result = file.showOpenDialog(null);
//JFileChooser.APPROVE_OPTION是个整型常量,代表0。就是说当返回0的值我们才执行相关操作,否则什么也不做。
if (result == JFileChooser.APPROVE_OPTION) {
//获得你选择的文件绝对路径。并输出。当然,我们获得这个路径后还可以做很多的事。
String path = file.getSelectedFile().getAbsolutePath();
System.out.println(path);
} else {
System.out.println("你已取消并关闭了窗口!");
}
}
private static int a;
public static int b;
{
b = a;
}
static {
b = 1;
}
static {
a = 2;
}
static {
System.out.println("a is ->" + a);
System.out.println("b is -> " + b);
}
static {
Test test = new Test();
System.out.println(Test.a);
}
// public static String test(String timeFormater, long second){
// FastDateFormat instance = FastDateFormat.getInstance(timeFormater);
// return instance.format(second * 1000);
// }
//
// public static void main(String[] args) {
// Calendar instance = Calendar.getInstance();
// instance.get(Calendar.MINUTE);
// System.out.println(test("ss", 600));
// }
@org.junit.Test
public void test2(){
String join = StringUtils.join(Arrays.asList("1", "2", "3"), ",");
System.out.println(join);
}
}
package com.zhiwei.crawler;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @Author: 朝花夕誓
* @Date: 2020/11/5 10:07
* @Version 1.0
*/
public class UrlMatchTest {
private static Logger log = LoggerFactory.getLogger(UrlMatchTest.class);
@Test
public void test(){
String url = "https://baike.so.com/Asyncdata/entryStat?eid=1038695&sid=1098608&ename=%E8%85%BE%E8%AE%AF";
String eid = url.replaceAll(".*(?<=\\?)|(?=\\&).*", "");
String sid = url.replaceAll(".*" + eid + "&" + "|\\&.*", "");
log.info("eid is : {}", eid);
log.info("sid is : {}", sid);
System.out.println("eid is :" + eid);
System.out.println("sid is :" + sid);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment