Commit f0983d8c by chenweiyang

Merge branch 'source-forward-chen' of

http://git.zhiweidata.top/zhangzhiwei/source_forward.git into
source-forward-chen

Conflicts:
	src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
parents b3fce9ac 4860f41e
...@@ -30,12 +30,11 @@ import com.zhiwei.tools.tools.ZhiWeiTools; ...@@ -30,12 +30,11 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
/** /**
* * @author byte-zbs
* @version 1.0.0
* @ClassName UrlLiveCrawler * @ClassName UrlLiveCrawler
* @Description 判断页面是否存在 * @Description 判断页面是否存在
* @author byte-zbs
* @Date 2018年8月20日 下午3:34:57 * @Date 2018年8月20日 下午3:34:57
* @version 1.0.0
*/ */
public class UrlLiveCrawler { public class UrlLiveCrawler {
...@@ -48,7 +47,7 @@ public class UrlLiveCrawler { ...@@ -48,7 +47,7 @@ public class UrlLiveCrawler {
return counter; return counter;
} }
private void start(GroupSync counter,UrlLiveDataCallback callback, String... urls) { private void start(GroupSync counter, UrlLiveDataCallback callback, String... urls) {
if (nonNull(urls) && urls.length > 0) { if (nonNull(urls) && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
counter.add(); counter.add();
...@@ -68,7 +67,6 @@ public class UrlLiveCrawler { ...@@ -68,7 +67,6 @@ public class UrlLiveCrawler {
private GroupSync search(GroupSync counter, String url, private GroupSync search(GroupSync counter, String url,
Attribution attr, UrlLiveDataCallback callback) { Attribution attr, UrlLiveDataCallback callback) {
// System.out.println(url);
url = dealUrl(url); url = dealUrl(url);
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", url);
Map<String,String> headers = new HashMap<>(); Map<String,String> headers = new HashMap<>();
...@@ -81,11 +79,12 @@ public class UrlLiveCrawler { ...@@ -81,11 +79,12 @@ public class UrlLiveCrawler {
} }
try { try {
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
if(Objects.nonNull(request)) { if (Objects.nonNull(request)) {
counter.add(); counter.add();
// , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128)) // , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
httpBoot.asyncCall(request, ph).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ph).whenComplete((rs, ex) -> {
try { try {
System.out.println(rs.code());
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
if(rs.isSuccessful()) { if(rs.isSuccessful()) {
parseHtml(rs.bodyString(), attr, callback); parseHtml(rs.bodyString(), attr, callback);
...@@ -99,24 +98,24 @@ public class UrlLiveCrawler { ...@@ -99,24 +98,24 @@ public class UrlLiveCrawler {
callBack(callback, attr, -1, "程序无法判断"); callBack(callback, attr, -1, "程序无法判断");
} }
} catch (Exception e) { } catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ",e); logger.error(" 数据是否删除 采集出错 {} ", e);
}finally { } finally {
counter.done(); counter.done();
} }
}); });
return counter; return counter;
} }
} catch (Exception e2) { } catch (Exception e2) {
logger.error("数据出错 {}" ,e2); logger.error("数据出错 {}", e2);
} }
return counter; return counter;
} }
private void callBack(UrlLiveDataCallback callback,Attribution attr,int i,String title) { private void callBack(UrlLiveDataCallback callback, Attribution attr, int i, String title) {
UrlLiveBean ulb = null; UrlLiveBean ulb = null;
if(i == 1) { if (i == 1) {
ulb = new UrlLiveBean(attr.getAttr().toString(), true, title); ulb = new UrlLiveBean(attr.getAttr().toString(), true, title);
}else { } else {
ulb = new UrlLiveBean(attr.getAttr().toString(), i, title); ulb = new UrlLiveBean(attr.getAttr().toString(), i, title);
} }
if (callback == null) { if (callback == null) {
...@@ -128,17 +127,17 @@ public class UrlLiveCrawler { ...@@ -128,17 +127,17 @@ public class UrlLiveCrawler {
private String dealUrl(String url) { private String dealUrl(String url) {
try { try {
if(url.contains("toutiao.com")) { if (url.contains("toutiao.com")) {
return dealToutiaoUrl(url); return dealToutiaoUrl(url);
}else if(url.contains("mp.weixin.qq.com")) { } else if (url.contains("mp.weixin.qq.com")) {
if(url.contains("https")) { if (url.contains("https")) {
}else { } else {
url = url.replace("http", "https"); url = url.replace("http", "https");
} }
}else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) { } else if (url.contains("a.mp.uc.cn") && url.contains("wm_aid")) {
url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"; url = "http://ff.dayu.com/contents/origin/" + url.split("wm_aid=")[1].split("!!wm_id")[0] + "?biz_id=1002&_fetch_author=1&_fetch_incrs=1";
}else if(url.contains("tznew.58.com/view") && url.contains("infoid=")) { } else if (url.contains("tznew.58.com/view") && url.contains("infoid=")) {
// https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473 // https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473
url = "https://tznew.58.com/tznew/c/info-detail?infoid=" + url.split("infoid=")[1].split("&")[0]; url = "https://tznew.58.com/tznew/c/info-detail?infoid=" + url.split("infoid=")[1].split("&")[0];
} }
...@@ -165,11 +164,10 @@ public class UrlLiveCrawler { ...@@ -165,11 +164,10 @@ public class UrlLiveCrawler {
} }
/** /**
*
* @Description 判断是否删除
* @param html * @param html
* @param attr * @param attr
* @param callback * @param callback
* @Description 判断是否删除
*/ */
private void parseHtml(String html, Attribution attr, private void parseHtml(String html, Attribution attr,
UrlLiveDataCallback callback) { UrlLiveDataCallback callback) {
...@@ -177,9 +175,9 @@ public class UrlLiveCrawler { ...@@ -177,9 +175,9 @@ public class UrlLiveCrawler {
logger.warn("DataCallback 对象为 null,无法保存数据"); logger.warn("DataCallback 对象为 null,无法保存数据");
} else { } else {
UrlLiveBean ulb = matchDel(html, attr, attr.getAttr().toString()); UrlLiveBean ulb = matchDel(html, attr, attr.getAttr().toString());
if(Objects.nonNull(ulb)) { if (Objects.nonNull(ulb)) {
callback.onData(ulb, attr); callback.onData(ulb, attr);
}else { } else {
callBack(callback, attr, -1, "程序无法判断"); callBack(callback, attr, -1, "程序无法判断");
} }
} }
...@@ -193,31 +191,31 @@ public class UrlLiveCrawler { ...@@ -193,31 +191,31 @@ public class UrlLiveCrawler {
* @param @return 设定文件 * @param @return 设定文件
* @return boolean 返回类型 * @return boolean 返回类型
*/ */
public UrlLiveBean matchDel(String result,Attribution attr,String url){ public UrlLiveBean matchDel(String result, Attribution attr, String url) {
try { try {
Document doc = Jsoup.parse(result); Document doc = Jsoup.parse(result);
String title = null; String title = null;
if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")){ if (url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")) {
title = doc.select("h2.rich_media_title").text().replaceAll(" ", ""); title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
if(Objects.isNull(title) || title.isEmpty()) { if (Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.title").text(); title = doc.select("p.title").text();
} }
if(Objects.isNull(title) || title.isEmpty()) { if (Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h3.msg-title").text(); title = doc.select("h3.msg-title").text();
} }
if(Objects.isNull(title) || title.isEmpty()) { if (Objects.isNull(title) || title.isEmpty()) {
title = doc.select("div.global_error_msg.warn").text(); title = doc.select("div.global_error_msg.warn").text();
} }
if(Objects.isNull(title) || title.isEmpty()) { if (Objects.isNull(title) || title.isEmpty()) {
title = doc.select("div.warn").text(); title = doc.select("div.warn").text();
} }
if(Objects.isNull(title) || title.isEmpty()) { if (Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.tips").text(); title = doc.select("p.tips").text();
} }
if(Objects.isNull(title) || title.isEmpty()) { if (Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h2").text(); title = doc.select("h2").text();
} }
if(Objects.isNull(title) || title.isEmpty()) { if (Objects.isNull(title) || title.isEmpty()) {
title = doc.select("div.weui-msg__text-area > h3").text(); title = doc.select("div.weui-msg__text-area > h3").text();
} }
// 获取title // 获取title
...@@ -226,73 +224,75 @@ public class UrlLiveCrawler { ...@@ -226,73 +224,75 @@ public class UrlLiveCrawler {
if (ma5.find()) { if (ma5.find()) {
title = ma5.group(1).replaceAll(" ", " ").trim(); title = ma5.group(1).replaceAll(" ", " ").trim();
} }
if(Objects.isNull(title) || title.isEmpty()) { if (Objects.isNull(title) || title.isEmpty()) {
if(result.contains("此帐号已被屏蔽, 内容无法查看") || result.contains("该公众号已迁移") || result.contains("此帐号已自主注销,内容无法查看") if (result.contains("此帐号已被屏蔽, 内容无法查看") || result.contains("该公众号已迁移") || result.contains("此帐号已自主注销,内容无法查看")
|| result.contains("此帐号处于帐号迁移流程中") || result.contains("该内容已被发布者删除") || result.contains("此内容被投诉且经审核涉嫌侵权")) { || result.contains("此帐号处于帐号迁移流程中") || result.contains("该内容已被发布者删除") || result.contains("此内容被投诉且经审核涉嫌侵权")) {
title = "网页已删除"; title = "网页已删除";
} }
} }
}else if(url.contains("kuaibao")){ } else if (url.contains("kuaibao")) {
title = doc.select("p.title").text().replaceAll(" ", ""); title = doc.select("p.title").text().replaceAll(" ", "");
}else if(url.contains("chinadaily.com.cn")){ } else if (url.contains("chinadaily.com.cn")) {
title = doc.select("p.style1").text().replaceAll(" ", ""); title = doc.select("p.style1").text().replaceAll(" ", "");
}else if(url.contains("baidu.com") || url.contains("hao123.com")) { } else if (url.contains("baidu.com") || url.contains("hao123.com")) {
title = doc.select("p#contaniner").text(); title = doc.select("p#contaniner").text();
}else if(url.contains("kanfanews.com")) { } else if (url.contains("kanfanews.com")) {
title = doc.select("p#tit").text(); title = doc.select("p#tit").text();
}else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) { } else if (url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
title = "网页已删除"; title = "网页已删除";
}else if(url.contains("a.mp.uc.cn")) { } else if (url.contains("a.mp.uc.cn")) {
try { try {
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
title = json.getJSONObject("data").getString("title"); title = json.getJSONObject("data").getString("title");
} catch (Exception e) { } catch (Exception e) {
logger.error(" uc 数据 json 转换失败", e); logger.error(" uc 数据 json 转换失败", e);
} }
}else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) { } else if (url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
title = "网页已删除"; title = "网页已删除";
}else if(url.contains("zhihu.com")) { } else if (url.contains("zhihu.com")) {
JSONObject resultJson = JSONObject.parseObject(result); JSONObject resultJson = JSONObject.parseObject(result);
if(url.contains("/answer/")) { if (url.contains("/answer/")) {
title = resultJson.getJSONObject("question").getString("title"); title = resultJson.getJSONObject("question").getString("title");
}else if(url.contains("/question/") && !url.contains("/answer/") || url.contains("/p/")) { } else if (url.contains("/question/") && !url.contains("/answer/") || url.contains("/p/")) {
title = resultJson.getString("title"); title = resultJson.getString("title");
} }
}else if(url.contains("360kuai.com") && result.contains("您访问的文章走失了")) { } else if (url.contains("360kuai.com") && result.contains("您访问的文章走失了")) {
title = String.valueOf("404"); title = String.valueOf("404");
}else if(result.contains("文章没有找到哦") && url.contains("yidianzixun.com")) { } else if (result.contains("文章没有找到哦") && url.contains("yidianzixun.com")) {
title = "文章未找到"; title = "文章未找到";
}else if(url.contains("tznew.58.com/view")) { } else if (url.contains("tznew.58.com/view")) {
try { try {
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
title = json.getJSONObject("result").getString("title"); title = json.getJSONObject("result").getString("title");
} catch (Exception e) { } catch (Exception e) {
logger.error(" uc 数据 json 转换失败", e); logger.error(" uc 数据 json 转换失败", e);
} }
}else if(attr.getAttr().toString().contains("toutiao.com")) { } else if (attr.getAttr().toString().contains("toutiao.com")) {
if(result.contains("\"success\":false")) { if (result.contains("\"success\":false")) {
title = "网页已删除"; title = "网页已删除";
}else { } else {
title = String.valueOf(JSONPath.read(result, "$..title")); title = String.valueOf(JSONPath.read(result, "$..title"));
} }
}else if(url.contains("page.om.qq.com")) { } else if (url.contains("page.om.qq.com")) {
if(result.contains("内容被删除")) { if (result.contains("内容被删除")) {
title = "网页已删除"; title = "网页已删除";
} }
} else if (url.contains("m.dcdapp.com") && !result.contains("title")) {
title = "网页已删除";
} }
//若title 为拿到 用 此方法 //若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) { if (Objects.isNull(title) || title.length() < 1) {
title = doc.select("div.adiv > p > span").text().replaceAll(" ", ""); title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
} }
//若title 为拿到 用 此方法 //若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) { if (Objects.isNull(title) || title.length() < 1) {
title = doc.select("title").text().replaceAll(" ", ""); title = doc.select("title").text().replaceAll(" ", "");
} }
//若title 为拿到 用 此方法 //若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) { if (Objects.isNull(title) || title.length() < 1) {
title = doc.select("h1").text().replaceAll(" ", ""); title = doc.select("h1").text().replaceAll(" ", "");
} }
...@@ -301,8 +301,8 @@ public class UrlLiveCrawler { ...@@ -301,8 +301,8 @@ public class UrlLiveCrawler {
// title = "网页已删除"; // title = "网页已删除";
// } // }
if(Objects.nonNull(title) && title.length() > 1){ if (Objects.nonNull(title) && title.length() > 1) {
return new UrlLiveBean(attr.getAttr().toString(), isDelete(title),title); return new UrlLiveBean(attr.getAttr().toString(), isDelete(title), title);
} else { } else {
return null; return null;
} }
...@@ -312,28 +312,27 @@ public class UrlLiveCrawler { ...@@ -312,28 +312,27 @@ public class UrlLiveCrawler {
} }
/** /**
*
* @Description 标题判断
* @param title * @param title
* @return * @return
* @Description 标题判断
*/ */
private boolean isDelete(String title) { private boolean isDelete(String title) {
List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除" List<String> eList = Arrays.asList("系统出错", "该内容已被发布者删除", "网页已删除"
,"此帐号已自主注销,内容无法查看","页面提示","正在维护中" , "此帐号已自主注销,内容无法查看", "页面提示", "正在维护中"
,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway" , "此文章被第三方评估为不实信息", "财经头条", "知识100题", "502BadGateway"
,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网" , "提示信息", "跳转页", "跳转中...", "此帐号在冻结期,内容无法查看", "东北新闻网"
,"百度一下,你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页" , "百度一下,你就知道", "帐号已迁移", "手机百度", "内容被删除", "亚博国际|首页"
,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网" , "中国软件网", "云广网", "新浪首页", "文章暂时找不到了", "-法易网"
,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台" , "【一点资讯】www.yidianzixun.com", "错误页面", "网站暂停通知", "【快资讯】你的专属资讯平台"
,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移" , "百度新闻——全球最大的中文新闻平台", "以上文章由以下机构判定为不实信息", "该公众号已迁移"
,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除" , "财经网-CAIJING.COM.CN", "蚂蚁资讯", "参数错误", "时尚头条_YOKA时尚网", "该文章已经被删除"
,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在","文章未找到" , "网易", "链接已过期", "找不到页面", "今晚网", "该文章已被删除", "该回答已被删除-知乎", "资源不存在", "文章未找到"
, "UC头条", "该内容暂无法显示", "手机搜狐网", "此内容被投诉且经审核涉嫌侵权,无法查看。"); , "UC头条", "该内容暂无法显示", "手机搜狐网", "此内容被投诉且经审核涉嫌侵权,无法查看。");
List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在" List<String> cList = Arrays.asList("提示信息-", "此内容因违规无法查看", "微信公众号不存在"
,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误" , "此内容被投诉且经审核涉嫌侵权,无法查看", "thepageyourequestedwasnotfound", "未知错误"
,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常", , "Objectmoved", "404", "页面没有找到", "页面未找到", "301MovedPermanently", "加载异常",
"此帐号已被屏蔽, 内容无法查看","链接不存在", "新闻已删除", "视频去哪了呢"); "此帐号已被屏蔽, 内容无法查看", "链接不存在", "新闻已删除", "视频去哪了呢");
return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals); return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
} }
...@@ -341,14 +340,13 @@ public class UrlLiveCrawler { ...@@ -341,14 +340,13 @@ public class UrlLiveCrawler {
/** /**
* 处理知乎链接 * 处理知乎链接
* */
* */
private static String treatZhihuUrl(String url) { private static String treatZhihuUrl(String url) {
if(url.contains("/answer/")) { if (url.contains("/answer/")) {
url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", ""); url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", "");
}else if(url.contains("/question/") && !url.contains("/answer/")) { } else if (url.contains("/question/") && !url.contains("/answer/")) {
url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", ""); url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", "");
}else if(url.contains("/p/")) { } else if (url.contains("/p/")) {
url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", ""); url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", "");
} }
return url; return url;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment