Commit 39b30f08 by yangchen

无效链接传入处理

parent 554dd201
...@@ -22,6 +22,8 @@ import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; ...@@ -22,6 +22,8 @@ import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
import okhttp3.Request;
/** /**
* *
* @ClassName UrlLiveCrawler * @ClassName UrlLiveCrawler
...@@ -65,35 +67,32 @@ public class UrlLiveCrawler { ...@@ -65,35 +67,32 @@ public class UrlLiveCrawler {
if(url.contains("www.toutiao.com")){ if(url.contains("www.toutiao.com")){
headers.put("referer", url); headers.put("referer", url);
} }
counter.add(); try {
httpBoot.asyncCall(RequestUtils.wrapGet(url, headers), ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> { Request request = RequestUtils.wrapGet(url, headers);
try { if(Objects.nonNull(request)) {
if (Objects.isNull(ex)) { counter.add();
if(rs.code() == 200) { httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> {
parseHtml(rs.body().string(), attr, callback,counter); try {
}else { if (Objects.isNull(ex)) {
if(attr.getCount() > 2) { if(rs.code() == 200) {
parseHtml(rs.body().string(), attr, callback);
}else {
callBack(callback, attr, 1,String.valueOf(rs.code()));
}
} else {
callBack(callback, attr, 1,String.valueOf(rs.code())); callBack(callback, attr, 1,String.valueOf(rs.code()));
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
} }
} catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ",e);
}finally {
counter.done();
} }
} else { });
if(attr.getCount() > 3) { return counter;
callBack(callback, attr, -1,null);
logger.info("搜索结果访问失败: {}", ex);
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
}
} catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ",e);
}finally {
counter.done();
} }
}); } catch (Exception e2) {
logger.error("数据出错 {}" ,e2);
}
return counter; return counter;
} }
...@@ -150,7 +149,7 @@ public class UrlLiveCrawler { ...@@ -150,7 +149,7 @@ public class UrlLiveCrawler {
* @param callback * @param callback
*/ */
private void parseHtml(String html, Attribution attr, private void parseHtml(String html, Attribution attr,
UrlLiveDataCallback callback,GroupSync counter) { UrlLiveDataCallback callback) {
if (callback == null) { if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据"); logger.warn("DataCallback 对象为 null,无法保存数据");
} else { } else {
...@@ -158,12 +157,7 @@ public class UrlLiveCrawler { ...@@ -158,12 +157,7 @@ public class UrlLiveCrawler {
if(Objects.nonNull(ulb)) { if(Objects.nonNull(ulb)) {
callback.onData(ulb, attr); callback.onData(ulb, attr);
}else { }else {
if(attr.getCount() > 3) { callBack(callback, attr, -1,null);
callBack(callback, attr, -1,null);
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
} }
} }
} }
...@@ -177,72 +171,76 @@ public class UrlLiveCrawler { ...@@ -177,72 +171,76 @@ public class UrlLiveCrawler {
* @return boolean 返回类型 * @return boolean 返回类型
*/ */
public UrlLiveBean matchDel(String result,Attribution attr,String url){ public UrlLiveBean matchDel(String result,Attribution attr,String url){
Document doc = Jsoup.parse(result); try {
String title = null; Document doc = Jsoup.parse(result);
if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com")){ String title = null;
title = doc.select("h2.rich_media_title").text().replaceAll(" ", ""); if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com")){
if(Objects.isNull(title) || title.isEmpty()) { title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
title = doc.select("p.title").text(); if(Objects.isNull(title) || title.isEmpty()) {
} title = doc.select("p.title").text();
if(Objects.isNull(title) || title.isEmpty()) { }
title = doc.select("h3.msg-title").text(); if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h3.msg-title").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("div.global_error_msg.warn").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.tips").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h2").text();
}
}else if(url.contains("kuaibao")){
title = doc.select("p.title").text().replaceAll(" ", "");
}else if(url.contains("chinadaily.com.cn")){
title = doc.select("p.style1").text().replaceAll(" ", "");
}else if(url.contains("baidu.com") || url.contains("hao123.com")) {
title = doc.select("p#contaniner").text();
}else if(url.contains("kanfanews.com")) {
title = doc.select("p#tit").text();
}else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
title = "网页已删除";
}else if(url.contains("a.mp.uc.cn")) {
try {
JSONObject json = JSONObject.parseObject(result);
title = json.getJSONObject("data").getString("title");
if(Objects.isNull(title) || title.length() < 1) {
title = "网页已删除";
}
} catch (Exception e) {
logger.error(" uc 数据 json 转换失败", e);
}
}else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
title = "网页已删除";
} }
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("div.global_error_msg.warn").text(); //若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
} }
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.tips").text(); //若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("title").text().replaceAll(" ", "");
} }
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h2").text(); //若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("h1").text().replaceAll(" ", "");
} }
}else if(url.contains("kuaibao")){
title = doc.select("p.title").text().replaceAll(" ", ""); //若title 为拿到 用 此方法
}else if(url.contains("chinadaily.com.cn")){ if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
title = doc.select("p.style1").text().replaceAll(" ", ""); title = "网页已删除";
}else if(url.contains("baidu.com") || url.contains("hao123.com")) {
title = doc.select("p#contaniner").text();
}else if(url.contains("kanfanews.com")) {
title = doc.select("p#tit").text();
}else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
title = "网页已删除";
}else if(url.contains("a.mp.uc.cn")) {
try {
JSONObject json = JSONObject.parseObject(result);
title = json.getJSONObject("data").getString("title");
if(Objects.isNull(title) || title.length() < 1) {
title = "网页已删除";
}
} catch (Exception e) {
logger.error(" uc 数据 json 转换失败", e);
} }
}else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
title = "网页已删除";
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("title").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法 if(Objects.nonNull(title) && title.length() > 1){
if(Objects.isNull(title) || title.length() < 1) { return new UrlLiveBean(attr.getAttr().toString(), isDelete(title),title);
title = doc.select("h1").text().replaceAll(" ", ""); } else {
} return null;
}
//若title 为拿到 用 此方法 } catch (Exception e) {
if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
title = "网页已删除";
}
if(Objects.nonNull(title) && title.length() > 1){
return new UrlLiveBean(attr.getAttr().toString(), isDelete(title),title);
} else {
return null; return null;
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment