Commit 39b30f08 by yangchen

无效链接传入处理

parent 554dd201
......@@ -22,6 +22,8 @@ import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.tools.httpclient.HeaderTool;
import okhttp3.Request;
/**
*
* @ClassName UrlLiveCrawler
......@@ -65,35 +67,32 @@ public class UrlLiveCrawler {
if(url.contains("www.toutiao.com")){
headers.put("referer", url);
}
counter.add();
httpBoot.asyncCall(RequestUtils.wrapGet(url, headers), ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
if(rs.code() == 200) {
parseHtml(rs.body().string(), attr, callback,counter);
}else {
if(attr.getCount() > 2) {
try {
Request request = RequestUtils.wrapGet(url, headers);
if(Objects.nonNull(request)) {
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
if(rs.code() == 200) {
parseHtml(rs.body().string(), attr, callback);
}else {
callBack(callback, attr, 1,String.valueOf(rs.code()));
}
} else {
callBack(callback, attr, 1,String.valueOf(rs.code()));
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
} catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ",e);
}finally {
counter.done();
}
} else {
if(attr.getCount() > 3) {
callBack(callback, attr, -1,null);
logger.info("搜索结果访问失败: {}", ex);
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
}
} catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ",e);
}finally {
counter.done();
});
return counter;
}
});
} catch (Exception e2) {
logger.error("数据出错 {}" ,e2);
}
return counter;
}
......@@ -150,7 +149,7 @@ public class UrlLiveCrawler {
* @param callback
*/
private void parseHtml(String html, Attribution attr,
UrlLiveDataCallback callback,GroupSync counter) {
UrlLiveDataCallback callback) {
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
......@@ -158,12 +157,7 @@ public class UrlLiveCrawler {
if(Objects.nonNull(ulb)) {
callback.onData(ulb, attr);
}else {
if(attr.getCount() > 3) {
callBack(callback, attr, -1,null);
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
callBack(callback, attr, -1,null);
}
}
}
......@@ -177,72 +171,76 @@ public class UrlLiveCrawler {
* @return boolean 返回类型
*/
public UrlLiveBean matchDel(String result,Attribution attr,String url){
Document doc = Jsoup.parse(result);
String title = null;
if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com")){
title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.title").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h3.msg-title").text();
try {
Document doc = Jsoup.parse(result);
String title = null;
if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com")){
title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.title").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h3.msg-title").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("div.global_error_msg.warn").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.tips").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h2").text();
}
}else if(url.contains("kuaibao")){
title = doc.select("p.title").text().replaceAll(" ", "");
}else if(url.contains("chinadaily.com.cn")){
title = doc.select("p.style1").text().replaceAll(" ", "");
}else if(url.contains("baidu.com") || url.contains("hao123.com")) {
title = doc.select("p#contaniner").text();
}else if(url.contains("kanfanews.com")) {
title = doc.select("p#tit").text();
}else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
title = "网页已删除";
}else if(url.contains("a.mp.uc.cn")) {
try {
JSONObject json = JSONObject.parseObject(result);
title = json.getJSONObject("data").getString("title");
if(Objects.isNull(title) || title.length() < 1) {
title = "网页已删除";
}
} catch (Exception e) {
logger.error(" uc 数据 json 转换失败", e);
}
}else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
title = "网页已删除";
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("div.global_error_msg.warn").text();
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.tips").text();
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("title").text().replaceAll(" ", "");
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h2").text();
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("h1").text().replaceAll(" ", "");
}
}else if(url.contains("kuaibao")){
title = doc.select("p.title").text().replaceAll(" ", "");
}else if(url.contains("chinadaily.com.cn")){
title = doc.select("p.style1").text().replaceAll(" ", "");
}else if(url.contains("baidu.com") || url.contains("hao123.com")) {
title = doc.select("p#contaniner").text();
}else if(url.contains("kanfanews.com")) {
title = doc.select("p#tit").text();
}else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
title = "网页已删除";
}else if(url.contains("a.mp.uc.cn")) {
try {
JSONObject json = JSONObject.parseObject(result);
title = json.getJSONObject("data").getString("title");
if(Objects.isNull(title) || title.length() < 1) {
title = "网页已删除";
}
} catch (Exception e) {
logger.error(" uc 数据 json 转换失败", e);
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
title = "网页已删除";
}
}else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
title = "网页已删除";
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("title").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("h1").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
title = "网页已删除";
}
if(Objects.nonNull(title) && title.length() > 1){
return new UrlLiveBean(attr.getAttr().toString(), isDelete(title),title);
} else {
if(Objects.nonNull(title) && title.length() > 1){
return new UrlLiveBean(attr.getAttr().toString(), isDelete(title),title);
} else {
return null;
}
} catch (Exception e) {
return null;
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment