Commit a94682af by chenweiyang

删除误导性判断

parent 7f7e4a1c
......@@ -2,6 +2,9 @@ package com.zhiwei.source_forward.crawler;
import static java.util.Objects.nonNull;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.Proxy.Type;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
......@@ -52,6 +55,7 @@ public class UrlLiveCrawler {
counter.add();
if (nonNull(url)) {
try {
// ZhiWeiTools.sleep(3000);
search(counter, url, Attribution.of(url, 1), callback);
} catch (Exception e) {
logger.error("搜索创建出错:", e);
......@@ -71,7 +75,7 @@ public class UrlLiveCrawler {
ProxyHolder ph = null;
if(url.contains("toutiao.com")){
headers.put("referer", url);
// headers.put("cookie", "__ac_nonce=05ed0c7bb00bc34aa36be; __ac_signature=0fFbMAAgEBBBDtmbXG3W-tHxWiAAI8q; ttcid=cfbee5ddf00b4013b5236b534c8cf36c19; tt_webid=6832180195202909704; s_v_web_id=verify_kary2om5_954yc9QS_twaQ_42XG_9Sei_dsAVEudiEodo; __tasessionId=4bmcvzruo1590740924839; tt_webid=6832180195202909704; SLARDAR_WEB_ID=fb4d8abf-bdd7-4e9e-ba38-8c00f0c13846; csrftoken=6430b380cc664479dfa0b0e5061b2db9; tt_scid=kRdSxPldqsXGPvYrxh3K4HZ5ayX0isXRzk08ZTjlIGmNW3HaSLrhBfHJ.CRjNom.b0fe");
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; tt_webid=6833273737980659213; tt_webid=6833273737980659213; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_scid=KdPOCLtoSVDQTnptuiejH4SkyYa7RodIcBHFpAGwf17X9rUWJJadFYALAeJ5C8xI71e5; __ac_nonce=05ee037380054152ddc38; __ac_signature=6C1-YAAgEBB40vzLiGE95-gsf3AALbYjxEHG0FQERCcxB-9tebz.fovM7gew-AHObLDUegpmF7k8G57XzXokCbi72klNkdvS.ukzrfuuFk3UL836QudGNHE6IJQ47kFRkiT; __tasessionId=nz5ags6bk1591752505915");
headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
headers.put("accept-encoding", "gzip, deflate, br");
headers.put("accept-language", "zh-CN,zh;q=0.9");
......@@ -91,19 +95,20 @@ public class UrlLiveCrawler {
Request request = RequestUtils.wrapGet(url, headers);
if(Objects.nonNull(request)) {
counter.add();
httpBoot.asyncCall(request, ph).whenComplete((rs,ex) -> {
// , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
if(rs.isSuccessful()) {
parseHtml(rs.body().string(), attr, callback);
}else if(rs.code() == 403){
callBack(callback, attr, -1, String.valueOf(rs.code()));
}else {
}else if(rs.code() == 404){
callBack(callback, attr, 1, String.valueOf(rs.code()));
}else {
callBack(callback, attr, -1, "程序无法判断");
}
} else {
logger.error("e", ex);
callBack(callback, attr, 1,"未访问成功");
callBack(callback, attr, -1, "程序无法判断");
}
} catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ",e);
......@@ -157,6 +162,9 @@ public class UrlLiveCrawler {
}
}else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) {
url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1";
}else if(url.contains("tznew.58.com/view") && url.contains("infoid=")) {
// https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473
url = "https://tznew.58.com/tznew/c/info-detail?infoid=" + url.split("infoid=")[1].split("&")[0];
}
return url;
} catch (Exception e) {
......@@ -180,7 +188,7 @@ public class UrlLiveCrawler {
if(Objects.nonNull(ulb)) {
callback.onData(ulb, attr);
}else {
callBack(callback, attr, -1,null);
callBack(callback, attr, -1, "程序无法判断");
}
}
}
......@@ -234,9 +242,6 @@ public class UrlLiveCrawler {
try {
JSONObject json = JSONObject.parseObject(result);
title = json.getJSONObject("data").getString("title");
if(Objects.isNull(title) || title.length() < 1) {
title = "网页已删除";
}
} catch (Exception e) {
logger.error(" uc 数据 json 转换失败", e);
}
......@@ -253,6 +258,13 @@ public class UrlLiveCrawler {
title = String.valueOf("404");
}else if(result.contains("文章没有找到哦") && url.contains("yidianzixun.com")) {
title = "文章未找到";
}else if(url.contains("tznew.58.com/view")) {
try {
JSONObject json = JSONObject.parseObject(result);
title = json.getJSONObject("result").getString("title");
} catch (Exception e) {
logger.error(" uc 数据 json 转换失败", e);
}
}
//若title 为拿到 用 此方法
......@@ -270,10 +282,10 @@ public class UrlLiveCrawler {
title = doc.select("h1").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
title = "网页已删除";
}
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
// title = "网页已删除";
// }
if(Objects.nonNull(title) && title.length() > 1){
return new UrlLiveBean(attr.getAttr().toString(), isDelete(title),title);
......@@ -301,12 +313,13 @@ public class UrlLiveCrawler {
,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在","文章未找到");
,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在","文章未找到"
, "UC头条");
List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误"
,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常",
"此帐号已被屏蔽, 内容无法查看","链接不存在");
"此帐号已被屏蔽, 内容无法查看","链接不存在", "新闻已删除");
return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
}
......
......@@ -72,8 +72,8 @@ public class URLLive {
public static void main(String[] args) {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://www.toutiao.com/a1665677841741827");
// urlList.add("https://mp.weixin.qq.com/s?__biz=MzA3NjgyNTU5Nw==&mid=2247486586&idx=2&sn=419218b3c831b17d2b9bd9a5281ea842&scene=6#wechat_redirect");
// urlList.add("http://www.toutiao.com/a1665677841741827");
urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment