Commit b8ed38f4 by chenweiyang

链接是否删除部分修改

parents bd0353ac 7003572f
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.2.7-SNAPSHOT</version> <version>0.2.8-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.util.Objects; import java.util.Objects;
import org.apache.logging.log4j.LogManager; import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.source_forward.bean.ContentBean; import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.ContentBean.Attribution; import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.util.ContentDataCallback; import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.MatchContent; import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import okhttp3.Request;
import okhttp3.Request;
public class ContentCrawler {
public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
* /**
* @Description 链接传入 并 返回采集完信号 *
* @param callback * @Description 链接传入 并 返回采集完信号
* @param urls * @param callback
* @return * @param urls
* @throws Exception * @return
*/ * @throws Exception
public GroupSync submitTask(ContentDataCallback callback, */
String... urls) { public GroupSync submitTask(ContentDataCallback callback,
GroupSync counter = new GroupSync(); String... urls) {
start(counter, callback, urls); GroupSync counter = new GroupSync();
return counter; start(counter, callback, urls);
} return counter;
}
/**
* /**
* @Description 提交链接 *
* @param counter * @Description 提交链接
* @param callback * @param counter
* @param urls * @param callback
*/ * @param urls
private void start(GroupSync counter, */
ContentDataCallback callback, String... urls) { private void start(GroupSync counter,
if (urls != null && urls.length > 0) { ContentDataCallback callback, String... urls) {
for (String url : urls) { if (urls != null && urls.length > 0) {
if (url != null) { for (String url : urls) {
try { ZhiWeiTools.sleep(100);
search(counter, url, Attribution.of(url), callback); if (url != null) {
} catch (Exception e) { try {
logger.error("搜索创建出错", e); search(counter, url, Attribution.of(url), callback);
} } catch (Exception e) {
} logger.error("搜索创建出错", e);
} }
} }
} }
}
/** }
*
* @Description 链接获取文章信息 /**
* @param counter *
* @param url * @Description 链接获取文章信息
* @param attr * @param counter
* @param callback * @param url
* @return * @param attr
*/ * @param callback
private GroupSync search(GroupSync counter, * @return
String url, Attribution attr, ContentDataCallback callback) { */
logger.info("当前处理 URL: {}", url); private GroupSync search(GroupSync counter,
Request request = RequestUtils.wrapGet(url); String url, Attribution attr, ContentDataCallback callback) {
counter.add(); logger.info("当前处理 URL: {}", url);
Request request = RequestUtils.wrapGet(url);
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> { counter.add();
try {
if (Objects.isNull(ex)) { httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
parseHtml(rs.body().string(), attr, callback); try {
} else { if (Objects.isNull(ex)) {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex); parseHtml(rs.body().string(), attr, callback);
} } else {
} catch (Exception e) { logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
logger.info("搜索结果访问失败: {}", ex); }
} finally { } catch (Exception e) {
counter.done(); logger.info("搜索结果访问失败: {}", ex);
} } finally {
counter.done();
}); }
return counter; });
}
return counter;
/** }
*
* /**
* @Description 获取正文解析 *
* @param response *
* @param attr * @Description 获取正文解析
* @param callback * @param response
*/ * @param attr
private void parseHtml(String result, Attribution attr, * @param callback
ContentDataCallback callback) { */
try { private void parseHtml(String result, Attribution attr,
String content = MatchContent.matchContent(attr.get().toString(), ContentDataCallback callback) {
result); try {
ContentBean cb = new ContentBean(attr.get().toString(), content); String content = MatchContent.matchContent(attr.get().toString(),
if (callback == null) { result);
logger.warn("DataCallback 对象为 null,无法保存数据"); ContentBean cb = new ContentBean(attr.get().toString(), content);
} else { if (callback == null) {
callback.onData(cb, attr); logger.warn("DataCallback 对象为 null,无法保存数据");
} } else {
} catch (Exception e) { callback.onData(cb, attr);
logger.error("网页链接失效", e); }
} } catch (Exception e) {
logger.error("网页链接失效", e);
} }
} }
}
...@@ -5,6 +5,7 @@ import java.util.List; ...@@ -5,6 +5,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
...@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler { ...@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) { private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) { if (urls != null && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
ZhiWeiTools.sleep(100);
counter.add(); counter.add();
if (url != null) { if (url != null) {
try { try {
...@@ -89,12 +91,9 @@ public class MediaSelfSourceCrawler { ...@@ -89,12 +91,9 @@ public class MediaSelfSourceCrawler {
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) { private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", attr.get()); logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
ProxyHolder ph = null; ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")) { if(url.contains("toutiao.com")) {
map.put("referer", url); map.put("referer", url);
ph = ProxyHolder.SOUGOU_OUTER_PROXY;
}else {
ph = ProxyHolder.NAT_HEAVY_PROXY;
} }
url = dealUrl(url); url = dealUrl(url);
if(Objects.nonNull(url)) { if(Objects.nonNull(url)) {
...@@ -168,7 +167,6 @@ public class MediaSelfSourceCrawler { ...@@ -168,7 +167,6 @@ public class MediaSelfSourceCrawler {
String url = attr.get().toString(); String url = attr.get().toString();
try { try {
source = MatchSource.matchMediaSelfSource(url + eUrl,result); source = MatchSource.matchMediaSelfSource(url + eUrl,result);
logger.info(url+"=======" + source);
channel = MatchChannel.verifyChannel(url); channel = MatchChannel.verifyChannel(url);
if(channel==null){ if(channel==null){
List<Node> nodeList = Jsoup.parse(result).head().childNodes(); List<Node> nodeList = Jsoup.parse(result).head().childNodes();
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync; import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.SourceForwardBean; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution; import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel; import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource; import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData; import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack; import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
import okhttp3.Request;
public class SourceForwardCrawler {
public class SourceForwardCrawler {
private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static List<String> sourceList = SourceData.getSourceList(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static List<String> sourceList = SourceData.getSourceList();
public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
try { public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
GroupSync counter = new GroupSync(); try {
start(counter, callback, urls); GroupSync counter = new GroupSync();
return counter; start(counter, callback, urls);
} catch (Exception e) { return counter;
logger.error(" exception ", e); } catch (Exception e) {
return null; logger.error(" exception ", e);
} return null;
} }
}
private void start(GroupSync counter,SourceForwardDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) { private void start(GroupSync counter,SourceForwardDataCallBack callback, String... urls) {
for (String url : urls) { if (urls != null && urls.length > 0) {
counter.add(); for (String url : urls) {
if (url != null) { counter.add();
try { ZhiWeiTools.sleep(100);
search(counter, url, Attribution.of(url), callback); if (url != null) {
} catch (Exception e) { try {
logger.error("搜索创建出错", e); search(counter, url, Attribution.of(url), callback);
} } catch (Exception e) {
} logger.error("搜索创建出错", e);
counter.done(); }
} }
} counter.done();
} }
}
private GroupSync search(GroupSync counter, String url,Attribution attr, SourceForwardDataCallBack callback) { }
logger.info("当前处理 URL: {}", url);
Map<String,String> headers = new HashMap<>(); private GroupSync search(GroupSync counter, String url,Attribution attr, SourceForwardDataCallBack callback) {
// Map<String,String> headers = HeaderTool.getCommonHead(); logger.info("当前处理 URL: {}", url);
if(url.contains("www.toutiao.com")){ Map<String,String> headers = new HashMap<>();
headers.put("referer", url); // Map<String,String> headers = HeaderTool.getCommonHead();
} if(url.contains("www.toutiao.com")){
if(url.contains("china.prcfe.com")) { headers.put("referer", url);
url = "http://china.prcfe.com/e/extend/ShowSource/?id=" + url.split("/")[url.split("/").length-1].split("\\.")[0]; }
} if(url.contains("china.prcfe.com")) {
Request request = RequestUtils.wrapGet(url, headers); url = "http://china.prcfe.com/e/extend/ShowSource/?id=" + url.split("/")[url.split("/").length-1].split("\\.")[0];
counter.add(); }
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> { if(url.contains("gu.qq.com")) {
try { String id = url.split("\\?id=")[1];
if (Objects.isNull(ex)) { url = "https://snp.tenpay.com/cgi-bin/snpgw_unified_newsinfo.fcgi?&filter=0&zappid=zxg_h5&sign=b2aceeb8a8ef093862608d806c1d6ab8&nonce=8464&reserve=1572995&&channel=zxg&user_openid=undefined&user_skey=undefined&&news_id=" + id;
parseHtml(rs.body().string(), attr, callback); headers.put("referer", "https://gu.qq.com/resources/shy/news/detail-v2/index.html");
} else { }
logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex); Request request = RequestUtils.wrapGet(url, headers);
} counter.add();
} catch (Exception e1) { httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
logger.error("解析出错",e1); try {
} finally { if (Objects.isNull(ex)) {
counter.done(); parseHtml(rs.body().string(), attr, callback);
} } else {
}); logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
return counter; }
} } catch (Exception e1) {
logger.error("解析出错",e1);
private void parseHtml(String body, Attribution attr, } finally {
SourceForwardDataCallBack callback) { counter.done();
String source = null; }
String channel = "新闻"; });
String isforward = "未知"; return counter;
try { }
Document document = Jsoup.parse(body);
if(attr.get().toString().contains("mp.weixin.qq.com")){ private void parseHtml(String body, Attribution attr,
isforward = document.select("div#meta_content").select("span#copyright_logo").text(); SourceForwardDataCallBack callback) {
if(isforward.contains("原创")){ String source = null;
isforward = "原创"; String channel = "新闻";
}else { String isforward = "未知";
isforward = "未知"; try {
} if(attr.get().toString().contains("mp.weixin.qq.com")){
}else if(attr.get().toString().contains("www.toutiao.com")){ Document document = Jsoup.parse(body);
if(body.contains("isOriginal") && body.contains("isOriginal: true")){ isforward = document.select("div#meta_content").select("span#copyright_logo").text();
isforward = "原创"; if(isforward.contains("原创")){
} isforward = "原创";
}else{ }else {
channel = MatchChannel.verifyChannel(attr.get().toString()); isforward = "未知";
if(channel==null){ }
List<Node> nodeList = document.head().childNodes(); }else if(attr.get().toString().contains("www.toutiao.com")){
channel = MatchChannel.matchChannel(nodeList); if(body.contains("isOriginal") && body.contains("isOriginal: true")){
} isforward = "原创";
source = MatchSource.matchSource(attr.get().toString(),document.toString(), sourceList); }
} }else if(attr.get().toString().contains("snp.tenpay.com") || attr.get().toString().contains("gu.qq.com")){
} catch (Exception e) { if(body.contains("source")){
source = null; source = body.split("\"source\":\"")[1].split("\"")[0];
channel = "新闻"; }
} }else{
logger.info(attr.get().toString()+"======="+channel+"================="+source); Document document = Jsoup.parse(body);
SourceForwardBean sfb = new SourceForwardBean(attr.get().toString(), channel, source,isforward); source = MatchSource.matchSource(attr.get().toString(),document.toString(), sourceList);
if (callback == null) { channel = MatchChannel.verifyChannel(attr.get().toString());
logger.warn("DataCallback 对象为 null,无法保存数据"); if(channel==null){
} else { List<Node> nodeList = document.head().childNodes();
callback.onData(sfb, attr); channel = MatchChannel.matchChannel(nodeList);
} }
} }
} catch (Exception e) {
} e.printStackTrace();
source = null;
channel = "新闻";
}
logger.info(attr.get().toString()+"======="+channel+"================="+source);
SourceForwardBean sfb = new SourceForwardBean(attr.get().toString(), channel, source,isforward);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(sfb, attr);
}
}
}
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import static java.util.Objects.nonNull; import static java.util.Objects.nonNull;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.async.GroupSync; import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
/** /**
* *
* @ClassName UrlLiveCrawler * @ClassName UrlLiveCrawler
* @Description 判断页面是否存在 * @Description 判断页面是否存在
* @author byte-zbs * @author byte-zbs
* @Date 2018年8月20日 下午3:34:57 * @Date 2018年8月20日 下午3:34:57
* @version 1.0.0 * @version 1.0.0
*/ */
public class UrlLiveCrawler { public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class); private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(true).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(true).build();
public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) { public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) {
GroupSync counter = new GroupSync(); GroupSync counter = new GroupSync();
start(counter, callback, urls); start(counter, callback, urls);
return counter; return counter;
} }
private void start(GroupSync counter,UrlLiveDataCallback callback, String... urls) { private void start(GroupSync counter,UrlLiveDataCallback callback, String... urls) {
if (nonNull(urls) && urls.length > 0) { if (nonNull(urls) && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
counter.add(); counter.add();
if (nonNull(url)) { ZhiWeiTools.sleep(100);
try { if (nonNull(url)) {
search(counter, url, Attribution.of(url, 1), callback); try {
} catch (Exception e) { // ZhiWeiTools.sleep(3000);
logger.error("搜索创建出错:", e); search(counter, url, Attribution.of(url, 1), callback);
} } catch (Exception e) {
} logger.error("搜索创建出错:", e);
counter.done(); }
} }
} counter.done();
} }
}
private GroupSync search(GroupSync counter, String url, }
Attribution attr, UrlLiveDataCallback callback) {
// System.out.println(url); private GroupSync search(GroupSync counter, String url,
url = dealUrl(url); Attribution attr, UrlLiveDataCallback callback) {
logger.info("当前处理 URL: {}", url); // System.out.println(url);
Map<String,String> headers = new HashMap<>(); url = dealUrl(url);
ProxyHolder ph = null; logger.info("当前处理 URL: {}", url);
if(url.contains("toutiao.com")){ Map<String,String> headers = new HashMap<>();
// headers.put("referer", url); ProxyHolder ph = null;
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_webid=6837283963338622477; tt_webid=6837283963338622477; __tasessionId=dsei2aty41591951911851; tt_scid=M--AJ-FYwZ0qcYTzQCLyMeS5MlykLS6ktMkvqKKJmq-ghRxX4waEBhJ3YbheuNmi2b8a"); if(url.contains("toutiao.com")){
// headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); // headers.put("referer", url);
// headers.put("accept-encoding", "gzip, deflate, br"); // headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_webid=6837283963338622477; tt_webid=6837283963338622477; __tasessionId=dsei2aty41591951911851; tt_scid=M--AJ-FYwZ0qcYTzQCLyMeS5MlykLS6ktMkvqKKJmq-ghRxX4waEBhJ3YbheuNmi2b8a");
// headers.put("accept-language", "zh-CN,zh;q=0.9"); // headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
// headers.put("cache-control", "no-cache"); // headers.put("accept-encoding", "gzip, deflate, br");
// headers.put("sec-fetch-dest", "document"); // headers.put("accept-language", "zh-CN,zh;q=0.9");
// headers.put("sec-fetch-mode", "navigate"); // headers.put("cache-control", "no-cache");
// headers.put("sec-fetch-site", "same-origin"); // headers.put("sec-fetch-dest", "document");
// headers.put("sec-fetch-user", "?1"); // headers.put("sec-fetch-mode", "navigate");
// headers.put("upgrade-insecure-requests", "1"); // headers.put("sec-fetch-site", "same-origin");
// headers.put("user-agent", "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Mobile Safari/537.36"); // headers.put("sec-fetch-user", "?1");
ph = ProxyHolder.NAT_HEAVY_PROXY; // headers.put("upgrade-insecure-requests", "1");
}else if(url.contains("zhihu.com")) { // headers.put("user-agent", "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Mobile Safari/537.36");
url = treatZhihuUrl(url); ph = ProxyHolder.NAT_HEAVY_PROXY;
ph = ProxyHolder.NAT_HEAVY_PROXY; }else if(url.contains("zhihu.com")) {
} url = treatZhihuUrl(url);
try { ph = ProxyHolder.NAT_HEAVY_PROXY;
Request request = RequestUtils.wrapGet(url, headers); }
if(Objects.nonNull(request)) { try {
counter.add(); Request request = RequestUtils.wrapGet(url, headers);
// , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128)) if(Objects.nonNull(request)) {
httpBoot.asyncCall(request, ph).whenComplete((rs,ex) -> { counter.add();
try { // , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
if (Objects.isNull(ex)) { httpBoot.asyncCall(request, ph).whenComplete((rs,ex) -> {
if(rs.isSuccessful()) { try {
parseHtml(rs.body().string(), attr, callback); if (Objects.isNull(ex)) {
}else if(rs.code() == 404){ if(rs.isSuccessful()) {
callBack(callback, attr, 1, String.valueOf(rs.code())); parseHtml(rs.body().string(), attr, callback);
}else { }else if(rs.code() == 404){
callBack(callback, attr, -1, "程序无法判断"); callBack(callback, attr, 1, String.valueOf(rs.code()));
} }else {
} else { callBack(callback, attr, -1, "程序无法判断");
logger.error("e", ex); }
callBack(callback, attr, -1, "程序无法判断"); } else {
} logger.error("e", ex);
} catch (Exception e) { callBack(callback, attr, -1, "程序无法判断");
logger.error(" 数据是否删除 采集出错 {} ",e); }
}finally { } catch (Exception e) {
counter.done(); logger.error(" 数据是否删除 采集出错 {} ",e);
} }finally {
}); counter.done();
return counter; }
} });
} catch (Exception e2) { return counter;
logger.error("数据出错 {}" ,e2); }
} } catch (Exception e2) {
return counter; logger.error("数据出错 {}" ,e2);
} }
return counter;
private void callBack(UrlLiveDataCallback callback,Attribution attr,int i,String title) { }
UrlLiveBean ulb = null;
if(i == 1) { private void callBack(UrlLiveDataCallback callback,Attribution attr,int i,String title) {
ulb = new UrlLiveBean(attr.getAttr().toString(), true, title); UrlLiveBean ulb = null;
}else { if(i == 1) {
ulb = new UrlLiveBean(attr.getAttr().toString(), i, title); ulb = new UrlLiveBean(attr.getAttr().toString(), true, title);
} }else {
if (callback == null) { ulb = new UrlLiveBean(attr.getAttr().toString(), i, title);
logger.warn("DataCallback 对象为 null,无法保存数据"); }
} else { if (callback == null) {
callback.onData(ulb, attr); logger.warn("DataCallback 对象为 null,无法保存数据");
} } else {
} callback.onData(ulb, attr);
}
private String dealUrl(String url) { }
try {
if(url.contains("www.toutiao.com")) { private String dealUrl(String url) {
if(url.contains("www.toutiao.com")) { try {
if(url.contains("www.toutiao.com")) {
}else { if(url.contains("www.toutiao.com")) {
url = url.replace("toutiao.com", "www.toutiao.com");
} }else {
if(url.contains("https")) { url = url.replace("toutiao.com", "www.toutiao.com");
}
}else { if(url.contains("https")) {
url = url.replace("http", "https");
} }else {
if(url.contains("group")) { url = url.replace("http", "https");
url = "https://www.toutiao.com/a" + url.split("/")[4] + "/"; }
} if(url.contains("group")) {
}else if(url.contains("mp.weixin.qq.com")) { url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
if(url.contains("https")) { }
}else if(url.contains("mp.weixin.qq.com")) {
}else { if(url.contains("https")) {
url = url.replace("http", "https");
} }else {
}else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) { url = url.replace("http", "https");
url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"; }
}else if(url.contains("tznew.58.com/view") && url.contains("infoid=")) { }else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) {
// https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473 url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1";
url = "https://tznew.58.com/tznew/c/info-detail?infoid=" + url.split("infoid=")[1].split("&")[0]; }else if(url.contains("tznew.58.com/view") && url.contains("infoid=")) {
} // https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473
return url; url = "https://tznew.58.com/tznew/c/info-detail?infoid=" + url.split("infoid=")[1].split("&")[0];
} catch (Exception e) { }
return url; return url;
} } catch (Exception e) {
} return url;
}
/** }
*
* @Description 判断是否删除 /**
* @param html *
* @param attr * @Description 判断是否删除
* @param callback * @param html
*/ * @param attr
private void parseHtml(String html, Attribution attr, * @param callback
UrlLiveDataCallback callback) { */
if (callback == null) { private void parseHtml(String html, Attribution attr,
logger.warn("DataCallback 对象为 null,无法保存数据"); UrlLiveDataCallback callback) {
} else { if (callback == null) {
UrlLiveBean ulb = matchDel(html, attr, attr.getAttr().toString()); logger.warn("DataCallback 对象为 null,无法保存数据");
if(Objects.nonNull(ulb)) { } else {
callback.onData(ulb, attr); UrlLiveBean ulb = matchDel(html, attr, attr.getAttr().toString());
}else { if(Objects.nonNull(ulb)) {
callBack(callback, attr, -1, "程序无法判断"); callback.onData(ulb, attr);
} }else {
} callBack(callback, attr, -1, "程序无法判断");
} }
}
/*** }
* @Title: matchDel
* @author hero /***
* @Description: 验证链接是否有效 * @Title: matchDel
* @param @param page * @author hero
* @param @return 设定文件 * @Description: 验证链接是否有效
* @return boolean 返回类型 * @param @param page
*/ * @param @return 设定文件
public UrlLiveBean matchDel(String result,Attribution attr,String url){ * @return boolean 返回类型
try { */
Document doc = Jsoup.parse(result); public UrlLiveBean matchDel(String result,Attribution attr,String url){
String title = null; try {
if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")){ Document doc = Jsoup.parse(result);
title = doc.select("h2.rich_media_title").text().replaceAll(" ", ""); String title = null;
if(Objects.isNull(title) || title.isEmpty()) { if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")){
title = doc.select("p.title").text(); title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
} if(Objects.isNull(title) || title.isEmpty()) {
if(Objects.isNull(title) || title.isEmpty()) { title = doc.select("p.title").text();
title = doc.select("h3.msg-title").text(); }
} if(Objects.isNull(title) || title.isEmpty()) {
if(Objects.isNull(title) || title.isEmpty()) { title = doc.select("h3.msg-title").text();
title = doc.select("div.global_error_msg.warn").text(); }
} if(Objects.isNull(title) || title.isEmpty()) {
if(Objects.isNull(title) || title.isEmpty()) { title = doc.select("div.global_error_msg.warn").text();
title = doc.select("p.tips").text(); }
} if(Objects.isNull(title) || title.isEmpty()) {
if(Objects.isNull(title) || title.isEmpty()) { title = doc.select("div.warn").text();
title = doc.select("h2").text(); }
} if(Objects.isNull(title) || title.isEmpty()) {
// 获取title title = doc.select("p.tips").text();
Matcher ma5 = Pattern.compile("var msg_title = \'(.*)\'") }
.matcher(result); if(Objects.isNull(title) || title.isEmpty()) {
if (ma5.find()) { title = doc.select("h2").text();
title = ma5.group(1).replaceAll(" ", " ").trim(); }
} // 获取title
}else if(url.contains("kuaibao")){ Matcher ma5 = Pattern.compile("var msg_title = \'(.*)\'")
title = doc.select("p.title").text().replaceAll(" ", ""); .matcher(result);
}else if(url.contains("chinadaily.com.cn")){ if (ma5.find()) {
title = doc.select("p.style1").text().replaceAll(" ", ""); title = ma5.group(1).replaceAll(" ", " ").trim();
}else if(url.contains("baidu.com") || url.contains("hao123.com")) { }
title = doc.select("p#contaniner").text(); }else if(url.contains("kuaibao")){
}else if(url.contains("kanfanews.com")) { title = doc.select("p.title").text().replaceAll(" ", "");
title = doc.select("p#tit").text(); }else if(url.contains("chinadaily.com.cn")){
}else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) { title = doc.select("p.style1").text().replaceAll(" ", "");
title = "网页已删除"; }else if(url.contains("baidu.com") || url.contains("hao123.com")) {
}else if(url.contains("a.mp.uc.cn")) { title = doc.select("p#contaniner").text();
try { }else if(url.contains("kanfanews.com")) {
JSONObject json = JSONObject.parseObject(result); title = doc.select("p#tit").text();
title = json.getJSONObject("data").getString("title"); }else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
} catch (Exception e) { title = "网页已删除";
logger.error(" uc 数据 json 转换失败", e); }else if(url.contains("a.mp.uc.cn")) {
} try {
}else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) { JSONObject json = JSONObject.parseObject(result);
title = "网页已删除"; title = json.getJSONObject("data").getString("title");
}else if(url.contains("zhihu.com")) { } catch (Exception e) {
JSONObject resultJson = JSONObject.parseObject(result); logger.error(" uc 数据 json 转换失败", e);
if(url.contains("/answer/")) { }
title = resultJson.getJSONObject("question").getString("title"); }else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
}else if(url.contains("/question/") && !url.contains("/answer/") || url.contains("/p/")) { title = "网页已删除";
title = resultJson.getString("title"); }else if(url.contains("zhihu.com")) {
} JSONObject resultJson = JSONObject.parseObject(result);
}else if(url.contains("360kuai.com") && result.contains("您访问的文章走失了")) { if(url.contains("/answer/")) {
title = String.valueOf("404"); title = resultJson.getJSONObject("question").getString("title");
}else if(result.contains("文章没有找到哦") && url.contains("yidianzixun.com")) { }else if(url.contains("/question/") && !url.contains("/answer/") || url.contains("/p/")) {
title = "文章未找到"; title = resultJson.getString("title");
}else if(url.contains("tznew.58.com/view")) { }
try { }else if(url.contains("360kuai.com") && result.contains("您访问的文章走失了")) {
JSONObject json = JSONObject.parseObject(result); title = String.valueOf("404");
title = json.getJSONObject("result").getString("title"); }else if(result.contains("文章没有找到哦") && url.contains("yidianzixun.com")) {
} catch (Exception e) { title = "文章未找到";
logger.error(" uc 数据 json 转换失败", e); }else if(url.contains("tznew.58.com/view")) {
} try {
} JSONObject json = JSONObject.parseObject(result);
title = json.getJSONObject("result").getString("title");
//若title 为拿到 用 此方法 } catch (Exception e) {
if(Objects.isNull(title) || title.length() < 1) { logger.error(" uc 数据 json 转换失败", e);
title = doc.select("div.adiv > p > span").text().replaceAll(" ", ""); }
} }
//若title 为拿到 用 此方法 //若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) { if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("title").text().replaceAll(" ", ""); title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
} }
//若title 为拿到 用 此方法 //若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) { if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("h1").text().replaceAll(" ", ""); title = doc.select("title").text().replaceAll(" ", "");
} }
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断 //若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) { if(Objects.isNull(title) || title.length() < 1) {
// title = "网页已删除"; title = doc.select("h1").text().replaceAll(" ", "");
// } }
if(Objects.nonNull(title) && title.length() > 1){ //若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
return new UrlLiveBean(attr.getAttr().toString(), isDelete(title),title); // if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
} else { // title = "网页已删除";
return null; // }
}
} catch (Exception e) { if(Objects.nonNull(title) && title.length() > 1){
return null; return new UrlLiveBean(attr.getAttr().toString(), isDelete(title),title);
} } else {
} return null;
}
/** } catch (Exception e) {
* return null;
* @Description 标题判断 }
* @param title }
* @return
*/ /**
private boolean isDelete(String title) { *
List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除" * @Description 标题判断
,"此帐号已自主注销,内容无法查看","页面提示","正在维护中" * @param title
,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway" * @return
,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网" */
,"百度一下,你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页" private boolean isDelete(String title) {
,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网" List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除"
,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台" ,"此帐号已自主注销,内容无法查看","页面提示","正在维护中"
,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移" ,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway"
,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除" ,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网"
,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在","文章未找到" ,"百度一下,你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页"
, "UC头条"); ,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网"
,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在" ,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误" ,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常", ,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在","文章未找到"
"此帐号已被屏蔽, 内容无法查看","链接不存在", "新闻已删除"); , "UC头条");
return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals); List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
} ,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误"
,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常",
"此帐号已被屏蔽, 内容无法查看","链接不存在", "新闻已删除");
/**
* 处理知乎链接 return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
* }
* */
private static String treatZhihuUrl(String url) {
if(url.contains("/answer/")) { /**
url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", ""); * 处理知乎链接
}else if(url.contains("/question/") && !url.contains("/answer/")) { *
url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", ""); * */
}else if(url.contains("/p/")) { private static String treatZhihuUrl(String url) {
url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", ""); if(url.contains("/answer/")) {
} url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", "");
return url; }else if(url.contains("/question/") && !url.contains("/answer/")) {
} url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", "");
}else if(url.contains("/p/")) {
} url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", "");
}
return url;
}
}
\ No newline at end of file
...@@ -32,7 +32,7 @@ public class MediaSelfSource { ...@@ -32,7 +32,7 @@ public class MediaSelfSource {
ProxyInit.initProxy(); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("https://new.qq.com/omn/20200507/20200507A0Q9JV00.html"); urlList.add("https://k.sina.com.cn/article_1060093724_3f2fbf1c00100vsqd.html?from=mood");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList); List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) { for(MediaSelfSourceBean b : u) {
......
...@@ -80,10 +80,10 @@ public class SourceForward { ...@@ -80,10 +80,10 @@ public class SourceForward {
public static void main(String[] args) { public static void main(String[] args) {
ProxyInit.initProxy(); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://www.wangjiaozixun.com/html/zx20/2020/0730/1396388.html"); urlList.add("http://gu.qq.com/resources/shy/news/detail-v2/index.html?#/index?id=SN202006091653447945411f");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList); List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) { for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString()); System.out.println("=============="+sfb.toString());
} }
} }
...@@ -94,7 +94,6 @@ public class SourceForward { ...@@ -94,7 +94,6 @@ public class SourceForward {
try{ try{
SourceForwardCrawler crawler = new SourceForwardCrawler(); SourceForwardCrawler crawler = new SourceForwardCrawler();
SourceForwardDataCallBack callback = new SourceForwardDataCallBack() { SourceForwardDataCallBack callback = new SourceForwardDataCallBack() {
@Override @Override
public void onData(SourceForwardBean data, Attribution attr) { public void onData(SourceForwardBean data, Attribution attr) {
list.add(data); list.add(data);
......
...@@ -72,13 +72,13 @@ public class URLLive { ...@@ -72,13 +72,13 @@ public class URLLive {
public static void main(String[] args) { public static void main(String[] args) {
ProxyInit.initProxy(); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://www.toutiao.com/item/1668646006370318/"); urlList.add("http://mp.weixin.qq.com/s?__biz=Mzg3MDMzNTc5Mg==&mid=2247485220&idx=1&sn=9118543ca120489cccbdc102be58f881");
// urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh"); // urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList); List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) { for(UrlLiveBean b : u) {
System.out.println(b.toString()); System.out.println(b.toString());
} }
} }
static class UrlLiveCrawlerThread extends Thread{ static class UrlLiveCrawlerThread extends Thread{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment