Commit 72bdcd09 by chenweiyang

头条是否删除做特殊处理

parent e9aa812f
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.2.9-SNAPSHOT</version> <version>0.3.0-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.2.0-SNAPSHOT</version> <version>0.2.4-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
......
...@@ -4,6 +4,9 @@ public class UrlLiveBean { ...@@ -4,6 +4,9 @@ public class UrlLiveBean {
private String url; private String url;
/**
* 1 已删除
*/
private Integer isLive; private Integer isLive;
private String title; private String title;
......
...@@ -16,6 +16,7 @@ import org.jsoup.Jsoup; ...@@ -16,6 +16,7 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.JSONPath;
import com.zhiwei.async.GroupSync; import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.crawler.core.proxy.ProxyHolder;
...@@ -70,20 +71,8 @@ public class UrlLiveCrawler { ...@@ -70,20 +71,8 @@ public class UrlLiveCrawler {
url = dealUrl(url); url = dealUrl(url);
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", url);
Map<String,String> headers = new HashMap<>(); Map<String,String> headers = new HashMap<>();
ProxyHolder ph = null; ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")){ if(url.contains("toutiao.com")){
// headers.put("referer", url);
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_webid=6837283963338622477; tt_webid=6837283963338622477; __tasessionId=dsei2aty41591951911851; tt_scid=M--AJ-FYwZ0qcYTzQCLyMeS5MlykLS6ktMkvqKKJmq-ghRxX4waEBhJ3YbheuNmi2b8a");
// headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
// headers.put("accept-encoding", "gzip, deflate, br");
// headers.put("accept-language", "zh-CN,zh;q=0.9");
// headers.put("cache-control", "no-cache");
// headers.put("sec-fetch-dest", "document");
// headers.put("sec-fetch-mode", "navigate");
// headers.put("sec-fetch-site", "same-origin");
// headers.put("sec-fetch-user", "?1");
// headers.put("upgrade-insecure-requests", "1");
// headers.put("user-agent", "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Mobile Safari/537.36");
ph = ProxyHolder.NAT_HEAVY_PROXY; ph = ProxyHolder.NAT_HEAVY_PROXY;
}else if(url.contains("zhihu.com")) { }else if(url.contains("zhihu.com")) {
url = treatZhihuUrl(url); url = treatZhihuUrl(url);
...@@ -138,20 +127,8 @@ public class UrlLiveCrawler { ...@@ -138,20 +127,8 @@ public class UrlLiveCrawler {
private String dealUrl(String url) { private String dealUrl(String url) {
try { try {
if(url.contains("www.toutiao.com")) { if(url.contains("toutiao.com")) {
if(url.contains("www.toutiao.com")) { return dealToutiaoUrl(url);
}else {
url = url.replace("toutiao.com", "www.toutiao.com");
}
if(url.contains("https")) {
}else {
url = url.replace("http", "https");
}
if(url.contains("group")) {
url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
}
}else if(url.contains("mp.weixin.qq.com")) { }else if(url.contains("mp.weixin.qq.com")) {
if(url.contains("https")) { if(url.contains("https")) {
...@@ -170,6 +147,22 @@ public class UrlLiveCrawler { ...@@ -170,6 +147,22 @@ public class UrlLiveCrawler {
} }
} }
private static Pattern pa = Pattern.compile("\\d+");
private String dealToutiaoUrl(String url) {
try {
String data = url.split("\\?")[0];
Matcher m = pa.matcher(data);
if (m.find()) {
String aid = m.group(0);
return "https://m.toutiao.com/i" + aid + "/info/?_signature=&i=" + aid;
}
} catch (Exception e) {
logger.error("数据获取id出错", e);
}
return url;
}
/** /**
* *
* @Description 判断是否删除 * @Description 判断是否删除
...@@ -229,6 +222,10 @@ public class UrlLiveCrawler { ...@@ -229,6 +222,10 @@ public class UrlLiveCrawler {
if (ma5.find()) { if (ma5.find()) {
title = ma5.group(1).replaceAll(" ", " ").trim(); title = ma5.group(1).replaceAll(" ", " ").trim();
} }
if(result.contains("此帐号已被屏蔽, 内容无法查看") || result.contains("该公众号已迁移") || result.contains("此帐号已自主注销,内容无法查看")
|| result.contains("此帐号处于帐号迁移流程中") || result.contains("该内容已被发布者删除")) {
title = "网页已删除";
}
}else if(url.contains("kuaibao")){ }else if(url.contains("kuaibao")){
title = doc.select("p.title").text().replaceAll(" ", ""); title = doc.select("p.title").text().replaceAll(" ", "");
}else if(url.contains("chinadaily.com.cn")){ }else if(url.contains("chinadaily.com.cn")){
...@@ -283,6 +280,11 @@ public class UrlLiveCrawler { ...@@ -283,6 +280,11 @@ public class UrlLiveCrawler {
title = doc.select("h1").text().replaceAll(" ", ""); title = doc.select("h1").text().replaceAll(" ", "");
} }
if(result.contains("\"success\":false") && attr.getAttr().toString().contains("toutiao.com")) {
title = "网页已删除";
}else {
title = String.valueOf(JSONPath.read(result, "$..title"));
}
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断 //若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) { // if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
// title = "网页已删除"; // title = "网页已删除";
......
...@@ -32,7 +32,7 @@ public class MediaSelfSource { ...@@ -32,7 +32,7 @@ public class MediaSelfSource {
ProxyInit.initProxy(); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("https://k.sina.com.cn/article_1060093724_3f2fbf1c00100vsqd.html?from=mood"); urlList.add("http://baijiahao.baidu.com/s?id=1665770738503315058&wfr=spider&for=pc");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList); List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) { for(MediaSelfSourceBean b : u) {
......
...@@ -72,7 +72,7 @@ public class URLLive { ...@@ -72,7 +72,7 @@ public class URLLive {
public static void main(String[] args) { public static void main(String[] args) {
ProxyInit.initProxy(); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://mp.weixin.qq.com/s?__biz=Mzg3MDMzNTc5Mg==&mid=2247485220&idx=1&sn=9118543ca120489cccbdc102be58f881"); urlList.add("https://www.toutiao.com/a6982350814614405670/");
// urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh"); // urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList); List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
......
...@@ -508,6 +508,8 @@ public class MatchSource { ...@@ -508,6 +508,8 @@ public class MatchSource {
source = document.select("p.author-name:nth-child(1)").text(); source = document.select("p.author-name:nth-child(1)").text();
}else if(StringUtils.isNotBlank(document.select("a.authorName").text())){ }else if(StringUtils.isNotBlank(document.select("a.authorName").text())){
source = document.select("a.authorName").text(); source = document.select("a.authorName").text();
}else if(StringUtils.isNotBlank(document.select("div.author-name > a").text())){
source = document.select("div.author-name > a").text();
} }
if(StringUtils.isNotBlank(source)){ if(StringUtils.isNotBlank(source)){
source = "百度百家-" + source; source = "百度百家-" + source;
......
...@@ -42,11 +42,4 @@ ...@@ -42,11 +42,4 @@
// poi.exportExcel(path ,"匹配后数据", headList, bodyList); // poi.exportExcel(path ,"匹配后数据", headList, bodyList);
// } // }
// //
//
//
//
//
//
//
//
//} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment