Commit 72bdcd09 by chenweiyang

头条是否删除做特殊处理

parent e9aa812f
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.2.9-SNAPSHOT</version>
<version>0.3.0-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......@@ -30,7 +30,7 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.2.0-SNAPSHOT</version>
<version>0.2.4-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
......
......@@ -4,6 +4,9 @@ public class UrlLiveBean {
private String url;
/**
* 1 已删除
*/
private Integer isLive;
private String title;
......
......@@ -16,6 +16,7 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.JSONPath;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
......@@ -70,20 +71,8 @@ public class UrlLiveCrawler {
url = dealUrl(url);
logger.info("当前处理 URL: {}", url);
Map<String,String> headers = new HashMap<>();
ProxyHolder ph = null;
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")){
// headers.put("referer", url);
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_webid=6837283963338622477; tt_webid=6837283963338622477; __tasessionId=dsei2aty41591951911851; tt_scid=M--AJ-FYwZ0qcYTzQCLyMeS5MlykLS6ktMkvqKKJmq-ghRxX4waEBhJ3YbheuNmi2b8a");
// headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
// headers.put("accept-encoding", "gzip, deflate, br");
// headers.put("accept-language", "zh-CN,zh;q=0.9");
// headers.put("cache-control", "no-cache");
// headers.put("sec-fetch-dest", "document");
// headers.put("sec-fetch-mode", "navigate");
// headers.put("sec-fetch-site", "same-origin");
// headers.put("sec-fetch-user", "?1");
// headers.put("upgrade-insecure-requests", "1");
// headers.put("user-agent", "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Mobile Safari/537.36");
ph = ProxyHolder.NAT_HEAVY_PROXY;
}else if(url.contains("zhihu.com")) {
url = treatZhihuUrl(url);
......@@ -138,20 +127,8 @@ public class UrlLiveCrawler {
private String dealUrl(String url) {
try {
if(url.contains("www.toutiao.com")) {
if(url.contains("www.toutiao.com")) {
}else {
url = url.replace("toutiao.com", "www.toutiao.com");
}
if(url.contains("https")) {
}else {
url = url.replace("http", "https");
}
if(url.contains("group")) {
url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
}
if(url.contains("toutiao.com")) {
return dealToutiaoUrl(url);
}else if(url.contains("mp.weixin.qq.com")) {
if(url.contains("https")) {
......@@ -170,6 +147,22 @@ public class UrlLiveCrawler {
}
}
private static Pattern pa = Pattern.compile("\\d+");
private String dealToutiaoUrl(String url) {
try {
String data = url.split("\\?")[0];
Matcher m = pa.matcher(data);
if (m.find()) {
String aid = m.group(0);
return "https://m.toutiao.com/i" + aid + "/info/?_signature=&i=" + aid;
}
} catch (Exception e) {
logger.error("数据获取id出错", e);
}
return url;
}
/**
*
* @Description 判断是否删除
......@@ -229,6 +222,10 @@ public class UrlLiveCrawler {
if (ma5.find()) {
title = ma5.group(1).replaceAll(" ", " ").trim();
}
if(result.contains("此帐号已被屏蔽, 内容无法查看") || result.contains("该公众号已迁移") || result.contains("此帐号已自主注销,内容无法查看")
|| result.contains("此帐号处于帐号迁移流程中") || result.contains("该内容已被发布者删除")) {
title = "网页已删除";
}
}else if(url.contains("kuaibao")){
title = doc.select("p.title").text().replaceAll(" ", "");
}else if(url.contains("chinadaily.com.cn")){
......@@ -283,6 +280,11 @@ public class UrlLiveCrawler {
title = doc.select("h1").text().replaceAll(" ", "");
}
if(result.contains("\"success\":false") && attr.getAttr().toString().contains("toutiao.com")) {
title = "网页已删除";
}else {
title = String.valueOf(JSONPath.read(result, "$..title"));
}
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
// title = "网页已删除";
......
......@@ -32,7 +32,7 @@ public class MediaSelfSource {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("https://k.sina.com.cn/article_1060093724_3f2fbf1c00100vsqd.html?from=mood");
urlList.add("http://baijiahao.baidu.com/s?id=1665770738503315058&wfr=spider&for=pc");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) {
......
......@@ -72,7 +72,7 @@ public class URLLive {
public static void main(String[] args) {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://mp.weixin.qq.com/s?__biz=Mzg3MDMzNTc5Mg==&mid=2247485220&idx=1&sn=9118543ca120489cccbdc102be58f881");
urlList.add("https://www.toutiao.com/a6982350814614405670/");
// urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
......
......@@ -508,6 +508,8 @@ public class MatchSource {
source = document.select("p.author-name:nth-child(1)").text();
}else if(StringUtils.isNotBlank(document.select("a.authorName").text())){
source = document.select("a.authorName").text();
}else if(StringUtils.isNotBlank(document.select("div.author-name > a").text())){
source = document.select("div.author-name > a").text();
}
if(StringUtils.isNotBlank(source)){
source = "百度百家-" + source;
......
......@@ -42,11 +42,4 @@
// poi.exportExcel(path ,"匹配后数据", headList, bodyList);
// }
//
//
//
//
//
//
//
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment