Commit 0c98f43b by cwy

升级版本 和修复快资讯自媒体号获取

parent 7c541080
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.1.9-SNAPSHOT</version>
<version>0.2.1-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......@@ -29,7 +29,7 @@
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.2-RELEASE</version>
<version>0.5.5.6-SNAPSHOT</version>
</dependency>
</dependencies>
......
......@@ -13,7 +13,6 @@ import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -50,7 +49,6 @@ public class ContentCrawler {
for (String url : urls) {
if (url != null) {
try {
ZhiWeiTools.sleep(10);
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
......
......@@ -19,7 +19,6 @@ import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -68,7 +67,6 @@ public class MediaSelfSourceCrawler {
counter.add();
if (url != null) {
try {
ZhiWeiTools.sleep(10);
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
......
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
......@@ -20,8 +21,6 @@ import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -50,7 +49,6 @@ public class SourceForwardCrawler {
if (url != null) {
try {
search(counter, url, Attribution.of(url), callback);
ZhiWeiTools.sleep(10);
} catch (Exception e) {
logger.error("搜索创建出错", e);
}
......@@ -62,7 +60,8 @@ public class SourceForwardCrawler {
private GroupSync search(GroupSync counter, String url,Attribution attr, SourceForwardDataCallBack callback) {
logger.info("当前处理 URL: {}", url);
Map<String,String> headers = HeaderTool.getCommonHead();
Map<String,String> headers = new HashMap<>();
// Map<String,String> headers = HeaderTool.getCommonHead();
if(url.contains("www.toutiao.com")){
headers.put("referer", url);
}
......
......@@ -36,14 +36,14 @@ import okhttp3.Request;
public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(true).build();
public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) {
GroupSync counter = new GroupSync();
start(counter, callback, urls);
return counter;
}
private void start(GroupSync counter,UrlLiveDataCallback callback, String... urls) {
if (nonNull(urls) && urls.length > 0) {
for (String url : urls) {
......@@ -60,11 +60,13 @@ public class UrlLiveCrawler {
}
}
}
private GroupSync search(GroupSync counter, String url,
Attribution attr, UrlLiveDataCallback callback) {
System.out.println(url);
url = dealUrl(url);
logger.info("当前处理 URL: {}", url);
// Map<String,String> headers = new HashMap<>();
Map<String,String> headers = HeaderTool.getCommonHead();
if(url.contains("www.toutiao.com")){
headers.put("referer", url);
......@@ -75,7 +77,7 @@ public class UrlLiveCrawler {
Request request = RequestUtils.wrapGet(url, headers);
if(Objects.nonNull(request)) {
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
httpBoot.asyncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
if(rs.isSuccessful()) {
......@@ -86,6 +88,7 @@ public class UrlLiveCrawler {
callBack(callback, attr, 1, String.valueOf(rs.code()));
}
} else {
logger.error("e", ex);
callBack(callback, attr, 1,"未访问成功");
}
} catch (Exception e) {
......@@ -118,7 +121,7 @@ public class UrlLiveCrawler {
private String dealUrl(String url) {
try {
if(url.contains("toutiao.com")) {
if(url.contains("www.toutiao.com")) {
if(url.contains("www.toutiao.com")) {
}else {
......@@ -223,6 +226,10 @@ public class UrlLiveCrawler {
JSONObject resultJson = JSONObject.parseObject(result);
title = resultJson.getString("title")!=null?resultJson.getString("title"):resultJson.getString("message");
}else if(url.contains("360kuai.com") && result.contains("您访问的文章走失了")) {
title = String.valueOf("404");
}else if(result.contains("文章没有找到哦") && url.contains("yidianzixun.com")) {
title = "文章未找到";
}
//若title 为拿到 用 此方法
......@@ -271,7 +278,7 @@ public class UrlLiveCrawler {
,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在");
,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在","文章未找到");
List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误"
......
package com.zhiwei.source_forward.crawler;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.async.GroupSync;
import com.zhiwei.async.TaskBoot;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import okhttp3.Request;
import okhttp3.Response;
public class UrlLiveCrawlerNew {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawlerNew.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(false).build();
public List<UrlLiveBean> judgeIsDelete(List<String> urlList) {
GroupSync counter = new GroupSync();
List<UrlLiveBean> ulbList = new ArrayList<>();
urlList.forEach(url -> {
try {
counter.add();
TaskBoot.blockingAsync(() -> {
try {
counter.add();
UrlLiveBean ulb = dealUrlLive(url);
if(Objects.nonNull(ulb)) {
ulbList.add(ulb);
}
} catch (Exception e) {
logger.error("链接是否删除新", e);
} finally {
counter.done();
}
});
} catch (Exception e2) {
logger.error("数据出错 {}" ,e2);
} finally {
counter.done();
}
});
try {
counter.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
return ulbList;
}
private UrlLiveBean dealUrlLive(String url) {
try {
url = dealUrl(url);
logger.info("当前处理 URL: {}", url);
Map<String,String> headers = new HashMap<>();
// Map<String,String> headers = HeaderTool.getCommonHead();
if(url.contains("www.toutiao.com")){
headers.put("referer", url);
}else if(url.contains("zhihu.com")) {
url = treatZhihuUrl(url);
}
Request request = RequestUtils.wrapGet(url, headers);
int code = 404;
for(int i = 0; i < 2; i++) {
try (Response response = httpBoot.syncCall(request)){
if(response.isSuccessful()) {
return matchDel(response.body().string(), url);
}else {
code = response.code();
}
} catch (Exception e) {
logger.error("解析", e);
}
}
if(code == 403){
return callBack(url, -1, String.valueOf(code));
}else {
return callBack(url, 1, String.valueOf(code));
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
private UrlLiveBean callBack(String url,int i,String title) {
if(i == 1) {
return new UrlLiveBean(url, true, title);
}else {
return new UrlLiveBean(url, i, title);
}
}
private String dealUrl(String url) {
try {
if(url.contains("toutiao.com")) {
if(url.contains("www.toutiao.com")) {
}else {
url = url.replace("toutiao.com", "www.toutiao.com");
}
if(url.contains("https")) {
}else {
url = url.replace("http", "https");
}
if(url.contains("group")) {
url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
}
}else if(url.contains("mp.weixin.qq.com")) {
if(url.contains("https")) {
}else {
url = url.replace("http", "https");
}
}else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) {
url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1";
}
return url;
} catch (Exception e) {
return url;
}
}
/***
* @Title: matchDel
* @author hero
* @Description: 验证链接是否有效
* @param @param page
* @param @return 设定文件
* @return boolean 返回类型
*/
public UrlLiveBean matchDel(String result,String url){
try {
Document doc = Jsoup.parse(result);
String title = null;
if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")){
title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.title").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h3.msg-title").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("div.global_error_msg.warn").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.tips").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h2").text();
}
}else if(url.contains("kuaibao")){
title = doc.select("p.title").text().replaceAll(" ", "");
}else if(url.contains("chinadaily.com.cn")){
title = doc.select("p.style1").text().replaceAll(" ", "");
}else if(url.contains("baidu.com") || url.contains("hao123.com")) {
title = doc.select("p#contaniner").text();
}else if(url.contains("kanfanews.com")) {
title = doc.select("p#tit").text();
}else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
title = "网页已删除";
}else if(url.contains("a.mp.uc.cn")) {
try {
JSONObject json = JSONObject.parseObject(result);
title = json.getJSONObject("data").getString("title");
if(Objects.isNull(title) || title.length() < 1) {
title = "网页已删除";
}
} catch (Exception e) {
logger.error(" uc 数据 json 转换失败", e);
}
}else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
title = "网页已删除";
}else if(url.contains("zhihu.com")) {
JSONObject resultJson = JSONObject.parseObject(result);
title = resultJson.getString("title")!=null?resultJson.getString("title"):resultJson.getString("message");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("title").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("h1").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
title = "网页已删除";
}
if(Objects.nonNull(title) && title.length() > 1){
return new UrlLiveBean(url, isDelete(title),title);
} else {
return null;
}
} catch (Exception e) {
return null;
}
}
/**
*
* @Description 标题判断
* @param title
* @return
*/
private boolean isDelete(String title) {
List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除"
,"此帐号已自主注销,内容无法查看","页面提示","正在维护中"
,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway"
,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网"
,"百度一下,你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页"
,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网"
,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在");
List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误"
,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常",
"此帐号已被屏蔽, 内容无法查看","链接不存在");
return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
}
/**
* 处理知乎链接
*
* */
private static String treatZhihuUrl(String url) {
if(url.contains("/answer/")) {
url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", "");
}else if(url.contains("/question/") && !url.contains("/answer/")) {
url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", "");
}else if(url.contains("/p/")) {
url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", "");
}
return url;
}
}
......@@ -14,6 +14,13 @@ import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
/**
*
* @ClassName: MediaSelfSource
* @Description: 自媒体号匹配
* @author 0xff
* @date 2019年12月5日 下午4:05:08
*/
public class MediaSelfSource {
private static Logger logger = LogManager.getLogger(MediaSelfSource.class);
......@@ -23,9 +30,9 @@ public class MediaSelfSource {
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER, 10000002L);
List<String> urlList = new ArrayList<>();
urlList.add("https://v.qq.com/x/page/g0904sm9wti.html");
urlList.add("https://www.360kuai.com/pc/922e4596800e5ef0a?cota=3&kuai_so=1&sign=360_e39369d1&refer_scene=so_3");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) {
System.out.println(b.toString());
......
......@@ -79,7 +79,7 @@ public class SourceForward {
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002);
List<String> urlList = new ArrayList<>();
urlList.add("http://software.it168.com/a2019/0621/6005/000006005693.shtml");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
......
package com.zhiwei.source_forward.run;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
......@@ -9,14 +11,21 @@ import java.util.Map.Entry;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.async.TaskBoot;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
import com.zhiwei.source_forward.crawler.UrlLiveCrawlerNew;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import okhttp3.Request;
import okhttp3.Response;
/**
* @ClassName: URLLive
* @Description: 验证链接是否已删除
......@@ -27,6 +36,8 @@ public class URLLive {
private static Logger logger = LogManager.getLogger(URLLive.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().build();
/**
* @Title: verificationURLLive
* @author hero
......@@ -73,9 +84,9 @@ public class URLLive {
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002);
List<String> urlList = new ArrayList<>();
urlList.add("http://weixin.sogou.com/api/share?timestamp=1569677503&signature=qIbwY*nI6KU9tBso4VCd8lYSesxOYgLcHX5tlbqlMR8N6flDHs4LLcFgRw7FjTAOm-VL1HR*9bkHkS0mWu-ZWc0ngS8ZsOYF7bq3mJCtAXbdMD8klA3ZAVBmYq2GVTJu2*fqwGdiiXgkPsBKht7mUN0o-rO8uYoVU6yfvrHHg29Hj1YBH4TG2Jtkz-zMRkQYKDOXTQgexDeAYfmgWMyar1GxXsDGbOjibPJZpqlwY-A=");
urlList.add("http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) {
......
......@@ -395,7 +395,7 @@ public class MatchSource {
source = document.select("p.article-info").select("a").text().trim();
}
if(source.length() < 1 && html.contains("window.__INITIAL_DATA__ =")) {
Matcher ma = Pattern.compile("window.__INITIAL_DATA__ =[\\s\\S]+?\\</script\\>").matcher(html);
Matcher ma = Pattern.compile("window.__INITIAL_DATA__ =[\\s\\S]+?}};").matcher(html);
if(ma.find()) {
String result = ma.group().replaceAll("window.__INITIAL_DATA__ =|\\</script\\>|", "").trim();
if(result.contains("window.autohomePVDDWhiteList")) {
......@@ -404,7 +404,7 @@ public class MatchSource {
JSONObject json = JSONObject.parseObject(result.trim().substring(0,result.lastIndexOf(";")));
source = json.getJSONObject("detail").getString("sec_src");
if(Objects.isNull(source) || source.length() < 1) {
source = json.getJSONObject("detail").getString("src");
source = json.getJSONObject("detail").getString("src");
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment