Commit ac11f629 by [zhangzhiwei]

Merge branch 'source-forward-chen' of…

Merge branch 'source-forward-chen' of http://git.zhiweidata.top/zhangzhiwei/source_forward.git into source-forward-chen
parents e7f2d78c b0fffd29
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.0.7-SNAPSHOT</version> <version>0.0.9-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......
...@@ -6,6 +6,8 @@ public class UrlLiveBean { ...@@ -6,6 +6,8 @@ public class UrlLiveBean {
private Integer isLive; private Integer isLive;
private String title;
public UrlLiveBean() { public UrlLiveBean() {
super(); super();
} }
...@@ -16,15 +18,31 @@ public class UrlLiveBean { ...@@ -16,15 +18,31 @@ public class UrlLiveBean {
this.isLive = isLive; this.isLive = isLive;
} }
public UrlLiveBean(String url, boolean isLive) { public UrlLiveBean(String url, Integer isLive,String title) {
super();
this.url = url;
this.isLive = isLive;
this.title = title;
}
public UrlLiveBean(String url, boolean isLive,String title) {
super(); super();
this.url = url; this.url = url;
this.title = title;
if(isLive) { if(isLive) {
this.isLive = 1; //已删除 this.isLive = 1; //已删除
}else { }else {
this.isLive = 0; this.isLive = 0;
} }
} }
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() { public String getUrl() {
return url; return url;
...@@ -44,7 +62,8 @@ public class UrlLiveBean { ...@@ -44,7 +62,8 @@ public class UrlLiveBean {
@Override @Override
public String toString() { public String toString() {
return "UrlLiveBean [url=" + url + ", isLive=" + isLive + "]"; return "UrlLiveBean [url=" + url + ", isLive=" + isLive + ", title="
+ title + "]";
} }
/** /**
......
...@@ -32,9 +32,14 @@ public class ContentCrawler { ...@@ -32,9 +32,14 @@ public class ContentCrawler {
*/ */
public MultiThreadingCounter submitTask(ContentDataCallback callback, public MultiThreadingCounter submitTask(ContentDataCallback callback,
String... urls) { String... urls) {
MultiThreadingCounter counter = new MultiThreadingCounter(15,TimeUnit.MINUTES,false); try {
start(counter, callback, urls); MultiThreadingCounter counter = new MultiThreadingCounter(15,TimeUnit.MINUTES,false);
return counter; start(counter, callback, urls);
return counter;
} catch (Exception e) {
logger.error(" exception {}",e);
return null;
}
} }
/** /**
......
...@@ -44,10 +44,15 @@ public class MediaSelfSourceCrawler { ...@@ -44,10 +44,15 @@ public class MediaSelfSourceCrawler {
* @return * @return
* @throws Exception * @throws Exception
*/ */
public MultiThreadingCounter submitTask(MediaSelfSourceDataCallBack callback,String... urls) throws Exception { public MultiThreadingCounter submitTask(MediaSelfSourceDataCallBack callback,String... urls) {
MultiThreadingCounter counter = new MultiThreadingCounter("任务======= ", 15,TimeUnit.SECONDS,true); try {
start(counter, callback, urls); MultiThreadingCounter counter = new MultiThreadingCounter("任务======= ", 15,TimeUnit.MINUTES,true);
return counter; start(counter, callback, urls);
return counter;
} catch (Exception e) {
logger.error(" exception {}",e);
return null;
}
} }
/** /**
...@@ -60,16 +65,15 @@ public class MediaSelfSourceCrawler { ...@@ -60,16 +65,15 @@ public class MediaSelfSourceCrawler {
private void start(MultiThreadingCounter counter,MediaSelfSourceDataCallBack callback, String... urls) { private void start(MultiThreadingCounter counter,MediaSelfSourceDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) { if (urls != null && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
counter.increase();
if (url != null) { if (url != null) {
try { try {
counter.increase();
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错", e); logger.error("搜索创建出错", e);
} finally {
counter.reduce();
} }
} }
counter.reduce();
} }
} }
} }
......
...@@ -32,25 +32,29 @@ public class SourceForwardCrawler { ...@@ -32,25 +32,29 @@ public class SourceForwardCrawler {
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot();
private static List<String> sourceList = SourceData.getSourceList(); private static List<String> sourceList = SourceData.getSourceList();
public MultiThreadingCounter submitTask(SourceForwardDataCallBack callback,String... urls) throws Exception { public MultiThreadingCounter submitTask(SourceForwardDataCallBack callback,String... urls) {
MultiThreadingCounter counter = new MultiThreadingCounter(15,TimeUnit.MINUTES,false); try {
start(counter, callback, urls); MultiThreadingCounter counter = new MultiThreadingCounter(15,TimeUnit.MINUTES,false);
return counter; start(counter, callback, urls);
return counter;
} catch (Exception e) {
logger.error(" exception ", e);
return null;
}
} }
private void start(MultiThreadingCounter counter,SourceForwardDataCallBack callback, String... urls) { private void start(MultiThreadingCounter counter,SourceForwardDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) { if (urls != null && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
counter.increase();
if (url != null) { if (url != null) {
try { try {
counter.increase();
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错", e); logger.error("搜索创建出错", e);
} finally { }
counter.reduce();
}
} }
counter.reduce();
} }
} }
} }
...@@ -100,10 +104,8 @@ public class SourceForwardCrawler { ...@@ -100,10 +104,8 @@ public class SourceForwardCrawler {
isforward = "未知"; isforward = "未知";
} }
}else if(attr.get().toString().contains("www.toutiao.com")){ }else if(attr.get().toString().contains("www.toutiao.com")){
if(body.contains("isOriginal")){ if(body.contains("isOriginal") && body.contains("isOriginal: true")){
if(body.contains("isOriginal: true")){ isforward = "原创";
isforward = "原创";
}
} }
}else{ }else{
channel = MatchChannel.verifyChannel(attr.get().toString()); channel = MatchChannel.verifyChannel(attr.get().toString());
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import static java.util.Objects.nonNull;
import java.util.Arrays;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
...@@ -8,6 +13,7 @@ import org.apache.logging.log4j.Logger; ...@@ -8,6 +13,7 @@ import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.async.MultiThreadingCounter; import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.core.RequestUtils;
...@@ -33,22 +39,29 @@ public class UrlLiveCrawler { ...@@ -33,22 +39,29 @@ public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class); private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot();
public MultiThreadingCounter submitTask(UrlLiveDataCallback callback,String... urls) throws Exception { public MultiThreadingCounter submitTask(UrlLiveDataCallback callback,String... urls) {
MultiThreadingCounter counter = new MultiThreadingCounter(20,TimeUnit.MINUTES,false); try {
start(counter, callback, urls); MultiThreadingCounter counter = new MultiThreadingCounter(10,TimeUnit.MINUTES,false);
return counter; start(counter, callback, urls);
return counter;
} catch (Exception e) {
logger.error(" 判断链接是否删除 {} ",e);
return null;
}
} }
private void start(MultiThreadingCounter counter,UrlLiveDataCallback callback, String... urls) { private void start(MultiThreadingCounter counter,UrlLiveDataCallback callback, String... urls) {
if (urls != null && urls.length > 0) { if (nonNull(urls) && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
if (url != null) { counter.increase();
if (nonNull(url)) {
try { try {
search(counter, url, Attribution.of(url,1), callback); search(counter, url, Attribution.of(url,1), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错:", e); logger.error("搜索创建出错:", e);
} }
} }
counter.reduce();
} }
} }
} }
...@@ -63,18 +76,23 @@ public class UrlLiveCrawler { ...@@ -63,18 +76,23 @@ public class UrlLiveCrawler {
} }
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
counter.increase(); counter.increase();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, false).addListener(future -> { httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).addListener(future -> {
try { try {
if (future.isSuccess()) { if (future.isSuccess()) {
Response response = future.result(); Response response = future.result();
try { try {
if(response.code() == 200) { if(response.code() == 200) {
parseHtml(response.body().string(), attr, callback); parseHtml(response.body().string(), attr, callback,counter);
}else { }else {
callBack(callback, attr, 1); if(attr.getCount() > 2) {
callBack(callback, attr, 1,String.valueOf(response.code()));
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("解析出错", e); logger.error("解析出错 {}", e);
}finally { }finally {
if(response != null) { if(response != null) {
response.close(); response.close();
...@@ -82,10 +100,10 @@ public class UrlLiveCrawler { ...@@ -82,10 +100,10 @@ public class UrlLiveCrawler {
} }
} else { } else {
if(future.cause().getMessage().contains("status code: ")) { if(future.cause().getMessage().contains("status code: ")) {
callBack(callback, attr, 1); callBack(callback, attr, 1,null);
}else { }else {
if(attr.getCount() > 3) { if(attr.getCount() > 3) {
callBack(callback, attr, -1); callBack(callback, attr, -1,null);
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage()); logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}else { }else {
attr.AddCount(); attr.AddCount();
...@@ -94,7 +112,7 @@ public class UrlLiveCrawler { ...@@ -94,7 +112,7 @@ public class UrlLiveCrawler {
} }
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); logger.error(" 数据是否删除 采集出错 {} ",e);
}finally { }finally {
counter.reduce(); counter.reduce();
} }
...@@ -102,8 +120,13 @@ public class UrlLiveCrawler { ...@@ -102,8 +120,13 @@ public class UrlLiveCrawler {
return counter; return counter;
} }
private void callBack(UrlLiveDataCallback callback,Attribution attr,int i) { private void callBack(UrlLiveDataCallback callback,Attribution attr,int i,String title) {
UrlLiveBean ulb = new UrlLiveBean(attr.getAttr().toString(), i); UrlLiveBean ulb = null;
if(i == 1) {
ulb = new UrlLiveBean(attr.getAttr().toString(), true, title);
}else {
ulb = new UrlLiveBean(attr.getAttr().toString(), i, title);
}
if (callback == null) { if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据"); logger.warn("DataCallback 对象为 null,无法保存数据");
} else { } else {
...@@ -112,33 +135,39 @@ public class UrlLiveCrawler { ...@@ -112,33 +135,39 @@ public class UrlLiveCrawler {
} }
private String dealUrl(String url) { private String dealUrl(String url) {
if(url.contains("toutiao.com")) { try {
try { if(url.contains("toutiao.com")) {
if(url.contains("www.toutiao.com")) { try {
if(url.contains("www.toutiao.com")) {
}else {
url = url.replace("toutiao.com", "www.toutiao.com"); }else {
url = url.replace("toutiao.com", "www.toutiao.com");
}
if(url.contains("https")) {
}else {
url = url.replace("http", "https");
}
if(url.contains("group")) {
url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
}
} catch (Exception e) {
logger.error("url 解析出错 ",e);
return url;
} }
}else if(url.contains("mp.weixin.qq.com")) {
if(url.contains("https")) { if(url.contains("https")) {
}else { }else {
url = url.replace("http", "https"); url = url.replace("http", "https");
} }
if(url.contains("group")) { }else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) {
url = "https://www.toutiao.com/a" + url.split("/")[4] + "/"; url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1";
}
} catch (Exception e) {
logger.error("url 解析出错 ",e);
return url;
}
}else if(url.contains("mp.weixin.qq.com")) {
if(url.contains("https")) {
}else {
url = url.replace("http", "https");
} }
return url;
} catch (Exception e) {
return url;
} }
return url;
} }
/** /**
...@@ -149,19 +178,21 @@ public class UrlLiveCrawler { ...@@ -149,19 +178,21 @@ public class UrlLiveCrawler {
* @param callback * @param callback
*/ */
private void parseHtml(String html, Attribution attr, private void parseHtml(String html, Attribution attr,
UrlLiveDataCallback callback) { UrlLiveDataCallback callback,MultiThreadingCounter counter) {
/***验证网页是否能够连通*/
boolean f = true;
try {
f = matchDel(html,attr.getAttr().toString());
} catch (Exception e) {
logger.error("数据判断出错 ",e);
}
UrlLiveBean ulb = new UrlLiveBean(attr.getAttr().toString(), f);
if (callback == null) { if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据"); logger.warn("DataCallback 对象为 null,无法保存数据");
} else { } else {
callback.onData(ulb, attr); UrlLiveBean ulb = matchDel(html,attr,attr.getAttr().toString());
if(Objects.nonNull(ulb)) {
callback.onData(ulb, attr);
}else {
if(attr.getCount() > 3) {
callBack(callback, attr, -1,null);
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
}
} }
} }
...@@ -173,22 +204,101 @@ public class UrlLiveCrawler { ...@@ -173,22 +204,101 @@ public class UrlLiveCrawler {
* @param @return 设定文件 * @param @return 设定文件
* @return boolean 返回类型 * @return boolean 返回类型
*/ */
public boolean matchDel(String result,String url){ public UrlLiveBean matchDel(String result,Attribution attr,String url){
Document doc = Jsoup.parse(result); Document doc = Jsoup.parse(result);
String title = null; String title = null;
if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com")){ if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com")){
title = doc.select("h2.rich_media_title").text().replaceAll(" ", ""); title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.title").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h3.msg-title").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("div.global_error_msg.warn").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.tips").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h2").text();
}
}else if(url.contains("kuaibao")){ }else if(url.contains("kuaibao")){
title = doc.select("p.title").text().replaceAll(" ", "");; title = doc.select("p.title").text().replaceAll(" ", "");
}else { }else if(url.contains("chinadaily.com.cn")){
title = doc.select("title").text().replaceAll(" ", "");; title = doc.select("p.style1").text().replaceAll(" ", "");
} }else if(url.contains("baidu.com") || url.contains("hao123.com")) {
if(title!=null && !title.equals("")){ title = doc.select("p#contaniner").text();
if(title.contains("未知错误") || title.contains("Object moved") || title.contains("404") || title.contains("页面没有找到") || title.contains("页面未找到") || title.contains("301 Moved Permanently")){ }else if(url.contains("kanfanews.com")) {
return true; title = doc.select("p#tit").text();
}else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
title = "网页已删除";
}else if(url.contains("a.mp.uc.cn")) {
try {
JSONObject json = JSONObject.parseObject(result);
title = json.getJSONObject("data").getString("title");
if(Objects.isNull(title) || title.length() < 1) {
title = "网页已删除";
}
} catch (Exception e) {
logger.error(" uc 数据 json 转换失败", e);
} }
}else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
title = "网页已删除";
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("title").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("h1").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
title = "网页已删除";
}
if(Objects.nonNull(title) && title.length() > 1){
return new UrlLiveBean(attr.getAttr().toString(), isDelete(title),title);
} else {
return null;
} }
return false; }
/**
*
* @Description 标题判断
* @param title
* @return
*/
private boolean isDelete(String title) {
List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除"
,"此帐号已自主注销,内容无法查看","页面提示","正在维护中"
,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway"
,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网"
,"百度一下,你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页"
,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网"
,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
,"网易","链接已过期","找不到页面","今晚网","该文章已被删除");
List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误"
,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常",
"此帐号已被屏蔽, 内容无法查看","链接不存在");
return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
} }
// /** // /**
...@@ -200,8 +310,7 @@ public class UrlLiveCrawler { ...@@ -200,8 +310,7 @@ public class UrlLiveCrawler {
// * @time 2016年6月3日上午9:54:00 // * @time 2016年6月3日上午9:54:00
// * @return boolean // * @return boolean
// */ // */
// private boolean rulerYaoyan(Document doc) // private boolean rulerYaoyan(Document doc){
// {
// boolean flg = false; // boolean flg = false;
// if ("谣言".equals(doc.select(".pic_rumor").text())) // if ("谣言".equals(doc.select(".pic_rumor").text()))
// { // {
...@@ -219,24 +328,16 @@ public class UrlLiveCrawler { ...@@ -219,24 +328,16 @@ public class UrlLiveCrawler {
// * @time 2016年6月3日上午9:59:54 // * @time 2016年6月3日上午9:59:54
// * @return boolean // * @return boolean
// */ // */
// private boolean rulerWeigui(Document doc) // private boolean rulerWechat(Document doc)
// { // {
// boolean flg = false; // boolean flg = false;
// if ((doc.select("p.title").text()).contains("此内容因违规无法查看") || doc.select("p.title").text().contains("此帐号在冻结期,内容无法查看")) // if ((doc.select("h3.msg-title").text()).contains("此内容被投诉且经审核涉嫌侵权,无法查看") || (doc.select("p.title").text()).contains("此内容因违规无法查看") || doc.select("p.title").text().contains("此帐号在冻结期,内容无法查看"))
// { // {
// flg = true; // flg = true;
// } // }
// return flg; // return flg;
// } // }
// //
// private boolean rulerWechatWeigui(Document doc) {
// boolean flg = false;
// if ((doc.select("h3.msg-title").text()).contains("此内容被投诉且经审核涉嫌侵权,无法查看"))
// {
// flg = true;
// }
// return flg;
// }
// //
// /** // /**
// * // *
......
...@@ -7,8 +7,6 @@ import java.util.List; ...@@ -7,8 +7,6 @@ import java.util.List;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean; import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution; import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler; import com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler;
...@@ -19,18 +17,17 @@ public class MediaSelfSource { ...@@ -19,18 +17,17 @@ public class MediaSelfSource {
private static Logger logger = LogManager.getLogger(MediaSelfSource.class); private static Logger logger = LogManager.getLogger(MediaSelfSource.class);
public static List<MediaSelfSourceBean> getMediaSelfSource(List<String> urlList) { public static List<MediaSelfSourceBean> getMediaSelfSource(List<String> urlList) {
List<MediaSelfSourceBean> list = MediaSelfSourceCrawlerThread.getMediaSelfSource(urlList); return MediaSelfSourceCrawlerThread.getMediaSelfSource(urlList);
return list;
} }
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); // ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>(); // List<String> urlList = new ArrayList<>();
urlList.add("http://sh.qihoo.com/pc/91d1d565fe552fa1e?sign=360_e39369d1"); // urlList.add("http://sh.qihoo.com/pc/91d1d565fe552fa1e?sign=360_e39369d1");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList); // List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) { // for(MediaSelfSourceBean b : u) {
System.out.println(b.toString()); // System.out.println(b.toString());
} // }
} }
static class MediaSelfSourceCrawlerThread extends Thread{ static class MediaSelfSourceCrawlerThread extends Thread{
...@@ -50,7 +47,7 @@ public class MediaSelfSource { ...@@ -50,7 +47,7 @@ public class MediaSelfSource {
}; };
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await(); crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
}catch (Exception e){ }catch (Exception e){
e.printStackTrace(); logger.error(" 网媒自媒体号 判断 {} ",e);
} }
return list; return list;
} }
......
...@@ -2,7 +2,6 @@ package com.zhiwei.source_forward.run; ...@@ -2,7 +2,6 @@ package com.zhiwei.source_forward.run;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
...@@ -10,11 +9,9 @@ import java.util.Map.Entry; ...@@ -10,11 +9,9 @@ import java.util.Map.Entry;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.SourceForwardBean; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution; import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.crawler.SourceForwardCrawler; import com.zhiwei.source_forward.crawler.SourceForwardCrawler;
import com.zhiwei.source_forward.run.MediaSelfSource.MediaSelfSourceCrawlerThread;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack; import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
/** /**
...@@ -28,82 +25,6 @@ public class SourceForward { ...@@ -28,82 +25,6 @@ public class SourceForward {
private static Logger logger = LogManager.getLogger(SourceForward.class); private static Logger logger = LogManager.getLogger(SourceForward.class);
/** /**
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体号名称
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> getMediaSelfSource(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序
List<String> urlList = new ArrayList<>();
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
urlList.add(entry.getKey());
}
List<MediaSelfSourceBean> sourceForwardList = MediaSelfSourceCrawlerThread.getMediaSelfSource(urlList);
for(MediaSelfSourceBean msfb : sourceForwardList){
String url = msfb.getUrl();
//整合数据及验证转发原创
if(dataMap.containsKey(url)){
Map<String,Object> data = dataMap.get(url);
data.put("自媒体号", msfb.getMediaself());
data.put("频道", msfb.getChannel());
dataMap.put(url, data);
}
}
return dataMap;
}
/**
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体账号
* @param @param urlList
* @param @return 设定文件
* @return Map<String,String> 返回类型
*/
public static Map<String,String> getMediaSelfSource(List<String> urlList){
//启动验证来源程序
Map<String,String> dataMap = new HashMap<>();
for(String url : urlList){
dataMap.put(url, null);
}
List<MediaSelfSourceBean> sourceForwardList = MediaSelfSourceCrawlerThread.getMediaSelfSource(urlList);
for(MediaSelfSourceBean mssb : sourceForwardList){
String url = mssb.getUrl();
//整合数据及验证转发原创
if(dataMap.containsKey(url)){
dataMap.put(url, mssb.getMediaself());
}
}
return dataMap;
}
/**
*
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体账号
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public static String getMediaSelfSource(String url){
//启动验证来源程序
List<String> urlList = new ArrayList<>();
urlList.add(url);
List<MediaSelfSourceBean> sourceForwardList = MediaSelfSourceCrawlerThread.getMediaSelfSource(urlList);
for(MediaSelfSourceBean sourceMap : sourceForwardList){
return sourceMap.getMediaself();
}
return null;
}
/**
* @Title: getSourceForward * @Title: getSourceForward
* @author hero * @author hero
* @Description: 验证文章是否转发 * @Description: 验证文章是否转发
...@@ -117,9 +38,7 @@ public class SourceForward { ...@@ -117,9 +38,7 @@ public class SourceForward {
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){ for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
urlList.add(entry.getKey()); urlList.add(entry.getKey());
} }
System.out.println(urlList.size());
List<SourceForwardBean> dataList = SourceForwardCrawlerThread.getSourceForward(urlList); List<SourceForwardBean> dataList = SourceForwardCrawlerThread.getSourceForward(urlList);
System.out.println(dataList.size());
for(SourceForwardBean sfb : dataList){ for(SourceForwardBean sfb : dataList){
String url = sfb.getUrl(); String url = sfb.getUrl();
String root_source = sfb.getRoot_source(); String root_source = sfb.getRoot_source();
...@@ -143,7 +62,6 @@ public class SourceForward { ...@@ -143,7 +62,6 @@ public class SourceForward {
dataMap.put(url, data); dataMap.put(url, data);
} }
} }
System.out.println("success");
return dataMap; return dataMap;
} }
...@@ -161,7 +79,7 @@ public class SourceForward { ...@@ -161,7 +79,7 @@ public class SourceForward {
public static void main(String[] args) { public static void main(String[] args) {
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); // ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// List<String> urlList = new ArrayList<>(); // List<String> urlList = new ArrayList<>();
// urlList.add("http://www.toutiao.com/a6452936157751968013/"); // urlList.add("https://www.toutiao.com/a6634320415839748621");
// List<SourceForwardBean> da = SourceForward.getSourceForward(urlList); // List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
// for(SourceForwardBean sfb : da) { // for(SourceForwardBean sfb : da) {
// System.out.println(sfb.toString()); // System.out.println(sfb.toString());
...@@ -185,7 +103,7 @@ public class SourceForward { ...@@ -185,7 +103,7 @@ public class SourceForward {
}; };
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await(); crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
}catch (Exception e){ }catch (Exception e){
e.printStackTrace(); logger.error(" 来源判断 出错 {} ",e);
} }
return list; return list;
} }
......
...@@ -6,6 +6,9 @@ import java.util.List; ...@@ -6,6 +6,9 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.source_forward.bean.UrlLiveBean; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.crawler.UrlLiveCrawler; import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
...@@ -19,6 +22,8 @@ import com.zhiwei.source_forward.util.UrlLiveDataCallback; ...@@ -19,6 +22,8 @@ import com.zhiwei.source_forward.util.UrlLiveDataCallback;
*/ */
public class URLLive { public class URLLive {
private static Logger logger = LogManager.getLogger(URLLive.class);
/** /**
* @Title: verificationURLLive * @Title: verificationURLLive
* @author hero * @author hero
...@@ -46,6 +51,7 @@ public class URLLive { ...@@ -46,6 +51,7 @@ public class URLLive {
}else if(i == 0) { }else if(i == 0) {
map.put("是否删除", false); map.put("是否删除", false);
} }
map.put("title", ub.getTitle());
dataMap.put(url, map); dataMap.put(url, map);
} }
} }
...@@ -60,18 +66,20 @@ public class URLLive { ...@@ -60,18 +66,20 @@ public class URLLive {
*/ */
public static List<UrlLiveBean> verificationURLLive(List<String> urlList){ public static List<UrlLiveBean> verificationURLLive(List<String> urlList){
//启动验证链接是否有效程序程序 //启动验证链接是否有效程序程序
List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList); return UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
return dataList;
} }
public static void main(String[] args) { // public static void main(String[] args) {
List<String> urlList = new ArrayList<>(); // ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
urlList.add("http://www.teso.cc/html/zixun/201606/233848.html"); // List<String> urlList = new ArrayList<>();
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList); // urlList.add("http://sh.qihoo.com/mob/transcoding?sign=360_e39369d1&n=10&pg=41&u=84c80ad777cd9a41152b4fd9c44f96e2&gzh=3093075895&news_sdk_version=&sqid=&_=1545026725607&callback=jsonp75&url=http%3A%2F%2Fzm.news.so.com%2F708e22872ce43ca08eec2a1fc57c6897&check=e0fae47326e7916f&ucheck=75e961d9583cfebe81a39e2dd972b0aa&uid=84c80ad777cd9a41152b4fd9c44f96e2&360newsdetail=1&c=detail&apiflag=detail&articlety=zmt");
for(UrlLiveBean b : u) { //
System.out.println(b.toString()); //
} // List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
} // for(UrlLiveBean b : u) {
// System.out.println(b.toString());
// }
// }
static class UrlLiveCrawlerThread extends Thread{ static class UrlLiveCrawlerThread extends Thread{
...@@ -90,7 +98,7 @@ public class URLLive { ...@@ -90,7 +98,7 @@ public class URLLive {
}; };
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await(); crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
}catch (Exception e){ }catch (Exception e){
e.printStackTrace(); logger.error(" 数据采集运行有问题 {} ", e);
} }
return list; return list;
} }
......
...@@ -130,22 +130,23 @@ public class MatchSource { ...@@ -130,22 +130,23 @@ public class MatchSource {
}else if(html.contains("screen_name:")){ }else if(html.contains("screen_name:")){
source = html.split("screen_name:'")[1].split("',")[0].trim(); source = html.split("screen_name:'")[1].split("',")[0].trim();
} }
if(source!=null && !source.equals("")){ if(source!=null && source.length()>1){
source = "今日头条-" + source; source = "今日头条-" + source;
} }
}else if(url.contains("sohu.com")){ }else if(url.contains("sohu.com")){
//搜狐自媒体号 //搜狐自媒体号
if(html.contains("<meta name=\"mediaid\"")){ if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim(); source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
if(source!=null && !source.equals("")){ if(source!=null && source.length()>1){
source = "搜狐-" + source; source = "搜狐-" + source;
} }
} }
}else if(url.contains("a.mini.eastday.com")){ }else if(url.contains("a.mini.eastday.com")){
//处理东方头条网-自媒体号匹配 //处理东方头条网-自媒体号匹配
source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text().trim(); source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text().trim();
if(source!=null && !source.equals("")){ if(source!=null && source.length()>1){
source = "东方头条-" + source; source = "东方头条-" + source;
} }
}else if(url.contains("sh.qihoo.com")){ }else if(url.contains("sh.qihoo.com")){
//今日报点解析 //今日报点解析
...@@ -153,45 +154,83 @@ public class MatchSource { ...@@ -153,45 +154,83 @@ public class MatchSource {
if(source.length() < 2) { if(source.length() < 2) {
source = document.select("p.article-info").select("a").text().trim(); source = document.select("p.article-info").select("a").text().trim();
} }
if(source!=null && !source.equals("")){ if(source!=null && source.length()>1){
source = "快资讯-" + source; source = "快资讯-" + source;
} }
}else if(url.contains("cj.sina.com.cn")){ }else if(url.contains("cj.sina.com.cn")){
//新浪财经头条号 //新浪财经头条号
if(html.contains("<meta name=\"mediaid\"")){ if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim(); source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
if(source!=null && !source.equals("")){ if(source!=null && source.length()>1){
source = "财经头条-" + source; source = "财经头条-" + source;
} }
} }
}else if(url.contains("baijiahao.baidu.com")){ }else if(url.contains("baijiahao.baidu.com")){
//百度百家 //百度百家
source = document.select("div.author").select("div.author-txt").select("p.author-name").text().trim(); source = document.select("p.author-name").first().text().trim();
if(source!=null && !source.equals("")){ if(source!=null && source.length()>1){
source = "百度百家-" + source; source = "百度百家-" + source;
} }
}else if(url.contains("yidianzixun.com")){ }else if(url.contains("yidianzixun.com")){
//一点资讯 //一点资讯
if(html.contains("related_wemedia")){ if(html.contains("related_wemedia")){
source = html.split("media_name\":\"")[1].split("\",\"")[0].trim(); source = html.split("media_name\":\"")[1].split("\",\"")[0].trim();
if(source!=null && !source.equals("")){ if(source!=null && source.length()>1){
source = "一点资讯-" + source; source = "一点资讯-" + source;
} }
}else{ }else{
source = html.split("source\":\"")[1].split("\",\"")[0]; source = html.split("source\":\"")[1].split("\",\"")[0];
} }
}else if(url.contains("news.bitauto.com")){ }else if(url.contains("news.bitauto.com")){
//易车网
source = document.select("[class=\"gz-box clearfix\"]").select("div.txt-box") source = document.select("[class=\"gz-box clearfix\"]").select("div.txt-box")
.select("p.p-n").select("a").text(); .select("p.p-n").select("a").text();
if(source!=null && !source.equals("")){ if(source!=null && source.length()>1){
source = "易车网-" + source; source = "易车网-" + source;
} }
}else if(url.contains("chejiahao.autohome.com.cn")){ }else if(url.contains("chejiahao.autohome.com.cn")){
//汽车之家
source = document.select("div.authorMes").select("[class=\"name text-overflow\"]") source = document.select("div.authorMes").select("[class=\"name text-overflow\"]")
.select("a").text(); .select("a").text();
if(source!=null && !source.equals("")){ if(source!=null && source.length()>1){
source = "汽车之家-" + source; source = "汽车之家-" + source;
} }
}else if(url.contains("item.btime.com")){
//北京时间
source = document.select("a.author").text();
if(source!=null && source.length()>1){
source = "北京时间-" + source;
}
}else if(url.contains("new.qq.com/omn/")){
//腾讯网-企鹅号
source = html.split("media\":\"")[1].split("\",\"")[0];
if(source!=null && source.length()>1){
source = "企鹅号-" + source;
}
}else if(url.contains("feng.ifeng.com")){
//凤凰网-大风号
source = html.split("source\":\"")[1].split("\",\"")[0];
if(source!=null && source.length()>1){
source = "大风号-" + source;
}
}else if(url.contains("dy.163.com")){
//网易订阅-网易号
source = document.select("div.colum_info>h4").text();
if(source!=null && source.length()>1){
source = "网易号-" + source;
}
}else if(url.contains("qctt.cn")){
//汽车头条
source = document.select("div.part2>a").text();
if(source!=null && source.length()>1){
source = "汽车头条-" + source;
}
}else if(url.contains("maiche.com")){
//买车网
source = document.select("div.info-left > div:nth-child(2) > span > a").text();
if(source!=null && source.length()>1){
source = "买车网-" + source;
}
}else if(url.contains("3g.163.com")){ }else if(url.contains("3g.163.com")){
source = document.select("div.info").select("[class=\"source js-source\"]") source = document.select("div.info").select("[class=\"source js-source\"]")
.text(); .text();
...@@ -201,7 +240,6 @@ public class MatchSource { ...@@ -201,7 +240,6 @@ public class MatchSource {
} }
return source; return source;
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace();
return null; return null;
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment