Commit 4a1a7343 by yangchen

提交本地修改

parent e18492f6
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.0.3-SNAPSHOT</version>
<version>0.0.5-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......@@ -24,12 +24,7 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.5-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>proxy-client</artifactId>
<version>0.0.2-RELEASE</version>
<version>0.0.9-SNAPSHOT</version>
</dependency>
</dependencies>
......
......@@ -45,7 +45,8 @@ public class SourceForwardBean {
@Override
public String toString() {
return "SourceForwardBean [url=" + url + ", channel=" + channel
+ ", root_source=" + root_source + "]";
+ ", root_source=" + root_source + ", isforward=" + isforward
+ "]";
}
public SourceForwardBean(String url, String channel, String root_source,
......
......@@ -164,17 +164,17 @@ public class ContentExtractor {
news.setUrl(doc.baseUri());
}
// try {
// news.setTime(getTime(contentElement));
// } catch (Exception ex) {
// LOG.info("news title extraction failed", ex);
// }
try {
news.setTime(getTime(contentElement));
} catch (Exception ex) {
LOG.info("news title extraction failed", ex);
}
// try {
// news.setTitle(getTitle(contentElement));
// } catch (Exception ex) {
// LOG.info("title extraction failed", ex);
// }
try {
news.setTitle(getTitle(contentElement));
} catch (Exception ex) {
LOG.info("title extraction failed", ex);
}
return news;
}
......
package com.zhiwei.source_forward.crawler;
import java.util.concurrent.TimeUnit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import okhttp3.Request;
import okhttp3.Response;
......@@ -18,7 +20,8 @@ import okhttp3.Response;
public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
*
* @Description 链接传入 并 返回采集完信号
......@@ -29,7 +32,7 @@ public class ContentCrawler {
*/
public MultiThreadingCounter submitTask(ContentDataCallback callback,
String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter();
MultiThreadingCounter counter = new MultiThreadingCounter(20,TimeUnit.MINUTES,false);
start(counter, callback, urls);
return counter;
}
......@@ -70,7 +73,7 @@ public class ContentCrawler {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false).addListener(future -> {
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).addListener(future -> {
if (future.isSuccess()) {
Response response = future.result();
try {
......@@ -101,6 +104,7 @@ public class ContentCrawler {
try {
if (response.isSuccessful()) {
String html = response.body().string();
System.out.println(html);
content = MatchContent.matchContent(attr.get().toString(),
html);
}
......
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......@@ -9,13 +12,13 @@ import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import okhttp3.Request;
import okhttp3.Response;
......@@ -31,6 +34,7 @@ import okhttp3.Response;
public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
*
......@@ -41,7 +45,7 @@ public class MediaSelfSourceCrawler {
* @throws Exception
*/
public MultiThreadingCounter submitTask(MediaSelfSourceDataCallBack callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter();
MultiThreadingCounter counter = new MultiThreadingCounter("任务======= ", 10,TimeUnit.SECONDS,true);
start(counter, callback, urls);
return counter;
}
......@@ -81,20 +85,27 @@ public class MediaSelfSourceCrawler {
*/
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
Map<String,Object> map = new HashMap<>();
if(url.contains("toutiao.com")) {
map.put("referer", url);
}
Request request = RequestUtils.wrapGet(url, map);
counter.increase();
HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false).addListener(future -> {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).addListener(future -> {
try {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
} finally {
counter.reduce();
}
counter.reduce();
});
return counter;
}
......
package com.zhiwei.source_forward.crawler;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......@@ -11,11 +12,11 @@ import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
......@@ -26,10 +27,11 @@ public class SourceForwardCrawler {
private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
private static HttpBoot httpBoot = new HttpBoot();
private static List<String> sourceList = SourceData.getSourceList();
public MultiThreadingCounter submitTask(SourceForwardDataCallBack callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter();
MultiThreadingCounter counter = new MultiThreadingCounter(20,TimeUnit.MINUTES,false);
start(counter, callback, urls);
return counter;
}
......@@ -55,7 +57,7 @@ public class SourceForwardCrawler {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false).addListener(future -> {
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).addListener(future -> {
if (future.isSuccess()) {
Response response = future.result();
try {
......@@ -81,7 +83,9 @@ public class SourceForwardCrawler {
Document document = Jsoup.parse(response.body().string());
if(attr.get().toString().contains("mp.weixin.qq.com")){
isforward = document.select("div#meta_content").select("span#copyright_logo").text();
if(!"原创".equals(isforward)){
if(isforward.contains("原创")){
isforward = "原创";
}else {
isforward = "未知";
}
}else{
......
package com.zhiwei.source_forward.crawler;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......@@ -11,9 +12,9 @@ import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import okhttp3.Request;
......@@ -30,9 +31,10 @@ import okhttp3.Response;
public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot();
public MultiThreadingCounter submitTask(UrlLiveDataCallback callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter();
MultiThreadingCounter counter = new MultiThreadingCounter(20,TimeUnit.MINUTES,false);
start(counter, callback, urls);
return counter;
}
......@@ -57,36 +59,41 @@ public class UrlLiveCrawler {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false,false).addListener(future -> {
if (future.isSuccess()) {
Response response = future.result();
try {
if(response.code() == 200) {
parseHtml(response.body().string(), attr, callback);
}else {
callBack(callback, attr, 1);
}
} catch (Exception e) {
logger.error("解析出错", e);
}finally {
if(response != null) {
response.close();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, false).addListener(future -> {
try {
if (future.isSuccess()) {
Response response = future.result();
try {
if(response.code() == 200) {
parseHtml(response.body().string(), attr, callback);
}else {
callBack(callback, attr, 1);
}
} catch (Exception e) {
logger.error("解析出错", e);
}finally {
if(response != null) {
response.close();
}
}
}
} else {
if(future.cause().getMessage().contains("status code: 301")) {
callBack(callback, attr, 1);
}else {
if(attr.getCount() > 3) {
callBack(callback, attr, -1);
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
} else {
if(future.cause().getMessage().contains("status code: ")) {
callBack(callback, attr, 1);
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
if(attr.getCount() > 3) {
callBack(callback, attr, -1);
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}finally {
counter.reduce();
}
counter.reduce();
});
return counter;
}
......
......@@ -57,7 +57,7 @@ public class ContentMatch {
public static void main(String[] args) {
List<String> urlList = new ArrayList<>();
urlList.add("https://mp.weixin.qq.com/s?src=11&timestamp=1535697915&ver=1093&signature=HNXpB8owyjfkyX-p2UDMga5R-qEpgjEpRQAjVmy7xqdrfsjZNdW0xa56dgCWMD9I*eo**yak46juxNEzryhKVLRT48DG0g9SUJSVrKSaPrhHEuJ1JOA86mSaY7TrHMMT&new=1");
urlList.add("https://mp.weixin.qq.com/s?src=11&timestamp=1539828001&ver=1189&signature=SAyiGuX8VfwlPsIlq*V7I8epXKcMc9Zr6RptkDT34vDk7tSYQCwix6qJxMm25JK9gxo0t9HKAeqm70V2J1FhcDiSlf1eMhSSMz8EiCk*Hu50B7sJFkoH46HHo1uiC4f7&new=1");
List<ContentBean> l = getContentMatch(urlList);
for(ContentBean cb : l) {
System.out.println(cb.getContent());
......
......@@ -7,6 +7,8 @@ import java.util.List;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler;
......@@ -21,12 +23,22 @@ public class MediaSelfSource {
return list;
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>();
urlList.add("https://www.toutiao.com/a6452936157751968013/");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) {
System.out.println(b.toString());
}
}
static class MediaSelfSourceCrawlerThread extends Thread{
static List<MediaSelfSourceBean> getMediaSelfSource(List<String> urlList){
List<MediaSelfSourceBean> list = Collections.synchronizedList(new ArrayList<MediaSelfSourceBean>());
try{
MediaSelfSourceCrawler crawler = new MediaSelfSourceCrawler();
List<MediaSelfSourceBean> list = Collections.synchronizedList(new ArrayList<MediaSelfSourceBean>());
MediaSelfSourceDataCallBack callback = new MediaSelfSourceDataCallBack() {
@Override
......@@ -41,7 +53,7 @@ public class MediaSelfSource {
}catch (Exception e){
e.fillInStackTrace();
}
return null;
return list;
}
}
......
......@@ -10,6 +10,8 @@ import java.util.Map.Entry;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
......@@ -158,26 +160,15 @@ public class SourceForward {
return dataList;
}
// public static void main(String[] args) {
// List<String> urlList = new ArrayList<>();
// urlList.add("http://sh.qihoo.com/pc/99493b3bf136d8e20?sign=360_e39369d1");
// urlList.add("http://news.ctocio.com.cn/383/14543883.shtml");
// urlList.add("http://www.jn001.com/news/2018-07/05/content_561091.htm");
// urlList.add("http://www.ca800.com/fFa8D/bOTUBC1QfF/40944.aspx");
// urlList.add("http://sh.qihoo.com/pc/988470164f6c5ca14?sign=360_e39369d1");
// urlList.add("http://news.jstv.com/a/20180705/1530731642686.shtml?jsbcApp=1");
// urlList.add("https://tech.sina.cn/i/gn/2018-07-05/detail-ihexfcvi8155439.d.html?pos=18");
// urlList.add("http://sh.qihoo.com/pc/983b3d157f91af18b?sign=360_e39369d1");
// urlList.add("http://china.rednet.cn/c/2018/07/05/4671927.htm");
// urlList.add("http://news.enorth.com.cn/system/2018/07/05/035782857.shtml");
// urlList.add("https://www.toutiao.com/i6573922350037729796/");
// urlList.add("http://news.cnhubei.com/xw/sh/201807/t4132048.shtml");
// urlList.add("https://www.toutiao.com/a6573774143949373956/");
// List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
// for(SourceForwardBean sfb : da) {
// System.out.println(sfb.toString());
// }
// }
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>();
urlList.add("http://www.toutiao.com/a6452936157751968013/");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString());
}
}
static class SourceForwardCrawlerThread extends Thread{
......
......@@ -66,7 +66,7 @@ public class URLLive {
public static void main(String[] args) {
List<String> urlList = new ArrayList<>();
urlList.add("http://mp.weixin.qq.com/s?__biz=MzA3MzY1NjMxMw==&mid=2652054872&idx=1&sn=d67630a6b55d0eebd353cc90242fd784&3rd=MzA3MDU4NTYzMw==&scene=6#rd");
urlList.add("http://www.teso.cc/html/zixun/201606/233848.html");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) {
System.out.println(b.toString());
......
package com.zhiwei.source_forward.util;
import java.net.Proxy;
import com.zhiwei.proxy.common.Definition.GroupType;
import com.zhiwei.proxy.core.ProxyClient;
import com.zhiwei.proxy.core.ProxyClientFactory;
import com.zhiwei.source_forward.config.ProxyConfig;
public class ProxyClientUtil {
private static volatile ProxyClient client;
/**
* @Title: getNATProxy
* @author hero
* @Description: 获取NAT机代理IP
* @param @return 设定文件
* @return Proxy 返回类型
*/
public static Proxy getNATProxy(){
return getClient().getNATProxy();
}
public static ProxyClient getClient() {
if(client==null) {
synchronized (ProxyClientUtil.class) {
if(client==null) {
client = ProxyClientFactory.build(ProxyConfig.registry, ProxyConfig.group, GroupType.PROVIDER);
}
}
}
return client;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment