Commit 7f4a87a2 by yangchen

修改 crawler-core 提升版本

parent 1b20782c
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.1.1-SNAPSHOT</version> <version>0.1.3-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
...@@ -24,12 +24,12 @@ ...@@ -24,12 +24,12 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.1-SNAPSHOT</version> <version>0.1.2-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.1.1-RELEASE</version> <version>0.3.0-RELEASE</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
</dependencies> </dependencies>
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.util.concurrent.TimeUnit; import java.util.Objects;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.crawler.async.MultiThreadingCounter; import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.ContentBean; import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution; import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback; import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent; import com.zhiwei.source_forward.util.MatchContent;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
public class ContentCrawler { public class ContentCrawler {
...@@ -30,10 +29,10 @@ public class ContentCrawler { ...@@ -30,10 +29,10 @@ public class ContentCrawler {
* @return * @return
* @throws Exception * @throws Exception
*/ */
public MultiThreadingCounter submitTask(ContentDataCallback callback, public GroupSync submitTask(ContentDataCallback callback,
String... urls) { String... urls) {
try { try {
MultiThreadingCounter counter = new MultiThreadingCounter(15,TimeUnit.MINUTES,false); GroupSync counter = new GroupSync();
start(counter, callback, urls); start(counter, callback, urls);
return counter; return counter;
} catch (Exception e) { } catch (Exception e) {
...@@ -49,7 +48,7 @@ public class ContentCrawler { ...@@ -49,7 +48,7 @@ public class ContentCrawler {
* @param callback * @param callback
* @param urls * @param urls
*/ */
private void start(MultiThreadingCounter counter, private void start(GroupSync counter,
ContentDataCallback callback, String... urls) { ContentDataCallback callback, String... urls) {
if (urls != null && urls.length > 0) { if (urls != null && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
...@@ -73,23 +72,25 @@ public class ContentCrawler { ...@@ -73,23 +72,25 @@ public class ContentCrawler {
* @param callback * @param callback
* @return * @return
*/ */
private MultiThreadingCounter search(MultiThreadingCounter counter, private GroupSync search(GroupSync counter,
String url, Attribution attr, ContentDataCallback callback) { String url, Attribution attr, ContentDataCallback callback) {
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", url);
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
counter.increase(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).addListener(future -> {
if (future.isSuccess()) { httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).whenComplete((rs,ex) -> {
Response response = future.result();
try { try {
parseHtml(response, attr, callback); if (Objects.isNull(ex)) {
} catch (Exception e) { parseHtml(rs.body().string(), attr, callback);
logger.error("解析出错", e);
}
} else { } else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),future.cause().getMessage()); logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
}
} catch (Exception e) {
logger.info("搜索结果访问失败: {}", ex);
} finally {
counter.done();
} }
counter.reduce();
}); });
return counter; return counter;
...@@ -103,28 +104,20 @@ public class ContentCrawler { ...@@ -103,28 +104,20 @@ public class ContentCrawler {
* @param attr * @param attr
* @param callback * @param callback
*/ */
private void parseHtml(Response response, Attribution attr, private void parseHtml(String result, Attribution attr,
ContentDataCallback callback) { ContentDataCallback callback) {
String content = null;
try { try {
if (response.isSuccessful()) { String content = MatchContent.matchContent(attr.get().toString(),
String html = response.body().string(); result);
content = MatchContent.matchContent(attr.get().toString(),
html);
}
} catch (Exception e) {
logger.error("网页链接失效", e);
} finally {
if (response != null) {
response.close();
}
}
ContentBean cb = new ContentBean(attr.get().toString(), content); ContentBean cb = new ContentBean(attr.get().toString(), content);
if (callback == null) { if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据"); logger.warn("DataCallback 对象为 null,无法保存数据");
} else { } else {
callback.onData(cb, attr); callback.onData(cb, attr);
} }
} catch (Exception e) {
logger.error("网页链接失效", e);
}
} }
......
...@@ -3,17 +3,17 @@ package com.zhiwei.source_forward.crawler; ...@@ -3,17 +3,17 @@ package com.zhiwei.source_forward.crawler;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.TimeUnit; import java.util.Objects;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter; import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean; import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution; import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel; import com.zhiwei.source_forward.util.MatchChannel;
...@@ -21,7 +21,6 @@ import com.zhiwei.source_forward.util.MatchSource; ...@@ -21,7 +21,6 @@ import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack; import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
/** /**
* *
...@@ -44,9 +43,9 @@ public class MediaSelfSourceCrawler { ...@@ -44,9 +43,9 @@ public class MediaSelfSourceCrawler {
* @return * @return
* @throws Exception * @throws Exception
*/ */
public MultiThreadingCounter submitTask(MediaSelfSourceDataCallBack callback,String... urls) { public GroupSync submitTask(MediaSelfSourceDataCallBack callback,String... urls) {
try { try {
MultiThreadingCounter counter = new MultiThreadingCounter("任务======= ", 15,TimeUnit.MINUTES,true); GroupSync counter = new GroupSync();
start(counter, callback, urls); start(counter, callback, urls);
return counter; return counter;
} catch (Exception e) { } catch (Exception e) {
...@@ -62,10 +61,10 @@ public class MediaSelfSourceCrawler { ...@@ -62,10 +61,10 @@ public class MediaSelfSourceCrawler {
* @param callback * @param callback
* @param urls * @param urls
*/ */
private void start(MultiThreadingCounter counter,MediaSelfSourceDataCallBack callback, String... urls) { private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) { if (urls != null && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
counter.increase(); counter.add();
if (url != null) { if (url != null) {
try { try {
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
...@@ -73,7 +72,7 @@ public class MediaSelfSourceCrawler { ...@@ -73,7 +72,7 @@ public class MediaSelfSourceCrawler {
logger.error("搜索创建出错", e); logger.error("搜索创建出错", e);
} }
} }
counter.reduce(); counter.done();
} }
} }
} }
...@@ -87,28 +86,28 @@ public class MediaSelfSourceCrawler { ...@@ -87,28 +86,28 @@ public class MediaSelfSourceCrawler {
* @param callback * @param callback
* @return * @return
*/ */
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) { private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", url);
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
if(url.contains("toutiao.com")) { if(url.contains("toutiao.com")) {
map.put("referer", url); map.put("referer", url);
} }
Request request = RequestUtils.wrapGet(url, map); Request request = RequestUtils.wrapGet(url, map);
counter.increase(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).addListener(future -> {
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> {
try { try {
if (future.isSuccess()) { if (Objects.isNull(ex)) {
Response response = future.result();
try { try {
parseHtml(response, attr, callback); parseHtml(rs.body().string(), attr, callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("解析出错", e); logger.error("解析出错", e);
} }
} else { } else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage()); logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
} }
} finally { } finally {
counter.reduce(); counter.done();
} }
}); });
return counter; return counter;
...@@ -121,30 +120,23 @@ public class MediaSelfSourceCrawler { ...@@ -121,30 +120,23 @@ public class MediaSelfSourceCrawler {
* @param attr * @param attr
* @param callback * @param callback
*/ */
private void parseHtml(Response response, Attribution attr, private void parseHtml(String result, Attribution attr,
MediaSelfSourceDataCallBack callback) { MediaSelfSourceDataCallBack callback) {
String source = null; String source = null;
String channel = null; String channel = null;
try { try {
if(response.isSuccessful()){ source = MatchSource.matchMediaSelfSource(attr.get().toString(),result);
String html = response.body().string();
source = MatchSource.matchMediaSelfSource(attr.get().toString(),html);
if(source==null || source.equals("")){ if(source==null || source.equals("")){
source = null; source = null;
} }
channel = MatchChannel.verifyChannel(attr.get().toString()); channel = MatchChannel.verifyChannel(attr.get().toString());
if(channel==null){ if(channel==null){
List<Node> nodeList = Jsoup.parse(html).head().childNodes(); List<Node> nodeList = Jsoup.parse(result).head().childNodes();
channel = MatchChannel.matchChannel(nodeList); channel = MatchChannel.matchChannel(nodeList);
} }
}
} catch (Exception e) { } catch (Exception e) {
logger.error("exception ",e); logger.error("exception ",e);
source = null; source = null;
}finally {
if(response != null) {
response.close();
}
} }
logger.info(attr.get()+"================="+source); logger.info(attr.get()+"================="+source);
MediaSelfSourceBean msfb = new MediaSelfSourceBean(attr.get().toString(), source, channel); MediaSelfSourceBean msfb = new MediaSelfSourceBean(attr.get().toString(), source, channel);
......
...@@ -2,7 +2,7 @@ package com.zhiwei.source_forward.crawler; ...@@ -2,7 +2,7 @@ package com.zhiwei.source_forward.crawler;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.TimeUnit; import java.util.Objects;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
...@@ -10,10 +10,10 @@ import org.jsoup.Jsoup; ...@@ -10,10 +10,10 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter; import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.SourceForwardBean; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution; import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel; import com.zhiwei.source_forward.util.MatchChannel;
...@@ -23,7 +23,6 @@ import com.zhiwei.source_forward.util.SourceForwardDataCallBack; ...@@ -23,7 +23,6 @@ import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
public class SourceForwardCrawler { public class SourceForwardCrawler {
...@@ -32,9 +31,9 @@ public class SourceForwardCrawler { ...@@ -32,9 +31,9 @@ public class SourceForwardCrawler {
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot();
private static List<String> sourceList = SourceData.getSourceList(); private static List<String> sourceList = SourceData.getSourceList();
public MultiThreadingCounter submitTask(SourceForwardDataCallBack callback,String... urls) { public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
try { try {
MultiThreadingCounter counter = new MultiThreadingCounter(15,TimeUnit.MINUTES,false); GroupSync counter = new GroupSync();
start(counter, callback, urls); start(counter, callback, urls);
return counter; return counter;
} catch (Exception e) { } catch (Exception e) {
...@@ -43,10 +42,10 @@ public class SourceForwardCrawler { ...@@ -43,10 +42,10 @@ public class SourceForwardCrawler {
} }
} }
private void start(MultiThreadingCounter counter,SourceForwardDataCallBack callback, String... urls) { private void start(GroupSync counter,SourceForwardDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) { if (urls != null && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
counter.increase(); counter.add();
if (url != null) { if (url != null) {
try { try {
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
...@@ -54,12 +53,12 @@ public class SourceForwardCrawler { ...@@ -54,12 +53,12 @@ public class SourceForwardCrawler {
logger.error("搜索创建出错", e); logger.error("搜索创建出错", e);
} }
} }
counter.reduce(); counter.done();
} }
} }
} }
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, SourceForwardDataCallBack callback) { private GroupSync search(GroupSync counter, String url,Attribution attr, SourceForwardDataCallBack callback) {
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", url);
Map<String,String> headers = HeaderTool.getCommonHead(); Map<String,String> headers = HeaderTool.getCommonHead();
if(url.contains("www.toutiao.com")){ if(url.contains("www.toutiao.com")){
...@@ -67,34 +66,29 @@ public class SourceForwardCrawler { ...@@ -67,34 +66,29 @@ public class SourceForwardCrawler {
} }
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
counter.increase(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).addListener(future -> { httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).whenComplete((rs,ex) -> {
try { try {
if (future.isSuccess()) { if (Objects.isNull(ex)) {
Response response = future.result(); parseHtml(rs.body().string(), attr, callback);
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else { } else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage()); logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
} }
} catch (Exception e1) {
logger.error("解析出错",e1);
} finally { } finally {
counter.reduce(); counter.done();
} }
}); });
return counter; return counter;
} }
private void parseHtml(Response response, Attribution attr, private void parseHtml(String body, Attribution attr,
SourceForwardDataCallBack callback) { SourceForwardDataCallBack callback) {
String source = null; String source = null;
String channel = "新闻"; String channel = "新闻";
String isforward = "未知"; String isforward = "未知";
try { try {
if(response.isSuccessful()){
String body = response.body().string();
Document document = Jsoup.parse(body); Document document = Jsoup.parse(body);
if(attr.get().toString().contains("mp.weixin.qq.com")){ if(attr.get().toString().contains("mp.weixin.qq.com")){
isforward = document.select("div#meta_content").select("span#copyright_logo").text(); isforward = document.select("div#meta_content").select("span#copyright_logo").text();
...@@ -115,15 +109,9 @@ public class SourceForwardCrawler { ...@@ -115,15 +109,9 @@ public class SourceForwardCrawler {
} }
source = MatchSource.matchSource(attr.get().toString(),document.toString(), sourceList); source = MatchSource.matchSource(attr.get().toString(),document.toString(), sourceList);
} }
}
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace();
source = null; source = null;
channel = "新闻"; channel = "新闻";
}finally {
if(response != null) {
response.close();
}
} }
logger.info(attr.get().toString()+"======="+channel+"================="+source); logger.info(attr.get().toString()+"======="+channel+"================="+source);
SourceForwardBean sfb = new SourceForwardBean(attr.get().toString(), channel, source,isforward); SourceForwardBean sfb = new SourceForwardBean(attr.get().toString(), channel, source,isforward);
......
...@@ -6,7 +6,6 @@ import java.util.Arrays; ...@@ -6,7 +6,6 @@ import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.TimeUnit;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
...@@ -14,17 +13,16 @@ import org.jsoup.Jsoup; ...@@ -14,17 +13,16 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.async.MultiThreadingCounter; import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
/** /**
* *
...@@ -39,9 +37,9 @@ public class UrlLiveCrawler { ...@@ -39,9 +37,9 @@ public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class); private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot();
public MultiThreadingCounter submitTask(UrlLiveDataCallback callback,String... urls) { public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) {
try { try {
MultiThreadingCounter counter = new MultiThreadingCounter(10,TimeUnit.MINUTES,false); GroupSync counter = new GroupSync();
start(counter, callback, urls); start(counter, callback, urls);
return counter; return counter;
} catch (Exception e) { } catch (Exception e) {
...@@ -50,10 +48,10 @@ public class UrlLiveCrawler { ...@@ -50,10 +48,10 @@ public class UrlLiveCrawler {
} }
} }
private void start(MultiThreadingCounter counter,UrlLiveDataCallback callback, String... urls) { private void start(GroupSync counter,UrlLiveDataCallback callback, String... urls) {
if (nonNull(urls) && urls.length > 0) { if (nonNull(urls) && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
counter.increase(); counter.add();
if (nonNull(url)) { if (nonNull(url)) {
try { try {
search(counter, url, Attribution.of(url,1), callback); search(counter, url, Attribution.of(url,1), callback);
...@@ -61,12 +59,12 @@ public class UrlLiveCrawler { ...@@ -61,12 +59,12 @@ public class UrlLiveCrawler {
logger.error("搜索创建出错:", e); logger.error("搜索创建出错:", e);
} }
} }
counter.reduce(); counter.done();
} }
} }
} }
private MultiThreadingCounter search(MultiThreadingCounter counter, String url, private GroupSync search(GroupSync counter, String url,
Attribution attr, UrlLiveDataCallback callback) { Attribution attr, UrlLiveDataCallback callback) {
url = dealUrl(url); url = dealUrl(url);
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", url);
...@@ -75,46 +73,33 @@ public class UrlLiveCrawler { ...@@ -75,46 +73,33 @@ public class UrlLiveCrawler {
headers.put("referer", url); headers.put("referer", url);
} }
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
counter.increase(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).addListener(future -> { httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> {
try { try {
if (future.isSuccess()) { if (Objects.isNull(ex)) {
Response response = future.result(); if(rs.code() == 200) {
try { parseHtml(rs.body().string(), attr, callback,counter);
if(response.code() == 200) {
parseHtml(response.body().string(), attr, callback,counter);
}else { }else {
if(attr.getCount() > 2) { if(attr.getCount() > 2) {
callBack(callback, attr, 1,String.valueOf(response.code())); callBack(callback, attr, 1,String.valueOf(rs.code()));
}else { }else {
attr.AddCount(); attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback); search(counter, attr.getAttr().toString(), attr, callback);
} }
} }
} catch (Exception e) {
logger.error("解析出错 {}", e);
}finally {
if(response != null) {
response.close();
}
}
} else { } else {
if(future.cause().getMessage().contains("status code: ")) {
callBack(callback, attr, 1,null);
}else {
if(attr.getCount() > 3) { if(attr.getCount() > 3) {
callBack(callback, attr, -1,null); callBack(callback, attr, -1,null);
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage()); logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
}else { }else {
attr.AddCount(); attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback); search(counter, attr.getAttr().toString(), attr, callback);
} }
} }
}
} catch (Exception e) { } catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ",e); logger.error(" 数据是否删除 采集出错 {} ",e);
}finally { }finally {
counter.reduce(); counter.done();
} }
}); });
return counter; return counter;
...@@ -178,7 +163,7 @@ public class UrlLiveCrawler { ...@@ -178,7 +163,7 @@ public class UrlLiveCrawler {
* @param callback * @param callback
*/ */
private void parseHtml(String html, Attribution attr, private void parseHtml(String html, Attribution attr,
UrlLiveDataCallback callback,MultiThreadingCounter counter) { UrlLiveDataCallback callback,GroupSync counter) {
if (callback == null) { if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据"); logger.warn("DataCallback 对象为 null,无法保存数据");
} else { } else {
......
...@@ -25,7 +25,7 @@ public class MediaSelfSource { ...@@ -25,7 +25,7 @@ public class MediaSelfSource {
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("https://sports.qq.com/a/20190227/001177.htm"); urlList.add("https://www.toutiao.com/a6669697912458445059/");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList); List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) { for(MediaSelfSourceBean b : u) {
System.out.println(b.toString()); System.out.println(b.toString());
......
package com.zhiwei.source_forward.util; package com.zhiwei.source_forward.util;
import java.util.List; import java.util.List;
import java.util.Objects;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -148,6 +149,16 @@ public class MatchSource { ...@@ -148,6 +149,16 @@ public class MatchSource {
source = "东方头条-" + source; source = "东方头条-" + source;
} }
}else if(url.contains("fashion.eastday.com")){
//处理东方头条网-自媒体号匹配
source = document.select("div.J-title_detail.title_detail > div > div.fl > i:nth-child(2)").text().trim();
if(Objects.isNull(source) || source.length() < 1) {
source = document.select("div.J-title_detail.title_detail > div > div.fl > a").text().trim();
}
if(source!=null && source.length()>1){
source = "东方看点-" + source;
}
}else if(url.contains("sh.qihoo.com")){ }else if(url.contains("sh.qihoo.com")){
//今日报点解析 //今日报点解析
source = document.select("span.source").text().trim(); source = document.select("span.source").text().trim();
...@@ -237,6 +248,18 @@ public class MatchSource { ...@@ -237,6 +248,18 @@ public class MatchSource {
if(source!=null && !source.equals("")){ if(source!=null && !source.equals("")){
source = "网易号-" + source; source = "网易号-" + source;
} }
}else if(url.contains("myzaker.com")){
source = document.select("div.article_header > div > a > span.auther")
.text();
if(source!=null && !source.equals("")){
source = "zaker-" + source;
}
}else if(url.contains("edushi.com")){
source = document.select("div.eds-name-box > div.eds-name > a > div.name")
.text();
if(source!=null && !source.equals("")){
source = "今日潮闻-" + source;
}
} }
return source; return source;
} catch (Exception e) { } catch (Exception e) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment