Commit 196e523d by yangchen

crawler-core 版本提升

parent 39b30f08
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.1.3-SNAPSHOT</version> <version>0.1.5-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
...@@ -24,12 +24,12 @@ ...@@ -24,12 +24,12 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.2-SNAPSHOT</version> <version>0.1.3-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.3.1-RELEASE</version> <version>0.3.6-RELEASE</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
</dependencies> </dependencies>
......
...@@ -13,13 +13,14 @@ import com.zhiwei.source_forward.bean.ContentBean; ...@@ -13,13 +13,14 @@ import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution; import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback; import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent; import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
public class ContentCrawler { public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class); private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/** /**
* *
...@@ -31,14 +32,9 @@ public class ContentCrawler { ...@@ -31,14 +32,9 @@ public class ContentCrawler {
*/ */
public GroupSync submitTask(ContentDataCallback callback, public GroupSync submitTask(ContentDataCallback callback,
String... urls) { String... urls) {
try {
GroupSync counter = new GroupSync(); GroupSync counter = new GroupSync();
start(counter, callback, urls); start(counter, callback, urls);
return counter; return counter;
} catch (Exception e) {
logger.error(" exception {}",e);
return null;
}
} }
/** /**
...@@ -54,6 +50,7 @@ public class ContentCrawler { ...@@ -54,6 +50,7 @@ public class ContentCrawler {
for (String url : urls) { for (String url : urls) {
if (url != null) { if (url != null) {
try { try {
ZhiWeiTools.sleep(10);
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错", e); logger.error("搜索创建出错", e);
...@@ -78,7 +75,7 @@ public class ContentCrawler { ...@@ -78,7 +75,7 @@ public class ContentCrawler {
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
counter.add(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback); parseHtml(rs.body().string(), attr, callback);
......
...@@ -19,6 +19,7 @@ import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution; ...@@ -19,6 +19,7 @@ import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel; import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource; import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack; import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
...@@ -33,7 +34,7 @@ import okhttp3.Request; ...@@ -33,7 +34,7 @@ import okhttp3.Request;
public class MediaSelfSourceCrawler { public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class); private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/** /**
* *
...@@ -67,6 +68,7 @@ public class MediaSelfSourceCrawler { ...@@ -67,6 +68,7 @@ public class MediaSelfSourceCrawler {
counter.add(); counter.add();
if (url != null) { if (url != null) {
try { try {
ZhiWeiTools.sleep(10);
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错", e); logger.error("搜索创建出错", e);
...@@ -96,7 +98,7 @@ public class MediaSelfSourceCrawler { ...@@ -96,7 +98,7 @@ public class MediaSelfSourceCrawler {
Request request = RequestUtils.wrapGet(url, map); Request request = RequestUtils.wrapGet(url, map);
counter.add(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
try { try {
......
...@@ -21,6 +21,7 @@ import com.zhiwei.source_forward.util.MatchSource; ...@@ -21,6 +21,7 @@ import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData; import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack; import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
...@@ -28,7 +29,7 @@ public class SourceForwardCrawler { ...@@ -28,7 +29,7 @@ public class SourceForwardCrawler {
private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class); private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static List<String> sourceList = SourceData.getSourceList(); private static List<String> sourceList = SourceData.getSourceList();
public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) { public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
...@@ -49,6 +50,7 @@ public class SourceForwardCrawler { ...@@ -49,6 +50,7 @@ public class SourceForwardCrawler {
if (url != null) { if (url != null) {
try { try {
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
ZhiWeiTools.sleep(10);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错", e); logger.error("搜索创建出错", e);
} }
...@@ -67,7 +69,7 @@ public class SourceForwardCrawler { ...@@ -67,7 +69,7 @@ public class SourceForwardCrawler {
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
counter.add(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback); parseHtml(rs.body().string(), attr, callback);
......
...@@ -21,6 +21,7 @@ import com.zhiwei.source_forward.bean.UrlLiveBean; ...@@ -21,6 +21,7 @@ import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
...@@ -35,7 +36,7 @@ import okhttp3.Request; ...@@ -35,7 +36,7 @@ import okhttp3.Request;
public class UrlLiveCrawler { public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class); private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot(false,2); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) { public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) {
GroupSync counter = new GroupSync(); GroupSync counter = new GroupSync();
...@@ -49,6 +50,7 @@ public class UrlLiveCrawler { ...@@ -49,6 +50,7 @@ public class UrlLiveCrawler {
counter.add(); counter.add();
if (nonNull(url)) { if (nonNull(url)) {
try { try {
ZhiWeiTools.sleep(10);
search(counter, url, Attribution.of(url,1), callback); search(counter, url, Attribution.of(url,1), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错:", e); logger.error("搜索创建出错:", e);
...@@ -71,7 +73,7 @@ public class UrlLiveCrawler { ...@@ -71,7 +73,7 @@ public class UrlLiveCrawler {
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
if(Objects.nonNull(request)) { if(Objects.nonNull(request)) {
counter.add(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
if(rs.code() == 200) { if(rs.code() == 200) {
......
...@@ -35,7 +35,7 @@ public class MatchContent { ...@@ -35,7 +35,7 @@ public class MatchContent {
try { try {
Document document = Jsoup.parse(html); Document document = Jsoup.parse(html);
if(url.contains("weixin.qq.com")) { if(url.contains("weixin.qq.com")) {
content = matchContentWeixin(document); content = matchContentWeixin(html);
}else if(url.contains("toutiao.com")) { }else if(url.contains("toutiao.com")) {
content = matchContentToutiao(html); content = matchContentToutiao(html);
} }
...@@ -71,22 +71,22 @@ public class MatchContent { ...@@ -71,22 +71,22 @@ public class MatchContent {
* @param html * @param html
* @return * @return
*/ */
private static String matchContentWeixin(Document document) { private static String matchContentWeixin(String contentHtml) {
try { try {
String content = document.select("div.rich_media_content").text(); Document document = Jsoup.parse(contentHtml);
if(document.toString().contains("<script id=\"content_tpl\"")) { if (contentHtml.contains("js_article")) {
Pattern pa = Pattern.compile("\\<script id=\"content_tpl(.*?)\\</script\\>"); return document.select("div#js_article").text();
Matcher ma = pa.matcher(document.toString()); } else if (contentHtml.contains("js_share_content")) {
while(ma.find()) { return document.select("div#js_share_content").text();
return ma.group(0).replaceAll("<script id=\"content_tpl\" type=\"text/html\">", "").replaceAll("</script>", "");
} }
return content; if (contentHtml.contains("content_tpl")) {
String text = document.select("script#content_tpl").html();
return Jsoup.parse(text).text();
} }
return content;
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); logger.error("微信全文解析出错 {}", e);
return "";
} }
return "";
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment