Commit 196e523d by yangchen

crawler-core 版本提升

parent 39b30f08
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.1.3-SNAPSHOT</version>
<version>0.1.5-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......@@ -24,12 +24,12 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.2-SNAPSHOT</version>
<version>0.1.3-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.3.1-RELEASE</version>
<version>0.3.6-RELEASE</version>
<scope>provided</scope>
</dependency>
</dependencies>
......
......@@ -13,13 +13,14 @@ import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
......@@ -31,14 +32,9 @@ public class ContentCrawler {
*/
public GroupSync submitTask(ContentDataCallback callback,
String... urls) {
try {
GroupSync counter = new GroupSync();
start(counter, callback, urls);
return counter;
} catch (Exception e) {
logger.error(" exception {}",e);
return null;
}
GroupSync counter = new GroupSync();
start(counter, callback, urls);
return counter;
}
/**
......@@ -54,6 +50,7 @@ public class ContentCrawler {
for (String url : urls) {
if (url != null) {
try {
ZhiWeiTools.sleep(10);
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
......@@ -78,7 +75,7 @@ public class ContentCrawler {
Request request = RequestUtils.wrapGet(url);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback);
......
......@@ -19,6 +19,7 @@ import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -33,7 +34,7 @@ import okhttp3.Request;
public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
......@@ -67,6 +68,7 @@ public class MediaSelfSourceCrawler {
counter.add();
if (url != null) {
try {
ZhiWeiTools.sleep(10);
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
......@@ -96,7 +98,7 @@ public class MediaSelfSourceCrawler {
Request request = RequestUtils.wrapGet(url, map);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
try {
......
......@@ -21,6 +21,7 @@ import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -28,7 +29,7 @@ public class SourceForwardCrawler {
private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static List<String> sourceList = SourceData.getSourceList();
public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
......@@ -49,6 +50,7 @@ public class SourceForwardCrawler {
if (url != null) {
try {
search(counter, url, Attribution.of(url), callback);
ZhiWeiTools.sleep(10);
} catch (Exception e) {
logger.error("搜索创建出错", e);
}
......@@ -67,7 +69,7 @@ public class SourceForwardCrawler {
Request request = RequestUtils.wrapGet(url, headers);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback);
......
......@@ -21,6 +21,7 @@ import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -35,7 +36,7 @@ import okhttp3.Request;
public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot(false,2);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) {
GroupSync counter = new GroupSync();
......@@ -49,6 +50,7 @@ public class UrlLiveCrawler {
counter.add();
if (nonNull(url)) {
try {
ZhiWeiTools.sleep(10);
search(counter, url, Attribution.of(url,1), callback);
} catch (Exception e) {
logger.error("搜索创建出错:", e);
......@@ -71,7 +73,7 @@ public class UrlLiveCrawler {
Request request = RequestUtils.wrapGet(url, headers);
if(Objects.nonNull(request)) {
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
if(rs.code() == 200) {
......
......@@ -35,7 +35,7 @@ public class MatchContent {
try {
Document document = Jsoup.parse(html);
if(url.contains("weixin.qq.com")) {
content = matchContentWeixin(document);
content = matchContentWeixin(html);
}else if(url.contains("toutiao.com")) {
content = matchContentToutiao(html);
}
......@@ -71,22 +71,22 @@ public class MatchContent {
* @param html
* @return
*/
private static String matchContentWeixin(Document document) {
try {
String content = document.select("div.rich_media_content").text();
if(document.toString().contains("<script id=\"content_tpl\"")) {
Pattern pa = Pattern.compile("\\<script id=\"content_tpl(.*?)\\</script\\>");
Matcher ma = pa.matcher(document.toString());
while(ma.find()) {
return ma.group(0).replaceAll("<script id=\"content_tpl\" type=\"text/html\">", "").replaceAll("</script>", "");
}
return content;
}
return content;
} catch (Exception e) {
e.printStackTrace();
return "";
}
private static String matchContentWeixin(String contentHtml) {
try {
Document document = Jsoup.parse(contentHtml);
if (contentHtml.contains("js_article")) {
return document.select("div#js_article").text();
} else if (contentHtml.contains("js_share_content")) {
return document.select("div#js_share_content").text();
}
if (contentHtml.contains("content_tpl")) {
String text = document.select("script#content_tpl").html();
return Jsoup.parse(text).text();
}
} catch (Exception e) {
logger.error("微信全文解析出错 {}", e);
}
return "";
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment