Commit b3fce9ac by chenweiyang

基础包升级版

parent d705de1f
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.3.0-SNAPSHOT</version>
<version>0.4.0-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......@@ -30,13 +30,30 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.2.4-SNAPSHOT</version>
<version>0.3.2-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.6.8-SNAPSHOT</version>
<groupId>com.zhiwei.http</groupId>
<artifactId>http-boot</artifactId>
<version>0.0.4.3-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.async</groupId>
<artifactId>task-boot</artifactId>
<version>0.20.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.kohlschutter.boilerpipe</groupId>
<artifactId>boilerpipe-extractor</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>proxy-client</artifactId>
<version>1.1.5-SNAPSHOT</version>
</dependency>
</dependencies>
<!-- 打包管理 -->
......
package com.zhiwei.source_forward.crawler;
import java.time.Duration;
import java.util.Objects;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().connectTimeout(Duration.ofSeconds(10)).retryTimes(3).detectChineseCharset(true).build();
/**
*
......@@ -75,15 +76,15 @@ public class ContentCrawler {
Request request = RequestUtils.wrapGet(url);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ProxySupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback);
parseHtml(rs.bodyString(), attr, callback);
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
}
} catch (Exception e) {
logger.info("搜索结果访问失败: {}", ex);
logger.info("搜索结果访问失败: ", ex);
} finally {
counter.done();
}
......
package com.zhiwei.source_forward.crawler;
import java.time.Duration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -34,7 +34,7 @@ import okhttp3.Request;
public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().connectTimeout(Duration.ofSeconds(10)).retryTimes(3).detectChineseCharset(true).build();
/**
*
......@@ -91,7 +91,7 @@ public class MediaSelfSourceCrawler {
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>();
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
ProxySupplier ph = ProxySupplier.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")) {
map.put("referer", url);
}
......@@ -104,7 +104,7 @@ public class MediaSelfSourceCrawler {
try {
if (Objects.isNull(ex)) {
try {
parseHtml(rs.body().string(), attr, callback, rs.request().url().uri().toString());
parseHtml(rs.bodyString(), attr, callback, rs.request().url().uri().toString());
} catch (Exception e) {
logger.error("解析出错", e);
}
......@@ -169,7 +169,7 @@ public class MediaSelfSourceCrawler {
source = MatchSource.matchMediaSelfSource(url + eUrl,result);
channel = MatchChannel.verifyChannel(url);
if(channel==null){
List<Node> nodeList = Jsoup.parse(result).head().childNodes();
List<org.jsoup.nodes.Node> nodeList = Jsoup.parse(result).head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
}
} catch (Exception e) {
......
package com.zhiwei.source_forward.crawler;
import java.time.Duration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
......@@ -11,16 +12,16 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -29,7 +30,7 @@ public class SourceForwardCrawler {
private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().connectTimeout(Duration.ofSeconds(10)).retryTimes(3).detectChineseCharset(true).build();
private static List<String> sourceList = SourceData.getSourceList();
public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
......@@ -77,10 +78,10 @@ public class SourceForwardCrawler {
}
Request request = RequestUtils.wrapGet(url, headers);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ProxySupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback);
parseHtml(rs.bodyString(), attr, callback);
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
}
......
......@@ -2,6 +2,7 @@ package com.zhiwei.source_forward.crawler;
import static java.util.Objects.nonNull;
import java.time.Duration;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
......@@ -17,13 +18,13 @@ import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.JSONPath;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -39,7 +40,7 @@ import okhttp3.Request;
public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(true).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().connectTimeout(Duration.ofSeconds(10)).retryTimes(3).detectChineseCharset(true).build();
public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) {
GroupSync counter = new GroupSync();
......@@ -71,12 +72,12 @@ public class UrlLiveCrawler {
url = dealUrl(url);
logger.info("当前处理 URL: {}", url);
Map<String,String> headers = new HashMap<>();
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
ProxySupplier ph = ProxySupplier.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")){
ph = ProxyHolder.NAT_HEAVY_PROXY;
ph = ProxySupplier.NAT_HEAVY_PROXY;
}else if(url.contains("zhihu.com")) {
url = treatZhihuUrl(url);
ph = ProxyHolder.NAT_HEAVY_PROXY;
ph = ProxySupplier.NAT_HEAVY_PROXY;
}
try {
Request request = RequestUtils.wrapGet(url, headers);
......@@ -87,7 +88,7 @@ public class UrlLiveCrawler {
try {
if (Objects.isNull(ex)) {
if(rs.isSuccessful()) {
parseHtml(rs.body().string(), attr, callback);
parseHtml(rs.bodyString(), attr, callback);
}else if(rs.code() == 404){
callBack(callback, attr, 1, String.valueOf(rs.code()));
}else {
......
package com.zhiwei.source_forward.crawler;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
......@@ -13,19 +14,18 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.async.GroupSync;
import com.zhiwei.async.TaskBoot;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.task.async.TaskBoot;
import com.zhiwei.task.sync.GroupSync;
import okhttp3.Request;
import okhttp3.Response;
public class UrlLiveCrawlerNew {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawlerNew.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(false).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().connectTimeout(Duration.ofSeconds(10)).retryTimes(3).detectChineseCharset(true).build();
public List<UrlLiveBean> judgeIsDelete(List<String> urlList) {
GroupSync counter = new GroupSync();
......@@ -74,14 +74,11 @@ public class UrlLiveCrawlerNew {
Request request = RequestUtils.wrapGet(url, headers);
int code = 404;
for(int i = 0; i < 2; i++) {
try (Response response = httpBoot.syncCall(request)){
if(response.isSuccessful()) {
return matchDel(response.body().string(), url);
}else {
code = response.code();
}
} catch (Exception e) {
logger.error("解析", e);
com.zhiwei.http.boot.Response response = httpBoot.syncCall(request);
if(response.isSuccessful()) {
return matchDel(response.bodyString(), url);
}else {
code = response.code();
}
}
if(code == 403){
......
package com.zhiwei.source_forward.util;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.source_forward.config.ProxyConfig;
......@@ -19,7 +18,7 @@ public class ProxyInit {
String address = ProxyConfig.registry;
String appName = "xumiaoxin";
long appId = ProxyConfig.proxyid;
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build());
com.zhiwei.http.proxy.ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build());
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment