Commit b3fce9ac by chenweiyang

基础包升级版

parent d705de1f
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.3.0-SNAPSHOT</version> <version>0.4.0-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
...@@ -30,13 +30,30 @@ ...@@ -30,13 +30,30 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.2.4-SNAPSHOT</version> <version>0.3.2-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.http</groupId>
<artifactId>http-boot</artifactId>
<version>0.0.4.3-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.async</groupId>
<artifactId>task-boot</artifactId>
<version>0.20.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.kohlschutter.boilerpipe</groupId>
<artifactId>boilerpipe-extractor</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>proxy-client</artifactId>
<version>0.6.6.8-SNAPSHOT</version> <version>1.1.5-SNAPSHOT</version>
</dependency> </dependency>
</dependencies> </dependencies>
<!-- 打包管理 --> <!-- 打包管理 -->
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.time.Duration;
import java.util.Objects; import java.util.Objects;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.async.GroupSync; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.ContentBean; import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution; import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback; import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent; import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
public class ContentCrawler { public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class); private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().connectTimeout(Duration.ofSeconds(10)).retryTimes(3).detectChineseCharset(true).build();
/** /**
* *
...@@ -75,15 +76,15 @@ public class ContentCrawler { ...@@ -75,15 +76,15 @@ public class ContentCrawler {
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
counter.add(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ProxySupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback); parseHtml(rs.bodyString(), attr, callback);
} else { } else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex); logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
} }
} catch (Exception e) { } catch (Exception e) {
logger.info("搜索结果访问失败: {}", ex); logger.info("搜索结果访问失败: ", ex);
} finally { } finally {
counter.done(); counter.done();
} }
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.time.Duration;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean; import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution; import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel; import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource; import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack; import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
...@@ -34,7 +34,7 @@ import okhttp3.Request; ...@@ -34,7 +34,7 @@ import okhttp3.Request;
public class MediaSelfSourceCrawler { public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class); private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().connectTimeout(Duration.ofSeconds(10)).retryTimes(3).detectChineseCharset(true).build();
/** /**
* *
...@@ -91,7 +91,7 @@ public class MediaSelfSourceCrawler { ...@@ -91,7 +91,7 @@ public class MediaSelfSourceCrawler {
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) { private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", attr.get()); logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY; ProxySupplier ph = ProxySupplier.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")) { if(url.contains("toutiao.com")) {
map.put("referer", url); map.put("referer", url);
} }
...@@ -104,7 +104,7 @@ public class MediaSelfSourceCrawler { ...@@ -104,7 +104,7 @@ public class MediaSelfSourceCrawler {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
try { try {
parseHtml(rs.body().string(), attr, callback, rs.request().url().uri().toString()); parseHtml(rs.bodyString(), attr, callback, rs.request().url().uri().toString());
} catch (Exception e) { } catch (Exception e) {
logger.error("解析出错", e); logger.error("解析出错", e);
} }
...@@ -169,7 +169,7 @@ public class MediaSelfSourceCrawler { ...@@ -169,7 +169,7 @@ public class MediaSelfSourceCrawler {
source = MatchSource.matchMediaSelfSource(url + eUrl,result); source = MatchSource.matchMediaSelfSource(url + eUrl,result);
channel = MatchChannel.verifyChannel(url); channel = MatchChannel.verifyChannel(url);
if(channel==null){ if(channel==null){
List<Node> nodeList = Jsoup.parse(result).head().childNodes(); List<org.jsoup.nodes.Node> nodeList = Jsoup.parse(result).head().childNodes();
channel = MatchChannel.matchChannel(nodeList); channel = MatchChannel.matchChannel(nodeList);
} }
} catch (Exception e) { } catch (Exception e) {
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.time.Duration;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -11,16 +12,16 @@ import org.jsoup.Jsoup; ...@@ -11,16 +12,16 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.SourceForwardBean; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution; import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel; import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource; import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData; import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack; import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
...@@ -29,7 +30,7 @@ public class SourceForwardCrawler { ...@@ -29,7 +30,7 @@ public class SourceForwardCrawler {
private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class); private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().connectTimeout(Duration.ofSeconds(10)).retryTimes(3).detectChineseCharset(true).build();
private static List<String> sourceList = SourceData.getSourceList(); private static List<String> sourceList = SourceData.getSourceList();
public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) { public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
...@@ -77,10 +78,10 @@ public class SourceForwardCrawler { ...@@ -77,10 +78,10 @@ public class SourceForwardCrawler {
} }
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
counter.add(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ProxySupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback); parseHtml(rs.bodyString(), attr, callback);
} else { } else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex); logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
} }
......
...@@ -2,6 +2,7 @@ package com.zhiwei.source_forward.crawler; ...@@ -2,6 +2,7 @@ package com.zhiwei.source_forward.crawler;
import static java.util.Objects.nonNull; import static java.util.Objects.nonNull;
import java.time.Duration;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
...@@ -17,13 +18,13 @@ import org.jsoup.nodes.Document; ...@@ -17,13 +18,13 @@ import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.JSONPath; import com.alibaba.fastjson.JSONPath;
import com.zhiwei.async.GroupSync; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
...@@ -39,7 +40,7 @@ import okhttp3.Request; ...@@ -39,7 +40,7 @@ import okhttp3.Request;
public class UrlLiveCrawler { public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class); private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(true).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().connectTimeout(Duration.ofSeconds(10)).retryTimes(3).detectChineseCharset(true).build();
public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) { public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) {
GroupSync counter = new GroupSync(); GroupSync counter = new GroupSync();
...@@ -71,12 +72,12 @@ public class UrlLiveCrawler { ...@@ -71,12 +72,12 @@ public class UrlLiveCrawler {
url = dealUrl(url); url = dealUrl(url);
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", url);
Map<String,String> headers = new HashMap<>(); Map<String,String> headers = new HashMap<>();
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY; ProxySupplier ph = ProxySupplier.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")){ if(url.contains("toutiao.com")){
ph = ProxyHolder.NAT_HEAVY_PROXY; ph = ProxySupplier.NAT_HEAVY_PROXY;
}else if(url.contains("zhihu.com")) { }else if(url.contains("zhihu.com")) {
url = treatZhihuUrl(url); url = treatZhihuUrl(url);
ph = ProxyHolder.NAT_HEAVY_PROXY; ph = ProxySupplier.NAT_HEAVY_PROXY;
} }
try { try {
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
...@@ -87,7 +88,7 @@ public class UrlLiveCrawler { ...@@ -87,7 +88,7 @@ public class UrlLiveCrawler {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
if(rs.isSuccessful()) { if(rs.isSuccessful()) {
parseHtml(rs.body().string(), attr, callback); parseHtml(rs.bodyString(), attr, callback);
}else if(rs.code() == 404){ }else if(rs.code() == 404){
callBack(callback, attr, 1, String.valueOf(rs.code())); callBack(callback, attr, 1, String.valueOf(rs.code()));
}else { }else {
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.time.Duration;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
...@@ -13,19 +14,18 @@ import org.jsoup.Jsoup; ...@@ -13,19 +14,18 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.async.GroupSync; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.async.TaskBoot; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.task.async.TaskBoot;
import com.zhiwei.task.sync.GroupSync;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
public class UrlLiveCrawlerNew { public class UrlLiveCrawlerNew {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawlerNew.class); private static final Logger logger = LogManager.getLogger(UrlLiveCrawlerNew.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(false).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().connectTimeout(Duration.ofSeconds(10)).retryTimes(3).detectChineseCharset(true).build();
public List<UrlLiveBean> judgeIsDelete(List<String> urlList) { public List<UrlLiveBean> judgeIsDelete(List<String> urlList) {
GroupSync counter = new GroupSync(); GroupSync counter = new GroupSync();
...@@ -74,15 +74,12 @@ public class UrlLiveCrawlerNew { ...@@ -74,15 +74,12 @@ public class UrlLiveCrawlerNew {
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
int code = 404; int code = 404;
for(int i = 0; i < 2; i++) { for(int i = 0; i < 2; i++) {
try (Response response = httpBoot.syncCall(request)){ com.zhiwei.http.boot.Response response = httpBoot.syncCall(request);
if(response.isSuccessful()) { if(response.isSuccessful()) {
return matchDel(response.body().string(), url); return matchDel(response.bodyString(), url);
}else { }else {
code = response.code(); code = response.code();
} }
} catch (Exception e) {
logger.error("解析", e);
}
} }
if(code == 403){ if(code == 403){
return callBack(url, -1, String.valueOf(code)); return callBack(url, -1, String.valueOf(code));
......
package com.zhiwei.source_forward.util; package com.zhiwei.source_forward.util;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig; import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.source_forward.config.ProxyConfig; import com.zhiwei.source_forward.config.ProxyConfig;
...@@ -19,7 +18,7 @@ public class ProxyInit { ...@@ -19,7 +18,7 @@ public class ProxyInit {
String address = ProxyConfig.registry; String address = ProxyConfig.registry;
String appName = "xumiaoxin"; String appName = "xumiaoxin";
long appId = ProxyConfig.proxyid; long appId = ProxyConfig.proxyid;
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build()); com.zhiwei.http.proxy.ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build());
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment