Commit 2c9d4fa2 by chenweiyang

代理升级版本

parent 4860f41e
...@@ -3,13 +3,17 @@ ...@@ -3,13 +3,17 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.3.0-SNAPSHOT</version> <version>0.3.1-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<http-boot.version>0.1.0.8-SNAPSHOT</http-boot.version>
<task-boot.version>1.1.2-SNAPSHOT</task-boot.version>
<boilerpipe.version>0.0.1-SNAPSHOT</boilerpipe.version>
<conomys-consumer.version>0.0.3-SNAPSHOT</conomys-consumer.version>
</properties> </properties>
<developers> <developers>
...@@ -30,12 +34,30 @@ ...@@ -30,12 +34,30 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.2.4-SNAPSHOT</version> <version>0.4.5-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.kohlschutter.boilerpipe</groupId>
<artifactId>crawler-core</artifactId> <artifactId>boilerpipe-extractor</artifactId>
<version>0.6.6.8-SNAPSHOT</version> <version>${boilerpipe.version}</version>
</dependency>
<dependency>
<groupId>com.zhiwei.http</groupId>
<artifactId>http-boot</artifactId>
<version>${http-boot.version}</version>
</dependency>
<dependency>
<groupId>com.zhiwei.async</groupId>
<artifactId>task-boot</artifactId>
<version>${task-boot.version}</version>
</dependency>
<dependency>
<groupId>com.zhiwei.network</groupId>
<artifactId>cynomys-consumer</artifactId>
<version>${conomys-consumer.version}</version>
</dependency> </dependency>
</dependencies> </dependencies>
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.Semaphore;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.async.GroupSync; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.ContentBean; import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution; import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback; import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent; import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
public class ContentCrawler { public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class); private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static Semaphore semaphore = new Semaphore(5);
/** /**
* *
...@@ -51,9 +53,12 @@ public class ContentCrawler { ...@@ -51,9 +53,12 @@ public class ContentCrawler {
ZhiWeiTools.sleep(100); ZhiWeiTools.sleep(100);
if (url != null) { if (url != null) {
try { try {
semaphore.acquire();
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错", e); logger.error("搜索创建出错", e);
} finally {
semaphore.release();
} }
} }
} }
...@@ -75,15 +80,15 @@ public class ContentCrawler { ...@@ -75,15 +80,15 @@ public class ContentCrawler {
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
counter.add(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback); parseHtml(rs.bodyString(), attr, callback);
} else { } else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex); logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
} }
} catch (Exception e) { } catch (Exception e) {
logger.info("搜索结果访问失败: {}", ex); logger.info("搜索结果访问失败: ", ex);
} finally { } finally {
counter.done(); counter.done();
} }
......
...@@ -4,22 +4,23 @@ import java.util.HashMap; ...@@ -4,22 +4,23 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.Semaphore;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean; import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution; import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel; import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource; import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack; import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
...@@ -34,7 +35,8 @@ import okhttp3.Request; ...@@ -34,7 +35,8 @@ import okhttp3.Request;
public class MediaSelfSourceCrawler { public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class); private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static Semaphore semaphore = new Semaphore(5);
/** /**
* *
...@@ -69,9 +71,12 @@ public class MediaSelfSourceCrawler { ...@@ -69,9 +71,12 @@ public class MediaSelfSourceCrawler {
counter.add(); counter.add();
if (url != null) { if (url != null) {
try { try {
semaphore.acquire();
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错", e); logger.error("搜索创建出错", e);
} finally {
semaphore.release();
} }
} }
counter.done(); counter.done();
...@@ -91,7 +96,7 @@ public class MediaSelfSourceCrawler { ...@@ -91,7 +96,7 @@ public class MediaSelfSourceCrawler {
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) { private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", attr.get()); logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY; ProxyServerSupplier ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")) { if(url.contains("toutiao.com")) {
map.put("referer", url); map.put("referer", url);
} }
...@@ -104,7 +109,7 @@ public class MediaSelfSourceCrawler { ...@@ -104,7 +109,7 @@ public class MediaSelfSourceCrawler {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
try { try {
parseHtml(rs.body().string(), attr, callback, rs.request().url().uri().toString()); parseHtml(rs.bodyString(), attr, callback, rs.bootRequest().url().uri().toString());
} catch (Exception e) { } catch (Exception e) {
logger.error("解析出错", e); logger.error("解析出错", e);
} }
......
...@@ -4,6 +4,7 @@ import java.util.HashMap; ...@@ -4,6 +4,7 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.Semaphore;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
...@@ -11,16 +12,16 @@ import org.jsoup.Jsoup; ...@@ -11,16 +12,16 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.SourceForwardBean; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution; import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel; import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource; import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData; import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack; import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
...@@ -29,8 +30,9 @@ public class SourceForwardCrawler { ...@@ -29,8 +30,9 @@ public class SourceForwardCrawler {
private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class); private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static List<String> sourceList = SourceData.getSourceList(); private static List<String> sourceList = SourceData.getSourceList();
private static Semaphore semaphore = new Semaphore(5);
public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) { public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
try { try {
...@@ -50,9 +52,12 @@ public class SourceForwardCrawler { ...@@ -50,9 +52,12 @@ public class SourceForwardCrawler {
ZhiWeiTools.sleep(100); ZhiWeiTools.sleep(100);
if (url != null) { if (url != null) {
try { try {
semaphore.acquire();
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错", e); logger.error("搜索创建出错", e);
} finally {
semaphore.release();
} }
} }
counter.done(); counter.done();
...@@ -77,10 +82,10 @@ public class SourceForwardCrawler { ...@@ -77,10 +82,10 @@ public class SourceForwardCrawler {
} }
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
counter.add(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback); parseHtml(rs.bodyString(), attr, callback);
} else { } else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex); logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
} }
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import static java.util.Objects.nonNull;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.Semaphore;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.JSONPath; import com.alibaba.fastjson.JSONPath;
import com.zhiwei.async.GroupSync; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.Objects.nonNull; import okhttp3.Request;
/** /**
* @author byte-zbs * @author byte-zbs
...@@ -32,7 +39,8 @@ import static java.util.Objects.nonNull; ...@@ -32,7 +39,8 @@ import static java.util.Objects.nonNull;
public class UrlLiveCrawler { public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class); private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(true).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static Semaphore semaphore = new Semaphore(5);
public GroupSync submitTask(UrlLiveDataCallback callback, String... urls) { public GroupSync submitTask(UrlLiveDataCallback callback, String... urls) {
GroupSync counter = new GroupSync(); GroupSync counter = new GroupSync();
...@@ -43,17 +51,20 @@ public class UrlLiveCrawler { ...@@ -43,17 +51,20 @@ public class UrlLiveCrawler {
private void start(GroupSync counter, UrlLiveDataCallback callback, String... urls) { private void start(GroupSync counter, UrlLiveDataCallback callback, String... urls) {
if (nonNull(urls) && urls.length > 0) { if (nonNull(urls) && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
try {
counter.add(); counter.add();
ZhiWeiTools.sleep(100); semaphore.acquire();
ZhiWeiTools.sleep(200);
if (nonNull(url)) { if (nonNull(url)) {
try {
// ZhiWeiTools.sleep(3000); // ZhiWeiTools.sleep(3000);
search(counter, url, Attribution.of(url, 1), callback); search(counter, url, Attribution.of(url, 1), callback);
}
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错:", e); logger.error("搜索创建出错:", e);
} } finally {
}
counter.done(); counter.done();
semaphore.release();
}
} }
} }
} }
...@@ -63,12 +74,12 @@ public class UrlLiveCrawler { ...@@ -63,12 +74,12 @@ public class UrlLiveCrawler {
url = dealUrl(url); url = dealUrl(url);
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", url);
Map<String, String> headers = new HashMap<>(); Map<String, String> headers = new HashMap<>();
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY; ProxyServerSupplier ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
if (url.contains("toutiao.com")) { if (url.contains("toutiao.com")) {
ph = ProxyHolder.NAT_HEAVY_PROXY; ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
} else if (url.contains("zhihu.com")) { } else if (url.contains("zhihu.com")) {
url = treatZhihuUrl(url); url = treatZhihuUrl(url);
ph = ProxyHolder.NAT_HEAVY_PROXY; ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
} }
try { try {
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
...@@ -80,7 +91,7 @@ public class UrlLiveCrawler { ...@@ -80,7 +91,7 @@ public class UrlLiveCrawler {
System.out.println(rs.code()); System.out.println(rs.code());
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
if (rs.isSuccessful()) { if (rs.isSuccessful()) {
parseHtml(rs.body().string(), attr, callback); parseHtml(rs.bodyString(), attr, callback);
} else if (rs.code() == 404) { } else if (rs.code() == 404) {
callBack(callback, attr, 1, String.valueOf(rs.code())); callBack(callback, attr, 1, String.valueOf(rs.code()));
} else { } else {
...@@ -91,7 +102,7 @@ public class UrlLiveCrawler { ...@@ -91,7 +102,7 @@ public class UrlLiveCrawler {
callBack(callback, attr, -1, "程序无法判断"); callBack(callback, attr, -1, "程序无法判断");
} }
} catch (Exception e) { } catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ", e); logger.error(" 数据是否删除 采集出错 ", e);
} finally { } finally {
counter.done(); counter.done();
} }
...@@ -99,7 +110,7 @@ public class UrlLiveCrawler { ...@@ -99,7 +110,7 @@ public class UrlLiveCrawler {
return counter; return counter;
} }
} catch (Exception e2) { } catch (Exception e2) {
logger.error("数据出错 {}", e2); logger.error("数据出错 ", e2);
} }
return counter; return counter;
} }
......
package com.zhiwei.source_forward.crawler; //package com.zhiwei.source_forward.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.Arrays; //import java.util.Arrays;
import java.util.HashMap; //import java.util.HashMap;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
import java.util.Objects; //import java.util.Objects;
//import java.util.concurrent.Semaphore;
import org.apache.logging.log4j.LogManager; //
import org.apache.logging.log4j.Logger; //import org.apache.logging.log4j.LogManager;
import org.jsoup.Jsoup; //import org.apache.logging.log4j.Logger;
import org.jsoup.nodes.Document; //import org.jsoup.Jsoup;
//import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject; //
import com.zhiwei.async.GroupSync; //import com.alibaba.fastjson.JSONObject;
import com.zhiwei.async.TaskBoot; //import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; //import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils; //import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean; //import com.zhiwei.task.async.TaskBoot;
//import com.zhiwei.task.sync.GroupSync;
import okhttp3.Request; //
import okhttp3.Response; //import okhttp3.Request;
//import okhttp3.Response;
public class UrlLiveCrawlerNew { //
//public class UrlLiveCrawlerNew {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawlerNew.class); //
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(false).build(); // private static final Logger logger = LogManager.getLogger(UrlLiveCrawlerNew.class);
// private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).throwException(false).build();
public List<UrlLiveBean> judgeIsDelete(List<String> urlList) { // private static Semaphore semaphore = new Semaphore(5);
GroupSync counter = new GroupSync(); //
List<UrlLiveBean> ulbList = new ArrayList<>(); //
urlList.forEach(url -> { // public List<UrlLiveBean> judgeIsDelete(List<String> urlList) {
try { // GroupSync counter = new GroupSync();
counter.add(); // List<UrlLiveBean> ulbList = new ArrayList<>();
TaskBoot.blockingAsync(() -> { // urlList.forEach(url -> {
try { // try {
counter.add(); // counter.add();
UrlLiveBean ulb = dealUrlLive(url); // TaskBoot.blockingAsync(() -> {
if(Objects.nonNull(ulb)) { // try {
ulbList.add(ulb); // counter.add();
} // UrlLiveBean ulb = dealUrlLive(url);
} catch (Exception e) { // if(Objects.nonNull(ulb)) {
logger.error("链接是否删除新", e); // ulbList.add(ulb);
} finally { // }
counter.done(); // } catch (Exception e) {
} // logger.error("链接是否删除新", e);
}); // } finally {
} catch (Exception e2) { // counter.done();
logger.error("数据出错 {}" ,e2); // }
} finally { // });
counter.done(); // } catch (Exception e2) {
} // logger.error("数据出错 {}" ,e2);
}); // } finally {
try { // counter.done();
counter.await(); // }
} catch (InterruptedException e) { // });
e.printStackTrace(); // try {
} // counter.await();
return ulbList; // } catch (InterruptedException e) {
} // e.printStackTrace();
// }
private UrlLiveBean dealUrlLive(String url) { // return ulbList;
try { // }
url = dealUrl(url); //
logger.info("当前处理 URL: {}", url); // private UrlLiveBean dealUrlLive(String url) {
Map<String,String> headers = new HashMap<>(); // try {
// Map<String,String> headers = HeaderTool.getCommonHead(); // url = dealUrl(url);
if(url.contains("www.toutiao.com")){ // logger.info("当前处理 URL: {}", url);
headers.put("referer", url); // Map<String,String> headers = new HashMap<>();
}else if(url.contains("zhihu.com")) { //// Map<String,String> headers = HeaderTool.getCommonHead();
url = treatZhihuUrl(url); // if(url.contains("www.toutiao.com")){
} // headers.put("referer", url);
Request request = RequestUtils.wrapGet(url, headers); // }else if(url.contains("zhihu.com")) {
int code = 404; // url = treatZhihuUrl(url);
for(int i = 0; i < 2; i++) { // }
try (Response response = httpBoot.syncCall(request)){ // Request request = RequestUtils.wrapGet(url, headers);
if(response.isSuccessful()) { // int code = 404;
return matchDel(response.body().string(), url); // for(int i = 0; i < 2; i++) {
}else { // try (Response response = httpBoot.syncCall(request)){
code = response.code(); // if(response.isSuccessful()) {
} // return matchDel(response.body().string(), url);
} catch (Exception e) { // }else {
logger.error("解析", e); // code = response.code();
} // }
} // } catch (Exception e) {
if(code == 403){ // logger.error("解析", e);
return callBack(url, -1, String.valueOf(code)); // }
}else { // }
return callBack(url, 1, String.valueOf(code)); // if(code == 403){
} // return callBack(url, -1, String.valueOf(code));
} catch (Exception e) { // }else {
e.printStackTrace(); // return callBack(url, 1, String.valueOf(code));
return null; // }
} // } catch (Exception e) {
} // e.printStackTrace();
// return null;
private UrlLiveBean callBack(String url,int i,String title) { // }
if(i == 1) { // }
return new UrlLiveBean(url, true, title); //
}else { // private UrlLiveBean callBack(String url,int i,String title) {
return new UrlLiveBean(url, i, title); // if(i == 1) {
} // return new UrlLiveBean(url, true, title);
} // }else {
// return new UrlLiveBean(url, i, title);
private String dealUrl(String url) { // }
try { // }
if(url.contains("toutiao.com")) { //
if(url.contains("www.toutiao.com")) { // private String dealUrl(String url) {
// try {
}else { // if(url.contains("toutiao.com")) {
url = url.replace("toutiao.com", "www.toutiao.com"); // if(url.contains("www.toutiao.com")) {
} //
if(url.contains("https")) { // }else {
// url = url.replace("toutiao.com", "www.toutiao.com");
}else { // }
url = url.replace("http", "https"); // if(url.contains("https")) {
} //
if(url.contains("group")) { // }else {
url = "https://www.toutiao.com/a" + url.split("/")[4] + "/"; // url = url.replace("http", "https");
} // }
}else if(url.contains("mp.weixin.qq.com")) { // if(url.contains("group")) {
if(url.contains("https")) { // url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
// }
}else { // }else if(url.contains("mp.weixin.qq.com")) {
url = url.replace("http", "https"); // if(url.contains("https")) {
} //
}else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) { // }else {
url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"; // url = url.replace("http", "https");
} // }
return url; // }else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) {
} catch (Exception e) { // url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1";
return url; // }
} // return url;
} // } catch (Exception e) {
// return url;
/*** // }
* @Title: matchDel // }
* @author hero //
* @Description: 验证链接是否有效 // /***
* @param @param page // * @Title: matchDel
* @param @return 设定文件 // * @author hero
* @return boolean 返回类型 // * @Description: 验证链接是否有效
*/ // * @param @param page
public UrlLiveBean matchDel(String result,String url){ // * @param @return 设定文件
try { // * @return boolean 返回类型
Document doc = Jsoup.parse(result); // */
String title = null; // public UrlLiveBean matchDel(String result,String url){
if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")){ // try {
title = doc.select("h2.rich_media_title").text().replaceAll(" ", ""); // Document doc = Jsoup.parse(result);
if(Objects.isNull(title) || title.isEmpty()) { // String title = null;
title = doc.select("p.title").text(); // if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")){
} // title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
if(Objects.isNull(title) || title.isEmpty()) { // if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h3.msg-title").text(); // title = doc.select("p.title").text();
} // }
if(Objects.isNull(title) || title.isEmpty()) { // if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("div.global_error_msg.warn").text(); // title = doc.select("h3.msg-title").text();
} // }
if(Objects.isNull(title) || title.isEmpty()) { // if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.tips").text(); // title = doc.select("div.global_error_msg.warn").text();
} // }
if(Objects.isNull(title) || title.isEmpty()) { // if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h2").text(); // title = doc.select("p.tips").text();
} // }
}else if(url.contains("kuaibao")){ // if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.title").text().replaceAll(" ", ""); // title = doc.select("h2").text();
}else if(url.contains("chinadaily.com.cn")){ // }
title = doc.select("p.style1").text().replaceAll(" ", ""); // }else if(url.contains("kuaibao")){
}else if(url.contains("baidu.com") || url.contains("hao123.com")) { // title = doc.select("p.title").text().replaceAll(" ", "");
title = doc.select("p#contaniner").text(); // }else if(url.contains("chinadaily.com.cn")){
}else if(url.contains("kanfanews.com")) { // title = doc.select("p.style1").text().replaceAll(" ", "");
title = doc.select("p#tit").text(); // }else if(url.contains("baidu.com") || url.contains("hao123.com")) {
}else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) { // title = doc.select("p#contaniner").text();
title = "网页已删除"; // }else if(url.contains("kanfanews.com")) {
}else if(url.contains("a.mp.uc.cn")) { // title = doc.select("p#tit").text();
try { // }else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
JSONObject json = JSONObject.parseObject(result); // title = "网页已删除";
title = json.getJSONObject("data").getString("title"); // }else if(url.contains("a.mp.uc.cn")) {
if(Objects.isNull(title) || title.length() < 1) { // try {
title = "网页已删除"; // JSONObject json = JSONObject.parseObject(result);
} // title = json.getJSONObject("data").getString("title");
} catch (Exception e) { // if(Objects.isNull(title) || title.length() < 1) {
logger.error(" uc 数据 json 转换失败", e); // title = "网页已删除";
} // }
}else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) { // } catch (Exception e) {
title = "网页已删除"; // logger.error(" uc 数据 json 转换失败", e);
}else if(url.contains("zhihu.com")) { // }
JSONObject resultJson = JSONObject.parseObject(result); // }else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
// title = "网页已删除";
title = resultJson.getString("title")!=null?resultJson.getString("title"):resultJson.getString("message"); // }else if(url.contains("zhihu.com")) {
} // JSONObject resultJson = JSONObject.parseObject(result);
//
//若title 为拿到 用 此方法 // title = resultJson.getString("title")!=null?resultJson.getString("title"):resultJson.getString("message");
if(Objects.isNull(title) || title.length() < 1) { // }
title = doc.select("div.adiv > p > span").text().replaceAll(" ", ""); //
} // //若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1) {
//若title 为拿到 用 此方法 // title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
if(Objects.isNull(title) || title.length() < 1) { // }
title = doc.select("title").text().replaceAll(" ", ""); //
} // //若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1) {
//若title 为拿到 用 此方法 // title = doc.select("title").text().replaceAll(" ", "");
if(Objects.isNull(title) || title.length() < 1) { // }
title = doc.select("h1").text().replaceAll(" ", ""); //
} // //若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1) {
//若title 为拿到 用 此方法 // title = doc.select("h1").text().replaceAll(" ", "");
if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) { // }
title = "网页已删除"; //
} // //若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
if(Objects.nonNull(title) && title.length() > 1){ // title = "网页已删除";
return new UrlLiveBean(url, isDelete(title),title); // }
} else { //
return null; // if(Objects.nonNull(title) && title.length() > 1){
} // return new UrlLiveBean(url, isDelete(title),title);
} catch (Exception e) { // } else {
return null; // return null;
} // }
} // } catch (Exception e) {
// return null;
/** // }
* // }
* @Description 标题判断 //
* @param title // /**
* @return // *
*/ // * @Description 标题判断
private boolean isDelete(String title) { // * @param title
List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除" // * @return
,"此帐号已自主注销,内容无法查看","页面提示","正在维护中" // */
,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway" // private boolean isDelete(String title) {
,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网" // List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除"
,"百度一下,你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页" // ,"此帐号已自主注销,内容无法查看","页面提示","正在维护中"
,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网" // ,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway"
,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台" // ,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网"
,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移" // ,"百度一下,你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页"
,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除" // ,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网"
,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在"); // ,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
// ,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在" // ,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误" // ,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在");
,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常", //
"此帐号已被屏蔽, 内容无法查看","链接不存在"); // List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
// ,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误"
return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals); // ,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常",
} // "此帐号已被屏蔽, 内容无法查看","链接不存在");
//
// return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
/** // }
* 处理知乎链接 //
* //
* */ // /**
private static String treatZhihuUrl(String url) { // * 处理知乎链接
if(url.contains("/answer/")) { // *
url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", ""); // * */
}else if(url.contains("/question/") && !url.contains("/answer/")) { // private static String treatZhihuUrl(String url) {
url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", ""); // if(url.contains("/answer/")) {
}else if(url.contains("/p/")) { // url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", "");
url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", ""); // }else if(url.contains("/question/") && !url.contains("/answer/")) {
} // url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", "");
return url; // }else if(url.contains("/p/")) {
} // url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", "");
// }
// return url;
} // }
//
//
//}
...@@ -80,7 +80,7 @@ public class SourceForward { ...@@ -80,7 +80,7 @@ public class SourceForward {
public static void main(String[] args) { public static void main(String[] args) {
ProxyInit.initProxy(); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://gu.qq.com/resources/shy/news/detail-v2/index.html?#/index?id=SN202006091653447945411f"); urlList.add("https://ypstatic.cnnb.com.cn/yppage-share/news/share/news_detail?newsId=627223d9e4b042b45e211c5a");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList); List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) { for(SourceForwardBean sfb : da) {
System.out.println("=============="+sfb.toString()); System.out.println("=============="+sfb.toString());
......
...@@ -98,7 +98,7 @@ public class URLLive { ...@@ -98,7 +98,7 @@ public class URLLive {
}; };
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await(); crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
}catch (Exception e){ }catch (Exception e){
logger.error(" 数据采集运行有问题 {} ", e); logger.error(" 数据采集运行有问题 ", e);
} }
return list; return list;
} }
......
package com.zhiwei.source_forward.util; package com.zhiwei.source_forward.util;
import com.zhiwei.crawler.core.proxy.ProxyFactory; import org.apache.dubbo.config.ApplicationConfig;
import com.zhiwei.proxy.config.SimpleConfig; import org.apache.dubbo.config.ConsumerConfig;
import com.zhiwei.source_forward.config.ProxyConfig; import org.apache.dubbo.config.RegistryConfig;
import com.zhiwei.http.proxy.CynomysFactory;
import com.zhiwei.network.cynomys.consumer.CynomysConsumer;
import com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory;
/** /**
* 初始化代理 * 初始化代理
...@@ -16,10 +20,18 @@ public class ProxyInit { ...@@ -16,10 +20,18 @@ public class ProxyInit {
* void * void
*/ */
public static void initProxy() { public static void initProxy() {
String address = ProxyConfig.registry; ApplicationConfig applicationConfig = new ApplicationConfig();
String appName = "xumiaoxin"; applicationConfig.setName("actool");
long appId = ProxyConfig.proxyid; RegistryConfig registryConfig = new RegistryConfig();
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build()); registryConfig.setAddress("zookeeper://192.168.0.30:2181");
ConsumerConfig consumerConfig = new ConsumerConfig();
// 设置分组
consumerConfig.setGroup("local");
String username = "18271694195";
String password = "Zhiwei289";
// 创建 consumer,applicationConfig 非必需参数
CynomysConsumer consumer = CynomysConsumerFactory.create(applicationConfig, registryConfig, consumerConfig, username, password);
CynomysFactory.init(consumer);
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment