Commit 2c9d4fa2 by chenweiyang

代理升级版本

parent 4860f41e
...@@ -3,13 +3,17 @@ ...@@ -3,13 +3,17 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.3.0-SNAPSHOT</version> <version>0.3.1-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<http-boot.version>0.1.0.8-SNAPSHOT</http-boot.version>
<task-boot.version>1.1.2-SNAPSHOT</task-boot.version>
<boilerpipe.version>0.0.1-SNAPSHOT</boilerpipe.version>
<conomys-consumer.version>0.0.3-SNAPSHOT</conomys-consumer.version>
</properties> </properties>
<developers> <developers>
...@@ -30,12 +34,30 @@ ...@@ -30,12 +34,30 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.2.4-SNAPSHOT</version> <version>0.4.5-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.kohlschutter.boilerpipe</groupId>
<artifactId>crawler-core</artifactId> <artifactId>boilerpipe-extractor</artifactId>
<version>0.6.6.8-SNAPSHOT</version> <version>${boilerpipe.version}</version>
</dependency>
<dependency>
<groupId>com.zhiwei.http</groupId>
<artifactId>http-boot</artifactId>
<version>${http-boot.version}</version>
</dependency>
<dependency>
<groupId>com.zhiwei.async</groupId>
<artifactId>task-boot</artifactId>
<version>${task-boot.version}</version>
</dependency>
<dependency>
<groupId>com.zhiwei.network</groupId>
<artifactId>cynomys-consumer</artifactId>
<version>${conomys-consumer.version}</version>
</dependency> </dependency>
</dependencies> </dependencies>
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.Semaphore;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.async.GroupSync; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.ContentBean; import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution; import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback; import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent; import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
public class ContentCrawler { public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class); private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static Semaphore semaphore = new Semaphore(5);
/** /**
* *
* @Description 链接传入 并 返回采集完信号 * @Description 链接传入 并 返回采集完信号
...@@ -51,9 +53,12 @@ public class ContentCrawler { ...@@ -51,9 +53,12 @@ public class ContentCrawler {
ZhiWeiTools.sleep(100); ZhiWeiTools.sleep(100);
if (url != null) { if (url != null) {
try { try {
semaphore.acquire();
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错", e); logger.error("搜索创建出错", e);
} finally {
semaphore.release();
} }
} }
} }
...@@ -75,15 +80,15 @@ public class ContentCrawler { ...@@ -75,15 +80,15 @@ public class ContentCrawler {
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
counter.add(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback); parseHtml(rs.bodyString(), attr, callback);
} else { } else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex); logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
} }
} catch (Exception e) { } catch (Exception e) {
logger.info("搜索结果访问失败: {}", ex); logger.info("搜索结果访问失败: ", ex);
} finally { } finally {
counter.done(); counter.done();
} }
......
...@@ -4,22 +4,23 @@ import java.util.HashMap; ...@@ -4,22 +4,23 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.Semaphore;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean; import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution; import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel; import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource; import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack; import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
...@@ -34,8 +35,9 @@ import okhttp3.Request; ...@@ -34,8 +35,9 @@ import okhttp3.Request;
public class MediaSelfSourceCrawler { public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class); private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static Semaphore semaphore = new Semaphore(5);
/** /**
* *
* @Description 链接传入 并 返回采集完信号 * @Description 链接传入 并 返回采集完信号
...@@ -69,9 +71,12 @@ public class MediaSelfSourceCrawler { ...@@ -69,9 +71,12 @@ public class MediaSelfSourceCrawler {
counter.add(); counter.add();
if (url != null) { if (url != null) {
try { try {
semaphore.acquire();
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错", e); logger.error("搜索创建出错", e);
} finally {
semaphore.release();
} }
} }
counter.done(); counter.done();
...@@ -91,7 +96,7 @@ public class MediaSelfSourceCrawler { ...@@ -91,7 +96,7 @@ public class MediaSelfSourceCrawler {
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) { private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", attr.get()); logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY; ProxyServerSupplier ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")) { if(url.contains("toutiao.com")) {
map.put("referer", url); map.put("referer", url);
} }
...@@ -104,7 +109,7 @@ public class MediaSelfSourceCrawler { ...@@ -104,7 +109,7 @@ public class MediaSelfSourceCrawler {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
try { try {
parseHtml(rs.body().string(), attr, callback, rs.request().url().uri().toString()); parseHtml(rs.bodyString(), attr, callback, rs.bootRequest().url().uri().toString());
} catch (Exception e) { } catch (Exception e) {
logger.error("解析出错", e); logger.error("解析出错", e);
} }
......
...@@ -4,6 +4,7 @@ import java.util.HashMap; ...@@ -4,6 +4,7 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.Semaphore;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
...@@ -11,16 +12,16 @@ import org.jsoup.Jsoup; ...@@ -11,16 +12,16 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.SourceForwardBean; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution; import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel; import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource; import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData; import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack; import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
...@@ -29,9 +30,10 @@ public class SourceForwardCrawler { ...@@ -29,9 +30,10 @@ public class SourceForwardCrawler {
private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class); private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static List<String> sourceList = SourceData.getSourceList(); private static List<String> sourceList = SourceData.getSourceList();
private static Semaphore semaphore = new Semaphore(5);
public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) { public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
try { try {
GroupSync counter = new GroupSync(); GroupSync counter = new GroupSync();
...@@ -50,10 +52,13 @@ public class SourceForwardCrawler { ...@@ -50,10 +52,13 @@ public class SourceForwardCrawler {
ZhiWeiTools.sleep(100); ZhiWeiTools.sleep(100);
if (url != null) { if (url != null) {
try { try {
semaphore.acquire();
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错", e); logger.error("搜索创建出错", e);
} } finally {
semaphore.release();
}
} }
counter.done(); counter.done();
} }
...@@ -77,10 +82,10 @@ public class SourceForwardCrawler { ...@@ -77,10 +82,10 @@ public class SourceForwardCrawler {
} }
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
counter.add(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback); parseHtml(rs.bodyString(), attr, callback);
} else { } else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex); logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
} }
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import static java.util.Objects.nonNull;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.Semaphore;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.JSONPath; import com.alibaba.fastjson.JSONPath;
import com.zhiwei.async.GroupSync; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.util.*; import okhttp3.Request;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.Objects.nonNull;
/** /**
* @author byte-zbs * @author byte-zbs
...@@ -32,8 +39,9 @@ import static java.util.Objects.nonNull; ...@@ -32,8 +39,9 @@ import static java.util.Objects.nonNull;
public class UrlLiveCrawler { public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class); private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(true).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static Semaphore semaphore = new Semaphore(5);
public GroupSync submitTask(UrlLiveDataCallback callback, String... urls) { public GroupSync submitTask(UrlLiveDataCallback callback, String... urls) {
GroupSync counter = new GroupSync(); GroupSync counter = new GroupSync();
start(counter, callback, urls); start(counter, callback, urls);
...@@ -43,17 +51,20 @@ public class UrlLiveCrawler { ...@@ -43,17 +51,20 @@ public class UrlLiveCrawler {
private void start(GroupSync counter, UrlLiveDataCallback callback, String... urls) { private void start(GroupSync counter, UrlLiveDataCallback callback, String... urls) {
if (nonNull(urls) && urls.length > 0) { if (nonNull(urls) && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
counter.add(); try {
ZhiWeiTools.sleep(100); counter.add();
if (nonNull(url)) { semaphore.acquire();
try { ZhiWeiTools.sleep(200);
if (nonNull(url)) {
// ZhiWeiTools.sleep(3000); // ZhiWeiTools.sleep(3000);
search(counter, url, Attribution.of(url, 1), callback); search(counter, url, Attribution.of(url, 1), callback);
} catch (Exception e) {
logger.error("搜索创建出错:", e);
} }
} catch (Exception e) {
logger.error("搜索创建出错:", e);
} finally {
counter.done();
semaphore.release();
} }
counter.done();
} }
} }
} }
...@@ -63,12 +74,12 @@ public class UrlLiveCrawler { ...@@ -63,12 +74,12 @@ public class UrlLiveCrawler {
url = dealUrl(url); url = dealUrl(url);
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", url);
Map<String, String> headers = new HashMap<>(); Map<String, String> headers = new HashMap<>();
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY; ProxyServerSupplier ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
if (url.contains("toutiao.com")) { if (url.contains("toutiao.com")) {
ph = ProxyHolder.NAT_HEAVY_PROXY; ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
} else if (url.contains("zhihu.com")) { } else if (url.contains("zhihu.com")) {
url = treatZhihuUrl(url); url = treatZhihuUrl(url);
ph = ProxyHolder.NAT_HEAVY_PROXY; ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
} }
try { try {
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
...@@ -80,7 +91,7 @@ public class UrlLiveCrawler { ...@@ -80,7 +91,7 @@ public class UrlLiveCrawler {
System.out.println(rs.code()); System.out.println(rs.code());
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
if (rs.isSuccessful()) { if (rs.isSuccessful()) {
parseHtml(rs.body().string(), attr, callback); parseHtml(rs.bodyString(), attr, callback);
} else if (rs.code() == 404) { } else if (rs.code() == 404) {
callBack(callback, attr, 1, String.valueOf(rs.code())); callBack(callback, attr, 1, String.valueOf(rs.code()));
} else { } else {
...@@ -91,7 +102,7 @@ public class UrlLiveCrawler { ...@@ -91,7 +102,7 @@ public class UrlLiveCrawler {
callBack(callback, attr, -1, "程序无法判断"); callBack(callback, attr, -1, "程序无法判断");
} }
} catch (Exception e) { } catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ", e); logger.error(" 数据是否删除 采集出错 ", e);
} finally { } finally {
counter.done(); counter.done();
} }
...@@ -99,7 +110,7 @@ public class UrlLiveCrawler { ...@@ -99,7 +110,7 @@ public class UrlLiveCrawler {
return counter; return counter;
} }
} catch (Exception e2) { } catch (Exception e2) {
logger.error("数据出错 {}", e2); logger.error("数据出错 ", e2);
} }
return counter; return counter;
} }
......
...@@ -80,7 +80,7 @@ public class SourceForward { ...@@ -80,7 +80,7 @@ public class SourceForward {
public static void main(String[] args) { public static void main(String[] args) {
ProxyInit.initProxy(); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://gu.qq.com/resources/shy/news/detail-v2/index.html?#/index?id=SN202006091653447945411f"); urlList.add("https://ypstatic.cnnb.com.cn/yppage-share/news/share/news_detail?newsId=627223d9e4b042b45e211c5a");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList); List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) { for(SourceForwardBean sfb : da) {
System.out.println("=============="+sfb.toString()); System.out.println("=============="+sfb.toString());
......
...@@ -98,7 +98,7 @@ public class URLLive { ...@@ -98,7 +98,7 @@ public class URLLive {
}; };
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await(); crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
}catch (Exception e){ }catch (Exception e){
logger.error(" 数据采集运行有问题 {} ", e); logger.error(" 数据采集运行有问题 ", e);
} }
return list; return list;
} }
......
package com.zhiwei.source_forward.util; package com.zhiwei.source_forward.util;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.kohlschutter.boilerpipe.extractors.ArticleExtractor; import com.kohlschutter.boilerpipe.extractors.ArticleExtractor;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
/** /**
* @ClassName: MatchChannel * @ClassName: MatchChannel
* @Description: 匹配频道 * @Description: 匹配频道
* @author hero * @author hero
* @date 2018年6月30日 上午10:27:58 * @date 2018年6月30日 上午10:27:58
*/ */
public class MatchContent { public class MatchContent {
private static Logger logger = LoggerFactory.getLogger(MatchContent.class); private static Logger logger = LoggerFactory.getLogger(MatchContent.class);
/** /**
* @Title: matchContent * @Title: matchContent
* @author hero * @author hero
* @Description: 匹配文章正文 * @Description: 匹配文章正文
* @param @param url * @param @param url
* @param @param html * @param @param html
* @param @return 设定文件 * @param @return 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
public static String matchContent(String url,String html) { public static String matchContent(String url,String html) {
String content = null; String content = null;
try { try {
Document document = Jsoup.parse(html); Document document = Jsoup.parse(html);
if(url.contains("weixin.qq.com")) { if(url.contains("weixin.qq.com")) {
content = matchContentWeixin(html); content = matchContentWeixin(html);
}else if(url.contains("toutiao.com")) { }else if(url.contains("toutiao.com")) {
content = matchContentToutiao(html); content = matchContentToutiao(html);
} }
if(content == null || content.length() < 10) { if(content == null || content.length() < 10) {
content = mathchContent(html, document); content = mathchContent(html, document);
} }
return ZhiWeiTools.delHTMLTag(content); return ZhiWeiTools.delHTMLTag(content);
} catch (Exception e) { } catch (Exception e) {
logger.error("获取全文失败",e); logger.error("获取全文失败",e);
content = null; content = null;
} }
return content; return content;
} }
/** /**
* *
* @Description 头条正文获取 * @Description 头条正文获取
* @param html * @param html
* @return * @return
*/ */
private static String matchContentToutiao(String html) { private static String matchContentToutiao(String html) {
Pattern pa = Pattern.compile("content:(.*?)',"); Pattern pa = Pattern.compile("content:(.*?)',");
Matcher ma = pa.matcher(html); Matcher ma = pa.matcher(html);
while(ma.find()) { while(ma.find()) {
return ma.group(1); return ma.group(1);
} }
return null; return null;
} }
/** /**
* *
* @Description 微信文本获取 * @Description 微信文本获取
* @param html * @param html
* @return * @return
*/ */
private static String matchContentWeixin(String contentHtml) { private static String matchContentWeixin(String contentHtml) {
try { try {
Document document = Jsoup.parse(contentHtml); Document document = Jsoup.parse(contentHtml);
if (contentHtml.contains("js_article")) { if (contentHtml.contains("js_article")) {
return document.select("div#js_article").text(); return document.select("div#js_article").text();
} else if (contentHtml.contains("js_share_content")) { } else if (contentHtml.contains("js_share_content")) {
return document.select("div#js_share_content").text(); return document.select("div#js_share_content").text();
} }
if (contentHtml.contains("content_tpl")) { if (contentHtml.contains("content_tpl")) {
String text = document.select("script#content_tpl").html(); String text = document.select("script#content_tpl").html();
return Jsoup.parse(text).text(); return Jsoup.parse(text).text();
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("微信全文解析出错 {}", e); logger.error("微信全文解析出错 {}", e);
} }
return ""; return "";
} }
/** /**
* @Title: mathchContent * @Title: mathchContent
* @author hero * @author hero
* @Description: 匹配正文数据 * @Description: 匹配正文数据
* @param @param html * @param @param html
* @param @param document * @param @param document
* @param @return 设定文件 * @param @return 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
private static String mathchContent(String html,Document document){ private static String mathchContent(String html,Document document){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/ /** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String content = null; String content = null;
try { try {
content = ArticleExtractor.getInstance().getText(html); content = ArticleExtractor.getInstance().getText(html);
} catch (Exception e) { } catch (Exception e) {
logger.error("正文抽取失败,获取全文文本:",e); logger.error("正文抽取失败,获取全文文本:",e);
content = document.text(); content = document.text();
} }
// String content = null; // String content = null;
// try { // try {
// News news = ContentExtractor.getNewsByHtml(html); // News news = ContentExtractor.getNewsByHtml(html);
// content = TreateData.filterSpecialCharacter(news.getContent()); // content = TreateData.filterSpecialCharacter(news.getContent());
// } catch (Exception e) { // } catch (Exception e) {
// logger.error("正文抽取失败,获取全文文本:",e); // logger.error("正文抽取失败,获取全文文本:",e);
// content = document.text(); // content = document.text();
// } // }
return content; return content;
} }
} }
package com.zhiwei.source_forward.util; package com.zhiwei.source_forward.util;
import com.zhiwei.crawler.core.proxy.ProxyFactory; import org.apache.dubbo.config.ApplicationConfig;
import com.zhiwei.proxy.config.SimpleConfig; import org.apache.dubbo.config.ConsumerConfig;
import com.zhiwei.source_forward.config.ProxyConfig; import org.apache.dubbo.config.RegistryConfig;
import com.zhiwei.http.proxy.CynomysFactory;
import com.zhiwei.network.cynomys.consumer.CynomysConsumer;
import com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory;
/** /**
* 初始化代理 * 初始化代理
...@@ -16,10 +20,18 @@ public class ProxyInit { ...@@ -16,10 +20,18 @@ public class ProxyInit {
* void * void
*/ */
public static void initProxy() { public static void initProxy() {
String address = ProxyConfig.registry; ApplicationConfig applicationConfig = new ApplicationConfig();
String appName = "xumiaoxin"; applicationConfig.setName("actool");
long appId = ProxyConfig.proxyid; RegistryConfig registryConfig = new RegistryConfig();
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build()); registryConfig.setAddress("zookeeper://192.168.0.30:2181");
ConsumerConfig consumerConfig = new ConsumerConfig();
// 设置分组
consumerConfig.setGroup("local");
String username = "18271694195";
String password = "Zhiwei289";
// 创建 consumer,applicationConfig 非必需参数
CynomysConsumer consumer = CynomysConsumerFactory.create(applicationConfig, registryConfig, consumerConfig, username, password);
CynomysFactory.init(consumer);
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment