Commit 2c9d4fa2 by chenweiyang

代理升级版本

parent 4860f41e
......@@ -3,13 +3,17 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.3.0-SNAPSHOT</version>
<version>0.3.1-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<http-boot.version>0.1.0.8-SNAPSHOT</http-boot.version>
<task-boot.version>1.1.2-SNAPSHOT</task-boot.version>
<boilerpipe.version>0.0.1-SNAPSHOT</boilerpipe.version>
<conomys-consumer.version>0.0.3-SNAPSHOT</conomys-consumer.version>
</properties>
<developers>
......@@ -30,12 +34,30 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.2.4-SNAPSHOT</version>
<version>0.4.5-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.6.8-SNAPSHOT</version>
<groupId>com.kohlschutter.boilerpipe</groupId>
<artifactId>boilerpipe-extractor</artifactId>
<version>${boilerpipe.version}</version>
</dependency>
<dependency>
<groupId>com.zhiwei.http</groupId>
<artifactId>http-boot</artifactId>
<version>${http-boot.version}</version>
</dependency>
<dependency>
<groupId>com.zhiwei.async</groupId>
<artifactId>task-boot</artifactId>
<version>${task-boot.version}</version>
</dependency>
<dependency>
<groupId>com.zhiwei.network</groupId>
<artifactId>cynomys-consumer</artifactId>
<version>${conomys-consumer.version}</version>
</dependency>
</dependencies>
......
package com.zhiwei.source_forward.crawler;
import java.util.Objects;
import java.util.concurrent.Semaphore;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static Semaphore semaphore = new Semaphore(5);
/**
*
* @Description 链接传入 并 返回采集完信号
......@@ -51,9 +53,12 @@ public class ContentCrawler {
ZhiWeiTools.sleep(100);
if (url != null) {
try {
semaphore.acquire();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
} finally {
semaphore.release();
}
}
}
......@@ -75,15 +80,15 @@ public class ContentCrawler {
Request request = RequestUtils.wrapGet(url);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback);
parseHtml(rs.bodyString(), attr, callback);
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
}
} catch (Exception e) {
logger.info("搜索结果访问失败: {}", ex);
logger.info("搜索结果访问失败: ", ex);
} finally {
counter.done();
}
......
......@@ -4,22 +4,23 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.Semaphore;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -34,8 +35,9 @@ import okhttp3.Request;
public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static Semaphore semaphore = new Semaphore(5);
/**
*
* @Description 链接传入 并 返回采集完信号
......@@ -69,9 +71,12 @@ public class MediaSelfSourceCrawler {
counter.add();
if (url != null) {
try {
semaphore.acquire();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
} finally {
semaphore.release();
}
}
counter.done();
......@@ -91,7 +96,7 @@ public class MediaSelfSourceCrawler {
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>();
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
ProxyServerSupplier ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")) {
map.put("referer", url);
}
......@@ -104,7 +109,7 @@ public class MediaSelfSourceCrawler {
try {
if (Objects.isNull(ex)) {
try {
parseHtml(rs.body().string(), attr, callback, rs.request().url().uri().toString());
parseHtml(rs.bodyString(), attr, callback, rs.bootRequest().url().uri().toString());
} catch (Exception e) {
logger.error("解析出错", e);
}
......
......@@ -4,6 +4,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.Semaphore;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......@@ -11,16 +12,16 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -29,9 +30,10 @@ public class SourceForwardCrawler {
private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static List<String> sourceList = SourceData.getSourceList();
private static Semaphore semaphore = new Semaphore(5);
public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
try {
GroupSync counter = new GroupSync();
......@@ -50,10 +52,13 @@ public class SourceForwardCrawler {
ZhiWeiTools.sleep(100);
if (url != null) {
try {
semaphore.acquire();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
}
} finally {
semaphore.release();
}
}
counter.done();
}
......@@ -77,10 +82,10 @@ public class SourceForwardCrawler {
}
Request request = RequestUtils.wrapGet(url, headers);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback);
parseHtml(rs.bodyString(), attr, callback);
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
}
......
package com.zhiwei.source_forward.crawler;
import static java.util.Objects.nonNull;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.Semaphore;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.JSONPath;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.Objects.nonNull;
import okhttp3.Request;
/**
* @author byte-zbs
......@@ -32,8 +39,9 @@ import static java.util.Objects.nonNull;
public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(true).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static Semaphore semaphore = new Semaphore(5);
public GroupSync submitTask(UrlLiveDataCallback callback, String... urls) {
GroupSync counter = new GroupSync();
start(counter, callback, urls);
......@@ -43,17 +51,20 @@ public class UrlLiveCrawler {
private void start(GroupSync counter, UrlLiveDataCallback callback, String... urls) {
if (nonNull(urls) && urls.length > 0) {
for (String url : urls) {
counter.add();
ZhiWeiTools.sleep(100);
if (nonNull(url)) {
try {
try {
counter.add();
semaphore.acquire();
ZhiWeiTools.sleep(200);
if (nonNull(url)) {
// ZhiWeiTools.sleep(3000);
search(counter, url, Attribution.of(url, 1), callback);
} catch (Exception e) {
logger.error("搜索创建出错:", e);
}
} catch (Exception e) {
logger.error("搜索创建出错:", e);
} finally {
counter.done();
semaphore.release();
}
counter.done();
}
}
}
......@@ -63,12 +74,12 @@ public class UrlLiveCrawler {
url = dealUrl(url);
logger.info("当前处理 URL: {}", url);
Map<String, String> headers = new HashMap<>();
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
ProxyServerSupplier ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
if (url.contains("toutiao.com")) {
ph = ProxyHolder.NAT_HEAVY_PROXY;
ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
} else if (url.contains("zhihu.com")) {
url = treatZhihuUrl(url);
ph = ProxyHolder.NAT_HEAVY_PROXY;
ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
}
try {
Request request = RequestUtils.wrapGet(url, headers);
......@@ -80,7 +91,7 @@ public class UrlLiveCrawler {
System.out.println(rs.code());
if (Objects.isNull(ex)) {
if (rs.isSuccessful()) {
parseHtml(rs.body().string(), attr, callback);
parseHtml(rs.bodyString(), attr, callback);
} else if (rs.code() == 404) {
callBack(callback, attr, 1, String.valueOf(rs.code()));
} else {
......@@ -91,7 +102,7 @@ public class UrlLiveCrawler {
callBack(callback, attr, -1, "程序无法判断");
}
} catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ", e);
logger.error(" 数据是否删除 采集出错 ", e);
} finally {
counter.done();
}
......@@ -99,7 +110,7 @@ public class UrlLiveCrawler {
return counter;
}
} catch (Exception e2) {
logger.error("数据出错 {}", e2);
logger.error("数据出错 ", e2);
}
return counter;
}
......
package com.zhiwei.source_forward.crawler;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.async.GroupSync;
import com.zhiwei.async.TaskBoot;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import okhttp3.Request;
import okhttp3.Response;
public class UrlLiveCrawlerNew {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawlerNew.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(false).build();
public List<UrlLiveBean> judgeIsDelete(List<String> urlList) {
GroupSync counter = new GroupSync();
List<UrlLiveBean> ulbList = new ArrayList<>();
urlList.forEach(url -> {
try {
counter.add();
TaskBoot.blockingAsync(() -> {
try {
counter.add();
UrlLiveBean ulb = dealUrlLive(url);
if(Objects.nonNull(ulb)) {
ulbList.add(ulb);
}
} catch (Exception e) {
logger.error("链接是否删除新", e);
} finally {
counter.done();
}
});
} catch (Exception e2) {
logger.error("数据出错 {}" ,e2);
} finally {
counter.done();
}
});
try {
counter.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
return ulbList;
}
private UrlLiveBean dealUrlLive(String url) {
try {
url = dealUrl(url);
logger.info("当前处理 URL: {}", url);
Map<String,String> headers = new HashMap<>();
// Map<String,String> headers = HeaderTool.getCommonHead();
if(url.contains("www.toutiao.com")){
headers.put("referer", url);
}else if(url.contains("zhihu.com")) {
url = treatZhihuUrl(url);
}
Request request = RequestUtils.wrapGet(url, headers);
int code = 404;
for(int i = 0; i < 2; i++) {
try (Response response = httpBoot.syncCall(request)){
if(response.isSuccessful()) {
return matchDel(response.body().string(), url);
}else {
code = response.code();
}
} catch (Exception e) {
logger.error("解析", e);
}
}
if(code == 403){
return callBack(url, -1, String.valueOf(code));
}else {
return callBack(url, 1, String.valueOf(code));
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
private UrlLiveBean callBack(String url,int i,String title) {
if(i == 1) {
return new UrlLiveBean(url, true, title);
}else {
return new UrlLiveBean(url, i, title);
}
}
private String dealUrl(String url) {
try {
if(url.contains("toutiao.com")) {
if(url.contains("www.toutiao.com")) {
}else {
url = url.replace("toutiao.com", "www.toutiao.com");
}
if(url.contains("https")) {
}else {
url = url.replace("http", "https");
}
if(url.contains("group")) {
url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
}
}else if(url.contains("mp.weixin.qq.com")) {
if(url.contains("https")) {
}else {
url = url.replace("http", "https");
}
}else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) {
url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1";
}
return url;
} catch (Exception e) {
return url;
}
}
/***
* @Title: matchDel
* @author hero
* @Description: 验证链接是否有效
* @param @param page
* @param @return 设定文件
* @return boolean 返回类型
*/
public UrlLiveBean matchDel(String result,String url){
try {
Document doc = Jsoup.parse(result);
String title = null;
if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")){
title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.title").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h3.msg-title").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("div.global_error_msg.warn").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.tips").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h2").text();
}
}else if(url.contains("kuaibao")){
title = doc.select("p.title").text().replaceAll(" ", "");
}else if(url.contains("chinadaily.com.cn")){
title = doc.select("p.style1").text().replaceAll(" ", "");
}else if(url.contains("baidu.com") || url.contains("hao123.com")) {
title = doc.select("p#contaniner").text();
}else if(url.contains("kanfanews.com")) {
title = doc.select("p#tit").text();
}else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
title = "网页已删除";
}else if(url.contains("a.mp.uc.cn")) {
try {
JSONObject json = JSONObject.parseObject(result);
title = json.getJSONObject("data").getString("title");
if(Objects.isNull(title) || title.length() < 1) {
title = "网页已删除";
}
} catch (Exception e) {
logger.error(" uc 数据 json 转换失败", e);
}
}else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
title = "网页已删除";
}else if(url.contains("zhihu.com")) {
JSONObject resultJson = JSONObject.parseObject(result);
title = resultJson.getString("title")!=null?resultJson.getString("title"):resultJson.getString("message");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("title").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("h1").text().replaceAll(" ", "");
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
title = "网页已删除";
}
if(Objects.nonNull(title) && title.length() > 1){
return new UrlLiveBean(url, isDelete(title),title);
} else {
return null;
}
} catch (Exception e) {
return null;
}
}
/**
*
* @Description 标题判断
* @param title
* @return
*/
private boolean isDelete(String title) {
List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除"
,"此帐号已自主注销,内容无法查看","页面提示","正在维护中"
,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway"
,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网"
,"百度一下,你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页"
,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网"
,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在");
List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误"
,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常",
"此帐号已被屏蔽, 内容无法查看","链接不存在");
return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
}
/**
* 处理知乎链接
*
* */
private static String treatZhihuUrl(String url) {
if(url.contains("/answer/")) {
url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", "");
}else if(url.contains("/question/") && !url.contains("/answer/")) {
url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", "");
}else if(url.contains("/p/")) {
url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", "");
}
return url;
}
}
//package com.zhiwei.source_forward.crawler;
//
//import java.util.ArrayList;
//import java.util.Arrays;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//import java.util.Objects;
//import java.util.concurrent.Semaphore;
//
//import org.apache.logging.log4j.LogManager;
//import org.apache.logging.log4j.Logger;
//import org.jsoup.Jsoup;
//import org.jsoup.nodes.Document;
//
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.http.boot.HttpBoot;
//import com.zhiwei.http.util.RequestUtils;
//import com.zhiwei.source_forward.bean.UrlLiveBean;
//import com.zhiwei.task.async.TaskBoot;
//import com.zhiwei.task.sync.GroupSync;
//
//import okhttp3.Request;
//import okhttp3.Response;
//
//public class UrlLiveCrawlerNew {
//
// private static final Logger logger = LogManager.getLogger(UrlLiveCrawlerNew.class);
// private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).throwException(false).build();
// private static Semaphore semaphore = new Semaphore(5);
//
//
// public List<UrlLiveBean> judgeIsDelete(List<String> urlList) {
// GroupSync counter = new GroupSync();
// List<UrlLiveBean> ulbList = new ArrayList<>();
// urlList.forEach(url -> {
// try {
// counter.add();
// TaskBoot.blockingAsync(() -> {
// try {
// counter.add();
// UrlLiveBean ulb = dealUrlLive(url);
// if(Objects.nonNull(ulb)) {
// ulbList.add(ulb);
// }
// } catch (Exception e) {
// logger.error("链接是否删除新", e);
// } finally {
// counter.done();
// }
// });
// } catch (Exception e2) {
// logger.error("数据出错 {}" ,e2);
// } finally {
// counter.done();
// }
// });
// try {
// counter.await();
// } catch (InterruptedException e) {
// e.printStackTrace();
// }
// return ulbList;
// }
//
// private UrlLiveBean dealUrlLive(String url) {
// try {
// url = dealUrl(url);
// logger.info("当前处理 URL: {}", url);
// Map<String,String> headers = new HashMap<>();
//// Map<String,String> headers = HeaderTool.getCommonHead();
// if(url.contains("www.toutiao.com")){
// headers.put("referer", url);
// }else if(url.contains("zhihu.com")) {
// url = treatZhihuUrl(url);
// }
// Request request = RequestUtils.wrapGet(url, headers);
// int code = 404;
// for(int i = 0; i < 2; i++) {
// try (Response response = httpBoot.syncCall(request)){
// if(response.isSuccessful()) {
// return matchDel(response.body().string(), url);
// }else {
// code = response.code();
// }
// } catch (Exception e) {
// logger.error("解析", e);
// }
// }
// if(code == 403){
// return callBack(url, -1, String.valueOf(code));
// }else {
// return callBack(url, 1, String.valueOf(code));
// }
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// }
//
// private UrlLiveBean callBack(String url,int i,String title) {
// if(i == 1) {
// return new UrlLiveBean(url, true, title);
// }else {
// return new UrlLiveBean(url, i, title);
// }
// }
//
// private String dealUrl(String url) {
// try {
// if(url.contains("toutiao.com")) {
// if(url.contains("www.toutiao.com")) {
//
// }else {
// url = url.replace("toutiao.com", "www.toutiao.com");
// }
// if(url.contains("https")) {
//
// }else {
// url = url.replace("http", "https");
// }
// if(url.contains("group")) {
// url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
// }
// }else if(url.contains("mp.weixin.qq.com")) {
// if(url.contains("https")) {
//
// }else {
// url = url.replace("http", "https");
// }
// }else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) {
// url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1";
// }
// return url;
// } catch (Exception e) {
// return url;
// }
// }
//
// /***
// * @Title: matchDel
// * @author hero
// * @Description: 验证链接是否有效
// * @param @param page
// * @param @return 设定文件
// * @return boolean 返回类型
// */
// public UrlLiveBean matchDel(String result,String url){
// try {
// Document doc = Jsoup.parse(result);
// String title = null;
// if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")){
// title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
// if(Objects.isNull(title) || title.isEmpty()) {
// title = doc.select("p.title").text();
// }
// if(Objects.isNull(title) || title.isEmpty()) {
// title = doc.select("h3.msg-title").text();
// }
// if(Objects.isNull(title) || title.isEmpty()) {
// title = doc.select("div.global_error_msg.warn").text();
// }
// if(Objects.isNull(title) || title.isEmpty()) {
// title = doc.select("p.tips").text();
// }
// if(Objects.isNull(title) || title.isEmpty()) {
// title = doc.select("h2").text();
// }
// }else if(url.contains("kuaibao")){
// title = doc.select("p.title").text().replaceAll(" ", "");
// }else if(url.contains("chinadaily.com.cn")){
// title = doc.select("p.style1").text().replaceAll(" ", "");
// }else if(url.contains("baidu.com") || url.contains("hao123.com")) {
// title = doc.select("p#contaniner").text();
// }else if(url.contains("kanfanews.com")) {
// title = doc.select("p#tit").text();
// }else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
// title = "网页已删除";
// }else if(url.contains("a.mp.uc.cn")) {
// try {
// JSONObject json = JSONObject.parseObject(result);
// title = json.getJSONObject("data").getString("title");
// if(Objects.isNull(title) || title.length() < 1) {
// title = "网页已删除";
// }
// } catch (Exception e) {
// logger.error(" uc 数据 json 转换失败", e);
// }
// }else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
// title = "网页已删除";
// }else if(url.contains("zhihu.com")) {
// JSONObject resultJson = JSONObject.parseObject(result);
//
// title = resultJson.getString("title")!=null?resultJson.getString("title"):resultJson.getString("message");
// }
//
// //若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1) {
// title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
// }
//
// //若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1) {
// title = doc.select("title").text().replaceAll(" ", "");
// }
//
// //若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1) {
// title = doc.select("h1").text().replaceAll(" ", "");
// }
//
// //若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
// title = "网页已删除";
// }
//
// if(Objects.nonNull(title) && title.length() > 1){
// return new UrlLiveBean(url, isDelete(title),title);
// } else {
// return null;
// }
// } catch (Exception e) {
// return null;
// }
// }
//
// /**
// *
// * @Description 标题判断
// * @param title
// * @return
// */
// private boolean isDelete(String title) {
// List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除"
// ,"此帐号已自主注销,内容无法查看","页面提示","正在维护中"
// ,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway"
// ,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网"
// ,"百度一下,你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页"
// ,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网"
// ,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
// ,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
// ,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
// ,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在");
//
// List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
// ,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误"
// ,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常",
// "此帐号已被屏蔽, 内容无法查看","链接不存在");
//
// return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
// }
//
//
// /**
// * 处理知乎链接
// *
// * */
// private static String treatZhihuUrl(String url) {
// if(url.contains("/answer/")) {
// url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", "");
// }else if(url.contains("/question/") && !url.contains("/answer/")) {
// url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", "");
// }else if(url.contains("/p/")) {
// url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", "");
// }
// return url;
// }
//
//
//}
......@@ -80,7 +80,7 @@ public class SourceForward {
public static void main(String[] args) {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://gu.qq.com/resources/shy/news/detail-v2/index.html?#/index?id=SN202006091653447945411f");
urlList.add("https://ypstatic.cnnb.com.cn/yppage-share/news/share/news_detail?newsId=627223d9e4b042b45e211c5a");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) {
System.out.println("=============="+sfb.toString());
......
......@@ -98,7 +98,7 @@ public class URLLive {
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
}catch (Exception e){
logger.error(" 数据采集运行有问题 {} ", e);
logger.error(" 数据采集运行有问题 ", e);
}
return list;
}
......
package com.zhiwei.source_forward.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.kohlschutter.boilerpipe.extractors.ArticleExtractor;
import com.zhiwei.tools.tools.ZhiWeiTools;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public class MatchContent {
private static Logger logger = LoggerFactory.getLogger(MatchContent.class);
/**
* @Title: matchContent
* @author hero
* @Description: 匹配文章正文
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public static String matchContent(String url,String html) {
String content = null;
try {
Document document = Jsoup.parse(html);
if(url.contains("weixin.qq.com")) {
content = matchContentWeixin(html);
}else if(url.contains("toutiao.com")) {
content = matchContentToutiao(html);
}
if(content == null || content.length() < 10) {
content = mathchContent(html, document);
}
return ZhiWeiTools.delHTMLTag(content);
} catch (Exception e) {
logger.error("获取全文失败",e);
content = null;
}
return content;
}
/**
*
* @Description 头条正文获取
* @param html
* @return
*/
private static String matchContentToutiao(String html) {
Pattern pa = Pattern.compile("content:(.*?)',");
Matcher ma = pa.matcher(html);
while(ma.find()) {
return ma.group(1);
}
return null;
}
/**
*
* @Description 微信文本获取
* @param html
* @return
*/
private static String matchContentWeixin(String contentHtml) {
try {
Document document = Jsoup.parse(contentHtml);
if (contentHtml.contains("js_article")) {
return document.select("div#js_article").text();
} else if (contentHtml.contains("js_share_content")) {
return document.select("div#js_share_content").text();
}
if (contentHtml.contains("content_tpl")) {
String text = document.select("script#content_tpl").html();
return Jsoup.parse(text).text();
}
} catch (Exception e) {
logger.error("微信全文解析出错 {}", e);
}
return "";
}
/**
* @Title: mathchContent
* @author hero
* @Description: 匹配正文数据
* @param @param html
* @param @param document
* @param @return 设定文件
* @return String 返回类型
*/
private static String mathchContent(String html,Document document){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String content = null;
try {
content = ArticleExtractor.getInstance().getText(html);
} catch (Exception e) {
logger.error("正文抽取失败,获取全文文本:",e);
content = document.text();
}
// String content = null;
// try {
// News news = ContentExtractor.getNewsByHtml(html);
// content = TreateData.filterSpecialCharacter(news.getContent());
// } catch (Exception e) {
// logger.error("正文抽取失败,获取全文文本:",e);
// content = document.text();
// }
return content;
}
}
package com.zhiwei.source_forward.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.kohlschutter.boilerpipe.extractors.ArticleExtractor;
import com.zhiwei.tools.tools.ZhiWeiTools;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public class MatchContent {
private static Logger logger = LoggerFactory.getLogger(MatchContent.class);
/**
* @Title: matchContent
* @author hero
* @Description: 匹配文章正文
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public static String matchContent(String url,String html) {
String content = null;
try {
Document document = Jsoup.parse(html);
if(url.contains("weixin.qq.com")) {
content = matchContentWeixin(html);
}else if(url.contains("toutiao.com")) {
content = matchContentToutiao(html);
}
if(content == null || content.length() < 10) {
content = mathchContent(html, document);
}
return ZhiWeiTools.delHTMLTag(content);
} catch (Exception e) {
logger.error("获取全文失败",e);
content = null;
}
return content;
}
/**
*
* @Description 头条正文获取
* @param html
* @return
*/
private static String matchContentToutiao(String html) {
Pattern pa = Pattern.compile("content:(.*?)',");
Matcher ma = pa.matcher(html);
while(ma.find()) {
return ma.group(1);
}
return null;
}
/**
*
* @Description 微信文本获取
* @param html
* @return
*/
private static String matchContentWeixin(String contentHtml) {
try {
Document document = Jsoup.parse(contentHtml);
if (contentHtml.contains("js_article")) {
return document.select("div#js_article").text();
} else if (contentHtml.contains("js_share_content")) {
return document.select("div#js_share_content").text();
}
if (contentHtml.contains("content_tpl")) {
String text = document.select("script#content_tpl").html();
return Jsoup.parse(text).text();
}
} catch (Exception e) {
logger.error("微信全文解析出错 {}", e);
}
return "";
}
/**
* @Title: mathchContent
* @author hero
* @Description: 匹配正文数据
* @param @param html
* @param @param document
* @param @return 设定文件
* @return String 返回类型
*/
private static String mathchContent(String html,Document document){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String content = null;
try {
content = ArticleExtractor.getInstance().getText(html);
} catch (Exception e) {
logger.error("正文抽取失败,获取全文文本:",e);
content = document.text();
}
// String content = null;
// try {
// News news = ContentExtractor.getNewsByHtml(html);
// content = TreateData.filterSpecialCharacter(news.getContent());
// } catch (Exception e) {
// logger.error("正文抽取失败,获取全文文本:",e);
// content = document.text();
// }
return content;
}
}
package com.zhiwei.source_forward.util;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.source_forward.config.ProxyConfig;
import org.apache.dubbo.config.ApplicationConfig;
import org.apache.dubbo.config.ConsumerConfig;
import org.apache.dubbo.config.RegistryConfig;
import com.zhiwei.http.proxy.CynomysFactory;
import com.zhiwei.network.cynomys.consumer.CynomysConsumer;
import com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory;
/**
* 初始化代理
......@@ -16,10 +20,18 @@ public class ProxyInit {
* void
*/
public static void initProxy() {
String address = ProxyConfig.registry;
String appName = "xumiaoxin";
long appId = ProxyConfig.proxyid;
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build());
ApplicationConfig applicationConfig = new ApplicationConfig();
applicationConfig.setName("actool");
RegistryConfig registryConfig = new RegistryConfig();
registryConfig.setAddress("zookeeper://192.168.0.30:2181");
ConsumerConfig consumerConfig = new ConsumerConfig();
// 设置分组
consumerConfig.setGroup("local");
String username = "18271694195";
String password = "Zhiwei289";
// 创建 consumer,applicationConfig 非必需参数
CynomysConsumer consumer = CynomysConsumerFactory.create(applicationConfig, registryConfig, consumerConfig, username, password);
CynomysFactory.init(consumer);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment