Commit 2c9d4fa2 by chenweiyang

代理升级版本

parent 4860f41e
......@@ -3,13 +3,17 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.3.0-SNAPSHOT</version>
<version>0.3.1-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<http-boot.version>0.1.0.8-SNAPSHOT</http-boot.version>
<task-boot.version>1.1.2-SNAPSHOT</task-boot.version>
<boilerpipe.version>0.0.1-SNAPSHOT</boilerpipe.version>
<conomys-consumer.version>0.0.3-SNAPSHOT</conomys-consumer.version>
</properties>
<developers>
......@@ -30,12 +34,30 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.2.4-SNAPSHOT</version>
<version>0.4.5-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.6.8-SNAPSHOT</version>
<groupId>com.kohlschutter.boilerpipe</groupId>
<artifactId>boilerpipe-extractor</artifactId>
<version>${boilerpipe.version}</version>
</dependency>
<dependency>
<groupId>com.zhiwei.http</groupId>
<artifactId>http-boot</artifactId>
<version>${http-boot.version}</version>
</dependency>
<dependency>
<groupId>com.zhiwei.async</groupId>
<artifactId>task-boot</artifactId>
<version>${task-boot.version}</version>
</dependency>
<dependency>
<groupId>com.zhiwei.network</groupId>
<artifactId>cynomys-consumer</artifactId>
<version>${conomys-consumer.version}</version>
</dependency>
</dependencies>
......
package com.zhiwei.source_forward.crawler;
import java.util.Objects;
import java.util.concurrent.Semaphore;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static Semaphore semaphore = new Semaphore(5);
/**
*
......@@ -51,9 +53,12 @@ public class ContentCrawler {
ZhiWeiTools.sleep(100);
if (url != null) {
try {
semaphore.acquire();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
} finally {
semaphore.release();
}
}
}
......@@ -75,15 +80,15 @@ public class ContentCrawler {
Request request = RequestUtils.wrapGet(url);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback);
parseHtml(rs.bodyString(), attr, callback);
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
}
} catch (Exception e) {
logger.info("搜索结果访问失败: {}", ex);
logger.info("搜索结果访问失败: ", ex);
} finally {
counter.done();
}
......
......@@ -4,22 +4,23 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.Semaphore;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -34,7 +35,8 @@ import okhttp3.Request;
public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static Semaphore semaphore = new Semaphore(5);
/**
*
......@@ -69,9 +71,12 @@ public class MediaSelfSourceCrawler {
counter.add();
if (url != null) {
try {
semaphore.acquire();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
} finally {
semaphore.release();
}
}
counter.done();
......@@ -91,7 +96,7 @@ public class MediaSelfSourceCrawler {
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>();
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
ProxyServerSupplier ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")) {
map.put("referer", url);
}
......@@ -104,7 +109,7 @@ public class MediaSelfSourceCrawler {
try {
if (Objects.isNull(ex)) {
try {
parseHtml(rs.body().string(), attr, callback, rs.request().url().uri().toString());
parseHtml(rs.bodyString(), attr, callback, rs.bootRequest().url().uri().toString());
} catch (Exception e) {
logger.error("解析出错", e);
}
......
......@@ -4,6 +4,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.Semaphore;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......@@ -11,16 +12,16 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -29,8 +30,9 @@ public class SourceForwardCrawler {
private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static List<String> sourceList = SourceData.getSourceList();
private static Semaphore semaphore = new Semaphore(5);
public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
try {
......@@ -50,9 +52,12 @@ public class SourceForwardCrawler {
ZhiWeiTools.sleep(100);
if (url != null) {
try {
semaphore.acquire();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
} finally {
semaphore.release();
}
}
counter.done();
......@@ -77,10 +82,10 @@ public class SourceForwardCrawler {
}
Request request = RequestUtils.wrapGet(url, headers);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback);
parseHtml(rs.bodyString(), attr, callback);
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
}
......
package com.zhiwei.source_forward.crawler;
import static java.util.Objects.nonNull;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.Semaphore;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.JSONPath;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.task.sync.GroupSync;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.Objects.nonNull;
import okhttp3.Request;
/**
* @author byte-zbs
......@@ -32,7 +39,8 @@ import static java.util.Objects.nonNull;
public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(true).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
private static Semaphore semaphore = new Semaphore(5);
public GroupSync submitTask(UrlLiveDataCallback callback, String... urls) {
GroupSync counter = new GroupSync();
......@@ -43,17 +51,20 @@ public class UrlLiveCrawler {
private void start(GroupSync counter, UrlLiveDataCallback callback, String... urls) {
if (nonNull(urls) && urls.length > 0) {
for (String url : urls) {
try {
counter.add();
ZhiWeiTools.sleep(100);
semaphore.acquire();
ZhiWeiTools.sleep(200);
if (nonNull(url)) {
try {
// ZhiWeiTools.sleep(3000);
search(counter, url, Attribution.of(url, 1), callback);
}
} catch (Exception e) {
logger.error("搜索创建出错:", e);
}
}
} finally {
counter.done();
semaphore.release();
}
}
}
}
......@@ -63,12 +74,12 @@ public class UrlLiveCrawler {
url = dealUrl(url);
logger.info("当前处理 URL: {}", url);
Map<String, String> headers = new HashMap<>();
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
ProxyServerSupplier ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
if (url.contains("toutiao.com")) {
ph = ProxyHolder.NAT_HEAVY_PROXY;
ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
} else if (url.contains("zhihu.com")) {
url = treatZhihuUrl(url);
ph = ProxyHolder.NAT_HEAVY_PROXY;
ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
}
try {
Request request = RequestUtils.wrapGet(url, headers);
......@@ -80,7 +91,7 @@ public class UrlLiveCrawler {
System.out.println(rs.code());
if (Objects.isNull(ex)) {
if (rs.isSuccessful()) {
parseHtml(rs.body().string(), attr, callback);
parseHtml(rs.bodyString(), attr, callback);
} else if (rs.code() == 404) {
callBack(callback, attr, 1, String.valueOf(rs.code()));
} else {
......@@ -91,7 +102,7 @@ public class UrlLiveCrawler {
callBack(callback, attr, -1, "程序无法判断");
}
} catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ", e);
logger.error(" 数据是否删除 采集出错 ", e);
} finally {
counter.done();
}
......@@ -99,7 +110,7 @@ public class UrlLiveCrawler {
return counter;
}
} catch (Exception e2) {
logger.error("数据出错 {}", e2);
logger.error("数据出错 ", e2);
}
return counter;
}
......
......@@ -80,7 +80,7 @@ public class SourceForward {
public static void main(String[] args) {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://gu.qq.com/resources/shy/news/detail-v2/index.html?#/index?id=SN202006091653447945411f");
urlList.add("https://ypstatic.cnnb.com.cn/yppage-share/news/share/news_detail?newsId=627223d9e4b042b45e211c5a");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) {
System.out.println("=============="+sfb.toString());
......
......@@ -98,7 +98,7 @@ public class URLLive {
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
}catch (Exception e){
logger.error(" 数据采集运行有问题 {} ", e);
logger.error(" 数据采集运行有问题 ", e);
}
return list;
}
......
package com.zhiwei.source_forward.util;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.source_forward.config.ProxyConfig;
import org.apache.dubbo.config.ApplicationConfig;
import org.apache.dubbo.config.ConsumerConfig;
import org.apache.dubbo.config.RegistryConfig;
import com.zhiwei.http.proxy.CynomysFactory;
import com.zhiwei.network.cynomys.consumer.CynomysConsumer;
import com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory;
/**
* 初始化代理
......@@ -16,10 +20,18 @@ public class ProxyInit {
* void
*/
public static void initProxy() {
String address = ProxyConfig.registry;
String appName = "xumiaoxin";
long appId = ProxyConfig.proxyid;
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build());
ApplicationConfig applicationConfig = new ApplicationConfig();
applicationConfig.setName("actool");
RegistryConfig registryConfig = new RegistryConfig();
registryConfig.setAddress("zookeeper://192.168.0.30:2181");
ConsumerConfig consumerConfig = new ConsumerConfig();
// 设置分组
consumerConfig.setGroup("local");
String username = "18271694195";
String password = "Zhiwei289";
// 创建 consumer,applicationConfig 非必需参数
CynomysConsumer consumer = CynomysConsumerFactory.create(applicationConfig, registryConfig, consumerConfig, username, password);
CynomysFactory.init(consumer);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment