Commit 4a661c59 by chenweiyang

升级爬虫核心包 升级版本 0.2.6

parent a94682af
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.2.5-SNAPSHOT</version>
<version>0.2.6-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......@@ -29,7 +29,7 @@
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.3.1-SNAPSHOT</version>
<version>0.6.6.3-SNAPSHOT</version>
</dependency>
</dependencies>
......
......@@ -7,8 +7,8 @@ import org.apache.logging.log4j.Logger;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback;
......
......@@ -12,8 +12,8 @@ import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
......
......@@ -13,8 +13,8 @@ import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
......
......@@ -2,9 +2,6 @@ package com.zhiwei.source_forward.crawler;
import static java.util.Objects.nonNull;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.Proxy.Type;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
......@@ -21,8 +18,8 @@ import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
......@@ -75,7 +72,7 @@ public class UrlLiveCrawler {
ProxyHolder ph = null;
if(url.contains("toutiao.com")){
headers.put("referer", url);
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; tt_webid=6833273737980659213; tt_webid=6833273737980659213; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_scid=KdPOCLtoSVDQTnptuiejH4SkyYa7RodIcBHFpAGwf17X9rUWJJadFYALAeJ5C8xI71e5; __ac_nonce=05ee037380054152ddc38; __ac_signature=6C1-YAAgEBB40vzLiGE95-gsf3AALbYjxEHG0FQERCcxB-9tebz.fovM7gew-AHObLDUegpmF7k8G57XzXokCbi72klNkdvS.ukzrfuuFk3UL836QudGNHE6IJQ47kFRkiT; __tasessionId=nz5ags6bk1591752505915");
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; tt_webid=6833273737980659213; tt_webid=6833273737980659213; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_scid=a56VD6ALatPbD63MlXw5skpZx9olxW6X.mRiDJBvVfZyQF2lfw8-lNeLPqqPPuCH4c68; __ac_nonce=05ee1cac300c6be6af0fe; __ac_signature=YvewuAAgEBDyCDITpF4SsmL2saAADwawXLDdrzlqO4hucxtXaZyI1l3ZReIsXb1OnF3koe7MdMwhnGPBA-mn5X5ERtMmQrb7RY5NqiBu.g3p0.oY6nNsvIT3NNbIsViZXz3; __tasessionId=48abvzgub1591855812394");
headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
headers.put("accept-encoding", "gzip, deflate, br");
headers.put("accept-language", "zh-CN,zh;q=0.9");
......@@ -86,7 +83,7 @@ public class UrlLiveCrawler {
headers.put("sec-fetch-user", "?1");
headers.put("upgrade-insecure-requests", "1");
headers.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36");
ph = ProxyHolder.SOUGOU_OUTER_PROXY;
ph = ProxyHolder.NAT_HEAVY_PROXY;
}else if(url.contains("zhihu.com")) {
url = treatZhihuUrl(url);
ph = ProxyHolder.NAT_HEAVY_PROXY;
......@@ -96,7 +93,7 @@ public class UrlLiveCrawler {
if(Objects.nonNull(request)) {
counter.add();
// , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ph).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
if(rs.isSuccessful()) {
......
......@@ -16,7 +16,7 @@ import com.alibaba.fastjson.JSONObject;
import com.zhiwei.async.GroupSync;
import com.zhiwei.async.TaskBoot;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import okhttp3.Request;
......
......@@ -72,8 +72,8 @@ public class URLLive {
public static void main(String[] args) {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
// urlList.add("http://www.toutiao.com/a1665677841741827");
urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
urlList.add("http://www.toutiao.com/a1665677841741827");
// urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) {
......
package com.zhiwei.source_forward.util;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.source_forward.config.ProxyConfig;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment