Commit 7f7e4a1c by cwy

头条数据更新使用白名单

parent ea0833d3
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.2.4-SNAPSHOT</version>
<version>0.2.5-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......
......@@ -89,15 +89,19 @@ public class MediaSelfSourceCrawler {
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>();
ProxyHolder ph = null;
if(url.contains("toutiao.com")) {
map.put("referer", url);
ph = ProxyHolder.SOUGOU_OUTER_PROXY;
}else {
ph = ProxyHolder.NAT_HEAVY_PROXY;
}
url = dealUrl(url);
if(Objects.nonNull(url)) {
Request request = RequestUtils.wrapGet(url, map);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ph).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
try {
......
......@@ -52,7 +52,6 @@ public class UrlLiveCrawler {
counter.add();
if (nonNull(url)) {
try {
// ZhiWeiTools.sleep(3000);
search(counter, url, Attribution.of(url, 1), callback);
} catch (Exception e) {
logger.error("搜索创建出错:", e);
......@@ -69,9 +68,10 @@ public class UrlLiveCrawler {
url = dealUrl(url);
logger.info("当前处理 URL: {}", url);
Map<String,String> headers = new HashMap<>();
ProxyHolder ph = null;
if(url.contains("toutiao.com")){
headers.put("referer", url);
headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; tt_webid=6763913092738418180; tt_webid=6763913092738418180; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; s_v_web_id=verify_k9wn4wvx_J8Tm9B3v_4KQj_4pYw_B3C5_Bz00jljwk2Ik; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1589341084.1589355043.4; CNZZDATA1259612802=2091325281-1587691681-%7C1589354688; __ac_nonce=05ec2023000312916dbf0; __ac_signature=YYVItAAgEBDxesof46KjamGESaAAD9LCPu9LY3i693yRwgjuLokObvXcXAHluuEslefdgz60kyPRc1WnihwB4acMsJgn1wYE8IuqB3toZpnIZRexNBULILeZxouOJAtnxO6; __tasessionId=402dor9vo1589772849201; tt_scid=yP.oipZ1w-SChWahT4a7rhJ2gsjG-rJO.4UkyTROzer4MBRJ4bAv7POpDKAcZwzc497f");
// headers.put("cookie", "__ac_nonce=05ed0c7bb00bc34aa36be; __ac_signature=0fFbMAAgEBBBDtmbXG3W-tHxWiAAI8q; ttcid=cfbee5ddf00b4013b5236b534c8cf36c19; tt_webid=6832180195202909704; s_v_web_id=verify_kary2om5_954yc9QS_twaQ_42XG_9Sei_dsAVEudiEodo; __tasessionId=4bmcvzruo1590740924839; tt_webid=6832180195202909704; SLARDAR_WEB_ID=fb4d8abf-bdd7-4e9e-ba38-8c00f0c13846; csrftoken=6430b380cc664479dfa0b0e5061b2db9; tt_scid=kRdSxPldqsXGPvYrxh3K4HZ5ayX0isXRzk08ZTjlIGmNW3HaSLrhBfHJ.CRjNom.b0fe");
headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
headers.put("accept-encoding", "gzip, deflate, br");
headers.put("accept-language", "zh-CN,zh;q=0.9");
......@@ -81,15 +81,17 @@ public class UrlLiveCrawler {
headers.put("sec-fetch-site", "same-origin");
headers.put("sec-fetch-user", "?1");
headers.put("upgrade-insecure-requests", "1");
headers.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36");
headers.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36");
ph = ProxyHolder.SOUGOU_OUTER_PROXY;
}else if(url.contains("zhihu.com")) {
url = treatZhihuUrl(url);
ph = ProxyHolder.NAT_HEAVY_PROXY;
}
try {
Request request = RequestUtils.wrapGet(url, headers);
if(Objects.nonNull(request)) {
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ph).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
if(rs.isSuccessful()) {
......
......@@ -32,7 +32,7 @@ public class MediaSelfSource {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("https://new.qq.com/rain/a/20200511A0LUU600");
urlList.add("https://new.qq.com/omn/20200507/20200507A0Q9JV00.html");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment