Commit b8ed38f4 by chenweiyang

链接是否删除部分修改

parents bd0353ac 7003572f
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.2.7-SNAPSHOT</version> <version>0.2.8-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......
...@@ -2,6 +2,7 @@ package com.zhiwei.source_forward.crawler; ...@@ -2,6 +2,7 @@ package com.zhiwei.source_forward.crawler;
import java.util.Objects; import java.util.Objects;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
...@@ -47,6 +48,7 @@ public class ContentCrawler { ...@@ -47,6 +48,7 @@ public class ContentCrawler {
ContentDataCallback callback, String... urls) { ContentDataCallback callback, String... urls) {
if (urls != null && urls.length > 0) { if (urls != null && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
ZhiWeiTools.sleep(100);
if (url != null) { if (url != null) {
try { try {
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
......
...@@ -5,6 +5,7 @@ import java.util.List; ...@@ -5,6 +5,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
...@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler { ...@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) { private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) { if (urls != null && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
ZhiWeiTools.sleep(100);
counter.add(); counter.add();
if (url != null) { if (url != null) {
try { try {
...@@ -89,12 +91,9 @@ public class MediaSelfSourceCrawler { ...@@ -89,12 +91,9 @@ public class MediaSelfSourceCrawler {
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) { private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", attr.get()); logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
ProxyHolder ph = null; ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")) { if(url.contains("toutiao.com")) {
map.put("referer", url); map.put("referer", url);
ph = ProxyHolder.SOUGOU_OUTER_PROXY;
}else {
ph = ProxyHolder.NAT_HEAVY_PROXY;
} }
url = dealUrl(url); url = dealUrl(url);
if(Objects.nonNull(url)) { if(Objects.nonNull(url)) {
...@@ -168,7 +167,6 @@ public class MediaSelfSourceCrawler { ...@@ -168,7 +167,6 @@ public class MediaSelfSourceCrawler {
String url = attr.get().toString(); String url = attr.get().toString();
try { try {
source = MatchSource.matchMediaSelfSource(url + eUrl,result); source = MatchSource.matchMediaSelfSource(url + eUrl,result);
logger.info(url+"=======" + source);
channel = MatchChannel.verifyChannel(url); channel = MatchChannel.verifyChannel(url);
if(channel==null){ if(channel==null){
List<Node> nodeList = Jsoup.parse(result).head().childNodes(); List<Node> nodeList = Jsoup.parse(result).head().childNodes();
......
...@@ -21,6 +21,7 @@ import com.zhiwei.source_forward.util.MatchChannel; ...@@ -21,6 +21,7 @@ import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource; import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData; import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack; import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
...@@ -46,6 +47,7 @@ public class SourceForwardCrawler { ...@@ -46,6 +47,7 @@ public class SourceForwardCrawler {
if (urls != null && urls.length > 0) { if (urls != null && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
counter.add(); counter.add();
ZhiWeiTools.sleep(100);
if (url != null) { if (url != null) {
try { try {
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
...@@ -68,6 +70,11 @@ public class SourceForwardCrawler { ...@@ -68,6 +70,11 @@ public class SourceForwardCrawler {
if(url.contains("china.prcfe.com")) { if(url.contains("china.prcfe.com")) {
url = "http://china.prcfe.com/e/extend/ShowSource/?id=" + url.split("/")[url.split("/").length-1].split("\\.")[0]; url = "http://china.prcfe.com/e/extend/ShowSource/?id=" + url.split("/")[url.split("/").length-1].split("\\.")[0];
} }
if(url.contains("gu.qq.com")) {
String id = url.split("\\?id=")[1];
url = "https://snp.tenpay.com/cgi-bin/snpgw_unified_newsinfo.fcgi?&filter=0&zappid=zxg_h5&sign=b2aceeb8a8ef093862608d806c1d6ab8&nonce=8464&reserve=1572995&&channel=zxg&user_openid=undefined&user_skey=undefined&&news_id=" + id;
headers.put("referer", "https://gu.qq.com/resources/shy/news/detail-v2/index.html");
}
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
counter.add(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
...@@ -92,8 +99,8 @@ public class SourceForwardCrawler { ...@@ -92,8 +99,8 @@ public class SourceForwardCrawler {
String channel = "新闻"; String channel = "新闻";
String isforward = "未知"; String isforward = "未知";
try { try {
Document document = Jsoup.parse(body);
if(attr.get().toString().contains("mp.weixin.qq.com")){ if(attr.get().toString().contains("mp.weixin.qq.com")){
Document document = Jsoup.parse(body);
isforward = document.select("div#meta_content").select("span#copyright_logo").text(); isforward = document.select("div#meta_content").select("span#copyright_logo").text();
if(isforward.contains("原创")){ if(isforward.contains("原创")){
isforward = "原创"; isforward = "原创";
...@@ -104,15 +111,21 @@ public class SourceForwardCrawler { ...@@ -104,15 +111,21 @@ public class SourceForwardCrawler {
if(body.contains("isOriginal") && body.contains("isOriginal: true")){ if(body.contains("isOriginal") && body.contains("isOriginal: true")){
isforward = "原创"; isforward = "原创";
} }
}else if(attr.get().toString().contains("snp.tenpay.com") || attr.get().toString().contains("gu.qq.com")){
if(body.contains("source")){
source = body.split("\"source\":\"")[1].split("\"")[0];
}
}else{ }else{
Document document = Jsoup.parse(body);
source = MatchSource.matchSource(attr.get().toString(),document.toString(), sourceList);
channel = MatchChannel.verifyChannel(attr.get().toString()); channel = MatchChannel.verifyChannel(attr.get().toString());
if(channel==null){ if(channel==null){
List<Node> nodeList = document.head().childNodes(); List<Node> nodeList = document.head().childNodes();
channel = MatchChannel.matchChannel(nodeList); channel = MatchChannel.matchChannel(nodeList);
} }
source = MatchSource.matchSource(attr.get().toString(),document.toString(), sourceList);
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace();
source = null; source = null;
channel = "新闻"; channel = "新闻";
} }
......
...@@ -50,8 +50,10 @@ public class UrlLiveCrawler { ...@@ -50,8 +50,10 @@ public class UrlLiveCrawler {
if (nonNull(urls) && urls.length > 0) { if (nonNull(urls) && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
counter.add(); counter.add();
ZhiWeiTools.sleep(100);
if (nonNull(url)) { if (nonNull(url)) {
try { try {
// ZhiWeiTools.sleep(3000);
search(counter, url, Attribution.of(url, 1), callback); search(counter, url, Attribution.of(url, 1), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错:", e); logger.error("搜索创建出错:", e);
...@@ -213,6 +215,9 @@ public class UrlLiveCrawler { ...@@ -213,6 +215,9 @@ public class UrlLiveCrawler {
title = doc.select("div.global_error_msg.warn").text(); title = doc.select("div.global_error_msg.warn").text();
} }
if(Objects.isNull(title) || title.isEmpty()) { if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("div.warn").text();
}
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("p.tips").text(); title = doc.select("p.tips").text();
} }
if(Objects.isNull(title) || title.isEmpty()) { if(Objects.isNull(title) || title.isEmpty()) {
......
...@@ -32,7 +32,7 @@ public class MediaSelfSource { ...@@ -32,7 +32,7 @@ public class MediaSelfSource {
ProxyInit.initProxy(); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("https://new.qq.com/omn/20200507/20200507A0Q9JV00.html"); urlList.add("https://k.sina.com.cn/article_1060093724_3f2fbf1c00100vsqd.html?from=mood");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList); List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) { for(MediaSelfSourceBean b : u) {
......
...@@ -80,10 +80,10 @@ public class SourceForward { ...@@ -80,10 +80,10 @@ public class SourceForward {
public static void main(String[] args) { public static void main(String[] args) {
ProxyInit.initProxy(); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://www.wangjiaozixun.com/html/zx20/2020/0730/1396388.html"); urlList.add("http://gu.qq.com/resources/shy/news/detail-v2/index.html?#/index?id=SN202006091653447945411f");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList); List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) { for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString()); System.out.println("=============="+sfb.toString());
} }
} }
...@@ -94,7 +94,6 @@ public class SourceForward { ...@@ -94,7 +94,6 @@ public class SourceForward {
try{ try{
SourceForwardCrawler crawler = new SourceForwardCrawler(); SourceForwardCrawler crawler = new SourceForwardCrawler();
SourceForwardDataCallBack callback = new SourceForwardDataCallBack() { SourceForwardDataCallBack callback = new SourceForwardDataCallBack() {
@Override @Override
public void onData(SourceForwardBean data, Attribution attr) { public void onData(SourceForwardBean data, Attribution attr) {
list.add(data); list.add(data);
......
...@@ -72,7 +72,7 @@ public class URLLive { ...@@ -72,7 +72,7 @@ public class URLLive {
public static void main(String[] args) { public static void main(String[] args) {
ProxyInit.initProxy(); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://www.toutiao.com/item/1668646006370318/"); urlList.add("http://mp.weixin.qq.com/s?__biz=Mzg3MDMzNTc5Mg==&mid=2247485220&idx=1&sn=9118543ca120489cccbdc102be58f881");
// urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh"); // urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList); List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment