Commit 29320d28 by yangchen

提升版本 增加匹配来源

parent ab587b25
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.1.8-SNAPSHOT</version>
<version>0.1.9-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......
......@@ -66,7 +66,9 @@ public class SourceForwardCrawler {
if(url.contains("www.toutiao.com")){
headers.put("referer", url);
}
if(url.contains("china.prcfe.com")) {
url = "http://china.prcfe.com/e/extend/ShowSource/?id=" + url.split("/")[url.split("/").length-1].split("\\.")[0];
}
Request request = RequestUtils.wrapGet(url, headers);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
......
......@@ -81,7 +81,7 @@ public class SourceForward {
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>();
urlList.add("http://industry.caijing.com.cn/20190423/4582310.shtml");
urlList.add("http://stock.10jqka.com.cn/usstock/20190621/c612094454.shtml");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString());
......
......@@ -184,6 +184,9 @@ public class MatchSource {
}else if(url.contains("finance.ifeng.com")){
//单独处理凤凰网
source = document.select("p.p_time").select("span").select("span").select("a").text();
if(Objects.isNull(source) || source.length() < 1) {
source = html.split("source\":\"")[1].split("\"")[0];
}
}else if(url.contains("iphone.265g.com")){
//单独处理265G网
source = document.select("div.article_info").select("span").text().replaceAll(".*来源:|QQ群号.*", "");
......@@ -299,6 +302,14 @@ public class MatchSource {
}else if(url.contains("finance.youth.cn")){
//单独处理中国青年网
source = document.select("span#source_baidu").text().replaceAll("来源:|作者.*", "");
}else if(url.contains("china.com")) {
//中国金融商报
source = document.select("#chan_newsInfo > a").text();
}else if(url.contains("xw.qq.com")) {
//腾讯网客户端
source = document.select("div.tpl_header_author").text();
}else if(url.contains("china.prcfe.com")) {
source = html.split("\"")[1];
}
if(Objects.nonNull(source) && source.length() != 0) {
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment