Commit dd6b6b30 by zhiwei

自媒体匹配百家号添加相应规则

来源转发添加腾讯自选股及中青看点
parent 9fcfba2d
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.2.7-SNAPSHOT</version>
<version>0.2.8-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......
......@@ -91,12 +91,9 @@ public class MediaSelfSourceCrawler {
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>();
ProxyHolder ph = null;
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")) {
map.put("referer", url);
ph = ProxyHolder.SOUGOU_OUTER_PROXY;
}else {
ph = ProxyHolder.NAT_HEAVY_PROXY;
}
url = dealUrl(url);
if(Objects.nonNull(url)) {
......@@ -170,7 +167,6 @@ public class MediaSelfSourceCrawler {
String url = attr.get().toString();
try {
source = MatchSource.matchMediaSelfSource(url + eUrl,result);
logger.info(url+"=======" + source);
channel = MatchChannel.verifyChannel(url);
if(channel==null){
List<Node> nodeList = Jsoup.parse(result).head().childNodes();
......
package com.zhiwei.source_forward.util;
import java.util.List;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.source_forward.content.ContentExtractor;
import com.zhiwei.source_forward.content.News;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.source_forward.content.ContentExtractor;
import com.zhiwei.source_forward.content.News;
import java.util.List;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @ClassName: MatchSource
......@@ -324,7 +323,19 @@ public class MatchSource {
if(StringUtils.isNotBlank(source)){
source = source.replaceAll(".*来源:|)", "");
}
}else if(url.contains("gu.qq.com")){
source = document.select("span#news_source").text();
if(StringUtils.isNotBlank(source)){
return source;
}
}else if(url.contains("kandian.youth.cn")){
source = document.select("div.fl > a").text();
if(StringUtils.isNotBlank(source)){
return source;
}
}
if(Objects.nonNull(source) && source.length() != 0) {
return source;
}
......@@ -487,9 +498,12 @@ public class MatchSource {
}
}else if(url.contains("baijiahao.baidu.com") || url.contains("mbd.baidu.com")){
//百度百家
if(StringUtils.isNotBlank(document.select("span.userNameSpan").text())){
source = document.select("span.userNameSpan").text();
if(StringUtils.isBlank(source)){
}else if(StringUtils.isNotBlank(document.select("p.author-name:nth-child(1)").text())){
source = document.select("p.author-name:nth-child(1)").text();
}else if(StringUtils.isNotBlank(document.select("a.authorName").text())){
source = document.select("a.authorName").text();
}
if(StringUtils.isNotBlank(source)){
source = "百度百家-" + source;
......@@ -768,8 +782,7 @@ public class MatchSource {
if(StringUtils.isNotBlank(source)){
source = "推酷-" + source;
}
}
if(url.contains("36kr.com")){
}else if(url.contains("36kr.com")){
source = document.select("div.info-header-text > a.author-name").text();
if(StringUtils.isNotBlank(source)){
return "36氪-" + source;
......@@ -783,6 +796,7 @@ public class MatchSource {
return "36氪-" + source;
}
}
return source;
} catch (Exception e) {
e.printStackTrace();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment