Commit dd6b6b30 by zhiwei

自媒体匹配百家号添加相应规则

来源转发添加腾讯自选股及中青看点
parent 9fcfba2d
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.2.7-SNAPSHOT</version> <version>0.2.8-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......
...@@ -91,12 +91,9 @@ public class MediaSelfSourceCrawler { ...@@ -91,12 +91,9 @@ public class MediaSelfSourceCrawler {
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) { private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", attr.get()); logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
ProxyHolder ph = null; ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")) { if(url.contains("toutiao.com")) {
map.put("referer", url); map.put("referer", url);
ph = ProxyHolder.SOUGOU_OUTER_PROXY;
}else {
ph = ProxyHolder.NAT_HEAVY_PROXY;
} }
url = dealUrl(url); url = dealUrl(url);
if(Objects.nonNull(url)) { if(Objects.nonNull(url)) {
...@@ -170,7 +167,6 @@ public class MediaSelfSourceCrawler { ...@@ -170,7 +167,6 @@ public class MediaSelfSourceCrawler {
String url = attr.get().toString(); String url = attr.get().toString();
try { try {
source = MatchSource.matchMediaSelfSource(url + eUrl,result); source = MatchSource.matchMediaSelfSource(url + eUrl,result);
logger.info(url+"=======" + source);
channel = MatchChannel.verifyChannel(url); channel = MatchChannel.verifyChannel(url);
if(channel==null){ if(channel==null){
List<Node> nodeList = Jsoup.parse(result).head().childNodes(); List<Node> nodeList = Jsoup.parse(result).head().childNodes();
......
package com.zhiwei.source_forward.util; package com.zhiwei.source_forward.util;
import java.util.List; import com.alibaba.fastjson.JSONObject;
import java.util.Objects; import com.zhiwei.source_forward.content.ContentExtractor;
import java.util.regex.Matcher; import com.zhiwei.source_forward.content.News;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject; import java.util.List;
import com.zhiwei.source_forward.content.ContentExtractor; import java.util.Objects;
import com.zhiwei.source_forward.content.News; import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** /**
* @ClassName: MatchSource * @ClassName: MatchSource
...@@ -324,7 +323,19 @@ public class MatchSource { ...@@ -324,7 +323,19 @@ public class MatchSource {
if(StringUtils.isNotBlank(source)){ if(StringUtils.isNotBlank(source)){
source = source.replaceAll(".*来源:|)", ""); source = source.replaceAll(".*来源:|)", "");
} }
}else if(url.contains("gu.qq.com")){
source = document.select("span#news_source").text();
if(StringUtils.isNotBlank(source)){
return source;
}
}else if(url.contains("kandian.youth.cn")){
source = document.select("div.fl > a").text();
if(StringUtils.isNotBlank(source)){
return source;
} }
}
if(Objects.nonNull(source) && source.length() != 0) { if(Objects.nonNull(source) && source.length() != 0) {
return source; return source;
} }
...@@ -487,9 +498,12 @@ public class MatchSource { ...@@ -487,9 +498,12 @@ public class MatchSource {
} }
}else if(url.contains("baijiahao.baidu.com") || url.contains("mbd.baidu.com")){ }else if(url.contains("baijiahao.baidu.com") || url.contains("mbd.baidu.com")){
//百度百家 //百度百家
if(StringUtils.isNotBlank(document.select("span.userNameSpan").text())){
source = document.select("span.userNameSpan").text(); source = document.select("span.userNameSpan").text();
if(StringUtils.isBlank(source)){ }else if(StringUtils.isNotBlank(document.select("p.author-name:nth-child(1)").text())){
source = document.select("p.author-name:nth-child(1)").text(); source = document.select("p.author-name:nth-child(1)").text();
}else if(StringUtils.isNotBlank(document.select("a.authorName").text())){
source = document.select("a.authorName").text();
} }
if(StringUtils.isNotBlank(source)){ if(StringUtils.isNotBlank(source)){
source = "百度百家-" + source; source = "百度百家-" + source;
...@@ -768,8 +782,7 @@ public class MatchSource { ...@@ -768,8 +782,7 @@ public class MatchSource {
if(StringUtils.isNotBlank(source)){ if(StringUtils.isNotBlank(source)){
source = "推酷-" + source; source = "推酷-" + source;
} }
} }else if(url.contains("36kr.com")){
if(url.contains("36kr.com")){
source = document.select("div.info-header-text > a.author-name").text(); source = document.select("div.info-header-text > a.author-name").text();
if(StringUtils.isNotBlank(source)){ if(StringUtils.isNotBlank(source)){
return "36氪-" + source; return "36氪-" + source;
...@@ -783,6 +796,7 @@ public class MatchSource { ...@@ -783,6 +796,7 @@ public class MatchSource {
return "36氪-" + source; return "36氪-" + source;
} }
} }
return source; return source;
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment