Commit eff378d9 by chenweiyang

Merge branch 'source-forward-chen' of…

Merge branch 'source-forward-chen' of http://git.zhiweidata.top/zhangzhiwei/source_forward.git into source-forward-chen
parents 8c8442e6 aa059934
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.2.6-SNAPSHOT</version> <version>0.2.7-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
...@@ -24,18 +24,18 @@ ...@@ -24,18 +24,18 @@
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId> <artifactId>fastjson</artifactId>
<version>1.2.62</version> <version>1.2.71</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.9-SNAPSHOT</version> <version>0.2.0-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.6.6.3-SNAPSHOT</version> <version>0.6.6.8-SNAPSHOT</version>
</dependency> </dependency>
</dependencies> </dependencies>
......
...@@ -80,7 +80,7 @@ public class SourceForward { ...@@ -80,7 +80,7 @@ public class SourceForward {
public static void main(String[] args) { public static void main(String[] args) {
ProxyInit.initProxy(); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://software.it168.com/a2019/0621/6005/000006005693.shtml"); urlList.add("http://www.wangjiaozixun.com/html/zx20/2020/0730/1396388.html");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList); List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) { for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString()); System.out.println(sfb.toString());
......
...@@ -39,6 +39,7 @@ public class MatchSource { ...@@ -39,6 +39,7 @@ public class MatchSource {
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})" + "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})" + "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2})" + "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "(\\d+[^\\d]{1,2})+\\d+"
; ;
/** /**
* @Title: findURLs * @Title: findURLs
...@@ -60,9 +61,13 @@ public class MatchSource { ...@@ -60,9 +61,13 @@ public class MatchSource {
/***特定网站单独处理**/ /***特定网站单独处理**/
if(url.contains("thepaper.cn")){ if(url.contains("thepaper.cn")){
//单独处理澎湃数据 //单独处理澎湃数据
if(StringUtils.isNotBlank(document.select("div.name").text())){
source = document.select("div.name").text();
}else{
source = document.select("div.news_about").select("p").select("span").text().replaceAll(".*来源:", ""); source = document.select("div.news_about").select("p").select("span").text().replaceAll(".*来源:", "");
if(StringUtils.isNotBlank(source)) { if(StringUtils.isNotBlank(source)) {
source = document.select("div.news_about").text().replaceAll(" \\d{4}.*|.*/", ""); source = document.select("div.news_about").text().replaceAll("\\d{4}.*|.*/", "");
}
} }
}else if(url.contains("sports.eastday.com")){ }else if(url.contains("sports.eastday.com")){
//单独处理东方体育网 //单独处理东方体育网
...@@ -314,6 +319,11 @@ public class MatchSource { ...@@ -314,6 +319,11 @@ public class MatchSource {
source = document.select("div.tpl_header_author").text(); source = document.select("div.tpl_header_author").text();
}else if(url.contains("china.prcfe.com")) { }else if(url.contains("china.prcfe.com")) {
source = html.split("\"")[1]; source = html.split("\"")[1];
}else if(url.contains("wangjiaozixun.com")) {
source = document.select("p.em_media").text();
if(StringUtils.isNotBlank(source)){
source = source.replaceAll(".*来源:|)", "");
}
} }
if(Objects.nonNull(source) && source.length() != 0) { if(Objects.nonNull(source) && source.length() != 0) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment