Commit aa059934 by zhiwei

提高采集jar版本及添加wangjiaozixun网站匹配规则及时间匹配规则

parent bb108a86
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.2.6-SNAPSHOT</version>
<version>0.2.7-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......@@ -24,18 +24,18 @@
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.62</version>
<version>1.2.71</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.9-SNAPSHOT</version>
<version>0.2.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.6.3-SNAPSHOT</version>
<version>0.6.6.8-SNAPSHOT</version>
</dependency>
</dependencies>
......
......@@ -80,7 +80,7 @@ public class SourceForward {
public static void main(String[] args) {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://software.it168.com/a2019/0621/6005/000006005693.shtml");
urlList.add("http://www.wangjiaozixun.com/html/zx20/2020/0730/1396388.html");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString());
......
......@@ -39,6 +39,7 @@ public class MatchSource {
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "(\\d+[^\\d]{1,2})+\\d+"
;
/**
* @Title: findURLs
......@@ -60,9 +61,13 @@ public class MatchSource {
/***特定网站单独处理**/
if(url.contains("thepaper.cn")){
//单独处理澎湃数据
if(StringUtils.isNotBlank(document.select("div.name").text())){
source = document.select("div.name").text();
}else{
source = document.select("div.news_about").select("p").select("span").text().replaceAll(".*来源:", "");
if(StringUtils.isNotBlank(source)) {
source = document.select("div.news_about").text().replaceAll(" \\d{4}.*|.*/", "");
source = document.select("div.news_about").text().replaceAll("\\d{4}.*|.*/", "");
}
}
}else if(url.contains("sports.eastday.com")){
//单独处理东方体育网
......@@ -314,6 +319,11 @@ public class MatchSource {
source = document.select("div.tpl_header_author").text();
}else if(url.contains("china.prcfe.com")) {
source = html.split("\"")[1];
}else if(url.contains("wangjiaozixun.com")) {
source = document.select("p.em_media").text();
if(StringUtils.isNotBlank(source)){
source = source.replaceAll(".*来源:|)", "");
}
}
if(Objects.nonNull(source) && source.length() != 0) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment