Commit 0e4b6f49 by win 10

新增界面新闻、亿欧网、蓝鲸、蓝鲸财经、虎嗅、连线家六个自媒体来源的匹配

parent 364e507d
...@@ -174,7 +174,7 @@ public class MatchSource { ...@@ -174,7 +174,7 @@ public class MatchSource {
source = document.select("div.new-content-info.clearfix").select("span").text().replaceAll(".*作者:", ""); source = document.select("div.new-content-info.clearfix").select("span").text().replaceAll(".*作者:", "");
}else if(url.contains("finance.eastmoney.com")){ }else if(url.contains("finance.eastmoney.com")){
//单独处理东方财富网 //单独处理东方财富网
source = document.select("div.source.data-source").attr("data-source").toString(); source = document.select("div.source.data-source").attr("data-source");
}else if(url.contains("emwap.eastmoney.com")){ }else if(url.contains("emwap.eastmoney.com")){
//单独处理东方财富网客户端 //单独处理东方财富网客户端
source = document.select("div.where").select("span.source").attr("title"); source = document.select("div.where").select("span.source").attr("title");
...@@ -298,7 +298,7 @@ public class MatchSource { ...@@ -298,7 +298,7 @@ public class MatchSource {
}else if(url.contains("stock.10jqka.com.cn")){ }else if(url.contains("stock.10jqka.com.cn")){
//单独处理重庆晨报 //单独处理重庆晨报
source = document.select("span.label_nr").text(); source = document.select("span.label_nr").text();
}else if(url.contains("jiemian.com")){ }else if(url.contains("jiemian.com") ){
//单独处理界面新闻 //单独处理界面新闻
// source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", ""); // source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", "");
return "界面新闻"; return "界面新闻";
...@@ -331,7 +331,7 @@ public class MatchSource { ...@@ -331,7 +331,7 @@ public class MatchSource {
} }
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
} }
return null; return null;
} }
...@@ -411,11 +411,9 @@ public class MatchSource { ...@@ -411,11 +411,9 @@ public class MatchSource {
}else if(url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn") || }else if(url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn") ||
url.contains("tech.sina.cn") || url.contains("news.sina.cn")){ url.contains("tech.sina.cn") || url.contains("news.sina.cn")){
source = document.select("h2.weibo_user").text(); source = document.select("h2.weibo_user").text();
if(Objects.isNull(source) || source.length() < 1){ if((Objects.isNull(source) || source.length() < 1) && html.contains("<meta name=\"mediaid\"")){
//新浪科技头条号 //新浪科技头条号
if(html.contains("<meta name=\"mediaid\"")){ source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
}
} }
if(Objects.isNull(source) || source.length() < 1){ if(Objects.isNull(source) || source.length() < 1){
//新浪财经头条号 //新浪财经头条号
...@@ -471,9 +469,6 @@ public class MatchSource { ...@@ -471,9 +469,6 @@ public class MatchSource {
if(source!=null && source.length()>1){ if(source!=null && source.length()>1){
source = "北京时间-" + source; source = "北京时间-" + source;
} }
}else if(url.contains("item.btime.com")){
//北京时间
source = document.select("span.col cite").text();
}else if(url.contains("mp.qq.com")){ }else if(url.contains("mp.qq.com")){
source = document.select("div#account_top > div.puin_text > div.pname").text(); source = document.select("div#account_top > div.puin_text > div.pname").text();
if(source!=null && !source.equals("")){ if(source!=null && !source.equals("")){
...@@ -588,6 +583,31 @@ public class MatchSource { ...@@ -588,6 +583,31 @@ public class MatchSource {
if(source!=null && !source.equals("")){ if(source!=null && !source.equals("")){
source = "优酷-" + source; source = "优酷-" + source;
} }
}else if(url.contains("jiemian.com")) {
source = document.select("div.article-info > p > span.author > a").text();
if(source!=null && !source.equals("")){
source = "界面新闻-" + source;
}
}else if (url.contains("iyiou.com")) {
source = document.select("div#post_author > a").text();
if(source!=null && !source.equals("")) {
source = "亿欧网-" + source;
}
}else if (url.contains("lanjingtmt.com")) {
source = document.select("div.scd-title > a:nth-child(2)").text();
if(source!=null && !source.equals("")) {
source = "蓝鲸-" + source;
}
}else if (url.contains("lanjinger.com")) {
source = document.select("div.article_info > span.info.author_name").text().replaceAll(".*编辑| ", "");
if(source!=null && !source.equals("")) {
source = "蓝鲸财经-" + source;
}
}else if (url.contains("huxiu.com")) {
source = document.select("div.article__author-info-box > a.article-author-info > span.author-info__username").text();
if(source!=null && !source.equals("")) {
source = "虎嗅-" + source;
}
} }
return source; return source;
} catch (Exception e) { } catch (Exception e) {
...@@ -712,7 +732,7 @@ public class MatchSource { ...@@ -712,7 +732,7 @@ public class MatchSource {
} }
} catch (Exception e) { } catch (Exception e) {
System.out.println("正文抽取失败处理........"); System.out.println("正文抽取失败处理........");
e.printStackTrace(); e.toString();
/*** /***
* 匹配正文失败 * 匹配正文失败
* 匹配命中包含来源等规则的数据 * 匹配命中包含来源等规则的数据
...@@ -758,7 +778,7 @@ public class MatchSource { ...@@ -758,7 +778,7 @@ public class MatchSource {
* 主要匹配 YYYY-MM-dd xx日报 * 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd * 或 xx日报 YYYY-MM-dd
***/ ***/
String times[] = htmlBody.split(timeSource); String[] times = htmlBody.split(timeSource);
for (int j = 0; j < times.length; j++) { for (int j = 0; j < times.length; j++) {
String timecontent = times[j]; String timecontent = times[j];
if (j == 0) { if (j == 0) {
...@@ -783,7 +803,7 @@ public class MatchSource { ...@@ -783,7 +803,7 @@ public class MatchSource {
} }
return null; return null;
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return null; return null;
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment