Commit 0e4b6f49 by win 10

新增界面新闻、亿欧网、蓝鲸、蓝鲸财经、虎嗅、连线家六个自媒体来源的匹配

parent 364e507d
......@@ -174,7 +174,7 @@ public class MatchSource {
source = document.select("div.new-content-info.clearfix").select("span").text().replaceAll(".*作者:", "");
}else if(url.contains("finance.eastmoney.com")){
//单独处理东方财富网
source = document.select("div.source.data-source").attr("data-source").toString();
source = document.select("div.source.data-source").attr("data-source");
}else if(url.contains("emwap.eastmoney.com")){
//单独处理东方财富网客户端
source = document.select("div.where").select("span.source").attr("title");
......@@ -298,7 +298,7 @@ public class MatchSource {
}else if(url.contains("stock.10jqka.com.cn")){
//单独处理重庆晨报
source = document.select("span.label_nr").text();
}else if(url.contains("jiemian.com")){
}else if(url.contains("jiemian.com") ){
//单独处理界面新闻
// source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", "");
return "界面新闻";
......@@ -331,7 +331,7 @@ public class MatchSource {
}
}
} catch (Exception e) {
e.printStackTrace();
e.toString();
}
return null;
}
......@@ -411,12 +411,10 @@ public class MatchSource {
}else if(url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn") ||
url.contains("tech.sina.cn") || url.contains("news.sina.cn")){
source = document.select("h2.weibo_user").text();
if(Objects.isNull(source) || source.length() < 1){
if((Objects.isNull(source) || source.length() < 1) && html.contains("<meta name=\"mediaid\"")){
//新浪科技头条号
if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
}
}
if(Objects.isNull(source) || source.length() < 1){
//新浪财经头条号
source = document.select("body > main > section.j_main_art > section > article > time > cite").text();
......@@ -471,9 +469,6 @@ public class MatchSource {
if(source!=null && source.length()>1){
source = "北京时间-" + source;
}
}else if(url.contains("item.btime.com")){
//北京时间
source = document.select("span.col cite").text();
}else if(url.contains("mp.qq.com")){
source = document.select("div#account_top > div.puin_text > div.pname").text();
if(source!=null && !source.equals("")){
......@@ -588,6 +583,31 @@ public class MatchSource {
if(source!=null && !source.equals("")){
source = "优酷-" + source;
}
}else if(url.contains("jiemian.com")) {
source = document.select("div.article-info > p > span.author > a").text();
if(source!=null && !source.equals("")){
source = "界面新闻-" + source;
}
}else if (url.contains("iyiou.com")) {
source = document.select("div#post_author > a").text();
if(source!=null && !source.equals("")) {
source = "亿欧网-" + source;
}
}else if (url.contains("lanjingtmt.com")) {
source = document.select("div.scd-title > a:nth-child(2)").text();
if(source!=null && !source.equals("")) {
source = "蓝鲸-" + source;
}
}else if (url.contains("lanjinger.com")) {
source = document.select("div.article_info > span.info.author_name").text().replaceAll(".*编辑| ", "");
if(source!=null && !source.equals("")) {
source = "蓝鲸财经-" + source;
}
}else if (url.contains("huxiu.com")) {
source = document.select("div.article__author-info-box > a.article-author-info > span.author-info__username").text();
if(source!=null && !source.equals("")) {
source = "虎嗅-" + source;
}
}
return source;
} catch (Exception e) {
......@@ -712,7 +732,7 @@ public class MatchSource {
}
} catch (Exception e) {
System.out.println("正文抽取失败处理........");
e.printStackTrace();
e.toString();
/***
* 匹配正文失败
* 匹配命中包含来源等规则的数据
......@@ -758,7 +778,7 @@ public class MatchSource {
* 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
***/
String times[] = htmlBody.split(timeSource);
String[] times = htmlBody.split(timeSource);
for (int j = 0; j < times.length; j++) {
String timecontent = times[j];
if (j == 0) {
......@@ -783,7 +803,7 @@ public class MatchSource {
}
return null;
} catch (Exception e) {
e.printStackTrace();
e.toString();
return null;
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment