Commit 210aee67 by yangchen

增加部分新浪自媒体的采集

parent a2736a3f
...@@ -25,7 +25,7 @@ public class MediaSelfSource { ...@@ -25,7 +25,7 @@ public class MediaSelfSource {
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("https://finance.sina.cn/stock/relnews/hk/2019-07-22/detail-ihytcitm3847530.d.html"); urlList.add("http://auto.sina.com.cn/j_kandian.d.html?docid=hytcerm4907505&subch=bauto&hpid=00032");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList); List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) { for(MediaSelfSourceBean b : u) {
System.out.println(b.toString()); System.out.println(b.toString());
......
...@@ -396,12 +396,22 @@ public class MatchSource { ...@@ -396,12 +396,22 @@ public class MatchSource {
} }
}else if(url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn")){ }else if(url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn")){
source = document.select("h2.weibo_user").text(); source = document.select("h2.weibo_user").text();
if(Objects.isNull(source) || source.length() < 1){
//新浪财经头条号 //新浪财经头条号
if(html.contains("<meta name=\"mediaid\"")){ if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim(); source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
}
}
if(source!=null && source.length()>1){ if(source!=null && source.length()>1){
source = "财经头条-" + source; source = "财经头条-" + source;
} }
}else if(url.contains("auto.sina.cn") || url.contains("auto.sina.com.cn")){
source = document.select("div.art_title > div > span:nth-child(1)").text();
if(Objects.isNull(source) || source.length() < 1){
source = document.select("#top_bar > div > div.date-source > a").text();
}
if(source!=null && source.length()>1){
source = "新浪-" + source;
} }
}else if(url.contains("baijiahao.baidu.com")){ }else if(url.contains("baijiahao.baidu.com")){
//百度百家 //百度百家
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment