Commit fc02384b by win 10

Merge branch 'source-forward-chen' of

http://git.zhiweidata.top/zhangzhiwei/source_forward.git into
source-forward-chen

Conflicts:
	src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
parents 3342069b 2d3871ad
......@@ -82,7 +82,7 @@ public class UrlLiveCrawler {
callBack(callback, attr, 1,String.valueOf(rs.code()));
}
} else {
callBack(callback, attr, 1,String.valueOf(rs.code()));
callBack(callback, attr, 1,"未访问成功");
}
} catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ",e);
......
......@@ -26,6 +26,7 @@ public class MediaSelfSource {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>();
urlList.add("http://yugang.blogchina.com/713055888.html");
urlList.add("https://item.btime.com/m_9bf5d805a257ddc87");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) {
System.out.println(b.toString());
......
......@@ -81,7 +81,7 @@ public class SourceForward {
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>();
urlList.add("https://www.jiemian.com/article/2782869.html");
urlList.add("http://industry.caijing.com.cn/20190423/4582310.shtml");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString());
......
......@@ -3,7 +3,6 @@ package com.zhiwei.source_forward.util;
import java.util.List;
import java.util.Objects;
import org.checkerframework.checker.units.qual.s;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -101,10 +100,7 @@ public class MatchSource {
}else if(url.contains("caijing.com.cn")){
//财经网产经
source = document.select("#source_baidu").text();
}
else if(url.contains("news.eastday.com")){
}else if(url.contains("news.eastday.com")){
//单独处理东方网
source = document.select("div#sectionleft").select("div").select("p").select("a").text();
}else if(url.contains("ny.chinacenn.com")){
......@@ -217,7 +213,7 @@ public class MatchSource {
source = document.select("div.tip.fl").select("a").text();
}else if(url.contains("finance.jrj.com.cn")){
//单独处理金融界
source = document.select("p.inftop").select("span").select("a").text().replaceAll("价值.*| ", "");
source = document.select("p.inftop").select("span").get(1).select("a").text().replaceAll("价值.*| ", "");
}else if(url.contains("tech.china.com.cn")){
//单独处理中国网
source = document.select("span.fl.time2").select("a").text();
......@@ -298,8 +294,12 @@ public class MatchSource {
source = document.select("span.label_nr").text();
}else if(url.contains("jiemian.com")){
//单独处理界面新闻
source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", "");
}
// source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", "");
return "界面新闻";
}else if(url.contains("finance.youth.cn")){
//单独处理中国青年网
source = document.select("span#source_baidu").text().replaceAll("来源:|作者.*", "");
}
if(Objects.nonNull(source) && source.length() != 0) {
return source;
......@@ -339,9 +339,11 @@ public class MatchSource {
/***特定网站单独处理**/
if(url.contains("toutiao.com")){
//今日头条帐号匹配
if(html.contains("name: '")){
if(html.contains("name: '") && html.contains("mediaInfo")){
source = html.split("mediaInfo:")[1].split("name: '")[1].split("',")[0].trim();
}else if(html.contains("screen_name:")){
}else if(html.contains("name: '") && html.contains("ugcInfo")){
source = html.split("ugcInfo:")[1].split("name: '")[1].split("',")[0].trim();
}else if(html.contains("screen_name:")){
source = html.split("screen_name:'")[1].split("',")[0].trim();
}
if(source!=null && source.length()>1){
......@@ -422,6 +424,9 @@ public class MatchSource {
}else if(url.contains("item.btime.com")){
//北京时间
source = document.select("a.author").text();
if(Objects.isNull(source) || source.length() < 1){
source = document.select("div.content-info > span.col.cite").text();
}
if(source!=null && source.length()>1){
source = "北京时间-" + source;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment