Commit fc02384b by win 10

Merge branch 'source-forward-chen' of

http://git.zhiweidata.top/zhangzhiwei/source_forward.git into
source-forward-chen

Conflicts:
	src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
parents 3342069b 2d3871ad
...@@ -82,7 +82,7 @@ public class UrlLiveCrawler { ...@@ -82,7 +82,7 @@ public class UrlLiveCrawler {
callBack(callback, attr, 1,String.valueOf(rs.code())); callBack(callback, attr, 1,String.valueOf(rs.code()));
} }
} else { } else {
callBack(callback, attr, 1,String.valueOf(rs.code())); callBack(callback, attr, 1,"未访问成功");
} }
} catch (Exception e) { } catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ",e); logger.error(" 数据是否删除 采集出错 {} ",e);
......
...@@ -26,6 +26,7 @@ public class MediaSelfSource { ...@@ -26,6 +26,7 @@ public class MediaSelfSource {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://yugang.blogchina.com/713055888.html"); urlList.add("http://yugang.blogchina.com/713055888.html");
urlList.add("https://item.btime.com/m_9bf5d805a257ddc87");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList); List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) { for(MediaSelfSourceBean b : u) {
System.out.println(b.toString()); System.out.println(b.toString());
......
...@@ -81,7 +81,7 @@ public class SourceForward { ...@@ -81,7 +81,7 @@ public class SourceForward {
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("https://www.jiemian.com/article/2782869.html"); urlList.add("http://industry.caijing.com.cn/20190423/4582310.shtml");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList); List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) { for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString()); System.out.println(sfb.toString());
......
...@@ -3,7 +3,6 @@ package com.zhiwei.source_forward.util; ...@@ -3,7 +3,6 @@ package com.zhiwei.source_forward.util;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import org.checkerframework.checker.units.qual.s;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -101,10 +100,7 @@ public class MatchSource { ...@@ -101,10 +100,7 @@ public class MatchSource {
}else if(url.contains("caijing.com.cn")){ }else if(url.contains("caijing.com.cn")){
//财经网产经 //财经网产经
source = document.select("#source_baidu").text(); source = document.select("#source_baidu").text();
} }else if(url.contains("news.eastday.com")){
else if(url.contains("news.eastday.com")){
//单独处理东方网 //单独处理东方网
source = document.select("div#sectionleft").select("div").select("p").select("a").text(); source = document.select("div#sectionleft").select("div").select("p").select("a").text();
}else if(url.contains("ny.chinacenn.com")){ }else if(url.contains("ny.chinacenn.com")){
...@@ -217,7 +213,7 @@ public class MatchSource { ...@@ -217,7 +213,7 @@ public class MatchSource {
source = document.select("div.tip.fl").select("a").text(); source = document.select("div.tip.fl").select("a").text();
}else if(url.contains("finance.jrj.com.cn")){ }else if(url.contains("finance.jrj.com.cn")){
//单独处理金融界 //单独处理金融界
source = document.select("p.inftop").select("span").select("a").text().replaceAll("价值.*| ", ""); source = document.select("p.inftop").select("span").get(1).select("a").text().replaceAll("价值.*| ", "");
}else if(url.contains("tech.china.com.cn")){ }else if(url.contains("tech.china.com.cn")){
//单独处理中国网 //单独处理中国网
source = document.select("span.fl.time2").select("a").text(); source = document.select("span.fl.time2").select("a").text();
...@@ -298,8 +294,12 @@ public class MatchSource { ...@@ -298,8 +294,12 @@ public class MatchSource {
source = document.select("span.label_nr").text(); source = document.select("span.label_nr").text();
}else if(url.contains("jiemian.com")){ }else if(url.contains("jiemian.com")){
//单独处理界面新闻 //单独处理界面新闻
source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", ""); // source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", "");
} return "界面新闻";
}else if(url.contains("finance.youth.cn")){
//单独处理中国青年网
source = document.select("span#source_baidu").text().replaceAll("来源:|作者.*", "");
}
if(Objects.nonNull(source) && source.length() != 0) { if(Objects.nonNull(source) && source.length() != 0) {
return source; return source;
...@@ -339,9 +339,11 @@ public class MatchSource { ...@@ -339,9 +339,11 @@ public class MatchSource {
/***特定网站单独处理**/ /***特定网站单独处理**/
if(url.contains("toutiao.com")){ if(url.contains("toutiao.com")){
//今日头条帐号匹配 //今日头条帐号匹配
if(html.contains("name: '")){ if(html.contains("name: '") && html.contains("mediaInfo")){
source = html.split("mediaInfo:")[1].split("name: '")[1].split("',")[0].trim(); source = html.split("mediaInfo:")[1].split("name: '")[1].split("',")[0].trim();
}else if(html.contains("screen_name:")){ }else if(html.contains("name: '") && html.contains("ugcInfo")){
source = html.split("ugcInfo:")[1].split("name: '")[1].split("',")[0].trim();
}else if(html.contains("screen_name:")){
source = html.split("screen_name:'")[1].split("',")[0].trim(); source = html.split("screen_name:'")[1].split("',")[0].trim();
} }
if(source!=null && source.length()>1){ if(source!=null && source.length()>1){
...@@ -422,6 +424,9 @@ public class MatchSource { ...@@ -422,6 +424,9 @@ public class MatchSource {
}else if(url.contains("item.btime.com")){ }else if(url.contains("item.btime.com")){
//北京时间 //北京时间
source = document.select("a.author").text(); source = document.select("a.author").text();
if(Objects.isNull(source) || source.length() < 1){
source = document.select("div.content-info > span.col.cite").text();
}
if(source!=null && source.length()>1){ if(source!=null && source.length()>1){
source = "北京时间-" + source; source = "北京时间-" + source;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment