Commit 9557316d by win 10

新增了txt文件的来源和大部分平台来源解析

parent 6e7f47cf
......@@ -25,7 +25,7 @@ public class MediaSelfSource {
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>();
urlList.add("https://www.360kuai.com/pc/9277f65f68bba0265?cota=3&kuai_so=1&sign=360_e39369d1&refer_scene=so_3");
urlList.add("http://sh.qihoo.com/pc/9dcfa48989d33df34?cota=1&sign=360_e39369d1&refer_scene=so_3");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) {
System.out.println(b.toString());
......
......@@ -81,7 +81,7 @@ public class SourceForward {
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>();
urlList.add("http://www.northnews.cn/2019/0419/3080909.shtml");
urlList.add("https://www.jiemian.com/article/2782869.html");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString());
......
......@@ -9,6 +9,8 @@ import java.util.Map.Entry;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
......@@ -69,17 +71,17 @@ public class URLLive {
return UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
}
// public static void main(String[] args) {
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// List<String> urlList = new ArrayList<>();
// urlList.add("https://www.toutiao.com/a6680674354260345355");
//
//
// List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
// for(UrlLiveBean b : u) {
// System.out.println(b.toString());
// }
// }
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>();
urlList.add("http://www.ebrun.com/ebrungo/zb/316384.shtml");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) {
System.out.println(b.toString());
}
}
static class UrlLiveCrawlerThread extends Thread{
......
......@@ -3,6 +3,7 @@ package com.zhiwei.source_forward.util;
import java.util.List;
import java.util.Objects;
import org.checkerframework.checker.units.qual.s;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -56,7 +57,10 @@ public class MatchSource {
/***特定网站单独处理**/
if(url.contains("thepaper.cn")){
//单独处理澎湃数据
source = document.select("div.news_about").text();
source = document.select("div.news_about").select("p").select("span").text().replaceAll(".*来源:", "");
if(source.length() == 0) {
source = document.select("div.news_about").text().replaceAll(" \\d{4}.*|.*/", "");
}
}else if(url.contains("sports.eastday.com")){
//单独处理东方体育网
source = document.select("div.article").select("span").text();
......@@ -97,18 +101,220 @@ public class MatchSource {
}else if(url.contains("caijing.com.cn")){
//财经网产经
source = document.select("#source_baidu").text();
}else{
}
else if(url.contains("news.eastday.com")){
//单独处理东方网
source = document.select("div#sectionleft").select("div").select("p").select("a").text();
}else if(url.contains("ny.chinacenn.com")){
//单独处理中企网
source = document.select("td").select("span.ltutext3").text().replaceAll(" \\d{4}.*", "");
}else if(url.contains("ebrun.com")){
//单独处理亿邦动力网
source = document.select("div.post-header").select("p.source").select("span.f-left").text().replaceAll(".*来源: ", "");
}else if(url.contains("www.mnw.cn")){
//单独处理闽南网
source = document.select("div.il").select("span").text().replaceAll("来源:|\\d{4}.*", "");
}else if(url.contains("sn.cri.cn")){
//单独处理国际在线
source = document.select("span.asource").select("a").text();
}else if(url.contains("sh.sina.com.cn")){
//单独处理新浪上海
source = document.select("p.source-time").select("span").get(1).select("a").text();
}else if(url.contains("kaixian.tv")){
//单独处理汉丰网
source = document.select("div.content").select("h2.font_gray").text().replaceAll(".*来源:", "");
}else if(url.contains("lanjingtmt.com")){
//单独处理蓝鲸TMT
source = "蓝鲸TMT网";
}else if(url.contains("tech.huanqiu.com")){
//单独处理环球网
source = document.select("span.la_t_b").select("a").text();
}else if(url.contains("china.qianlong.com")){
//单独处理千龙网
source = document.select("span.source").select("a").text();
}else if(url.contains("m.mnw.cn")){
//单独处理手机闽南网
source = document.select("article.info").select("header").select("div").select("span").text().replaceAll("\\d{4}.*| ", "");
}else if(url.contains("mydrivers.com")){
//单独处理快科技
source = document.select("div.news_bt1_left").text().replaceAll(".*出处:| 作者:[\\w\\W]*", "");
}else if(url.contains("3dmgame.com")){
//单独处理3DMGAME
source = document.select("ul.intem").select("li").select("span.weibo").text();
}else if(url.contains("99it.com.cn")){
//单独处理99科技
source = document.select("div.mate").select("span").text().replaceAll(".*来源:|编辑.*", "");
}else if(url.contains("ciotimes.com")){
//单独处理CIO时代网
source = document.select("p.ly.visible-xs.text-left").text().replaceAll(".*来源:", "");
}else if(url.contains("ithome.com")){
//单独处理IT之家
source = document.select("span#source_baidu").select("a").text();
}else if(url.contains("techweb.com.cn")){
//单独处理TechWeb
source = document.select("span.from").select("a").text();
}else if(url.contains("cniteyes.com")){
//单独处理T客帮
source = document.select("div.item-date").select("span").text();
}else if(url.contains("enorth.com.cn")){
//单独处理北方网
source = document.select("p.col-sm-8.info").select("span").text().replaceAll(".*来源:|编辑.*", "");
}else if(url.contains("btime.com")){
//单独处理北京时间
source = document.select("span.col.cite").text();
}else if(url.contains("bianews.com")){
//单独处理鞭牛士
source = document.select("span.name.fl").text();
}else if(url.contains("dzwww.com")){
//单独处理大众网
source = document.select("div.layout").select("div.left").text().replaceAll(".*来源: |作者.*", "");
}else if(url.contains("dsb.cn")){
//单独处理电商报
source = document.select("div.new-content-info.clearfix").select("span").text().replaceAll(".*作者:", "");
}else if(url.contains("finance.eastmoney.com")){
//单独处理东方财富网
source = document.select("div.source.data-source").attr("data-source").toString();
}else if(url.contains("emwap.eastmoney.com")){
//单独处理东方财富网客户端
source = document.select("div.where").select("span.source").attr("title");
}else if(url.contains("mini.eastday.com")){
//单独处理东方头条
source = document.select("div.article-src-time").select("span").text().replaceAll(".*来源:", "");
}else if(url.contains("tech.ifeng.com")){
//单独处理凤凰科技
source = document.select("p.p_time").select("span").select("span.ss03").text();
}else if(url.contains("finance.ifeng.com")){
//单独处理凤凰网
source = document.select("p.p_time").select("span").select("span").select("a").text();
}else if(url.contains("iphone.265g.com")){
//单独处理265G网
source = document.select("div.article_info").select("span").text().replaceAll(".*来源:|QQ群号.*", "");
}else if(url.contains("yicai.com")){
//单独处理第一财经
source = document.select("div.title.f-pr").select("p").select("span").text();
}else if(url.contains("cnblogs.com")){
//单独处理博客园
source = document.select("div#come_from").text().replaceAll(".*来自:", "");
}else if(url.contains("chinaxiaokang.com")){
//单独处理中国小康网
source = document.select("span#arturl").select("a").text();
}else if(url.contains("chinabaogao.com")) {
//单独处理中国报告网
source = document.select("p.cbg-a-d-info").select("a").text().replaceAll("大 中 小 | ", "");
}else if(url.contains("anyv.net")) {
//单独处理爱妮微
source = document.select("span.cor666").select("a").text();
}else if(url.contains("yingxiao360.com")){
//单独处理第一赢销网
source = "第一赢销网";
}else if(url.contains("cctime.com")){
//单独处理飞象网
source = document.select("td.dateAndSource").text().replaceAll(".*\\d{2}|作 者.*| ", "");
}else if(url.contains("news.hexun.com")){
//单独处理和讯网
source = document.select("div.tip.fl").select("a").text();
}else if(url.contains("finance.jrj.com.cn")){
//单独处理金融界
source = document.select("p.inftop").select("span").select("a").text().replaceAll("价值.*| ", "");
}else if(url.contains("tech.china.com.cn")){
//单独处理中国网
source = document.select("span.fl.time2").select("a").text();
}else if(url.contains("news.china.com.cn")){
//单独处理中国网
source = document.select("div.pub_date").select("span#source_baidu").text().replaceAll(".*来源:", "");
}else if(url.contains("admin5.com")){
//单独处理站长网
source = document.select("div.source").select("span").text().replaceAll(".*来源:| ", "");
}else if(url.contains("stock.qq.com")){
//单独处理腾讯证券
source = document.select("div.a_Info").select("span.a_source").text();
}else if(url.contains("n.cztv.com")){
//单独处理新蓝网
source = document.select("div.publish").select("ul").select("li").text().replaceAll("\\d{4}.*", "");
}else if(url.contains("news.paidai.com")){
//单独处理派代网
source = document.select("p.t_info").select("span").select("a").text();
}else if(url.contains("news.mydrivers.com")){
//单独处理快科技
source = document.select("div.news_bt1_left").text().replaceAll(".*出处:| 作者.*", "");
}else if(url.contains("www.chinaz.com")){
//单独处理站长之家
source = document.select("div.meta").select("span.source").select("a").text();
}else if(url.contains("yuncaijing.com")){
//单独处理云财经
source = document.select("section.news-wrap").select("header").select("div").text().replaceAll(".*消息来源: |\\[阅读原文.*| ", "");
}else if(url.contains("itmsc.cn")){
//单独处理科技传媒网
source = document.select("div.arc_sc").select("p").select("a").text();
}else if(url.contains("nbd.com.cn")){
//单独处理每日经济新闻
source = document.select("span.source").text();
}else if(url.contains("pintu360.com")){
//单独处理品途商业评论
source = "品途商业评论";
}else if(url.contains("news.qudong.com")){
//单独处理驱动中国
source = document.select("div.news_right").select("dd").select("li").select("span").select("a").text().replaceAll(" .*", "");
}else if(url.contains("shobserver.com")){
//单独处理上海观察
source = document.select("span.max-words").get(0).text();
}else if(url.contains("g.pconline.com.cn")){
//单独处理太平洋电脑网
source = document.select("div.art-info").text().replaceAll("手机|\\d{4}.*| ", "");
}else if(url.contains("news.xtol.cn")){
//单独处理湘潭在线
source = document.select("span.date").text().replaceAll(".*来源:", "");
}else if(url.contains("bjnews.com.cn")){
//单独处理新京报网
source = document.select("span.author").text().replaceAll(" 记者.*", "");
}else if(url.contains("telworld.com.cn")){
//单独处理运营商世界
source = document.select("div.news_xiang_tit_2_left").select("a").text();
}else if(url.contains("thehour.cn")){
//单独处理浙江24小时
source = document.select("div.newsInfo").select("span").select("a").text();
}else if(url.contains("sh.zol.com.cn")){
//单独处理中关村在线
source = document.select("div.article-aboute").select("span.source_baidu").text();
}else if(url.contains("ec.com.cn")){
//单独处理中国国际电子商务网
source = document.select("span.article_resource").text().replaceAll(".*来源:", "");
}else if(url.contains("cqn.com.cn")){
//单独处理中国质量新闻网
source = document.select("span.from").text().replaceAll("-.*", "");
}else if(url.contains("sc.stock.cnfol.com")){
//单独处理中金在线
source = document.select("div.artDes").select("span").select("a").text();
}else if(url.contains("zczj.com")){
//单独处理众筹之家
source = document.select("div.news-info").select("span").text().replaceAll("来源:|作者.*", "");
}else if(url.contains("cqcb.com")){
//单独处理重庆晨报
source = document.select("span.label_nr").text();
}else if(url.contains("stock.10jqka.com.cn")){
//单独处理重庆晨报
source = document.select("span.label_nr").text();
}else if(url.contains("jiemian.com")){
//单独处理界面新闻
source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", "");
}
if(Objects.nonNull(source) && source.length() != 0) {
return source;
}
else{
//其他网站处理
source = mathchOtherSource(html, htmlBody, sourceList);
}
if(source!=null){
//验证来源
// for (String sourceMatch : sourceList) {
// if (source.contains(sourceMatch)) {
// return sourceMatch;
// }
// }
return source;
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
......@@ -213,12 +419,17 @@ public class MatchSource {
if(source!=null && source.length()>1){
source = "汽车之家-" + source;
}
}else if(url.contains("item.btime.com")){
}
else if(url.contains("item.btime.com")){
//北京时间
source = document.select("a.author").text();
if(source!=null && source.length()>1){
source = "北京时间-" + source;
}
}
else if(url.contains("item.btime.com")){
//北京时间
source = document.select("span.col cite").text();
}else if(url.contains("qq.com/")){
//腾讯网-企鹅号
source = html.split("media\": \"")[1].split("\",")[0];
......
......@@ -893,6 +893,7 @@ ZOL中关村在线
华东理工大学
华东在线
华尔街见闻
华尔街见闻网
华股财经
华龙网
华龙网法律频道
......@@ -2488,6 +2489,7 @@ ZOL中关村在线
智慧长沙
智慧长沙资讯
智能派
智通财经
智通财经网
置家网
中安在线
......@@ -3053,3 +3055,26 @@ ZOL中关村在线
最高人民法院网
最高人民检察院
今日湖北
中国经营报
三言财经
TechWeb.com.cn
中企网
央视新闻移动网
新浪财经-自媒体综合
T媒体
《法人》
国是直通车
科技小肆
雷帝触网
铅笔道
三秦都市报
新浪财经综合
央视财经
第一财经
第一赢销网
国际金融报
A5创业网
运营商世界网讯
中外管理杂志
上游新闻综合
新蓝网·浙江网络广播电视台
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment