Commit b13567ee by yangchen

添加来源判断

parent eba74706
...@@ -9,6 +9,8 @@ import java.util.Map.Entry; ...@@ -9,6 +9,8 @@ import java.util.Map.Entry;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.SourceForwardBean; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution; import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.crawler.SourceForwardCrawler; import com.zhiwei.source_forward.crawler.SourceForwardCrawler;
...@@ -77,13 +79,13 @@ public class SourceForward { ...@@ -77,13 +79,13 @@ public class SourceForward {
} }
public static void main(String[] args) { public static void main(String[] args) {
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
// urlList.add("https://www.toutiao.com/a6634320415839748621"); urlList.add("http://www.northnews.cn/2019/0419/3080909.shtml");
// List<SourceForwardBean> da = SourceForward.getSourceForward(urlList); List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
// for(SourceForwardBean sfb : da) { for(SourceForwardBean sfb : da) {
// System.out.println(sfb.toString()); System.out.println(sfb.toString());
// } }
} }
static class SourceForwardCrawlerThread extends Thread{ static class SourceForwardCrawlerThread extends Thread{
......
...@@ -39,7 +39,7 @@ public class MatchSource { ...@@ -39,7 +39,7 @@ public class MatchSource {
/** /**
* @Title: findURLs * @Title: findURLs
* @author hero * @author hero
* @Description: TODO(验证并匹配数据) * @Description: (验证并匹配数据)
* @param @param * @param @param
* s * s
* @param @param * @param @param
...@@ -91,17 +91,24 @@ public class MatchSource { ...@@ -91,17 +91,24 @@ public class MatchSource {
}else{ }else{
source = html.split("source\":\"")[1].split("\",\"")[0]; source = html.split("source\":\"")[1].split("\",\"")[0];
} }
}else{ }else if(url.contains("tech.china.com")){
//中华网科技
source = document.select("#chan_newsInfo").text().split("来源:")[1];
}else if(url.contains("caijing.com.cn")){
//财经网产经
source = document.select("#source_baidu").text();
}else{
//其他网站处理 //其他网站处理
source = mathchOtherSource(html, htmlBody, sourceList); source = mathchOtherSource(html, htmlBody, sourceList);
} }
if(source!=null){ if(source!=null){
//验证来源 //验证来源
for (String sourceMatch : sourceList) { // for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) { // if (source.contains(sourceMatch)) {
return sourceMatch; // return sourceMatch;
} // }
} // }
return source;
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
......
...@@ -3051,4 +3051,5 @@ ZOL中关村在线 ...@@ -3051,4 +3051,5 @@ ZOL中关村在线
邹城政务网 邹城政务网
走进中关村 走进中关村
最高人民法院网 最高人民法院网
最高人民检察院 最高人民检察院
\ No newline at end of file 今日湖北
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment