Commit b13567ee by yangchen

添加来源判断

parent eba74706
......@@ -9,6 +9,8 @@ import java.util.Map.Entry;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.crawler.SourceForwardCrawler;
......@@ -77,13 +79,13 @@ public class SourceForward {
}
public static void main(String[] args) {
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// List<String> urlList = new ArrayList<>();
// urlList.add("https://www.toutiao.com/a6634320415839748621");
// List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
// for(SourceForwardBean sfb : da) {
// System.out.println(sfb.toString());
// }
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>();
urlList.add("http://www.northnews.cn/2019/0419/3080909.shtml");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString());
}
}
static class SourceForwardCrawlerThread extends Thread{
......
......@@ -39,7 +39,7 @@ public class MatchSource {
/**
* @Title: findURLs
* @author hero
* @Description: TODO(验证并匹配数据)
* @Description: (验证并匹配数据)
* @param @param
* s
* @param @param
......@@ -91,17 +91,24 @@ public class MatchSource {
}else{
source = html.split("source\":\"")[1].split("\",\"")[0];
}
}else{
}else if(url.contains("tech.china.com")){
//中华网科技
source = document.select("#chan_newsInfo").text().split("来源:")[1];
}else if(url.contains("caijing.com.cn")){
//财经网产经
source = document.select("#source_baidu").text();
}else{
//其他网站处理
source = mathchOtherSource(html, htmlBody, sourceList);
}
if(source!=null){
//验证来源
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
// for (String sourceMatch : sourceList) {
// if (source.contains(sourceMatch)) {
// return sourceMatch;
// }
// }
return source;
}
} catch (Exception e) {
e.printStackTrace();
......
......@@ -3051,4 +3051,5 @@ ZOL中关村在线
邹城政务网
走进中关村
最高人民法院网
最高人民检察院
\ No newline at end of file
最高人民检察院
今日湖北
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment