Commit cd456869 by zhiwei

1.来源转发匹配精确化搜狐及新浪

2.来源转发匹配将东方头条、今日爆点、千寻生活、触电新闻自媒体匹配为原创
3.添加自媒体号媒体:今日头条、搜狐、东方头条、今日爆点、财经头条、百家号
parent 6ce658e0
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.TreateData;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
public class MediaSelfSourcePageProcessor implements PageProcessor {
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
.setTimeOut(10000)
.addHeader("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
.addHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
;
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
Map<String,String> data = new HashMap<String,String>();
String source = null;
try {
if(page.getStatusCode()!=404){
source = TreateData.matchMediaSelfSource(page.getUrl().get(),page.getHtml().toString());
}
} catch (Exception e) {
source = null;
}
System.out.println(page.getUrl().get()+"================="+source);
data.put("url", page.getUrl().get());
data.put("mediaself", source);
page.putField("data", data);
}
}
...@@ -36,9 +36,8 @@ public class SourceForwardPageProcessor implements PageProcessor { ...@@ -36,9 +36,8 @@ public class SourceForwardPageProcessor implements PageProcessor {
if(page.getStatusCode()!=404){ if(page.getStatusCode()!=404){
channel = verifyChannel(page.getUrl().get()); channel = verifyChannel(page.getUrl().get());
if(channel==null){ if(channel==null){
List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
List<Node> nodeList = page.getHtml().getDocument().head().childNodes(); channel = TreateData.matchChannel(nodeList);
channel = TreateData.matchChannel(nodeList);
} }
source = TreateData.matchSource(page.getUrl().get(),page.getHtml().toString(), sourceList); source = TreateData.matchSource(page.getUrl().get(),page.getHtml().toString(), sourceList);
} }
......
package com.zhiwei.source_forward.pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class MediaSelfSourceDataPipeline implements Pipeline {
private List<Map<String, Object>> dataList;
public MediaSelfSourceDataPipeline(List<Map<String, Object>> dataList) {
super();
this.dataList = dataList;
}
public MediaSelfSourceDataPipeline() {
super();
this.dataList = new ArrayList<>();
}
public List<Map<String, Object>> getDataList() {
return dataList;
}
public void setDataList(List<Map<String, Object>> dataList) {
this.dataList = dataList;
}
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> data = resultItems.get("data");
if (data != null) {
dataList.add(data);
}
}
}
...@@ -4,8 +4,10 @@ import java.util.List; ...@@ -4,8 +4,10 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.MediaSelfSourcePageProcessor;
import com.zhiwei.source_forward.crawler.SourceForwardPageProcessor; import com.zhiwei.source_forward.crawler.SourceForwardPageProcessor;
import com.zhiwei.source_forward.downloader.MyDownLoader; import com.zhiwei.source_forward.downloader.MyDownLoader;
import com.zhiwei.source_forward.pipeline.MediaSelfSourceDataPipeline;
import com.zhiwei.source_forward.pipeline.SourceForwardDataPipeline; import com.zhiwei.source_forward.pipeline.SourceForwardDataPipeline;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
...@@ -66,8 +68,37 @@ public class SourceForward { ...@@ -66,8 +68,37 @@ public class SourceForward {
/**
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体号名称
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> getMediaSelfSource(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序
MediaSelfSourceDataPipeline pipeline = new MediaSelfSourceDataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey());
}
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getDataList();
for(Map<String,Object> sourceMap : sourceForwardList){
String url = sourceMap.get("url")+"";
//整合数据及验证转发原创
if(dataMap.containsKey(url)){
Map<String,Object> data = dataMap.get(url);
data.put("自媒体号", sourceMap.get("mediaself"));
dataMap.put(url, data);
}
}
return dataMap;
}
......
...@@ -70,15 +70,24 @@ public class TreateData { ...@@ -70,15 +70,24 @@ public class TreateData {
}else if(url.contains("myzaker.com")){ }else if(url.contains("myzaker.com")){
//单独处理扎克网数据 //单独处理扎克网数据
source = document.select("div#article").select("span.auther").text(); source = document.select("div#article").select("span.auther").text();
}else if(url.contains("tech.sina.com.cn")){ }else if(url.contains("sina.com.cn") || url.contains("sohu.com")){
//单独处理新浪网-科技频道数据 //单独处理新浪网
source = document.select("span.source").text(); if(html.contains("<meta name=\"mediaid\"")){
}else if(url.contains("finance.sina.com.cn") || url.contains("news.sina.com.cn")){ source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0];
//单独处理新浪网-财经及新闻数据 }
source = document.select("div.page-info").select("span[data-sudaclick=\"media_name\"]").text(); }else if(url.contains("a.mini.eastday.com")){
}else if(url.contains("ent.sina.com.cn")){ //处理东方头条网-自媒体号匹配
//单独处理新浪网-娱乐 // source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text();
source = document.select("div#top_bar").select("div.date-source").select("a").text(); source = "东方头条";
}else if(url.contains("orz520.com")){
//千寻生活网解析
source = "千寻生活";
}else if(url.contains("sh.qihoo.com")){
//今日报点解析
source = "今日爆点";
}else if(url.contains("itouchtv.cn")){
//触电新闻解析
source = "触电新闻";
}else{ }else{
//其他网站处理 //其他网站处理
source = mathchOtherSource(html, htmlBody, sourceList); source = mathchOtherSource(html, htmlBody, sourceList);
...@@ -92,11 +101,60 @@ public class TreateData { ...@@ -92,11 +101,60 @@ public class TreateData {
} }
} }
} catch (Exception e) { } catch (Exception e) {
System.out.println("+++++++++++++++++");
e.printStackTrace(); e.printStackTrace();
} }
return null; return null;
} }
/**
* @Title: matchMediaSelfSource
* @author hero
* @Description: 验证及匹配自媒体号
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public static String matchMediaSelfSource(String url,String html) {
String source = null;
Document document = Jsoup.parse(html);
try {
/***特定网站单独处理**/
if(url.contains("toutiao.com")){
//今日头条帐号匹配
if(html.contains(" source: '")){
source = "今日头条-" + html.split(" source: '")[1].split("',")[0];
}
}else if(url.contains("sohu.com")){
//搜狐自媒体号
if(html.contains("<meta name=\"mediaid\"")){
source = "搜狐-" + html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0];
}
}else if(url.contains("a.mini.eastday.com")){
//处理东方头条网-自媒体号匹配
source = "东方头条-" + document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text();
}else if(url.contains("sh.qihoo.com")){
//今日报点解析
source = "今日爆点-" + document.select("p.info").select("span.source").text();
}else if(url.contains("cj.sina.com.cn")){
//新浪财经头条号
if(html.contains("<meta name=\"mediaid\"")){
source = "财经头条-" + html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0];
}
}else if(url.contains("baijia.baidu.com")){
//百度百家
source = "百家号-" + document.select("section.info").select("span.author").text();
}
return source;
} catch (Exception e) {
return null;
}
}
/** /**
* @Title: matchChannel * @Title: matchChannel
...@@ -154,7 +212,6 @@ public class TreateData { ...@@ -154,7 +212,6 @@ public class TreateData {
/**分割正文**/ /**分割正文**/
String[] matchTextArr = text.split("@@@@@@@@@@"); String[] matchTextArr = text.split("@@@@@@@@@@");
if(regex(fromRegex, matchTextArr[0]) != null || regex(fromRegex, matchTextArr[1])!=null){ if(regex(fromRegex, matchTextArr[0]) != null || regex(fromRegex, matchTextArr[1])!=null){
if(regex(fromRegex, matchTextArr[0])!=null){ if(regex(fromRegex, matchTextArr[0])!=null){
source = regex(fromRegex, matchTextArr[0]); source = regex(fromRegex, matchTextArr[0]);
for (String sourceMatch : sourceList) { for (String sourceMatch : sourceList) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment