Commit 7a6d49e2 by zhiwei

添加内容匹配

parent 82632f70
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.source_forward.util.TreateData;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @ClassName: ContentPageProcessor
* @Description: 获取文章内容
* @author hero
* @date 2018年6月30日 上午9:54:02
*/
public class ContentPageProcessor implements PageProcessor {
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
.setTimeOut(10000)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
.addHeader("Accept-Encoding", "gzip, deflate, br")
;
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
Map<String,String> data = new HashMap<String,String>();
String content = null;
try {
if(page.getStatusCode()!=404){
MatchContent.matchContent(page.getUrl().get(), page.getHtml().toString());
}
} catch (Exception e) {
content = null;
}
data.put("url", page.getUrl().get());
data.put("content", content);
page.putField("content", data);
}
}
......@@ -6,7 +6,8 @@ import java.util.Map;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.util.TreateData;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
......@@ -31,14 +32,14 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
String channel = null;
try {
if(page.getStatusCode()!=404){
source = TreateData.matchMediaSelfSource(page.getUrl().get(),page.getHtml().toString());
source = MatchSource.matchMediaSelfSource(page.getUrl().get(),page.getHtml().toString());
if(source==null || source.equals("")){
source = null;
}
channel = TreateData.verifyChannel(page.getUrl().get());
channel = MatchChannel.verifyChannel(page.getUrl().get());
if(channel==null){
List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
channel = TreateData.matchChannel(nodeList);
channel = MatchChannel.matchChannel(nodeList);
}
}
} catch (Exception e) {
......@@ -49,7 +50,7 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
data.put("mediaself", source);
data.put("channel", channel);
page.putField("data", data);
page.putField("mediaSelf", data);
}
}
......@@ -6,8 +6,10 @@ import java.util.Map;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.TreateData;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
......@@ -45,12 +47,12 @@ public class SourceForwardPageProcessor implements PageProcessor {
}
data.put("isforward", isforward);
}else{
channel = TreateData.verifyChannel(page.getUrl().get());
channel = MatchChannel.verifyChannel(page.getUrl().get());
if(channel==null){
List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
channel = TreateData.matchChannel(nodeList);
channel = MatchChannel.matchChannel(nodeList);
}
source = TreateData.matchSource(page.getUrl().get(),page.getHtml().toString(), sourceList);
source = MatchSource.matchSource(page.getUrl().get(),page.getHtml().toString(), sourceList);
}
}
} catch (Exception e) {
......@@ -62,7 +64,7 @@ public class SourceForwardPageProcessor implements PageProcessor {
data.put("channel", channel);
data.put("root_source", source);
page.putField("data", data);
page.putField("sourceForward", data);
}
}
......@@ -35,7 +35,7 @@ public class UrlLivePageProcessor implements PageProcessor{
Map<String,Object> data = new HashMap<String,Object>();
data.put("url", page.getUrl().get());
data.put("live", f);
page.putField("data", data);
page.putField("urlLive", data);
}
@Override
......
package com.zhiwei.source_forward.pipeline;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @ClassName: ContentDataPipeline
* @Description: 存储文章位置
* @author hero
* @date 2018年6月30日 上午9:54:27
*/
public class DataPipeline implements Pipeline {
private List<Map<String, Object>> contentDataList;
private List<Map<String, Object>> mediaSelfDataList;
private List<Map<String, Object>> sourceForwardDataList;
private List<Map<String, Object>> urlLivedataList;
public DataPipeline(List<Map<String, Object>> dataList,List<Map<String, Object>> contentDataList,List<Map<String, Object>> mediaSelfDataList,
List<Map<String, Object>> sourceForwardDataList,List<Map<String, Object>> urlLivedataList) {
super();
this.contentDataList = contentDataList;
this.mediaSelfDataList = mediaSelfDataList;
this.sourceForwardDataList = sourceForwardDataList;
this.urlLivedataList = urlLivedataList;
}
public DataPipeline() {
super();
}
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> contentData = resultItems.get("content");
Map<String, Object> mediaSelfData = resultItems.get("mediaSelf");
Map<String, Object> sourceForwardData = resultItems.get("sourceForward");
Map<String, Object> urlLivedata = resultItems.get("urlLive");
if (contentData != null) {
contentDataList.add(contentData);
}
if (mediaSelfData != null) {
mediaSelfDataList.add(mediaSelfData);
}
if (sourceForwardData != null) {
sourceForwardDataList.add(sourceForwardData);
}
if (urlLivedata != null) {
urlLivedataList.add(urlLivedata);
}
}
public List<Map<String, Object>> getContentDataList() {
return contentDataList;
}
public void setContentDataList(List<Map<String, Object>> contentDataList) {
this.contentDataList = contentDataList;
}
public List<Map<String, Object>> getMediaSelfDataList() {
return mediaSelfDataList;
}
public void setMediaSelfDataList(List<Map<String, Object>> mediaSelfDataList) {
this.mediaSelfDataList = mediaSelfDataList;
}
public List<Map<String, Object>> getSourceForwardDataList() {
return sourceForwardDataList;
}
public void setSourceForwardDataList(List<Map<String, Object>> sourceForwardDataList) {
this.sourceForwardDataList = sourceForwardDataList;
}
public List<Map<String, Object>> getUrlLivedataList() {
return urlLivedataList;
}
public void setUrlLivedataList(List<Map<String, Object>> urlLivedataList) {
this.urlLivedataList = urlLivedataList;
}
}
package com.zhiwei.source_forward.pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class MediaSelfSourceDataPipeline implements Pipeline {
private List<Map<String, Object>> dataList;
public MediaSelfSourceDataPipeline(List<Map<String, Object>> dataList) {
super();
this.dataList = dataList;
}
public MediaSelfSourceDataPipeline() {
super();
this.dataList = new ArrayList<>();
}
public List<Map<String, Object>> getDataList() {
return dataList;
}
public void setDataList(List<Map<String, Object>> dataList) {
this.dataList = dataList;
}
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> data = resultItems.get("data");
if (data != null) {
dataList.add(data);
}
}
}
package com.zhiwei.source_forward.pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class SourceForwardDataPipeline implements Pipeline {
private List<Map<String, Object>> dataList;
public SourceForwardDataPipeline(List<Map<String, Object>> dataList) {
super();
this.dataList = dataList;
}
public SourceForwardDataPipeline() {
super();
this.dataList = new ArrayList<>();
}
public List<Map<String, Object>> getDataList() {
return dataList;
}
public void setDataList(List<Map<String, Object>> dataList) {
this.dataList = dataList;
}
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> data = resultItems.get("data");
if (data != null) {
dataList.add(data);
}
}
}
package com.zhiwei.source_forward.pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class UrlLivePipeline implements Pipeline{
private List<Map<String, Object>> dataList;
public UrlLivePipeline(List<Map<String, Object>> dataList) {
super();
this.dataList = dataList;
}
public UrlLivePipeline() {
super();
this.dataList = new ArrayList<>();
}
public List<Map<String, Object>> getDataList() {
return dataList;
}
public void setDataList(List<Map<String, Object>> dataList) {
this.dataList = dataList;
}
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> data = resultItems.get("data");
if (data != null) {
dataList.add(data);
}
}
}
package com.zhiwei.source_forward.run;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.ContentPageProcessor;
import com.zhiwei.source_forward.downloader.MyDownLoader;
import com.zhiwei.source_forward.pipeline.DataPipeline;
import us.codecraft.webmagic.Spider;
public class ContentMatch {
/**
* @Title: getSourceForward
* @author hero
* @Description: 验证文章是否转发
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> getContent(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new ContentPageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey());
}
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(5).run();
List<Map<String,Object>> contentList = pipeline.getContentDataList();
for(Map<String,Object> sourceMap : contentList){
String url = sourceMap.get("url")+"";
//整合数据及验证转发原创
if(dataMap.containsKey(url)){
Map<String,Object> data = dataMap.get(url);
String content = data.get("content")+"";
data.put("content", content);
dataMap.put(url, data);
}
}
return dataMap;
}
}
......@@ -8,8 +8,7 @@ import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.MediaSelfSourcePageProcessor;
import com.zhiwei.source_forward.crawler.SourceForwardPageProcessor;
import com.zhiwei.source_forward.downloader.MyDownLoader;
import com.zhiwei.source_forward.pipeline.MediaSelfSourceDataPipeline;
import com.zhiwei.source_forward.pipeline.SourceForwardDataPipeline;
import com.zhiwei.source_forward.pipeline.DataPipeline;
import us.codecraft.webmagic.Spider;
......@@ -31,7 +30,7 @@ public class SourceForward {
*/
public static Map<String,Map<String,Object>> getSourceForward(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序
SourceForwardDataPipeline pipeline = new SourceForwardDataPipeline();
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new SourceForwardPageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey());
......@@ -40,7 +39,7 @@ public class SourceForward {
spider.addPipeline(pipeline);
spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getDataList();
List<Map<String,Object>> sourceForwardList = pipeline.getSourceForwardDataList();
for(Map<String,Object> sourceMap : sourceForwardList){
String url = sourceMap.get("url")+"";
String root_source = sourceMap.get("root_source")!=null?sourceMap.get("root_source").toString():null;
......@@ -85,7 +84,7 @@ public class SourceForward {
*/
public static Map<String,Map<String,Object>> getMediaSelfSource(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序
MediaSelfSourceDataPipeline pipeline = new MediaSelfSourceDataPipeline();
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey());
......@@ -94,7 +93,7 @@ public class SourceForward {
spider.addPipeline(pipeline);
spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getDataList();
List<Map<String,Object>> sourceForwardList = pipeline.getSourceForwardDataList();
for(Map<String,Object> sourceMap : sourceForwardList){
String url = sourceMap.get("url")+"";
//整合数据及验证转发原创
......@@ -119,7 +118,7 @@ public class SourceForward {
public static Map<String,String> getMediaSelfSource(List<String> urlList){
//启动验证来源程序
Map<String,String> dataMap = new HashMap<String,String>();
MediaSelfSourceDataPipeline pipeline = new MediaSelfSourceDataPipeline();
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
for(String url : urlList){
spider.addUrl(url);
......@@ -129,7 +128,7 @@ public class SourceForward {
spider.addPipeline(pipeline);
spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getDataList();
List<Map<String,Object>> sourceForwardList = pipeline.getMediaSelfDataList();
for(Map<String,Object> sourceMap : sourceForwardList){
String url = sourceMap.get("url")+"";
//整合数据及验证转发原创
......@@ -152,14 +151,14 @@ public class SourceForward {
*/
public static String getMediaSelfSource(String url){
//启动验证来源程序
MediaSelfSourceDataPipeline pipeline = new MediaSelfSourceDataPipeline();
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
spider.addUrl(url);
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(1).run();
List<Map<String,Object>> sourceForwardList = pipeline.getDataList();
List<Map<String,Object>> sourceForwardList = pipeline.getMediaSelfDataList();
for(Map<String,Object> sourceMap : sourceForwardList){
return sourceMap.get("mediaself").toString();
}
......
......@@ -5,7 +5,7 @@ import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.UrlLivePageProcessor;
import com.zhiwei.source_forward.pipeline.UrlLivePipeline;
import com.zhiwei.source_forward.pipeline.DataPipeline;
import us.codecraft.webmagic.Spider;
......@@ -28,7 +28,7 @@ public class URLLive {
*/
public static Map<String,Map<String,Object>> verificationURLLive(Map<String,Map<String,Object>> dataMap){
//启动验证链接是否有效程序程序
UrlLivePipeline pipeline = new UrlLivePipeline();
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new UrlLivePageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey());
......@@ -37,7 +37,7 @@ public class URLLive {
spider.thread(5).run();
//验证数据是否已删除
List<Map<String,Object>> dataList = pipeline.getDataList();
List<Map<String,Object>> dataList = pipeline.getUrlLivedataList();
for(Map<String,Object> data : dataList){
String url = data.get("url")+"";
if(!url.contains("http")){
......
package com.zhiwei.source_forward.util;
import java.util.List;
import org.jsoup.nodes.Node;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public class MatchChannel {
/**
* @Title: matchChannel
* @author hero
* @Description: TODO(匹配频道)
* @param @param
* list
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String matchChannel(List<Node> list) {
/** 验证频道标签 **/
String channel = "新闻";
try {
for (Node node : list) {
if (node.outerHtml().contains("<title>")) {
String[] content = node.toString().split("<title>")[1].split("</title>")[0].split("_");
String channelMatch = "";
for (int i = 0; i < content.length; i++) {
if (i > 0) {
channelMatch += content[i] + "_";
}
}
channel = getChannel(channelMatch);
break;
}
}
} catch (Exception e) {
return channel;
}
return channel;
}
/**
* @Title: verifyChannel
* @author hero
* @Description: 根据链接验证文章频道
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public static String verifyChannel(String url){
String channel = null;
if(url.contains("news.") || url.contains("cj.sina.com.cn")
|| url.contains("wemedia.ifeng.com")){
channel = "新闻";
}else if(url.contains("finance.") || url.contains("business.")
|| url.contains("money.") || url.contains("stock.")
|| url.contains("10jqka.com.cn")){
channel = "财经";
}else if(url.contains("tech.") || url.contains("it.")
|| url.contains("pcedu.") || url.contains("mobile.")
|| url.contains("vr.")){
channel = "科技";
}else if(url.contains("sports.")){
channel = "体育";
}else if(url.contains("ent.") || url.contains("yule.")){
channel = "娱乐";
}else if(url.contains("auto.")){
channel = "汽车";
}else if(url.contains("fashion.")){
channel = "时尚";
}else if(url.contains("learning.") || url.contains("edu.")){
channel = "教育";
}else if(url.contains("baobao.")){
channel = "母婴";
}else if(url.contains("house.") ||url.contains("leju.")
|| url.contains("focus.")){
channel = "房产";
}else if(url.contains("games.")){
channel = "游戏";
}else if(url.contains("intl.")){
channel = "国际";
}else if(url.contains("science.")){
channel = "科学";
}else if(url.contains("city.")){
channel = "城市";
}else if(url.contains("sc.")){
channel = "市场";
}
return channel;
}
/**
* @Title: getChannel
* @author hero
* @Description: TODO(渠道验证)
* @param @param
* source
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String getChannel(String source) {
String channel = "新闻";
if (source.contains("财经")) {
channel = "财经";
} else if (source.contains("金融")) {
channel = "金融";
} else if (source.contains("经济")) {
channel = "经济";
} else if (source.contains("科技")) {
channel = "科技";
} else if (source.contains("时尚")) {
channel = "时尚";
} else if (source.contains("互联网")) {
channel = "互联网";
} else if (source.contains("数码")) {
channel = "数码";
} else if (source.contains("科学")) {
channel = "科学";
} else if (source.contains("TMT")) {
channel = "TMT";
} else if (source.contains("通讯")) {
channel = "通讯";
} else if (source.contains("社会")) {
channel = "社会";
}else if (source.contains("IT")) {
channel = "IT";
}else if (source.contains("房产")) {
channel = "房产";
}else if (source.contains("母婴")) {
channel = "母婴";
}else if (source.contains("3C")) {
channel = "3C";
}
return channel;
}
}
package com.zhiwei.source_forward.util;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public class MatchContent {
/**
* @Title: matchContent
* @author hero
* @Description: 匹配文章正文
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public static String matchContent(String url,String html) {
String content = null;
Document document = Jsoup.parse(html);
try {
content = mathchContent(html, document);
} catch (Exception e) {
content = null;
}
return content;
}
/**
* @Title: mathchContent
* @author hero
* @Description: 匹配正文数据
* @param @param html
* @param @param document
* @param @return 设定文件
* @return String 返回类型
*/
private static String mathchContent(String html,Document document){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String content = null;
try {
News news = ContentExtractor.getNewsByHtml(html);
content = TreateData.filterSpecialCharacter(news.getContent());
} catch (Exception e) {
content = document.text();
System.out.println("正文抽取失败处理........");
e.printStackTrace();
}
return content;
}
}
package com.zhiwei.source_forward.util;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News;
/**
* @ClassName: MatchSource
* @Description: 匹配来源
* @author hero
* @date 2018年6月30日 上午10:27:29
*/
public class MatchSource {
private static String fromRegex = "(来源:(\\S)*(\\s)*[\\S]*)|(来源:(\\S)*(\\s)*[\\S]*)|(本文来源于(\\S)*(\\s)*[\\S]*)"
+ "|(源:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)"
+ "|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)"
+ "|(出自:(\\S)*(\\s)*[\\S]*)|(出自:(\\S)*(\\s)*[\\S]*)" + "|(转自:(\\S)*(\\s)*[\\S]*)|(转自:(\\S)*(\\s)*[\\S]*)"
+ "|(出处\\/作者:(\\S)*(\\s)*[\\S]*)|(出处\\/作者:(\\S)*(\\s)*[\\S]*)"
+ "|(出处:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)|(稿源:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)";
private static String timeRegex = ""
+ "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"
+ "|([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日)"
+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+ "|(\\d{0,2}月\\d{0,2}日)"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2})"
;
/**
* @Title: findURLs
* @author hero
* @Description: TODO(验证并匹配数据)
* @param @param
* s
* @param @param
* regex
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String matchSource(String url,String html, List<String> sourceList) {
String source = null;
Document document = Jsoup.parse(html);
String htmlBody = TreateData.filterSpecialCharacter(document.select("body").text().toUpperCase());
try {
/***特定网站单独处理**/
if(url.contains("thepaper.cn")){
//单独处理澎湃数据
source = document.select("div.news_about").text();
}else if(url.contains("sports.eastday.com")){
//单独处理东方体育网
source = document.select("div.article").select("span").text();
}else if(url.contains("lesports.com")){
//单独处理乐视网数据
source = document.select("div.article-source").select("strong").text();
}else if(url.contains("myzaker.com")){
//单独处理扎克网数据
source = document.select("div#article").select("span.auther").text();
}else if(url.contains("sina.com.cn") || url.contains("sohu.com")){
//单独处理新浪网
if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0];
}
}else if(url.contains("a.mini.eastday.com")){
//处理东方头条网-自媒体号匹配
// source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text();
source = "东方头条";
}else if(url.contains("orz520.com")){
//千寻生活网解析
source = "千寻生活";
}else if(url.contains("sh.qihoo.com")){
//今日报点解析
source = "今日爆点";
}else if(url.contains("itouchtv.cn")){
//触电新闻解析
source = "触电新闻";
}else if(url.contains("yidianzixun.com")){
//一点资讯
if(html.contains("related_wemedia")){
source = "一点资讯";
}else{
source = html.split("source\":\"")[1].split("\",\"")[0];
}
}else{
//其他网站处理
source = mathchOtherSource(html, htmlBody, sourceList);
}
if(source!=null){
//验证来源
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* @Title: matchMediaSelfSource
* @author hero
* @Description: 验证及匹配自媒体号
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public static String matchMediaSelfSource(String url,String html) {
String source = null;
Document document = Jsoup.parse(html);
try {
/***特定网站单独处理**/
if(url.contains("toutiao.com")){
//今日头条帐号匹配
if(html.contains("name: '")){
source = html.split("mediaInfo:")[1].split("name: '")[1].split("',")[0].trim();
}else if(html.contains("screen_name:")){
source = html.split("screen_name:'")[1].split("',")[0].trim();
}
if(source!=null && !source.equals("")){
source = "今日头条-" + source;
}
}else if(url.contains("sohu.com")){
//搜狐自媒体号
if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
if(source!=null && !source.equals("")){
source = "搜狐-" + source;
}
}
}else if(url.contains("a.mini.eastday.com")){
//处理东方头条网-自媒体号匹配
source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text().trim();
if(source!=null && !source.equals("")){
source = "东方头条-" + source;
}
}else if(url.contains("sh.qihoo.com")){
//今日报点解析
source = document.select("p.info").select("span.source").text().trim();
if(source!=null && !source.equals("")){
source = "快资讯-" + source;
}
}else if(url.contains("cj.sina.com.cn")){
//新浪财经头条号
if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
if(source!=null && !source.equals("")){
source = "财经头条-" + source;
}
}
}else if(url.contains("baijia.baidu.com")){
//百度百家
source = document.select("section.info").select("span.author").text().trim();
if(source!=null && !source.equals("")){
source = "百度百家-" + source;
}
}else if(url.contains("yidianzixun.com")){
//一点资讯
if(html.contains("related_wemedia")){
source = html.split("media_name\":\"")[1].split("\",\"")[0].trim();
if(source!=null && !source.equals("")){
source = "一点资讯-" + source;
}
}else{
source = html.split("source\":\"")[1].split("\",\"")[0];
}
}else if(url.contains("news.bitauto.com")){
source = document.select("[class=\"gz-box clearfix\"]").select("div.txt-box")
.select("p.p-n").select("a").text();
if(source!=null && !source.equals("")){
source = "易车网-" + source;
}
}else if(url.contains("chejiahao.autohome.com.cn")){
source = document.select("div.authorMes").select("[class=\"name text-overflow\"]")
.select("a").text();
if(source!=null && !source.equals("")){
source = "汽车之家-" + source;
}
}
return source;
} catch (Exception e) {
return null;
}
}
/**
*
* @Title: mathchOtherSource
* @author hero
* @Description: 匹配通用结果数据
* @param @param html
* @param @param htmlBody
* @param @param length
* @param @return 设定文件
* @return String 返回类型
*/
private static String mathchOtherSource(String html,String htmlBody,List<String> sourceList){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String source = null;
try {
News news = ContentExtractor.getNewsByHtml(html);
String content = TreateData.filterSpecialCharacter(news.getContent().toUpperCase());
String title = TreateData.filterSpecialCharacter(news.getTitle().toUpperCase());
/**剔除正文**/
String text = htmlBody.replace(content, "@@@@@@@@@@");
/**分割正文**/
String[] matchTextArr = text.split("@@@@@@@@@@");
if(TreateData.regex(fromRegex, matchTextArr[0]) != null || TreateData.regex(fromRegex, matchTextArr[1])!=null){
if(TreateData.regex(fromRegex, matchTextArr[0])!=null){
source = TreateData.regex(fromRegex, matchTextArr[0]);
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}else if(TreateData.regex(fromRegex, matchTextArr[1])!=null){
source = TreateData.regex(fromRegex, matchTextArr[1]);
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}
}else{
if(matchTextArr[0].contains(title)){
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String[] titlesArr = matchTextArr[0].split(title);
for(int j = 0;j<titlesArr.length; j++){
String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
if(timeSource!=null){
source = getSourceByTime(timeSource, titlesArr[j], sourceList);
if(source != null){
return source;
}
}
}
}
if(matchTextArr[1].contains(title)){
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String[] titlesArr = matchTextArr[1].split(title);
for(int j = 0;j<titlesArr.length; j++){
String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
if(timeSource!=null){
source = getSourceByTime(timeSource, titlesArr[j], sourceList);
if(source != null){
return source;
}
}
}
}
}
/***正文外无相关数据,匹配正文**/
if(source == null ){
/***
* 匹配命中包含来源等规则的数据
*/
source = TreateData.regex(fromRegex, content);
if(source!=null){
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}else {
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
if(content.contains(title)){ /**正文中包含标题**/
String[] titlesArr = content.split(title);
for(int j = 0;j<titlesArr.length; j++){
String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
if(timeSource!=null){
source = getSourceByTime(timeSource, titlesArr[j], sourceList);
if(source != null){
return source;
}
}
}
}else{ /**正文中不包含标题**/
String timeSource = TreateData.regex(timeRegex, content);
if(timeSource!=null){
source = getSourceByTime(timeSource, content, sourceList);
if(source != null){
return source;
}
}
}
}
}
} catch (Exception e) {
System.out.println("正文抽取失败处理........");
e.printStackTrace();
/***
* 匹配正文失败
* 匹配命中包含来源等规则的数据
*/
source = TreateData.regex(fromRegex, htmlBody);
if (source != null) {
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
} else {
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String timeSource = TreateData.regex(timeRegex, htmlBody);
if(timeSource!=null){
source = getSourceByTime(timeSource, htmlBody, sourceList);
if(source != null){
return source;
}
}
}
}
return null;
}
/**
* @Title: getSourceByTime
* @author hero
* @Description: TODO(根据匹配时间截取数据)
* @param @param htmlBody
* @param @return 设定文件
* @return String 返回类型
*/
private static String getSourceByTime(String timeSource, String htmlBody,List<String> sourceList){
/**以时间做分割,匹配来源信息。
* 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
***/
String times[] = htmlBody.split(timeSource);
for (int j = 0; j < times.length; j++) {
String timecontent = times[j];
if (j == 0) {
if (timecontent.length() >= 30) {
timecontent = timecontent.substring(timecontent.length() - 30, timecontent.length());
} else {
timecontent = timecontent.substring(0, timecontent.length());
}
} else {
if (timecontent.length() >= 30) {
timecontent = timecontent.substring(0, 30);
} else {
timecontent = timecontent.substring(0, timecontent.length());
}
}
for (String sourceMatch : sourceList) {
if (timecontent.contains(sourceMatch)) {
return sourceMatch;
}
}
}
return null;
}
}
package com.zhiwei.source_forward.util;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News;
/**
* @ClassName: TreateData
* @Description: TODO(数据处理类)
......@@ -19,365 +10,6 @@ import cn.edu.hfut.dmic.contentextractor.News;
*/
public class TreateData {
private static String fromRegex = "(来源:(\\S)*(\\s)*[\\S]*)|(来源:(\\S)*(\\s)*[\\S]*)|(本文来源于(\\S)*(\\s)*[\\S]*)"
+ "|(源:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)"
+ "|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)"
+ "|(出自:(\\S)*(\\s)*[\\S]*)|(出自:(\\S)*(\\s)*[\\S]*)" + "|(转自:(\\S)*(\\s)*[\\S]*)|(转自:(\\S)*(\\s)*[\\S]*)"
+ "|(出处\\/作者:(\\S)*(\\s)*[\\S]*)|(出处\\/作者:(\\S)*(\\s)*[\\S]*)"
+ "|(出处:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)|(稿源:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)";
private static String timeRegex = ""
+ "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"
+ "|([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日)"
+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+ "|(\\d{0,2}月\\d{0,2}日)"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2})"
;
/**
* @Title: findURLs
* @author hero
* @Description: TODO(验证并匹配数据)
* @param @param
* s
* @param @param
* regex
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String matchSource(String url,String html, List<String> sourceList) {
String source = null;
Document document = Jsoup.parse(html);
String htmlBody = filterSpecialCharacter(document.select("body").text().toUpperCase());
try {
/***特定网站单独处理**/
if(url.contains("thepaper.cn")){
//单独处理澎湃数据
source = document.select("div.news_about").text();
}else if(url.contains("sports.eastday.com")){
//单独处理东方体育网
source = document.select("div.article").select("span").text();
}else if(url.contains("lesports.com")){
//单独处理乐视网数据
source = document.select("div.article-source").select("strong").text();
}else if(url.contains("myzaker.com")){
//单独处理扎克网数据
source = document.select("div#article").select("span.auther").text();
}else if(url.contains("sina.com.cn") || url.contains("sohu.com")){
//单独处理新浪网
if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0];
}
}else if(url.contains("a.mini.eastday.com")){
//处理东方头条网-自媒体号匹配
// source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text();
source = "东方头条";
}else if(url.contains("orz520.com")){
//千寻生活网解析
source = "千寻生活";
}else if(url.contains("sh.qihoo.com")){
//今日报点解析
source = "今日爆点";
}else if(url.contains("itouchtv.cn")){
//触电新闻解析
source = "触电新闻";
}else if(url.contains("yidianzixun.com")){
//一点资讯
if(html.contains("related_wemedia")){
source = "一点资讯";
}else{
source = html.split("source\":\"")[1].split("\",\"")[0];
}
}else{
//其他网站处理
source = mathchOtherSource(html, htmlBody, sourceList);
}
if(source!=null){
//验证来源
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* @Title: matchMediaSelfSource
* @author hero
* @Description: 验证及匹配自媒体号
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public static String matchMediaSelfSource(String url,String html) {
String source = null;
Document document = Jsoup.parse(html);
try {
/***特定网站单独处理**/
if(url.contains("toutiao.com")){
//今日头条帐号匹配
if(html.contains("name: '")){
source = html.split("mediaInfo:")[1].split("name: '")[1].split("',")[0].trim();
}else if(html.contains("screen_name:")){
source = html.split("screen_name:'")[1].split("',")[0].trim();
}
if(source!=null && !source.equals("")){
source = "今日头条-" + source;
}
}else if(url.contains("sohu.com")){
//搜狐自媒体号
if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
if(source!=null && !source.equals("")){
source = "搜狐-" + source;
}
}
}else if(url.contains("a.mini.eastday.com")){
//处理东方头条网-自媒体号匹配
source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text().trim();
if(source!=null && !source.equals("")){
source = "东方头条-" + source;
}
}else if(url.contains("sh.qihoo.com")){
//今日报点解析
source = document.select("p.info").select("span.source").text().trim();
if(source!=null && !source.equals("")){
source = "快资讯-" + source;
}
}else if(url.contains("cj.sina.com.cn")){
//新浪财经头条号
if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
if(source!=null && !source.equals("")){
source = "财经头条-" + source;
}
}
}else if(url.contains("baijia.baidu.com")){
//百度百家
source = document.select("section.info").select("span.author").text().trim();
if(source!=null && !source.equals("")){
source = "百度百家-" + source;
}
}else if(url.contains("yidianzixun.com")){
//一点资讯
if(html.contains("related_wemedia")){
source = html.split("media_name\":\"")[1].split("\",\"")[0].trim();
if(source!=null && !source.equals("")){
source = "一点资讯-" + source;
}
}else{
source = html.split("source\":\"")[1].split("\",\"")[0];
}
}else if(url.contains("news.bitauto.com")){
source = document.select("[class=\"gz-box clearfix\"]").select("div.txt-box")
.select("p.p-n").select("a").text();
if(source!=null && !source.equals("")){
source = "易车网-" + source;
}
}else if(url.contains("chejiahao.autohome.com.cn")){
source = document.select("div.authorMes").select("[class=\"name text-overflow\"]")
.select("a").text();
if(source!=null && !source.equals("")){
source = "汽车之家-" + source;
}
}
return source;
} catch (Exception e) {
return null;
}
}
/**
* @Title: matchChannel
* @author hero
* @Description: TODO(匹配频道)
* @param @param
* list
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String matchChannel(List<Node> list) {
/** 验证频道标签 **/
String channel = "新闻";
try {
for (Node node : list) {
if (node.outerHtml().contains("<title>")) {
String[] content = node.toString().split("<title>")[1].split("</title>")[0].split("_");
String channelMatch = "";
for (int i = 0; i < content.length; i++) {
if (i > 0) {
channelMatch += content[i] + "_";
}
}
channel = getChannel(channelMatch);
break;
}
}
} catch (Exception e) {
return channel;
}
return channel;
}
/**
*
* @Title: mathchOtherSource
* @author hero
* @Description: 匹配通用结果数据
* @param @param html
* @param @param htmlBody
* @param @param length
* @param @return 设定文件
* @return String 返回类型
*/
private static String mathchOtherSource(String html,String htmlBody,List<String> sourceList){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String source = null;
try {
News news = ContentExtractor.getNewsByHtml(html);
String content = filterSpecialCharacter(news.getContent().toUpperCase());
String title = filterSpecialCharacter(news.getTitle().toUpperCase());
/**剔除正文**/
String text = htmlBody.replace(content, "@@@@@@@@@@");
/**分割正文**/
String[] matchTextArr = text.split("@@@@@@@@@@");
if(regex(fromRegex, matchTextArr[0]) != null || regex(fromRegex, matchTextArr[1])!=null){
if(regex(fromRegex, matchTextArr[0])!=null){
source = regex(fromRegex, matchTextArr[0]);
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}else if(regex(fromRegex, matchTextArr[1])!=null){
source = regex(fromRegex, matchTextArr[1]);
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}
}else{
if(matchTextArr[0].contains(title)){
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String[] titlesArr = matchTextArr[0].split(title);
for(int j = 0;j<titlesArr.length; j++){
String timeSource = regex(timeRegex, titlesArr[j]);
if(timeSource!=null){
source = getSourceByTime(timeSource, titlesArr[j], sourceList);
if(source != null){
return source;
}
}
}
}
if(matchTextArr[1].contains(title)){
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String[] titlesArr = matchTextArr[1].split(title);
for(int j = 0;j<titlesArr.length; j++){
String timeSource = regex(timeRegex, titlesArr[j]);
if(timeSource!=null){
source = getSourceByTime(timeSource, titlesArr[j], sourceList);
if(source != null){
return source;
}
}
}
}
}
/***正文外无相关数据,匹配正文**/
if(source == null ){
/***
* 匹配命中包含来源等规则的数据
*/
source = regex(fromRegex, content);
if(source!=null){
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}else {
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
if(content.contains(title)){ /**正文中包含标题**/
String[] titlesArr = content.split(title);
for(int j = 0;j<titlesArr.length; j++){
String timeSource = regex(timeRegex, titlesArr[j]);
if(timeSource!=null){
source = getSourceByTime(timeSource, titlesArr[j], sourceList);
if(source != null){
return source;
}
}
}
}else{ /**正文中不包含标题**/
String timeSource = regex(timeRegex, content);
if(timeSource!=null){
source = getSourceByTime(timeSource, content, sourceList);
if(source != null){
return source;
}
}
}
}
}
} catch (Exception e) {
System.out.println("正文抽取失败处理........");
e.printStackTrace();
/***
* 匹配正文失败
* 匹配命中包含来源等规则的数据
*/
source = regex(fromRegex, htmlBody);
if (source != null) {
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
} else {
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String timeSource = regex(timeRegex, htmlBody);
if(timeSource!=null){
source = getSourceByTime(timeSource, htmlBody, sourceList);
if(source != null){
return source;
}
}
}
}
return null;
}
/***
*
......@@ -404,148 +36,6 @@ public class TreateData {
}
/**
* @Title: getSourceByTime
* @author hero
* @Description: TODO(根据匹配时间截取数据)
* @param @param htmlBody
* @param @return 设定文件
* @return String 返回类型
*/
private static String getSourceByTime(String timeSource, String htmlBody,List<String> sourceList){
/**以时间做分割,匹配来源信息。
* 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
***/
String times[] = htmlBody.split(timeSource);
for (int j = 0; j < times.length; j++) {
String timecontent = times[j];
if (j == 0) {
if (timecontent.length() >= 30) {
timecontent = timecontent.substring(timecontent.length() - 30, timecontent.length());
} else {
timecontent = timecontent.substring(0, timecontent.length());
}
} else {
if (timecontent.length() >= 30) {
timecontent = timecontent.substring(0, 30);
} else {
timecontent = timecontent.substring(0, timecontent.length());
}
}
for (String sourceMatch : sourceList) {
if (timecontent.contains(sourceMatch)) {
return sourceMatch;
}
}
}
return null;
}
/**
* @Title: getChannel
* @author hero
* @Description: TODO(渠道验证)
* @param @param
* source
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String getChannel(String source) {
String channel = "新闻";
if (source.contains("财经")) {
channel = "财经";
} else if (source.contains("金融")) {
channel = "金融";
} else if (source.contains("经济")) {
channel = "经济";
} else if (source.contains("科技")) {
channel = "科技";
} else if (source.contains("时尚")) {
channel = "时尚";
} else if (source.contains("互联网")) {
channel = "互联网";
} else if (source.contains("数码")) {
channel = "数码";
} else if (source.contains("科学")) {
channel = "科学";
} else if (source.contains("TMT")) {
channel = "TMT";
} else if (source.contains("通讯")) {
channel = "通讯";
} else if (source.contains("社会")) {
channel = "社会";
}else if (source.contains("IT")) {
channel = "IT";
}else if (source.contains("房产")) {
channel = "房产";
}else if (source.contains("母婴")) {
channel = "母婴";
}else if (source.contains("3C")) {
channel = "3C";
}
return channel;
}
/**
* @Title: verifyChannel
* @author hero
* @Description: 根据链接验证文章频道
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public static String verifyChannel(String url){
String channel = null;
if(url.contains("news.") || url.contains("cj.sina.com.cn")
|| url.contains("wemedia.ifeng.com")){
channel = "新闻";
}else if(url.contains("finance.") || url.contains("business.")
|| url.contains("money.") || url.contains("stock.")
|| url.contains("10jqka.com.cn")){
channel = "财经";
}else if(url.contains("tech.") || url.contains("it.")
|| url.contains("pcedu.") || url.contains("mobile.")
|| url.contains("vr.")){
channel = "科技";
}else if(url.contains("sports.")){
channel = "体育";
}else if(url.contains("ent.") || url.contains("yule.")){
channel = "娱乐";
}else if(url.contains("auto.")){
channel = "汽车";
}else if(url.contains("fashion.")){
channel = "时尚";
}else if(url.contains("learning.") || url.contains("edu.")){
channel = "教育";
}else if(url.contains("baobao.")){
channel = "母婴";
}else if(url.contains("house.") ||url.contains("leju.")
|| url.contains("focus.")){
channel = "房产";
}else if(url.contains("games.")){
channel = "游戏";
}else if(url.contains("intl.")){
channel = "国际";
}else if(url.contains("science.")){
channel = "科学";
}else if(url.contains("city.")){
channel = "城市";
}else if(url.contains("sc.")){
channel = "市场";
}
return channel;
}
public static String filterSpecialCharacter(String str) {
try {
String regEx = "【[`~!@#$%^&*()+=|{}';'//[//].<>/?~!@#%……&*——+|{}“”;‘’,。、·]】";
......
package com.zhiwei.source_forward.sourceforward.test;
import java.util.HashMap;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.source_forward.run.SourceForward;
/**
* @ClassName: SourceForwardTest
* @Description: 来源验证
* @author hero
* @date 2017年12月6日 上午9:55:13
*/
public class MediaSelfSourceTest {
@Test
public void sourceForwardTest(){
Map<String,Map<String,Object>> dataMap = new HashMap<String,Map<String,Object>>();
String url = "https://www.toutiao.com/a6549872248428167687/";
Map<String,Object> data = new HashMap<String,Object>();
dataMap.put(url, data);
SourceForward.getMediaSelfSource(dataMap);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment