Commit 7a6d49e2 by zhiwei

添加内容匹配

parent 82632f70
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.source_forward.util.TreateData;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @ClassName: ContentPageProcessor
* @Description: 获取文章内容
* @author hero
* @date 2018年6月30日 上午9:54:02
*/
public class ContentPageProcessor implements PageProcessor {
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
.setTimeOut(10000)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
.addHeader("Accept-Encoding", "gzip, deflate, br")
;
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
Map<String,String> data = new HashMap<String,String>();
String content = null;
try {
if(page.getStatusCode()!=404){
MatchContent.matchContent(page.getUrl().get(), page.getHtml().toString());
}
} catch (Exception e) {
content = null;
}
data.put("url", page.getUrl().get());
data.put("content", content);
page.putField("content", data);
}
}
...@@ -6,7 +6,8 @@ import java.util.Map; ...@@ -6,7 +6,8 @@ import java.util.Map;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.util.TreateData; import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
...@@ -31,14 +32,14 @@ public class MediaSelfSourcePageProcessor implements PageProcessor { ...@@ -31,14 +32,14 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
String channel = null; String channel = null;
try { try {
if(page.getStatusCode()!=404){ if(page.getStatusCode()!=404){
source = TreateData.matchMediaSelfSource(page.getUrl().get(),page.getHtml().toString()); source = MatchSource.matchMediaSelfSource(page.getUrl().get(),page.getHtml().toString());
if(source==null || source.equals("")){ if(source==null || source.equals("")){
source = null; source = null;
} }
channel = TreateData.verifyChannel(page.getUrl().get()); channel = MatchChannel.verifyChannel(page.getUrl().get());
if(channel==null){ if(channel==null){
List<Node> nodeList = page.getHtml().getDocument().head().childNodes(); List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
channel = TreateData.matchChannel(nodeList); channel = MatchChannel.matchChannel(nodeList);
} }
} }
} catch (Exception e) { } catch (Exception e) {
...@@ -49,7 +50,7 @@ public class MediaSelfSourcePageProcessor implements PageProcessor { ...@@ -49,7 +50,7 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
data.put("mediaself", source); data.put("mediaself", source);
data.put("channel", channel); data.put("channel", channel);
page.putField("data", data); page.putField("mediaSelf", data);
} }
} }
...@@ -6,8 +6,10 @@ import java.util.Map; ...@@ -6,8 +6,10 @@ import java.util.Map;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData; import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.TreateData;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
...@@ -45,12 +47,12 @@ public class SourceForwardPageProcessor implements PageProcessor { ...@@ -45,12 +47,12 @@ public class SourceForwardPageProcessor implements PageProcessor {
} }
data.put("isforward", isforward); data.put("isforward", isforward);
}else{ }else{
channel = TreateData.verifyChannel(page.getUrl().get()); channel = MatchChannel.verifyChannel(page.getUrl().get());
if(channel==null){ if(channel==null){
List<Node> nodeList = page.getHtml().getDocument().head().childNodes(); List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
channel = TreateData.matchChannel(nodeList); channel = MatchChannel.matchChannel(nodeList);
} }
source = TreateData.matchSource(page.getUrl().get(),page.getHtml().toString(), sourceList); source = MatchSource.matchSource(page.getUrl().get(),page.getHtml().toString(), sourceList);
} }
} }
} catch (Exception e) { } catch (Exception e) {
...@@ -62,7 +64,7 @@ public class SourceForwardPageProcessor implements PageProcessor { ...@@ -62,7 +64,7 @@ public class SourceForwardPageProcessor implements PageProcessor {
data.put("channel", channel); data.put("channel", channel);
data.put("root_source", source); data.put("root_source", source);
page.putField("data", data); page.putField("sourceForward", data);
} }
} }
...@@ -35,7 +35,7 @@ public class UrlLivePageProcessor implements PageProcessor{ ...@@ -35,7 +35,7 @@ public class UrlLivePageProcessor implements PageProcessor{
Map<String,Object> data = new HashMap<String,Object>(); Map<String,Object> data = new HashMap<String,Object>();
data.put("url", page.getUrl().get()); data.put("url", page.getUrl().get());
data.put("live", f); data.put("live", f);
page.putField("data", data); page.putField("urlLive", data);
} }
@Override @Override
......
package com.zhiwei.source_forward.pipeline;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @ClassName: ContentDataPipeline
* @Description: 存储文章位置
* @author hero
* @date 2018年6月30日 上午9:54:27
*/
public class DataPipeline implements Pipeline {
private List<Map<String, Object>> contentDataList;
private List<Map<String, Object>> mediaSelfDataList;
private List<Map<String, Object>> sourceForwardDataList;
private List<Map<String, Object>> urlLivedataList;
public DataPipeline(List<Map<String, Object>> dataList,List<Map<String, Object>> contentDataList,List<Map<String, Object>> mediaSelfDataList,
List<Map<String, Object>> sourceForwardDataList,List<Map<String, Object>> urlLivedataList) {
super();
this.contentDataList = contentDataList;
this.mediaSelfDataList = mediaSelfDataList;
this.sourceForwardDataList = sourceForwardDataList;
this.urlLivedataList = urlLivedataList;
}
public DataPipeline() {
super();
}
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> contentData = resultItems.get("content");
Map<String, Object> mediaSelfData = resultItems.get("mediaSelf");
Map<String, Object> sourceForwardData = resultItems.get("sourceForward");
Map<String, Object> urlLivedata = resultItems.get("urlLive");
if (contentData != null) {
contentDataList.add(contentData);
}
if (mediaSelfData != null) {
mediaSelfDataList.add(mediaSelfData);
}
if (sourceForwardData != null) {
sourceForwardDataList.add(sourceForwardData);
}
if (urlLivedata != null) {
urlLivedataList.add(urlLivedata);
}
}
public List<Map<String, Object>> getContentDataList() {
return contentDataList;
}
public void setContentDataList(List<Map<String, Object>> contentDataList) {
this.contentDataList = contentDataList;
}
public List<Map<String, Object>> getMediaSelfDataList() {
return mediaSelfDataList;
}
public void setMediaSelfDataList(List<Map<String, Object>> mediaSelfDataList) {
this.mediaSelfDataList = mediaSelfDataList;
}
public List<Map<String, Object>> getSourceForwardDataList() {
return sourceForwardDataList;
}
public void setSourceForwardDataList(List<Map<String, Object>> sourceForwardDataList) {
this.sourceForwardDataList = sourceForwardDataList;
}
public List<Map<String, Object>> getUrlLivedataList() {
return urlLivedataList;
}
public void setUrlLivedataList(List<Map<String, Object>> urlLivedataList) {
this.urlLivedataList = urlLivedataList;
}
}
package com.zhiwei.source_forward.pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class MediaSelfSourceDataPipeline implements Pipeline {
private List<Map<String, Object>> dataList;
public MediaSelfSourceDataPipeline(List<Map<String, Object>> dataList) {
super();
this.dataList = dataList;
}
public MediaSelfSourceDataPipeline() {
super();
this.dataList = new ArrayList<>();
}
public List<Map<String, Object>> getDataList() {
return dataList;
}
public void setDataList(List<Map<String, Object>> dataList) {
this.dataList = dataList;
}
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> data = resultItems.get("data");
if (data != null) {
dataList.add(data);
}
}
}
package com.zhiwei.source_forward.pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class SourceForwardDataPipeline implements Pipeline {
private List<Map<String, Object>> dataList;
public SourceForwardDataPipeline(List<Map<String, Object>> dataList) {
super();
this.dataList = dataList;
}
public SourceForwardDataPipeline() {
super();
this.dataList = new ArrayList<>();
}
public List<Map<String, Object>> getDataList() {
return dataList;
}
public void setDataList(List<Map<String, Object>> dataList) {
this.dataList = dataList;
}
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> data = resultItems.get("data");
if (data != null) {
dataList.add(data);
}
}
}
package com.zhiwei.source_forward.pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class UrlLivePipeline implements Pipeline{
private List<Map<String, Object>> dataList;
public UrlLivePipeline(List<Map<String, Object>> dataList) {
super();
this.dataList = dataList;
}
public UrlLivePipeline() {
super();
this.dataList = new ArrayList<>();
}
public List<Map<String, Object>> getDataList() {
return dataList;
}
public void setDataList(List<Map<String, Object>> dataList) {
this.dataList = dataList;
}
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> data = resultItems.get("data");
if (data != null) {
dataList.add(data);
}
}
}
package com.zhiwei.source_forward.run;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.ContentPageProcessor;
import com.zhiwei.source_forward.downloader.MyDownLoader;
import com.zhiwei.source_forward.pipeline.DataPipeline;
import us.codecraft.webmagic.Spider;
public class ContentMatch {
/**
* @Title: getSourceForward
* @author hero
* @Description: 验证文章是否转发
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> getContent(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new ContentPageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey());
}
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(5).run();
List<Map<String,Object>> contentList = pipeline.getContentDataList();
for(Map<String,Object> sourceMap : contentList){
String url = sourceMap.get("url")+"";
//整合数据及验证转发原创
if(dataMap.containsKey(url)){
Map<String,Object> data = dataMap.get(url);
String content = data.get("content")+"";
data.put("content", content);
dataMap.put(url, data);
}
}
return dataMap;
}
}
...@@ -8,8 +8,7 @@ import java.util.Map.Entry; ...@@ -8,8 +8,7 @@ import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.MediaSelfSourcePageProcessor; import com.zhiwei.source_forward.crawler.MediaSelfSourcePageProcessor;
import com.zhiwei.source_forward.crawler.SourceForwardPageProcessor; import com.zhiwei.source_forward.crawler.SourceForwardPageProcessor;
import com.zhiwei.source_forward.downloader.MyDownLoader; import com.zhiwei.source_forward.downloader.MyDownLoader;
import com.zhiwei.source_forward.pipeline.MediaSelfSourceDataPipeline; import com.zhiwei.source_forward.pipeline.DataPipeline;
import com.zhiwei.source_forward.pipeline.SourceForwardDataPipeline;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
...@@ -31,7 +30,7 @@ public class SourceForward { ...@@ -31,7 +30,7 @@ public class SourceForward {
*/ */
public static Map<String,Map<String,Object>> getSourceForward(Map<String,Map<String,Object>> dataMap){ public static Map<String,Map<String,Object>> getSourceForward(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序 //启动验证来源程序
SourceForwardDataPipeline pipeline = new SourceForwardDataPipeline(); DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new SourceForwardPageProcessor()); Spider spider = Spider.create(new SourceForwardPageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){ for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey()); spider.addUrl(entry.getKey());
...@@ -40,7 +39,7 @@ public class SourceForward { ...@@ -40,7 +39,7 @@ public class SourceForward {
spider.addPipeline(pipeline); spider.addPipeline(pipeline);
spider.thread(5).run(); spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getDataList(); List<Map<String,Object>> sourceForwardList = pipeline.getSourceForwardDataList();
for(Map<String,Object> sourceMap : sourceForwardList){ for(Map<String,Object> sourceMap : sourceForwardList){
String url = sourceMap.get("url")+""; String url = sourceMap.get("url")+"";
String root_source = sourceMap.get("root_source")!=null?sourceMap.get("root_source").toString():null; String root_source = sourceMap.get("root_source")!=null?sourceMap.get("root_source").toString():null;
...@@ -85,7 +84,7 @@ public class SourceForward { ...@@ -85,7 +84,7 @@ public class SourceForward {
*/ */
public static Map<String,Map<String,Object>> getMediaSelfSource(Map<String,Map<String,Object>> dataMap){ public static Map<String,Map<String,Object>> getMediaSelfSource(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序 //启动验证来源程序
MediaSelfSourceDataPipeline pipeline = new MediaSelfSourceDataPipeline(); DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor()); Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){ for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey()); spider.addUrl(entry.getKey());
...@@ -94,7 +93,7 @@ public class SourceForward { ...@@ -94,7 +93,7 @@ public class SourceForward {
spider.addPipeline(pipeline); spider.addPipeline(pipeline);
spider.thread(5).run(); spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getDataList(); List<Map<String,Object>> sourceForwardList = pipeline.getSourceForwardDataList();
for(Map<String,Object> sourceMap : sourceForwardList){ for(Map<String,Object> sourceMap : sourceForwardList){
String url = sourceMap.get("url")+""; String url = sourceMap.get("url")+"";
//整合数据及验证转发原创 //整合数据及验证转发原创
...@@ -119,7 +118,7 @@ public class SourceForward { ...@@ -119,7 +118,7 @@ public class SourceForward {
public static Map<String,String> getMediaSelfSource(List<String> urlList){ public static Map<String,String> getMediaSelfSource(List<String> urlList){
//启动验证来源程序 //启动验证来源程序
Map<String,String> dataMap = new HashMap<String,String>(); Map<String,String> dataMap = new HashMap<String,String>();
MediaSelfSourceDataPipeline pipeline = new MediaSelfSourceDataPipeline(); DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor()); Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
for(String url : urlList){ for(String url : urlList){
spider.addUrl(url); spider.addUrl(url);
...@@ -129,7 +128,7 @@ public class SourceForward { ...@@ -129,7 +128,7 @@ public class SourceForward {
spider.addPipeline(pipeline); spider.addPipeline(pipeline);
spider.thread(5).run(); spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getDataList(); List<Map<String,Object>> sourceForwardList = pipeline.getMediaSelfDataList();
for(Map<String,Object> sourceMap : sourceForwardList){ for(Map<String,Object> sourceMap : sourceForwardList){
String url = sourceMap.get("url")+""; String url = sourceMap.get("url")+"";
//整合数据及验证转发原创 //整合数据及验证转发原创
...@@ -152,14 +151,14 @@ public class SourceForward { ...@@ -152,14 +151,14 @@ public class SourceForward {
*/ */
public static String getMediaSelfSource(String url){ public static String getMediaSelfSource(String url){
//启动验证来源程序 //启动验证来源程序
MediaSelfSourceDataPipeline pipeline = new MediaSelfSourceDataPipeline(); DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor()); Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
spider.addUrl(url); spider.addUrl(url);
spider.setDownloader(new MyDownLoader()); spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline); spider.addPipeline(pipeline);
spider.thread(1).run(); spider.thread(1).run();
List<Map<String,Object>> sourceForwardList = pipeline.getDataList(); List<Map<String,Object>> sourceForwardList = pipeline.getMediaSelfDataList();
for(Map<String,Object> sourceMap : sourceForwardList){ for(Map<String,Object> sourceMap : sourceForwardList){
return sourceMap.get("mediaself").toString(); return sourceMap.get("mediaself").toString();
} }
......
...@@ -5,7 +5,7 @@ import java.util.Map; ...@@ -5,7 +5,7 @@ import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.UrlLivePageProcessor; import com.zhiwei.source_forward.crawler.UrlLivePageProcessor;
import com.zhiwei.source_forward.pipeline.UrlLivePipeline; import com.zhiwei.source_forward.pipeline.DataPipeline;
import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Spider;
...@@ -28,7 +28,7 @@ public class URLLive { ...@@ -28,7 +28,7 @@ public class URLLive {
*/ */
public static Map<String,Map<String,Object>> verificationURLLive(Map<String,Map<String,Object>> dataMap){ public static Map<String,Map<String,Object>> verificationURLLive(Map<String,Map<String,Object>> dataMap){
//启动验证链接是否有效程序程序 //启动验证链接是否有效程序程序
UrlLivePipeline pipeline = new UrlLivePipeline(); DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new UrlLivePageProcessor()); Spider spider = Spider.create(new UrlLivePageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){ for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey()); spider.addUrl(entry.getKey());
...@@ -37,7 +37,7 @@ public class URLLive { ...@@ -37,7 +37,7 @@ public class URLLive {
spider.thread(5).run(); spider.thread(5).run();
//验证数据是否已删除 //验证数据是否已删除
List<Map<String,Object>> dataList = pipeline.getDataList(); List<Map<String,Object>> dataList = pipeline.getUrlLivedataList();
for(Map<String,Object> data : dataList){ for(Map<String,Object> data : dataList){
String url = data.get("url")+""; String url = data.get("url")+"";
if(!url.contains("http")){ if(!url.contains("http")){
......
package com.zhiwei.source_forward.util;
import java.util.List;
import org.jsoup.nodes.Node;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public class MatchChannel {
/**
* @Title: matchChannel
* @author hero
* @Description: TODO(匹配频道)
* @param @param
* list
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String matchChannel(List<Node> list) {
/** 验证频道标签 **/
String channel = "新闻";
try {
for (Node node : list) {
if (node.outerHtml().contains("<title>")) {
String[] content = node.toString().split("<title>")[1].split("</title>")[0].split("_");
String channelMatch = "";
for (int i = 0; i < content.length; i++) {
if (i > 0) {
channelMatch += content[i] + "_";
}
}
channel = getChannel(channelMatch);
break;
}
}
} catch (Exception e) {
return channel;
}
return channel;
}
/**
* @Title: verifyChannel
* @author hero
* @Description: 根据链接验证文章频道
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public static String verifyChannel(String url){
String channel = null;
if(url.contains("news.") || url.contains("cj.sina.com.cn")
|| url.contains("wemedia.ifeng.com")){
channel = "新闻";
}else if(url.contains("finance.") || url.contains("business.")
|| url.contains("money.") || url.contains("stock.")
|| url.contains("10jqka.com.cn")){
channel = "财经";
}else if(url.contains("tech.") || url.contains("it.")
|| url.contains("pcedu.") || url.contains("mobile.")
|| url.contains("vr.")){
channel = "科技";
}else if(url.contains("sports.")){
channel = "体育";
}else if(url.contains("ent.") || url.contains("yule.")){
channel = "娱乐";
}else if(url.contains("auto.")){
channel = "汽车";
}else if(url.contains("fashion.")){
channel = "时尚";
}else if(url.contains("learning.") || url.contains("edu.")){
channel = "教育";
}else if(url.contains("baobao.")){
channel = "母婴";
}else if(url.contains("house.") ||url.contains("leju.")
|| url.contains("focus.")){
channel = "房产";
}else if(url.contains("games.")){
channel = "游戏";
}else if(url.contains("intl.")){
channel = "国际";
}else if(url.contains("science.")){
channel = "科学";
}else if(url.contains("city.")){
channel = "城市";
}else if(url.contains("sc.")){
channel = "市场";
}
return channel;
}
/**
* @Title: getChannel
* @author hero
* @Description: TODO(渠道验证)
* @param @param
* source
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String getChannel(String source) {
String channel = "新闻";
if (source.contains("财经")) {
channel = "财经";
} else if (source.contains("金融")) {
channel = "金融";
} else if (source.contains("经济")) {
channel = "经济";
} else if (source.contains("科技")) {
channel = "科技";
} else if (source.contains("时尚")) {
channel = "时尚";
} else if (source.contains("互联网")) {
channel = "互联网";
} else if (source.contains("数码")) {
channel = "数码";
} else if (source.contains("科学")) {
channel = "科学";
} else if (source.contains("TMT")) {
channel = "TMT";
} else if (source.contains("通讯")) {
channel = "通讯";
} else if (source.contains("社会")) {
channel = "社会";
}else if (source.contains("IT")) {
channel = "IT";
}else if (source.contains("房产")) {
channel = "房产";
}else if (source.contains("母婴")) {
channel = "母婴";
}else if (source.contains("3C")) {
channel = "3C";
}
return channel;
}
}
package com.zhiwei.source_forward.util;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public class MatchContent {
/**
* @Title: matchContent
* @author hero
* @Description: 匹配文章正文
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public static String matchContent(String url,String html) {
String content = null;
Document document = Jsoup.parse(html);
try {
content = mathchContent(html, document);
} catch (Exception e) {
content = null;
}
return content;
}
/**
* @Title: mathchContent
* @author hero
* @Description: 匹配正文数据
* @param @param html
* @param @param document
* @param @return 设定文件
* @return String 返回类型
*/
private static String mathchContent(String html,Document document){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String content = null;
try {
News news = ContentExtractor.getNewsByHtml(html);
content = TreateData.filterSpecialCharacter(news.getContent());
} catch (Exception e) {
content = document.text();
System.out.println("正文抽取失败处理........");
e.printStackTrace();
}
return content;
}
}
package com.zhiwei.source_forward.sourceforward.test;
import java.util.HashMap;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.source_forward.run.SourceForward;
/**
* @ClassName: SourceForwardTest
* @Description: 来源验证
* @author hero
* @date 2017年12月6日 上午9:55:13
*/
public class MediaSelfSourceTest {
@Test
public void sourceForwardTest(){
Map<String,Map<String,Object>> dataMap = new HashMap<String,Map<String,Object>>();
String url = "https://www.toutiao.com/a6549872248428167687/";
Map<String,Object> data = new HashMap<String,Object>();
dataMap.put(url, data);
SourceForward.getMediaSelfSource(dataMap);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment