添加内容匹配

7a6d49e2 · zhiwei · 82632f70 · 7a6d49e2 · 7a6d49e2 · 7a6d49e2
Commit 7a6d49e2 authored Jun 30, 2018 by zhiwei
16 changed files
--- a/src/main/java/com/zhiwei/source_forward/crawler/ContentPageProcessor.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/ContentPageProcessor.java
+package com.zhiwei.source_forward.crawler;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.jsoup.nodes.Node;
+
+import com.zhiwei.source_forward.util.MatchContent;
+import com.zhiwei.source_forward.util.TreateData;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+/**
+ * @ClassName: ContentPageProcessor 
+ * @Description: 获取文章内容 
+ * @author hero 
+ * @date 2018年6月30日 上午9:54:02
+ */
+public class ContentPageProcessor implements PageProcessor {
+
+	private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
+			.setTimeOut(10000)
+			.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
+			.addHeader("Accept-Encoding", "gzip, deflate, br")
+			;
+	
+	@Override
+	public Site getSite() {
+		return site;
+	}
+	
+	@Override
+	public void process(Page page) {
+		Map<String,String> data = new HashMap<String,String>();
+		String content = null;
+		try {
+			if(page.getStatusCode()!=404){
+				MatchContent.matchContent(page.getUrl().get(), page.getHtml().toString());
+			}
+		} catch (Exception e) {
+			content = null;
+		}
+		data.put("url", page.getUrl().get());
+		data.put("content", content);
+		
+		page.putField("content", data);
+	}
+	
+}
--- a/src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourcePageProcessor.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourcePageProcessor.java
@@ -6,7 +6,8 @@ import java.util.Map;

 import org.jsoup.nodes.Node;

-import com.zhiwei.source_forward.util.TreateData;
+import com.zhiwei.source_forward.util.MatchChannel;
+import com.zhiwei.source_forward.util.MatchSource;
 import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.Site;
 import us.codecraft.webmagic.processor.PageProcessor;
@@ -31,14 +32,14 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
 		String channel = null;
 		try {
 			if(page.getStatusCode()!=404){
-				source = TreateData.matchMediaSelfSource(page.getUrl().get(),page.getHtml().toString());
+				source = MatchSource.matchMediaSelfSource(page.getUrl().get(),page.getHtml().toString());
 				if(source==null || source.equals("")){
 					source = null;
 				}
-				channel = TreateData.verifyChannel(page.getUrl().get());
+				channel = MatchChannel.verifyChannel(page.getUrl().get());
 				if(channel==null){
 					List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
-					channel = TreateData.matchChannel(nodeList);
+					channel = MatchChannel.matchChannel(nodeList);
 				}
 			}
 		} catch (Exception e) {
@@ -49,7 +50,7 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
 		data.put("mediaself", source);
 		data.put("channel", channel);
 		
-		page.putField("data", data);
+		page.putField("mediaSelf", data);
 	}
 	
 }
--- a/src/main/java/com/zhiwei/source_forward/crawler/SourceForwardPageProcessor.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/SourceForwardPageProcessor.java
@@ -6,8 +6,10 @@ import java.util.Map;

 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Node;
+
+import com.zhiwei.source_forward.util.MatchChannel;
+import com.zhiwei.source_forward.util.MatchSource;
 import com.zhiwei.source_forward.util.SourceData;
-import com.zhiwei.source_forward.util.TreateData;
 import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.Site;
 import us.codecraft.webmagic.processor.PageProcessor;
@@ -45,12 +47,12 @@ public class SourceForwardPageProcessor implements PageProcessor {
 					}
 					data.put("isforward", isforward);
 				}else{
-					channel = TreateData.verifyChannel(page.getUrl().get());
+					channel = MatchChannel.verifyChannel(page.getUrl().get());
 					if(channel==null){
 						List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
-						channel = TreateData.matchChannel(nodeList);
+						channel = MatchChannel.matchChannel(nodeList);
 					}
-					source = TreateData.matchSource(page.getUrl().get(),page.getHtml().toString(), sourceList);
+					source = MatchSource.matchSource(page.getUrl().get(),page.getHtml().toString(), sourceList);
 				}
 			}
 		} catch (Exception e) {
@@ -62,7 +64,7 @@ public class SourceForwardPageProcessor implements PageProcessor {
 		data.put("channel", channel);
 		data.put("root_source", source);
 		
-		page.putField("data", data);
+		page.putField("sourceForward", data);
 	}
 	
 }
--- a/src/main/java/com/zhiwei/source_forward/crawler/UrlLivePageProcessor.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/UrlLivePageProcessor.java
@@ -35,7 +35,7 @@ public class UrlLivePageProcessor implements PageProcessor{
 		Map<String,Object> data = new HashMap<String,Object>();
 		data.put("url", page.getUrl().get());
 		data.put("live", f);
-		page.putField("data", data);
+		page.putField("urlLive", data);
 	}

 	@Override

--- a/src/main/java/com/zhiwei/source_forward/pipeline/DataPipeline.java
+++ b/src/main/java/com/zhiwei/source_forward/pipeline/DataPipeline.java
+package com.zhiwei.source_forward.pipeline;
+
+import java.util.List;
+import java.util.Map;
+
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.pipeline.Pipeline;
+/**
+ * @ClassName: ContentDataPipeline 
+ * @Description: 存储文章位置
+ * @author hero 
+ * @date 2018年6月30日 上午9:54:27
+ */
+public class DataPipeline implements Pipeline {
+    private List<Map<String, Object>> contentDataList;
+    private List<Map<String, Object>> mediaSelfDataList;
+    private List<Map<String, Object>> sourceForwardDataList;
+    private List<Map<String, Object>> urlLivedataList;
+
+	public DataPipeline(List<Map<String, Object>> dataList,List<Map<String, Object>> contentDataList,List<Map<String, Object>> mediaSelfDataList,
+			List<Map<String, Object>> sourceForwardDataList,List<Map<String, Object>> urlLivedataList) {
+		super();
+		this.contentDataList = contentDataList;
+		this.mediaSelfDataList = mediaSelfDataList;
+		this.sourceForwardDataList = sourceForwardDataList;
+		this.urlLivedataList = urlLivedataList;
+	}
+	
+	
+	public DataPipeline() {
+		super();
+	}
+
+	
+	@Override
+	public void process(ResultItems resultItems, Task task) {
+		Map<String, Object> contentData = resultItems.get("content");
+		Map<String, Object> mediaSelfData = resultItems.get("mediaSelf");
+		Map<String, Object> sourceForwardData = resultItems.get("sourceForward");
+		Map<String, Object> urlLivedata = resultItems.get("urlLive");
+		if (contentData != null) {
+			contentDataList.add(contentData);
+		}
+		if (mediaSelfData != null) {
+			mediaSelfDataList.add(mediaSelfData);
+		}
+		if (sourceForwardData != null) {
+			sourceForwardDataList.add(sourceForwardData);
+		}
+		if (urlLivedata != null) {
+			urlLivedataList.add(urlLivedata);
+		}
+	}
+	
+	public List<Map<String, Object>> getContentDataList() {
+		return contentDataList;
+	}
+
+
+	public void setContentDataList(List<Map<String, Object>> contentDataList) {
+		this.contentDataList = contentDataList;
+	}
+
+
+	public List<Map<String, Object>> getMediaSelfDataList() {
+		return mediaSelfDataList;
+	}
+
+
+	public void setMediaSelfDataList(List<Map<String, Object>> mediaSelfDataList) {
+		this.mediaSelfDataList = mediaSelfDataList;
+	}
+
+
+	public List<Map<String, Object>> getSourceForwardDataList() {
+		return sourceForwardDataList;
+	}
+
+
+	public void setSourceForwardDataList(List<Map<String, Object>> sourceForwardDataList) {
+		this.sourceForwardDataList = sourceForwardDataList;
+	}
+
+	public List<Map<String, Object>> getUrlLivedataList() {
+		return urlLivedataList;
+	}
+
+
+	public void setUrlLivedataList(List<Map<String, Object>> urlLivedataList) {
+		this.urlLivedataList = urlLivedataList;
+	}
+
+
+
+
+	
+
+}
--- a/src/main/java/com/zhiwei/source_forward/pipeline/MediaSelfSourceDataPipeline.java
+++ b/src/main/java/com/zhiwei/source_forward/pipeline/MediaSelfSourceDataPipeline.java
-package com.zhiwei.source_forward.pipeline;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-
-import us.codecraft.webmagic.ResultItems;
-import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.pipeline.Pipeline;
-
-public class MediaSelfSourceDataPipeline implements Pipeline {
-    private List<Map<String, Object>> dataList;
-
-	public MediaSelfSourceDataPipeline(List<Map<String, Object>> dataList) {
-		super();
-		this.dataList = dataList;
-	}
-	
-	public MediaSelfSourceDataPipeline() {
-		super();
-		this.dataList = new ArrayList<>();
-	}
-
-	public List<Map<String, Object>> getDataList() {
-		return dataList;
-	}
-
-	public void setDataList(List<Map<String, Object>> dataList) {
-		this.dataList = dataList;
-	}
-
-	@Override
-	public void process(ResultItems resultItems, Task task) {
-		Map<String, Object> data = resultItems.get("data");
-		if (data != null) {
-			dataList.add(data);
-		}
-	}
-
-}
--- a/src/main/java/com/zhiwei/source_forward/pipeline/SourceForwardDataPipeline.java
+++ b/src/main/java/com/zhiwei/source_forward/pipeline/SourceForwardDataPipeline.java
-package com.zhiwei.source_forward.pipeline;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-
-import us.codecraft.webmagic.ResultItems;
-import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.pipeline.Pipeline;
-
-public class SourceForwardDataPipeline implements Pipeline {
-    private List<Map<String, Object>> dataList;
-
-	public SourceForwardDataPipeline(List<Map<String, Object>> dataList) {
-		super();
-		this.dataList = dataList;
-	}
-	
-	public SourceForwardDataPipeline() {
-		super();
-		this.dataList = new ArrayList<>();
-	}
-
-	public List<Map<String, Object>> getDataList() {
-		return dataList;
-	}
-
-	public void setDataList(List<Map<String, Object>> dataList) {
-		this.dataList = dataList;
-	}
-
-	@Override
-	public void process(ResultItems resultItems, Task task) {
-		Map<String, Object> data = resultItems.get("data");
-		if (data != null) {
-			dataList.add(data);
-		}
-	}
-
-}
--- a/src/main/java/com/zhiwei/source_forward/pipeline/UrlLivePipeline.java
+++ b/src/main/java/com/zhiwei/source_forward/pipeline/UrlLivePipeline.java
-package com.zhiwei.source_forward.pipeline;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-
-import us.codecraft.webmagic.ResultItems;
-import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.pipeline.Pipeline;
-
-public class UrlLivePipeline implements Pipeline{
-	
-	 private List<Map<String, Object>> dataList;
-
-		public UrlLivePipeline(List<Map<String, Object>> dataList) {
-			super();
-			this.dataList = dataList;
-		}
-		
-		public UrlLivePipeline() {
-			super();
-			this.dataList = new ArrayList<>();
-		}
-
-		public List<Map<String, Object>> getDataList() {
-			return dataList;
-		}
-
-		public void setDataList(List<Map<String, Object>> dataList) {
-			this.dataList = dataList;
-		}
-
-		@Override
-		public void process(ResultItems resultItems, Task task) {
-			Map<String, Object> data = resultItems.get("data");
-			if (data != null) {
-				dataList.add(data);
-			}
-		}
-}
--- a/src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
+++ b/src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
+package com.zhiwei.source_forward.run;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import com.zhiwei.source_forward.crawler.ContentPageProcessor;
+import com.zhiwei.source_forward.downloader.MyDownLoader;
+import com.zhiwei.source_forward.pipeline.DataPipeline;
+
+import us.codecraft.webmagic.Spider;
+
+public class ContentMatch {
+	/**
+	 * @Title: getSourceForward 
+	 * @author hero 
+	 * @Description: 验证文章是否转发
+	 * @param @param dataMap
+	 * @param @return 设定文件 
+	 * @return Map<String,Map<String,Object>> 返回类型
+	 */
+	public static Map<String,Map<String,Object>> getContent(Map<String,Map<String,Object>> dataMap){
+		//启动验证来源程序
+		DataPipeline pipeline = new DataPipeline();
+		Spider spider = Spider.create(new ContentPageProcessor());
+		for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
+			spider.addUrl(entry.getKey());
+		}
+		spider.setDownloader(new MyDownLoader());
+		spider.addPipeline(pipeline);
+		spider.thread(5).run();
+		
+		List<Map<String,Object>> contentList = pipeline.getContentDataList();
+		for(Map<String,Object> sourceMap : contentList){
+			String url = sourceMap.get("url")+"";
+			//整合数据及验证转发原创
+			if(dataMap.containsKey(url)){
+				Map<String,Object> data = dataMap.get(url);
+				String content = data.get("content")+"";
+				data.put("content", content);
+				dataMap.put(url, data);
+			}
+		}
+		return dataMap;
+	} 
+}
--- a/src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+++ b/src/main/java/com/zhiwei/source_forward/run/SourceForward.java
@@ -8,8 +8,7 @@ import java.util.Map.Entry;
 import com.zhiwei.source_forward.crawler.MediaSelfSourcePageProcessor;
 import com.zhiwei.source_forward.crawler.SourceForwardPageProcessor;
 import com.zhiwei.source_forward.downloader.MyDownLoader;
-import com.zhiwei.source_forward.pipeline.MediaSelfSourceDataPipeline;
-import com.zhiwei.source_forward.pipeline.SourceForwardDataPipeline;
+import com.zhiwei.source_forward.pipeline.DataPipeline;

 import us.codecraft.webmagic.Spider;

@@ -31,7 +30,7 @@ public class SourceForward {
 	 */
 	public static Map<String,Map<String,Object>> getSourceForward(Map<String,Map<String,Object>> dataMap){
 		//启动验证来源程序
-		SourceForwardDataPipeline pipeline = new SourceForwardDataPipeline();
+		DataPipeline pipeline = new DataPipeline();
 		Spider spider = Spider.create(new SourceForwardPageProcessor());
 		for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
 			spider.addUrl(entry.getKey());
@@ -40,7 +39,7 @@ public class SourceForward {
 		spider.addPipeline(pipeline);
 		spider.thread(5).run();
 		
-		List<Map<String,Object>> sourceForwardList = pipeline.getDataList();
+		List<Map<String,Object>> sourceForwardList = pipeline.getSourceForwardDataList();
 		for(Map<String,Object> sourceMap : sourceForwardList){
 			String url = sourceMap.get("url")+"";
 			String root_source = sourceMap.get("root_source")!=null?sourceMap.get("root_source").toString():null;
@@ -85,7 +84,7 @@ public class SourceForward {
 	 */
 	public static Map<String,Map<String,Object>> getMediaSelfSource(Map<String,Map<String,Object>> dataMap){
 		//启动验证来源程序
-		MediaSelfSourceDataPipeline pipeline = new MediaSelfSourceDataPipeline();
+		DataPipeline pipeline = new DataPipeline();
 		Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
 		for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
 			spider.addUrl(entry.getKey());
@@ -94,7 +93,7 @@ public class SourceForward {
 		spider.addPipeline(pipeline);
 		spider.thread(5).run();
 		
-		List<Map<String,Object>> sourceForwardList = pipeline.getDataList();
+		List<Map<String,Object>> sourceForwardList = pipeline.getSourceForwardDataList();
 		for(Map<String,Object> sourceMap : sourceForwardList){
 			String url = sourceMap.get("url")+"";
 			//整合数据及验证转发原创
@@ -119,7 +118,7 @@ public class SourceForward {
 	public static Map<String,String> getMediaSelfSource(List<String> urlList){
 		//启动验证来源程序
 		Map<String,String> dataMap = new HashMap<String,String>();
-		MediaSelfSourceDataPipeline pipeline = new MediaSelfSourceDataPipeline();
+		DataPipeline pipeline = new DataPipeline();
 		Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
 		for(String url : urlList){
 			spider.addUrl(url);
@@ -129,7 +128,7 @@ public class SourceForward {
 		spider.addPipeline(pipeline);
 		spider.thread(5).run();
 		
-		List<Map<String,Object>> sourceForwardList = pipeline.getDataList();
+		List<Map<String,Object>> sourceForwardList = pipeline.getMediaSelfDataList();
 		for(Map<String,Object> sourceMap : sourceForwardList){
 			String url = sourceMap.get("url")+"";
 			//整合数据及验证转发原创
@@ -152,14 +151,14 @@ public class SourceForward {
 	 */
 	public static String getMediaSelfSource(String url){
 		//启动验证来源程序
-		MediaSelfSourceDataPipeline pipeline = new MediaSelfSourceDataPipeline();
+		DataPipeline pipeline = new DataPipeline();
 		Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
 		spider.addUrl(url);
 		spider.setDownloader(new MyDownLoader());
 		spider.addPipeline(pipeline);
 		spider.thread(1).run();
 		
-		List<Map<String,Object>> sourceForwardList = pipeline.getDataList();
+		List<Map<String,Object>> sourceForwardList = pipeline.getMediaSelfDataList();
 		for(Map<String,Object> sourceMap : sourceForwardList){
 			return sourceMap.get("mediaself").toString();
 		}

--- a/src/main/java/com/zhiwei/source_forward/run/URLLive.java
+++ b/src/main/java/com/zhiwei/source_forward/run/URLLive.java
@@ -5,7 +5,7 @@ import java.util.Map;
 import java.util.Map.Entry;

 import com.zhiwei.source_forward.crawler.UrlLivePageProcessor;
-import com.zhiwei.source_forward.pipeline.UrlLivePipeline;
+import com.zhiwei.source_forward.pipeline.DataPipeline;

 import us.codecraft.webmagic.Spider;

@@ -28,7 +28,7 @@ public class URLLive {
 	 */
 	public static Map<String,Map<String,Object>> verificationURLLive(Map<String,Map<String,Object>> dataMap){
 		//启动验证链接是否有效程序程序
-		UrlLivePipeline pipeline = new UrlLivePipeline();
+		DataPipeline pipeline = new DataPipeline();
 		Spider spider = Spider.create(new UrlLivePageProcessor());
 		for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
 			spider.addUrl(entry.getKey());
@@ -37,7 +37,7 @@ public class URLLive {
 		spider.thread(5).run();
 		
 		//验证数据是否已删除
-		List<Map<String,Object>> dataList = pipeline.getDataList();
+		List<Map<String,Object>> dataList = pipeline.getUrlLivedataList();
 		for(Map<String,Object> data : dataList){
 			String url = data.get("url")+"";
 			if(!url.contains("http")){

--- a/src/main/java/com/zhiwei/source_forward/util/MatchChannel.java
+++ b/src/main/java/com/zhiwei/source_forward/util/MatchChannel.java
+package com.zhiwei.source_forward.util;
+
+import java.util.List;
+
+import org.jsoup.nodes.Node;
+
+/**
+ * @ClassName: MatchChannel 
+ * @Description: 匹配频道
+ * @author hero 
+ * @date 2018年6月30日 上午10:27:58
+ */
+public class MatchChannel {
+
+	/**
+	 * @Title: matchChannel
+	 * @author hero
+	 * @Description: TODO(匹配频道)
+	 * @param @param
+	 *            list
+	 * @param @return
+	 *            设定文件
+	 * @return String 返回类型
+	 */
+	public static String matchChannel(List<Node> list) {
+		/** 验证频道标签 **/
+		String channel = "新闻";
+		try {
+			for (Node node : list) {
+				if (node.outerHtml().contains("<title>")) {
+					String[] content = node.toString().split("<title>")[1].split("</title>")[0].split("_");
+					String channelMatch = "";
+					for (int i = 0; i < content.length; i++) {
+						if (i > 0) {
+							channelMatch += content[i] + "_";
+						}
+					}
+					channel = getChannel(channelMatch);
+					break;
+				}
+			}
+		} catch (Exception e) {
+			return channel;
+		}
+		return channel;
+	}
+	
+	/**
+	 * @Title: verifyChannel 
+	 * @author hero 
+	 * @Description: 根据链接验证文章频道
+	 * @param @param url
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	public static String verifyChannel(String url){
+		String channel = null;
+		if(url.contains("news.") || url.contains("cj.sina.com.cn") 
+				|| url.contains("wemedia.ifeng.com")){
+			channel = "新闻";
+		}else if(url.contains("finance.") || url.contains("business.")
+				|| url.contains("money.") || url.contains("stock.")
+				|| url.contains("10jqka.com.cn")){
+			channel = "财经";
+		}else if(url.contains("tech.") || url.contains("it.") 
+				|| url.contains("pcedu.") || url.contains("mobile.")
+				|| url.contains("vr.")){
+			channel = "科技";
+		}else if(url.contains("sports.")){
+			channel = "体育";
+		}else if(url.contains("ent.") || url.contains("yule.")){
+			channel = "娱乐";
+		}else if(url.contains("auto.")){
+			channel = "汽车";
+		}else if(url.contains("fashion.")){
+			channel = "时尚";
+		}else if(url.contains("learning.") || url.contains("edu.")){
+			channel = "教育";
+		}else if(url.contains("baobao.")){
+			channel = "母婴";
+		}else if(url.contains("house.") ||url.contains("leju.")
+				|| url.contains("focus.")){
+			channel = "房产";
+		}else if(url.contains("games.")){
+			channel = "游戏";
+		}else if(url.contains("intl.")){
+			channel = "国际";
+		}else if(url.contains("science.")){
+			channel = "科学";
+		}else if(url.contains("city.")){
+			channel = "城市";
+		}else if(url.contains("sc.")){
+			channel = "市场";
+		}
+		return channel;
+	}
+	
+	/**
+	 * @Title: getChannel
+	 * @author hero
+	 * @Description: TODO(渠道验证)
+	 * @param @param
+	 *            source
+	 * @param @return
+	 *            设定文件
+	 * @return String 返回类型
+	 */
+	public static String getChannel(String source) {
+		String channel = "新闻";
+		if (source.contains("财经")) {
+			channel = "财经";
+		} else if (source.contains("金融")) {
+			channel = "金融";
+		} else if (source.contains("经济")) {
+			channel = "经济";
+		} else if (source.contains("科技")) {
+			channel = "科技";
+		} else if (source.contains("时尚")) {
+			channel = "时尚";
+		} else if (source.contains("互联网")) {
+			channel = "互联网";
+		} else if (source.contains("数码")) {
+			channel = "数码";
+		} else if (source.contains("科学")) {
+			channel = "科学";
+		} else if (source.contains("TMT")) {
+			channel = "TMT";
+		} else if (source.contains("通讯")) {
+			channel = "通讯";
+		} else if (source.contains("社会")) {
+			channel = "社会";
+		}else if (source.contains("IT")) {
+			channel = "IT";
+		}else if (source.contains("房产")) {
+			channel = "房产";
+		}else if (source.contains("母婴")) {
+			channel = "母婴";
+		}else if (source.contains("3C")) {
+			channel = "3C";
+		}
+		return channel;
+	}
+}
--- a/src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+++ b/src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+package com.zhiwei.source_forward.util;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
+import cn.edu.hfut.dmic.contentextractor.News;
+
+/**
+ * @ClassName: MatchChannel 
+ * @Description: 匹配频道
+ * @author hero 
+ * @date 2018年6月30日 上午10:27:58
+ */
+public class MatchContent {
+
+	
+	/**
+	 * @Title: matchContent 
+	 * @author hero 
+	 * @Description: 匹配文章正文
+	 * @param @param url
+	 * @param @param html
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	public static String matchContent(String url,String html) {
+		String content = null;
+		Document document = Jsoup.parse(html);
+		try {
+			content = mathchContent(html, document);
+		} catch (Exception e) {
+			content = null;
+		}
+		return content;
+	}
+	
+	
+	/**
+	 * @Title: mathchContent 
+	 * @author hero 
+	 * @Description: 匹配正文数据
+	 * @param @param html
+	 * @param @param document
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String mathchContent(String html,Document document){
+		/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
+		String content = null;
+		try {
+			News news = ContentExtractor.getNewsByHtml(html);
+			content = TreateData.filterSpecialCharacter(news.getContent());
+		} catch (Exception e) {
+			content = document.text();
+			System.out.println("正文抽取失败处理........");
+			e.printStackTrace();
+		}
+		return content;
+	}
+}
--- a/src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+++ b/src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+package com.zhiwei.source_forward.util;
+
+import java.util.List;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
+import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
+import cn.edu.hfut.dmic.contentextractor.News;
+
+/**
+ * @ClassName: MatchSource 
+ * @Description: 匹配来源
+ * @author hero 
+ * @date 2018年6月30日 上午10:27:29
+ */
+public class MatchSource {
+	private static String fromRegex = "(来源：(\\S)*(\\s)*[\\S]*)|(来源:(\\S)*(\\s)*[\\S]*)|(本文来源于(\\S)*(\\s)*[\\S]*)"
+			+ "|(源:(\\S)*(\\s)*[\\S]*)|(来自：(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)"
+			+ "|(\\[来源\\]：(\\S)*(\\s)*[\\S]*)|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)"
+			+ "|(出自：(\\S)*(\\s)*[\\S]*)|(出自:(\\S)*(\\s)*[\\S]*)" + "|(转自：(\\S)*(\\s)*[\\S]*)|(转自:(\\S)*(\\s)*[\\S]*)"
+			+ "|(出处\\/作者：(\\S)*(\\s)*[\\S]*)|(出处\\/作者:(\\S)*(\\s)*[\\S]*)"
+			+ "|(出处：(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)|(稿源：(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)";
+
+	private static String timeRegex = ""
+			+ "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"
+			+ "|([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"
+			+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+			+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+			+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日)"
+			+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+			+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+			+ "|(\\d{0,2}月\\d{0,2}日)"
+			+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+			+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+			+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2})"
+			;
+	/**
+	 * @Title: findURLs
+	 * @author hero
+	 * @Description: TODO(验证并匹配数据)
+	 * @param @param
+	 *            s
+	 * @param @param
+	 *            regex
+	 * @param @return
+	 *            设定文件
+	 * @return String 返回类型
+	 */
+	public static String matchSource(String url,String html, List<String> sourceList) {
+		String source = null;
+		Document document = Jsoup.parse(html);
+		String htmlBody = TreateData.filterSpecialCharacter(document.select("body").text().toUpperCase());
+		try {
+			/***特定网站单独处理**/
+			if(url.contains("thepaper.cn")){  
+				//单独处理澎湃数据
+				source = document.select("div.news_about").text();
+			}else if(url.contains("sports.eastday.com")){   
+				//单独处理东方体育网
+				source = document.select("div.article").select("span").text();
+			}else if(url.contains("lesports.com")){    
+				//单独处理乐视网数据
+				source = document.select("div.article-source").select("strong").text();
+			}else if(url.contains("myzaker.com")){   
+				//单独处理扎克网数据
+				source = document.select("div#article").select("span.auther").text();
+			}else if(url.contains("sina.com.cn") || url.contains("sohu.com")){ 
+				//单独处理新浪网
+				if(html.contains("<meta name=\"mediaid\"")){
+					source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0];
+				}
+			}else if(url.contains("a.mini.eastday.com")){  
+				//处理东方头条网-自媒体号匹配
+//				source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text();
+				source = "东方头条";
+			}else if(url.contains("orz520.com")){
+				//千寻生活网解析
+				source = "千寻生活";
+			}else if(url.contains("sh.qihoo.com")){
+				//今日报点解析
+				source = "今日爆点";
+			}else if(url.contains("itouchtv.cn")){
+				//触电新闻解析
+				source = "触电新闻";
+			}else if(url.contains("yidianzixun.com")){
+				//一点资讯
+				if(html.contains("related_wemedia")){
+					source = "一点资讯";
+				}else{
+					source = html.split("source\":\"")[1].split("\",\"")[0];
+				}
+			}else{ 
+				//其他网站处理
+				source = mathchOtherSource(html, htmlBody, sourceList);
+			}
+			if(source!=null){
+				//验证来源
+				for (String sourceMatch : sourceList) {
+					if (source.contains(sourceMatch)) {
+						return sourceMatch;
+					}
+				}
+			}
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+		return null;
+	}
+	
+	
+	/**
+	 * @Title: matchMediaSelfSource 
+	 * @author hero 
+	 * @Description: 验证及匹配自媒体号
+	 * @param @param url
+	 * @param @param html
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	public static String matchMediaSelfSource(String url,String html) {
+		String source = null;
+		Document document = Jsoup.parse(html);
+		try {
+			/***特定网站单独处理**/
+			if(url.contains("toutiao.com")){  
+				//今日头条帐号匹配
+				if(html.contains("name: '")){
+					source = html.split("mediaInfo:")[1].split("name: '")[1].split("',")[0].trim();
+				}else if(html.contains("screen_name:")){
+					source = html.split("screen_name:'")[1].split("',")[0].trim();
+				}
+				if(source!=null && !source.equals("")){
+					source =  "今日头条-" + source;
+				}
+			}else if(url.contains("sohu.com")){ 
+				//搜狐自媒体号
+				if(html.contains("<meta name=\"mediaid\"")){
+					source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
+					if(source!=null && !source.equals("")){
+						source =  "搜狐-" + source;
+					}
+				}
+			}else if(url.contains("a.mini.eastday.com")){  
+				//处理东方头条网-自媒体号匹配
+				source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text().trim();
+				if(source!=null && !source.equals("")){
+					source =  "东方头条-" + source;
+				}
+			}else if(url.contains("sh.qihoo.com")){
+				//今日报点解析
+				source = document.select("p.info").select("span.source").text().trim();
+				if(source!=null && !source.equals("")){
+					source =  "快资讯-" + source;
+				}
+			}else if(url.contains("cj.sina.com.cn")){
+				//新浪财经头条号
+				if(html.contains("<meta name=\"mediaid\"")){
+					source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
+					if(source!=null && !source.equals("")){
+						source = "财经头条-" + source;
+					}
+				}
+			}else if(url.contains("baijia.baidu.com")){
+				//百度百家
+				source = document.select("section.info").select("span.author").text().trim();
+				if(source!=null && !source.equals("")){
+					source = "百度百家-" + source;
+				}
+			}else if(url.contains("yidianzixun.com")){
+				//一点资讯
+				if(html.contains("related_wemedia")){
+					source = html.split("media_name\":\"")[1].split("\",\"")[0].trim();
+					if(source!=null && !source.equals("")){
+						source = "一点资讯-" + source;
+					}
+				}else{
+					source = html.split("source\":\"")[1].split("\",\"")[0];
+				}
+			}else if(url.contains("news.bitauto.com")){
+				source = document.select("[class=\"gz-box clearfix\"]").select("div.txt-box")
+						.select("p.p-n").select("a").text();
+				if(source!=null && !source.equals("")){
+					source = "易车网-" + source;
+				}
+			}else if(url.contains("chejiahao.autohome.com.cn")){
+				source = document.select("div.authorMes").select("[class=\"name text-overflow\"]")
+						.select("a").text();
+				if(source!=null && !source.equals("")){
+					source = "汽车之家-" + source;
+				}
+			}
+			return source;
+		} catch (Exception e) {
+			return null;
+		}
+	}
+	
+	
+
+	/**
+	 * 
+	 * @Title: mathchOtherSource 
+	 * @author hero 
+	 * @Description: 匹配通用结果数据 
+	 * @param @param html
+	 * @param @param htmlBody
+	 * @param @param length
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String mathchOtherSource(String html,String htmlBody,List<String> sourceList){
+		/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
+		String source = null;
+		try {
+			News news = ContentExtractor.getNewsByHtml(html);
+			String content = TreateData.filterSpecialCharacter(news.getContent().toUpperCase());
+			String title = TreateData.filterSpecialCharacter(news.getTitle().toUpperCase());
+			/**剔除正文**/
+			String text = htmlBody.replace(content, "@@@@@@@@@@");
+			/**分割正文**/
+			String[] matchTextArr = text.split("@@@@@@@@@@");
+			if(TreateData.regex(fromRegex, matchTextArr[0]) != null ||  TreateData.regex(fromRegex, matchTextArr[1])!=null){
+				if(TreateData.regex(fromRegex, matchTextArr[0])!=null){
+					source = TreateData.regex(fromRegex, matchTextArr[0]);
+					for (String sourceMatch : sourceList) {
+						if (source.contains(sourceMatch)) {
+							return sourceMatch;
+						}
+					}
+				}else if(TreateData.regex(fromRegex, matchTextArr[1])!=null){
+					source = TreateData.regex(fromRegex, matchTextArr[1]);
+					for (String sourceMatch : sourceList) {
+						if (source.contains(sourceMatch)) {
+							return sourceMatch;
+						}
+					}
+				}
+			}else{
+				if(matchTextArr[0].contains(title)){    
+					/***判断是否包含标题，如果包含标题则以标题截取数据
+					 * 验证数据为 主要匹配  YYYY-MM-dd  xx日报  
+					 *                或   xx日报   YYYY-MM-dd 
+					 * ***/
+					String[] titlesArr = matchTextArr[0].split(title);
+					for(int j = 0;j<titlesArr.length; j++){
+						String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
+						if(timeSource!=null){
+							source = getSourceByTime(timeSource, titlesArr[j], sourceList);
+							if(source != null){
+								return source;
+							}
+						}
+					}
+				}
+				
+				if(matchTextArr[1].contains(title)){    
+					/***判断是否包含标题，如果包含标题则以标题截取数据
+					 * 验证数据为 主要匹配  YYYY-MM-dd  xx日报  
+					 *                或   xx日报   YYYY-MM-dd 
+					 * ***/
+					String[] titlesArr = matchTextArr[1].split(title);
+					for(int j = 0;j<titlesArr.length; j++){
+						String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
+						if(timeSource!=null){
+							source = getSourceByTime(timeSource, titlesArr[j], sourceList);
+							if(source != null){
+								return source;
+							}
+						}
+					}
+				}
+			}
+			
+			/***正文外无相关数据，匹配正文**/
+			if(source == null ){
+				/***
+				 * 匹配命中包含来源等规则的数据
+				 */
+				source = TreateData.regex(fromRegex, content);
+				if(source!=null){
+					for (String sourceMatch : sourceList) {
+						if (source.contains(sourceMatch)) {
+							return sourceMatch;
+						}
+					}
+				}else {    
+					/***判断是否包含标题，如果包含标题则以标题截取数据
+					 * 验证数据为 主要匹配  YYYY-MM-dd  xx日报  
+					 *                或   xx日报   YYYY-MM-dd 
+					 * ***/
+					if(content.contains(title)){  /**正文中包含标题**/
+						String[] titlesArr = content.split(title);
+						for(int j = 0;j<titlesArr.length; j++){
+							String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
+							if(timeSource!=null){
+								source = getSourceByTime(timeSource, titlesArr[j], sourceList);
+								if(source != null){
+									return source;
+								}
+							}
+						}
+					}else{  /**正文中不包含标题**/
+						String timeSource = TreateData.regex(timeRegex, content);
+						if(timeSource!=null){
+							source = getSourceByTime(timeSource, content, sourceList);
+							if(source != null){
+								return source;
+							}
+						}
+					}
+				}
+			}
+		} catch (Exception e) {
+			System.out.println("正文抽取失败处理........");
+			e.printStackTrace();
+			/***
+			 * 匹配正文失败
+			 * 匹配命中包含来源等规则的数据
+			 */
+			source = TreateData.regex(fromRegex, htmlBody);
+			if (source != null) {
+				for (String sourceMatch : sourceList) {
+					if (source.contains(sourceMatch)) {
+						return sourceMatch;
+					}
+				}
+			} else {
+				/***判断是否包含标题，如果包含标题则以标题截取数据
+				 * 验证数据为 主要匹配  YYYY-MM-dd  xx日报  
+				 *                或   xx日报   YYYY-MM-dd 
+				 * ***/
+				String timeSource = TreateData.regex(timeRegex, htmlBody);
+				if(timeSource!=null){
+					source = getSourceByTime(timeSource, htmlBody, sourceList);
+					if(source != null){
+						return source;
+					}
+				}
+			}
+		}
+		return null;
+	}
+	
+	
+	/**
+	 * @Title: getSourceByTime 
+	 * @author hero 
+	 * @Description: TODO(根据匹配时间截取数据) 
+	 * @param @param htmlBody
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String getSourceByTime(String timeSource, String htmlBody,List<String> sourceList){
+		
+		/**以时间做分割，匹配来源信息。
+		 * 主要匹配  YYYY-MM-dd  xx日报  
+		 * 或  xx日报   YYYY-MM-dd 
+		 ***/
+		String times[] = htmlBody.split(timeSource);
+		for (int j = 0; j < times.length; j++) {
+			String timecontent = times[j];
+			if (j == 0) {
+				if (timecontent.length() >= 30) {
+					timecontent = timecontent.substring(timecontent.length() - 30, timecontent.length());
+				} else {
+					timecontent = timecontent.substring(0, timecontent.length());
+				}
+			} else {
+				if (timecontent.length() >= 30) {
+					timecontent = timecontent.substring(0, 30);
+				} else {
+					timecontent = timecontent.substring(0, timecontent.length());
+				}
+			}
+			
+			for (String sourceMatch : sourceList) {
+				if (timecontent.contains(sourceMatch)) {
+					return sourceMatch;
+				}
+			}
+		}
+		return null;
+	}
+}
--- a/src/main/java/com/zhiwei/source_forward/util/TreateData.java
+++ b/src/main/java/com/zhiwei/source_forward/util/TreateData.java
 package com.zhiwei.source_forward.util;

-import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Node;
-
-import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
-import cn.edu.hfut.dmic.contentextractor.News;
-
 /**
 * @ClassName: TreateData
 * @Description: TODO(数据处理类)
@@ -19,365 +10,6 @@ import cn.edu.hfut.dmic.contentextractor.News;
 */
 public class TreateData {

-	private static String fromRegex = "(来源：(\\S)*(\\s)*[\\S]*)|(来源:(\\S)*(\\s)*[\\S]*)|(本文来源于(\\S)*(\\s)*[\\S]*)"
-			+ "|(源:(\\S)*(\\s)*[\\S]*)|(来自：(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)"
-			+ "|(\\[来源\\]：(\\S)*(\\s)*[\\S]*)|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)"
-			+ "|(出自：(\\S)*(\\s)*[\\S]*)|(出自:(\\S)*(\\s)*[\\S]*)" + "|(转自：(\\S)*(\\s)*[\\S]*)|(转自:(\\S)*(\\s)*[\\S]*)"
-			+ "|(出处\\/作者：(\\S)*(\\s)*[\\S]*)|(出处\\/作者:(\\S)*(\\s)*[\\S]*)"
-			+ "|(出处：(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)|(稿源：(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)";
-
-	private static String timeRegex = ""
-			+ "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"
-			+ "|([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"
-			+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
-			+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
-			+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日)"
-			+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
-			+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
-			+ "|(\\d{0,2}月\\d{0,2}日)"
-			+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
-			+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
-			+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2})"
-			;
-
-	/**
-	 * @Title: findURLs
-	 * @author hero
-	 * @Description: TODO(验证并匹配数据)
-	 * @param @param
-	 *            s
-	 * @param @param
-	 *            regex
-	 * @param @return
-	 *            设定文件
-	 * @return String 返回类型
-	 */
-	public static String matchSource(String url,String html, List<String> sourceList) {
-		String source = null;
-		Document document = Jsoup.parse(html);
-		String htmlBody = filterSpecialCharacter(document.select("body").text().toUpperCase());
-		try {
-			/***特定网站单独处理**/
-			if(url.contains("thepaper.cn")){  
-				//单独处理澎湃数据
-				source = document.select("div.news_about").text();
-			}else if(url.contains("sports.eastday.com")){   
-				//单独处理东方体育网
-				source = document.select("div.article").select("span").text();
-			}else if(url.contains("lesports.com")){    
-				//单独处理乐视网数据
-				source = document.select("div.article-source").select("strong").text();
-			}else if(url.contains("myzaker.com")){   
-				//单独处理扎克网数据
-				source = document.select("div#article").select("span.auther").text();
-			}else if(url.contains("sina.com.cn") || url.contains("sohu.com")){ 
-				//单独处理新浪网
-				if(html.contains("<meta name=\"mediaid\"")){
-					source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0];
-				}
-			}else if(url.contains("a.mini.eastday.com")){  
-				//处理东方头条网-自媒体号匹配
-//				source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text();
-				source = "东方头条";
-			}else if(url.contains("orz520.com")){
-				//千寻生活网解析
-				source = "千寻生活";
-			}else if(url.contains("sh.qihoo.com")){
-				//今日报点解析
-				source = "今日爆点";
-			}else if(url.contains("itouchtv.cn")){
-				//触电新闻解析
-				source = "触电新闻";
-			}else if(url.contains("yidianzixun.com")){
-				//一点资讯
-				if(html.contains("related_wemedia")){
-					source = "一点资讯";
-				}else{
-					source = html.split("source\":\"")[1].split("\",\"")[0];
-				}
-			}else{ 
-				//其他网站处理
-				source = mathchOtherSource(html, htmlBody, sourceList);
-			}
-			if(source!=null){
-				//验证来源
-				for (String sourceMatch : sourceList) {
-					if (source.contains(sourceMatch)) {
-						return sourceMatch;
-					}
-				}
-			}
-		} catch (Exception e) {
-			e.printStackTrace();
-		}
-		return null;
-	}
-	
-	/**
-	 * @Title: matchMediaSelfSource 
-	 * @author hero 
-	 * @Description: 验证及匹配自媒体号
-	 * @param @param url
-	 * @param @param html
-	 * @param @return 设定文件 
-	 * @return String 返回类型
-	 */
-	public static String matchMediaSelfSource(String url,String html) {
-		String source = null;
-		Document document = Jsoup.parse(html);
-		try {
-			/***特定网站单独处理**/
-			if(url.contains("toutiao.com")){  
-				//今日头条帐号匹配
-				if(html.contains("name: '")){
-					source = html.split("mediaInfo:")[1].split("name: '")[1].split("',")[0].trim();
-				}else if(html.contains("screen_name:")){
-					source = html.split("screen_name:'")[1].split("',")[0].trim();
-				}
-				if(source!=null && !source.equals("")){
-					source =  "今日头条-" + source;
-				}
-			}else if(url.contains("sohu.com")){ 
-				//搜狐自媒体号
-				if(html.contains("<meta name=\"mediaid\"")){
-					source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
-					if(source!=null && !source.equals("")){
-						source =  "搜狐-" + source;
-					}
-				}
-			}else if(url.contains("a.mini.eastday.com")){  
-				//处理东方头条网-自媒体号匹配
-				source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text().trim();
-				if(source!=null && !source.equals("")){
-					source =  "东方头条-" + source;
-				}
-			}else if(url.contains("sh.qihoo.com")){
-				//今日报点解析
-				source = document.select("p.info").select("span.source").text().trim();
-				if(source!=null && !source.equals("")){
-					source =  "快资讯-" + source;
-				}
-			}else if(url.contains("cj.sina.com.cn")){
-				//新浪财经头条号
-				if(html.contains("<meta name=\"mediaid\"")){
-					source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
-					if(source!=null && !source.equals("")){
-						source = "财经头条-" + source;
-					}
-				}
-			}else if(url.contains("baijia.baidu.com")){
-				//百度百家
-				source = document.select("section.info").select("span.author").text().trim();
-				if(source!=null && !source.equals("")){
-					source = "百度百家-" + source;
-				}
-			}else if(url.contains("yidianzixun.com")){
-				//一点资讯
-				if(html.contains("related_wemedia")){
-					source = html.split("media_name\":\"")[1].split("\",\"")[0].trim();
-					if(source!=null && !source.equals("")){
-						source = "一点资讯-" + source;
-					}
-				}else{
-					source = html.split("source\":\"")[1].split("\",\"")[0];
-				}
-			}else if(url.contains("news.bitauto.com")){
-				source = document.select("[class=\"gz-box clearfix\"]").select("div.txt-box")
-						.select("p.p-n").select("a").text();
-				if(source!=null && !source.equals("")){
-					source = "易车网-" + source;
-				}
-			}else if(url.contains("chejiahao.autohome.com.cn")){
-				source = document.select("div.authorMes").select("[class=\"name text-overflow\"]")
-						.select("a").text();
-				if(source!=null && !source.equals("")){
-					source = "汽车之家-" + source;
-				}
-			}
-			return source;
-		} catch (Exception e) {
-			return null;
-		}
-	}
-	
-
-	/**
-	 * @Title: matchChannel
-	 * @author hero
-	 * @Description: TODO(匹配频道)
-	 * @param @param
-	 *            list
-	 * @param @return
-	 *            设定文件
-	 * @return String 返回类型
-	 */
-	public static String matchChannel(List<Node> list) {
-		/** 验证频道标签 **/
-		String channel = "新闻";
-		try {
-			for (Node node : list) {
-				if (node.outerHtml().contains("<title>")) {
-					String[] content = node.toString().split("<title>")[1].split("</title>")[0].split("_");
-					String channelMatch = "";
-					for (int i = 0; i < content.length; i++) {
-						if (i > 0) {
-							channelMatch += content[i] + "_";
-						}
-					}
-					channel = getChannel(channelMatch);
-					break;
-				}
-			}
-		} catch (Exception e) {
-			return channel;
-		}
-		return channel;
-	}
-	
-	/**
-	 * 
-	 * @Title: mathchOtherSource 
-	 * @author hero 
-	 * @Description: 匹配通用结果数据 
-	 * @param @param html
-	 * @param @param htmlBody
-	 * @param @param length
-	 * @param @return 设定文件 
-	 * @return String 返回类型
-	 */
-	private static String mathchOtherSource(String html,String htmlBody,List<String> sourceList){
-		/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
-		String source = null;
-		try {
-			News news = ContentExtractor.getNewsByHtml(html);
-			String content = filterSpecialCharacter(news.getContent().toUpperCase());
-			String title = filterSpecialCharacter(news.getTitle().toUpperCase());
-			/**剔除正文**/
-			String text = htmlBody.replace(content, "@@@@@@@@@@");
-			/**分割正文**/
-			String[] matchTextArr = text.split("@@@@@@@@@@");
-			if(regex(fromRegex, matchTextArr[0]) != null ||  regex(fromRegex, matchTextArr[1])!=null){
-				if(regex(fromRegex, matchTextArr[0])!=null){
-					source = regex(fromRegex, matchTextArr[0]);
-					for (String sourceMatch : sourceList) {
-						if (source.contains(sourceMatch)) {
-							return sourceMatch;
-						}
-					}
-				}else if(regex(fromRegex, matchTextArr[1])!=null){
-					source = regex(fromRegex, matchTextArr[1]);
-					for (String sourceMatch : sourceList) {
-						if (source.contains(sourceMatch)) {
-							return sourceMatch;
-						}
-					}
-				}
-			}else{
-				if(matchTextArr[0].contains(title)){    
-					/***判断是否包含标题，如果包含标题则以标题截取数据
-					 * 验证数据为 主要匹配  YYYY-MM-dd  xx日报  
-					 *                或   xx日报   YYYY-MM-dd 
-					 * ***/
-					String[] titlesArr = matchTextArr[0].split(title);
-					for(int j = 0;j<titlesArr.length; j++){
-						String timeSource = regex(timeRegex, titlesArr[j]);
-						if(timeSource!=null){
-							source = getSourceByTime(timeSource, titlesArr[j], sourceList);
-							if(source != null){
-								return source;
-							}
-						}
-					}
-				}
-				
-				if(matchTextArr[1].contains(title)){    
-					/***判断是否包含标题，如果包含标题则以标题截取数据
-					 * 验证数据为 主要匹配  YYYY-MM-dd  xx日报  
-					 *                或   xx日报   YYYY-MM-dd 
-					 * ***/
-					String[] titlesArr = matchTextArr[1].split(title);
-					for(int j = 0;j<titlesArr.length; j++){
-						String timeSource = regex(timeRegex, titlesArr[j]);
-						if(timeSource!=null){
-							source = getSourceByTime(timeSource, titlesArr[j], sourceList);
-							if(source != null){
-								return source;
-							}
-						}
-					}
-				}
-			}
-			
-			/***正文外无相关数据，匹配正文**/
-			if(source == null ){
-				/***
-				 * 匹配命中包含来源等规则的数据
-				 */
-				source = regex(fromRegex, content);
-				if(source!=null){
-					for (String sourceMatch : sourceList) {
-						if (source.contains(sourceMatch)) {
-							return sourceMatch;
-						}
-					}
-				}else {    
-					/***判断是否包含标题，如果包含标题则以标题截取数据
-					 * 验证数据为 主要匹配  YYYY-MM-dd  xx日报  
-					 *                或   xx日报   YYYY-MM-dd 
-					 * ***/
-					if(content.contains(title)){  /**正文中包含标题**/
-						String[] titlesArr = content.split(title);
-						for(int j = 0;j<titlesArr.length; j++){
-							String timeSource = regex(timeRegex, titlesArr[j]);
-							if(timeSource!=null){
-								source = getSourceByTime(timeSource, titlesArr[j], sourceList);
-								if(source != null){
-									return source;
-								}
-							}
-						}
-					}else{  /**正文中不包含标题**/
-						String timeSource = regex(timeRegex, content);
-						if(timeSource!=null){
-							source = getSourceByTime(timeSource, content, sourceList);
-							if(source != null){
-								return source;
-							}
-						}
-					}
-				}
-			}
-		} catch (Exception e) {
-			System.out.println("正文抽取失败处理........");
-			e.printStackTrace();
-			/***
-			 * 匹配正文失败
-			 * 匹配命中包含来源等规则的数据
-			 */
-			source = regex(fromRegex, htmlBody);
-			if (source != null) {
-				for (String sourceMatch : sourceList) {
-					if (source.contains(sourceMatch)) {
-						return sourceMatch;
-					}
-				}
-			} else {
-				/***判断是否包含标题，如果包含标题则以标题截取数据
-				 * 验证数据为 主要匹配  YYYY-MM-dd  xx日报  
-				 *                或   xx日报   YYYY-MM-dd 
-				 * ***/
-				String timeSource = regex(timeRegex, htmlBody);
-				if(timeSource!=null){
-					source = getSourceByTime(timeSource, htmlBody, sourceList);
-					if(source != null){
-						return source;
-					}
-				}
-			}
-		}
-		return null;
-	}
-	

 	/***
 	 * 
@@ -404,148 +36,6 @@ public class TreateData {
 	}
 	
 	
-	/**
-	 * @Title: getSourceByTime 
-	 * @author hero 
-	 * @Description: TODO(根据匹配时间截取数据) 
-	 * @param @param htmlBody
-	 * @param @return 设定文件 
-	 * @return String 返回类型
-	 */
-	private static String getSourceByTime(String timeSource, String htmlBody,List<String> sourceList){
-		
-		/**以时间做分割，匹配来源信息。
-		 * 主要匹配  YYYY-MM-dd  xx日报  
-		 * 或  xx日报   YYYY-MM-dd 
-		 ***/
-		String times[] = htmlBody.split(timeSource);
-		for (int j = 0; j < times.length; j++) {
-			String timecontent = times[j];
-			if (j == 0) {
-				if (timecontent.length() >= 30) {
-					timecontent = timecontent.substring(timecontent.length() - 30, timecontent.length());
-				} else {
-					timecontent = timecontent.substring(0, timecontent.length());
-				}
-			} else {
-				if (timecontent.length() >= 30) {
-					timecontent = timecontent.substring(0, 30);
-				} else {
-					timecontent = timecontent.substring(0, timecontent.length());
-				}
-			}
-			
-			for (String sourceMatch : sourceList) {
-				if (timecontent.contains(sourceMatch)) {
-					return sourceMatch;
-				}
-			}
-		}
-		return null;
-	}
-	
-	
-	
-	
-	/**
-	 * @Title: getChannel
-	 * @author hero
-	 * @Description: TODO(渠道验证)
-	 * @param @param
-	 *            source
-	 * @param @return
-	 *            设定文件
-	 * @return String 返回类型
-	 */
-	public static String getChannel(String source) {
-		String channel = "新闻";
-		if (source.contains("财经")) {
-			channel = "财经";
-		} else if (source.contains("金融")) {
-			channel = "金融";
-		} else if (source.contains("经济")) {
-			channel = "经济";
-		} else if (source.contains("科技")) {
-			channel = "科技";
-		} else if (source.contains("时尚")) {
-			channel = "时尚";
-		} else if (source.contains("互联网")) {
-			channel = "互联网";
-		} else if (source.contains("数码")) {
-			channel = "数码";
-		} else if (source.contains("科学")) {
-			channel = "科学";
-		} else if (source.contains("TMT")) {
-			channel = "TMT";
-		} else if (source.contains("通讯")) {
-			channel = "通讯";
-		} else if (source.contains("社会")) {
-			channel = "社会";
-		}else if (source.contains("IT")) {
-			channel = "IT";
-		}else if (source.contains("房产")) {
-			channel = "房产";
-		}else if (source.contains("母婴")) {
-			channel = "母婴";
-		}else if (source.contains("3C")) {
-			channel = "3C";
-		}
-		return channel;
-	}
-	
-	
-	/**
-	 * @Title: verifyChannel 
-	 * @author hero 
-	 * @Description: 根据链接验证文章频道
-	 * @param @param url
-	 * @param @return 设定文件 
-	 * @return String 返回类型
-	 */
-	public static String verifyChannel(String url){
-		String channel = null;
-		if(url.contains("news.") || url.contains("cj.sina.com.cn") 
-				|| url.contains("wemedia.ifeng.com")){
-			channel = "新闻";
-		}else if(url.contains("finance.") || url.contains("business.")
-				|| url.contains("money.") || url.contains("stock.")
-				|| url.contains("10jqka.com.cn")){
-			channel = "财经";
-		}else if(url.contains("tech.") || url.contains("it.") 
-				|| url.contains("pcedu.") || url.contains("mobile.")
-				|| url.contains("vr.")){
-			channel = "科技";
-		}else if(url.contains("sports.")){
-			channel = "体育";
-		}else if(url.contains("ent.") || url.contains("yule.")){
-			channel = "娱乐";
-		}else if(url.contains("auto.")){
-			channel = "汽车";
-		}else if(url.contains("fashion.")){
-			channel = "时尚";
-		}else if(url.contains("learning.") || url.contains("edu.")){
-			channel = "教育";
-		}else if(url.contains("baobao.")){
-			channel = "母婴";
-		}else if(url.contains("house.") ||url.contains("leju.")
-				|| url.contains("focus.")){
-			channel = "房产";
-		}else if(url.contains("games.")){
-			channel = "游戏";
-		}else if(url.contains("intl.")){
-			channel = "国际";
-		}else if(url.contains("science.")){
-			channel = "科学";
-		}else if(url.contains("city.")){
-			channel = "城市";
-		}else if(url.contains("sc.")){
-			channel = "市场";
-		}
-		return channel;
-	}
-	
-	
-	
 	public static String filterSpecialCharacter(String str) {
 		try {
 			String regEx = "【[`~!@#$%^&*()+=|{}';'//[//].<>/?~！@#%……&*——+|｛｝“”；‘’，。、·]】";

--- a/src/test/java/com/zhiwei/source_forward/sourceforward/test/MediaSelfSourceTest.java
+++ b/src/test/java/com/zhiwei/source_forward/sourceforward/test/MediaSelfSourceTest.java
+package com.zhiwei.source_forward.sourceforward.test;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.junit.Test;
+
+import com.zhiwei.source_forward.run.SourceForward;
+
+/**
+ * @ClassName: SourceForwardTest 
+ * @Description: 来源验证
+ * @author hero 
+ * @date 2017年12月6日 上午9:55:13
+ */
+public class MediaSelfSourceTest {
+	
+	@Test
+	public void sourceForwardTest(){
+		Map<String,Map<String,Object>> dataMap = new HashMap<String,Map<String,Object>>();
+		String url = "https://www.toutiao.com/a6549872248428167687/";
+		Map<String,Object> data = new HashMap<String,Object>();
+		dataMap.put(url, data);
+		
+		SourceForward.getMediaSelfSource(dataMap);
+		
+	}
+	
+	
+	
+	
+	
+	
+	
+
+}