搜索引擎采集项目初次提交，项目中主要包含

1.百度新闻采集 2.360新闻采集 3.搜狗新闻采集

搜索引擎采集项目初次提交，项目中主要包含
1.百度新闻采集 2.360新闻采集 3.搜狗新闻采集
41547bad · zhiwei · 41547bad · 41547bad · 41547bad · 41547bad
Commit 41547bad authored Feb 26, 2018 by zhiwei
6 changed files
--- a/pom.xml
+++ b/pom.xml
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>com.zhiwei</groupId>
+  <artifactId>media_data_crawler</artifactId>
+  <version>0.0.1-SNAPSHOT</version>
+  <name>media_data_crawler</name>
+  <description>网媒数据抓取，包含百度新闻、搜狗新闻、360新闻等</description>
+  <dependencies>
+  	<dependency>
+  		<groupId>com.zhiwei</groupId>
+  		<artifactId>zhiweiTools</artifactId>
+  		<version>0.0.6-SNAPSHOT</version>
+  	</dependency>
+  </dependencies>
+  
+  
+  <!-- 打包管理 -->
+	<build>
+		<plugins>
+			<!-- 发布源码 -->
+        	<plugin>
+				<artifactId>maven-source-plugin</artifactId>
+				<version>2.4</version>
+				<configuration>
+				<attach>true</attach>
+				</configuration>
+				<executions>
+					<execution>
+						<phase>compile</phase>
+						<goals>
+							<goal>jar</goal>
+						</goals>
+					</execution>
+				</executions>
+			</plugin>
+			<plugin>
+   				<groupId>org.apache.maven.plugins</groupId>
+   				<artifactId>maven-javadoc-plugin</artifactId>
+   				<version>2.10.4</version>
+			</plugin>
+
+			<!-- 解决maven test命令时console出现中文乱码乱码 -->
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-surefire-plugin</artifactId>
+				<version>2.19.1</version>
+				<configuration>
+					<forkMode>once</forkMode>
+					<argLine>-Dfile.encoding=UTF-8</argLine>
+				</configuration>
+			</plugin>
+		</plugins>
+	</build>
+
+
+	<!-- 分发管理:管理distribution和supporting files -->
+	<distributionManagement>
+		<snapshotRepository>
+			<id>nexus-releases</id>
+			<name>User Porject Snapshot</name>
+			<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
+			<uniqueVersion>true</uniqueVersion>
+		</snapshotRepository>
+		<repository>
+			<id>nexus-releases</id>
+			<name>User Porject Release</name>
+			<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
+		</repository>
+	</distributionManagement>
+  
+  
+</project>
\ No newline at end of file
--- a/src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+package com.zhiwei.media_data_crawler.crawler;
+
+import java.io.IOException;
+import java.net.Proxy;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import com.zhiwei.media_data_crawler.entity.NewsData;
+import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
+import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
+import com.zhiwei.zhiweiTools.timeParse.TimeParse;
+import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
+
+public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
+	
+	private static Logger logger = LoggerFactory.getLogger(BaiduNewsCrawlerParse.class);
+	private static final String pt = "百度新闻";
+	
+	
+	/**
+	 * @Title: getBaiduNewsData 
+	 * @author hero 
+	 * @Description: 采集百度新闻数据 
+	 * @param @param word
+	 * @param @param startTime
+	 * @param @param endTime
+	 * @param @param proxy
+	 * @param @return 设定文件 
+	 * @return List<NewsData> 返回类型
+	 */
+	@SuppressWarnings("unchecked")
+	public static List<NewsData> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy){
+		 List<NewsData> list = new ArrayList<NewsData>();
+		 int page = 0;
+		 boolean more = true;
+		 while(more){
+			 //最大页数为20
+			 if(page>20){
+				 more = false;
+			 }
+			 String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newsdy", page);
+			 if(htmlBody != null){
+				 Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
+				 List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
+				 list.addAll(dataList);
+				 more = (Boolean)dataMap.get("more");
+			 }else{
+				 more = false;
+			 }
+			 page++;
+		 }
+		 return list;
+	}
+	
+	@SuppressWarnings("unchecked")
+	public static List<NewsData> getBaiduNewsDataByTitle(String word, String startTime, String endTime, Proxy proxy){
+		 List<NewsData> list = new ArrayList<NewsData>();
+		 int page = 0;
+		 boolean more = true;
+		 while(more){
+			 //最大页数为20
+			 if(page>20){
+				 more = false;
+			 }
+			 String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newstitle", page);
+			 if(htmlBody != null){
+				 Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
+				 List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
+				 list.addAll(dataList);
+				 more = (Boolean)dataMap.get("more");
+			 }else{
+				 more = false;
+			 }
+			 page++;
+		 }
+		 return list;
+	}
+	
+	
+	
+	/**
+	 * @Title: downloadHtml 
+	 * @author hero 
+	 * @Description: 获取数据流 
+	 * @param @param word
+	 * @param @param startTime
+	 * @param @param endTime
+	 * @param @param proxy
+	 * @param @param tn   (tn=newsdy 为 全文匹配, tn=newstitle 为标题匹配)
+	 * @param @param page
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn,int page) {
+		//获取通用请求头
+		Map<String,String> headerMap = HeaderTool.getCommonHead();
+		//获取链接地址
+		String url = getUrl(word, startTime, endTime, tn, page);
+		headerMap.put("Host", "news.baidu.com");
+		headerMap.put("Referer", url);
+		//下载数据页面
+		for(int i = 1; i<=3; i++){
+			try {
+				return get(url, proxy, headerMap);
+			} catch (IOException e) {
+				logger.error("获取数据时出现问题,问题为：{}", e.fillInStackTrace());
+				continue;
+			}
+		}
+		return null;
+	}
+	
+	private static String downloadHtml(String url, Proxy proxy, int page) {
+		//获取通用请求头
+		Map<String,String> headerMap = HeaderTool.getCommonHead();
+		//获取链接地址
+		url = url + "&pn="+page*30;
+		headerMap.put("Host", "news.baidu.com");
+		headerMap.put("Referer", url);
+		//下载数据页面
+		for(int i = 1; i<=3; i++){
+			try {
+				return get(url, proxy, headerMap);
+			} catch (IOException e) {
+				logger.error("获取数据时出现问题,问题为：{}", e.fillInStackTrace());
+				continue;
+			}
+		}
+		return null;
+	}
+	
+	
+	
+	/**
+	 * @Title: analysisData 
+	 * @author hero 
+	 * @Description: 解析百度新闻数据 
+	 * @param @param htmlBody
+	 * @param @param proxy
+	 * @param @param word
+	 * @param @return 设定文件 
+	 * @return Map<String,Object> 返回类型
+	 */
+	private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) {
+		Map<String,Object> resultMap = new HashMap<String,Object>();
+		List<NewsData> list = new ArrayList<NewsData>();
+		boolean more = true;
+		
+		/** 解析页面 */
+		Document document = Jsoup.parse(htmlBody);
+		/**判断是否有下一页**/
+		if(document.select("p#page") == null)
+		{
+			more = false;
+		}else
+		{
+			if(!document.select("p#page").text().contains("下一页"))
+			{
+				more = false;
+			}
+		}
+		//开始解析
+		Elements elementes = document.select("div.result");
+		String time = null;
+		String source = null;
+		String link = null;
+		String title = null;
+		String soureAndtime = null;
+		String descript = null;
+		String soureAndtimeText = null;
+		String content = null;
+		Pattern pattern = null;
+		Matcher matcher = null;
+		for (Element element : elementes) 
+		{
+			try {
+				link = element.select("h3.c-title").select("a").attr("href");
+				title = element.select("h3.c-title").select("a").text();
+				soureAndtime = element.select("div.c-row").select("p.c-author").html();
+				/**截取时间*/
+				if (soureAndtime.contains("&nbsp;&nbsp;")) {
+					String soureAndtimes[] = soureAndtime.split("&nbsp;&nbsp;");
+					time = soureAndtimes[1];
+					source = soureAndtimes[0];
+				} else {
+					time = element.select("div.c-row").select("p.c-author").text();
+				}
+				/**文章发布时间处理**/
+				time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ;
+				// 处理文章简介
+				if(element.select("div.c-row")!=null){
+					descript = element.select("div.c-row").text();
+					soureAndtimeText = element.select("div.c-row").select("p.c-author").text();
+					content = descript.substring(soureAndtimeText.length(), descript.length());
+					pattern = Pattern.compile("\\d*条相同新闻");
+					matcher = pattern.matcher(content);
+					content = matcher.replaceAll("").replace("-", "").replace("百度快照", "");
+				}
+				//添加到数据集合中
+				NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
+				list.add(newsData);
+				/**采集相同新闻链接**/
+				if(element.select("div.c-row").select("a.c-more_link")!=null)
+				{
+					String otherLink = "http://news.baidu.com"+element.select("div.c-row").select("a.c-more_link").attr("href");
+					List<NewsData> otherDataList = getOherBaiduNewsData(otherLink, word, proxy);
+					list.addAll(otherDataList);
+				}
+			} catch (Exception e) {
+				e.printStackTrace();
+				logger.error("百度新闻数据解析时出现问题，问题为:{}", e.fillInStackTrace());
+				continue;
+			}
+		}
+		resultMap.put("data", list);
+		resultMap.put("more", more);
+		
+		return resultMap;
+	}
+	
+	
+	/**
+	 * @Title: getOherBaiduNewsData 
+	 * @author hero 
+	 * @Description: 解析相似新闻 
+	 * @param @param url
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @return 设定文件 
+	 * @return List<NewsData> 返回类型
+	 */
+	@SuppressWarnings("unchecked")
+	public static List<NewsData> getOherBaiduNewsData(String url, String word, Proxy proxy){
+		 List<NewsData> list = new ArrayList<NewsData>();
+		 int page = 0;
+		 boolean more = true;
+		 while(more){
+			 //最大页数为20
+			 if(page>20){
+				 more = false;
+			 }
+			 String htmlBody = downloadHtml(url, proxy, page);
+			 if(htmlBody != null){
+				Map<String,Object> dataMap = analysisData(htmlBody, null, word);
+				List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
+				 list.addAll(dataList);
+				 more = (Boolean)dataMap.get("more");
+			 }else{
+				 more = false;
+			 }
+			 page++;
+		 }
+		 return list;
+	}
+	
+	
+	/**
+	 * @Title: getUrl 
+	 * @author hero 
+	 * @Description: 获取链接
+	 * @param @param word
+	 * @param @param startTime
+	 * @param @param endTime
+	 * @param @param page
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String getUrl(String word, String startTime, String endTime, String tn, int page){
+		long bt = 0;
+		long et = 0;
+		String url = null;
+		if(startTime!=null){
+			bt = TimeParse.stringFormartDate(startTime).getTime()/1000;
+		}
+		if(endTime!=null){
+			et = TimeParse.stringFormartDate(endTime).getTime()/1000;
+		}
+		if(word!=null){
+			url = "http://news.baidu.com/ns?from=news&cl=2&bt=" + bt
+					+ "&et=" + et + "&q1=" +URLCodeUtil.getURLEncode(word, "utf-8") + "&q3=&q4=&tn="+ tn +"&ct=0&rn=50&clk=sortbytime&q6=&pn=" + page * 50;
+		}
+		return url;
+	}
+
+}
--- a/src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
+package com.zhiwei.media_data_crawler.crawler;
+
+import java.io.IOException;
+import java.net.Proxy;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import com.zhiwei.media_data_crawler.entity.NewsData;
+import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
+import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
+import com.zhiwei.zhiweiTools.timeParse.TimeParse;
+import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
+
+public class SoNewsCrawlerParse extends HttpClientTemplateOK {
+
+	private static Logger logger = LoggerFactory.getLogger(SoNewsCrawlerParse.class);
+	private static final String pt = "360新闻";
+
+	/**
+	 * @Title: getSoNewsData 
+	 * @author hero 
+	 * @Description: 采集360新闻数据
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @return 设定文件 
+	 * @return List<NewsData> 返回类型
+	 */
+	@SuppressWarnings("unchecked")
+	public static List<NewsData> getSoNewsData(String word, Proxy proxy) {
+		List<NewsData> list = new ArrayList<NewsData>();
+		int page = 1;
+		boolean more = true;
+		while (more) {
+			// 最大页数为50
+			if (page > 50) {
+				more = false;
+			}
+			String htmlBody = downloadHtml(word, "news", proxy, page);
+			if (htmlBody != null) {
+				Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
+				List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
+				list.addAll(dataList);
+				more = (Boolean) dataMap.get("more");
+			} else {
+				more = false;
+			}
+			page++;
+		}
+		return list;
+	}
+	
+	/**
+	 * @Title: getSoNewsDataByTitle 
+	 * @author hero 
+	 * @Description: 采集360新闻数据,标题匹配
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @return 设定文件 
+	 * @return List<NewsData> 返回类型
+	 */
+	@SuppressWarnings("unchecked")
+	public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy) {
+		List<NewsData> list = new ArrayList<NewsData>();
+		int page = 1;
+		boolean more = true;
+		while (more) {
+			// 最大页数为50
+			if (page > 50) {
+				more = false;
+			}
+			String htmlBody = downloadHtml(word, "newstitle", proxy, page);
+			if (htmlBody != null) {
+				Map<String, Object> dataMap = analysisDataByTitle(htmlBody, proxy, word);
+				List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
+				list.addAll(dataList);
+				more = (Boolean) dataMap.get("more");
+			} else {
+				more = false;
+			}
+			page++;
+		}
+		return list;
+	}
+
+	/**
+	 * @Title: downloadHtml 
+	 * @author hero 
+	 * @Description: 获取数据流
+	 * @param @param word
+	 * @param @param tn （tn=news为全文匹配， tn=newstitle为标题匹配）
+	 * @param @param proxy
+	 * @param @param page
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String downloadHtml(String word, String tn, Proxy proxy, int page) {
+		// 获取通用请求头
+		Map<String, String> headerMap = HeaderTool.getCommonHead();
+		// 获取链接地址
+		String url = getUrl(word, tn, page);
+		headerMap.put("Host", "news.baidu.com");
+		headerMap.put("Referer", url);
+		// 下载数据页面
+		for (int i = 1; i <= 3; i++) {
+			try {
+				return get(url, proxy, headerMap);
+			} catch (IOException e) {
+				logger.error("获取360新闻数据时出现问题,问题为：{}", e.fillInStackTrace());
+				continue;
+			}
+		}
+		return null;
+	}
+
+
+	/**
+	 * @Title: analysisData
+	 * @author hero
+	 * @Description: 解析360新闻数据
+	 * @param @param
+	 *            htmlBody
+	 * @param @param
+	 *            proxy
+	 * @param @param
+	 *            word
+	 * @param @return
+	 *            设定文件
+	 * @return Map<String,Object> 返回类型
+	 */
+	private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) {
+		Map<String, Object> resultMap = new HashMap<String, Object>();
+		List<NewsData> list = new ArrayList<NewsData>();
+		boolean more = true;
+
+		/** 解析页面 */
+		Document document = Jsoup.parse(htmlBody);
+		/** 判断是否有下一页 **/
+		if (document.select("div#page") == null) {
+			more = false;
+		} else {
+			if (!document.select("div#page").text().contains("下一页")) {
+				more = false;
+			}
+		}
+		// 开始解析
+		Elements elementes = document.select("ul#news").select("li");
+		String time = null;
+		String source = null;
+		String link = null;
+		String title = null;
+		String content = null;
+		for (Element element : elementes) {
+			try {
+				if(!element.attr("class").equals("res-list hasimg hasmediav")){
+					link = element.select("h3").select("a").attr("href");
+					title = element.select("h3").select("a").text();
+					time = element.select("p.newsinfo").select("span.posttime").attr("title");
+					source = element.select("p.newsinfo").select("span.sitename").text();		
+					/** 文章发布时间处理 **/
+					time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
+					// 处理文章简介
+					content = element.select("p.content").text();
+					// 添加到数据集合中
+					NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
+					list.add(newsData);
+				}
+			} catch (Exception e) {
+				e.printStackTrace();
+				logger.error("360新闻数据解析时出现问题，问题为:{}", e.fillInStackTrace());
+				continue;
+			}
+		}
+		resultMap.put("data", list);
+		resultMap.put("more", more);
+
+		return resultMap;
+	}
+	
+	
+	/**
+	 * 
+	 * @Title: analysisDataByTitle 
+	 * @author hero 
+	 * @Description: 根据标题匹配数据
+	 * @param @param htmlBody
+	 * @param @param proxy
+	 * @param @param word
+	 * @param @return 设定文件 
+	 * @return Map<String,Object> 返回类型
+	 */
+	private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) {
+		Map<String, Object> resultMap = new HashMap<String, Object>();
+		List<NewsData> list = new ArrayList<NewsData>();
+		boolean more = true;
+
+		/** 解析页面 */
+		Document document = Jsoup.parse(htmlBody);
+		/** 判断是否有下一页 **/
+		if (document.select("div#page") == null) {
+			more = false;
+		} else {
+			if (!document.select("div#page").text().contains("下一页")) {
+				more = false;
+			}
+		}
+		// 开始解析
+		Elements elementes = document.select("ul#news").select("li");
+		String time = null;
+		String source = null;
+		String link = null;
+		String title = null;
+		String content = null;
+		for (Element element : elementes) {
+			try {
+				link = element.select("a.news_title").attr("href");
+				title = element.select("a.news_title").text();
+				time = element.select("div.ntinfo").select("span.pdate").text();
+				source = element.select("div.ntinfo").select("span.stname").text();		
+				/** 文章发布时间处理 **/
+				time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
+				// 添加到数据集合中
+				NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
+				list.add(newsData);
+			} catch (Exception e) {
+				e.printStackTrace();
+				logger.error("360新闻数据解析时出现问题，问题为:{}", e.fillInStackTrace());
+				continue;
+			}
+		}
+		resultMap.put("data", list);
+		resultMap.put("more", more);
+
+		return resultMap;
+	}
+	
+	
+	/**
+	 * @Title: getUrl 
+	 * @author hero 
+	 * @Description: 获取链接
+	 * @param @param word
+	 * @param @param tn  （tn=news为全文匹配， tn=newstitle为标题匹配）
+	 * @param @param page
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String getUrl(String word, String tn, int page) {
+		String url = null;
+		if (word != null) {
+			url = "https://news.so.com/ns?q=" + URLCodeUtil.getURLEncode(word, "utf-8") + "&tn=" + tn
+					+ "&rank=rank&j=0&nso=8&tp=10&nc=0&src=page&pn=" + page;
+		}
+		return url;
+	}
+
+}
--- a/src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
+package com.zhiwei.media_data_crawler.crawler;
+
+import java.io.IOException;
+import java.net.Proxy;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import com.zhiwei.media_data_crawler.entity.NewsData;
+import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
+import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
+import com.zhiwei.zhiweiTools.timeParse.TimeParse;
+import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
+
+public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
+	
+	private static Logger logger = LoggerFactory.getLogger(SougouNewsCrawlerParse.class);
+	private static final String pt = "搜狗新闻";
+	
+	
+	/**
+	 * @Title: getBaiduNewsData 
+	 * @author hero 
+	 * @Description: 采集百度新闻数据 
+	 * @param @param word
+	 * @param @param startTime
+	 * @param @param endTime
+	 * @param @param proxy
+	 * @param @return 设定文件 
+	 * @return List<NewsData> 返回类型
+	 */
+	@SuppressWarnings("unchecked")
+	public static List<NewsData> getSougouNewsData(String word, Proxy proxy){
+		 List<NewsData> list = new ArrayList<NewsData>();
+		 int page = 1;
+		 boolean more = true;
+		 while(more){
+			 //最大页数为20
+			 if(page>100){
+				 more = false;
+			 }
+			 String htmlBody = downloadHtml(word, 1, proxy, page);
+			 if(htmlBody != null){
+				 Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
+				 List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
+				 list.addAll(dataList);
+				 more = (Boolean)dataMap.get("more");
+			 }else{
+				 more = false;
+			 }
+			 page++;
+		 }
+		 return list;
+	}
+	
+	
+	@SuppressWarnings("unchecked")
+	public static List<NewsData> getSougouNewsDataByTitle(String word, Proxy proxy){
+		 List<NewsData> list = new ArrayList<NewsData>();
+		 int page = 0;
+		 boolean more = true;
+		 while(more){
+			 //最大页数为20
+			 if(page>20){
+				 more = false;
+			 }
+			 String htmlBody = downloadHtml(word, 2, proxy, page);
+			 if(htmlBody != null){
+				 Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
+				 List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
+				 list.addAll(dataList);
+				 more = (Boolean)dataMap.get("more");
+			 }else{
+				 more = false;
+			 }
+			 page++;
+		 }
+		 return list;
+	}
+	
+	/**
+	 * 
+	 * @Title: downloadHtml 
+	 * @author hero 
+	 * @Description: 获取数据流
+	 * @param @param word
+	 * @param @param mode （mode为匹配规则，mode=1 全文匹配, mode=2 为标题匹配）
+	 * @param @param proxy
+	 * @param @param page
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String downloadHtml(String word, int mode, Proxy proxy, int page) {
+		//获取通用请求头
+		Map<String,String> headerMap = HeaderTool.getCommonHead();
+		//获取链接地址
+		String url = getUrl(word, mode, page);
+		headerMap.put("Host", "news.sogou.com");
+		headerMap.put("Referer", url.split("&page=")[0]+"&page="+(page-1));
+		//下载数据页面
+		for(int i = 1; i<=3; i++){
+			try {
+				return get(url, proxy, headerMap);
+			} catch (IOException e) {
+				logger.error("获取搜狗新闻数据时出现问题,问题为：{}", e.fillInStackTrace());
+				continue;
+			}
+		}
+		return null;
+	}
+	
+	private static String downloadHtml(String url, Proxy proxy, int page) {
+		//获取通用请求头
+		Map<String,String> headerMap = HeaderTool.getCommonHead();
+		//获取链接地址
+		url = url + "&page" + page;
+		headerMap.put("Host", "news.sogou.com");
+		headerMap.put("Referer", url);
+		//下载数据页面
+		for(int i = 1; i<=3; i++){
+			try {
+				return get(url, proxy, headerMap);
+			} catch (IOException e) {
+				logger.error("获取搜狗新闻数据时出现问题,问题为：{}", e.fillInStackTrace());
+				continue;
+			}
+		}
+		return null;
+	}
+	
+	
+	
+	/**
+	 * @Title: analysisData 
+	 * @author hero 
+	 * @Description: 解析百度新闻数据 
+	 * @param @param htmlBody
+	 * @param @param proxy
+	 * @param @param word
+	 * @param @return 设定文件 
+	 * @return Map<String,Object> 返回类型
+	 */
+	private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) {
+		Map<String,Object> resultMap = new HashMap<String,Object>();
+		List<NewsData> list = new ArrayList<NewsData>();
+		boolean more = true;
+		
+		/** 解析页面 */
+		Document document = Jsoup.parse(htmlBody);
+		/**判断是否有下一页**/
+		if(document.select("div#pagebar_container") == null)
+		{
+			more = false;
+		}else
+		{
+			if(!document.select("div#pagebar_container").text().contains("下一页"))
+			{
+				more = false;
+			}
+		}
+		//开始解析
+		Elements elementes = document.select("div.results").select("div.vrwrap");
+		String time = null;
+		String source = null;
+		String link = null;
+		String title = null;
+		String soureAndtime = null;
+		String content = null;
+		for (Element element : elementes) 
+		{
+			try {
+				link = element.select("h3.vrTitle").select("a").attr("href");
+				title = element.select("h3.vrTitle").select("a").text();
+				soureAndtime = element.select("div.news-detail").select("div.news-info").select("p.news-from").html();
+				/**截取时间*/
+				if (soureAndtime.contains("&nbsp;")) {
+					String soureAndtimes[] = soureAndtime.split("&nbsp;");
+					time = soureAndtimes[1];
+					source = soureAndtimes[0];
+				} else {
+					time = element.select("div.news-detail").select("div.news-info").select("p.news-from").text();
+				}
+				/**文章发布时间处理**/
+				time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ;
+				// 处理文章简介
+				content = element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("span#summary_1").text();
+				//添加到数据集合中
+				if(title != null){
+					NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
+					list.add(newsData);
+				}
+				/**采集相同新闻链接**/
+				if(element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("a#news_similar")!=null)
+				{
+					String otherLink = "http://news.sogou.com/news"+element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("a#news_similar").attr("href");
+					List<NewsData> otherDataList = getOherSougouNewsData(otherLink, word, proxy);
+					list.addAll(otherDataList);
+				}
+			} catch (Exception e) {
+				e.printStackTrace();
+				logger.error("搜狗新闻数据解析时出现问题，问题为:{}", e.fillInStackTrace());
+				continue;
+			}
+		}
+		resultMap.put("data", list);
+		resultMap.put("more", more);
+		
+		return resultMap;
+	}
+	
+	
+	/**
+	 * @Title: getOherBaiduNewsData 
+	 * @author hero 
+	 * @Description: 解析相似新闻 
+	 * @param @param url
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @return 设定文件 
+	 * @return List<NewsData> 返回类型
+	 */
+	@SuppressWarnings("unchecked")
+	public static List<NewsData> getOherSougouNewsData(String url, String word, Proxy proxy){
+		 List<NewsData> list = new ArrayList<NewsData>();
+		 int page = 1;
+		 boolean more = true;
+		 while(more){
+			 //最大页数为20
+			 if(page>10){
+				 more = false;
+			 }
+			 String htmlBody = downloadHtml(url, proxy, page);
+			 if(htmlBody != null){
+				Map<String,Object> dataMap = analysisData(htmlBody, null, word);
+				List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
+				 list.addAll(dataList);
+				 more = (Boolean)dataMap.get("more");
+			 }else{
+				 more = false;
+			 }
+			 page++;
+		 }
+		 return list;
+	}
+	
+	
+	/**
+	 * @Title: getUrl 
+	 * @author hero 
+	 * @Description: 获取链接
+	 * @param @param word
+	 * @param @param mode   （mode为匹配规则，mode=1 全文匹配, mode=2 为标题匹配）
+	 * @param @param page
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String getUrl(String word, int mode ,int page){
+		String url = null;
+		if(word!=null){
+			url = "http://news.sogou.com/news?mode="+ mode +"&media=&query="
+                    + URLCodeUtil.getURLEncode(word, "utf-8") + "&time=0&clusterId=&sort=1&page=2&dp=1&page="+page;
+		}
+		return url;
+	}
+
+}
--- a/src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+package com.zhiwei.media_data_crawler.data;
+
+import java.net.Proxy;
+import java.util.List;
+
+import com.zhiwei.media_data_crawler.crawler.BaiduNewsCrawlerParse;
+import com.zhiwei.media_data_crawler.crawler.SoNewsCrawlerParse;
+import com.zhiwei.media_data_crawler.crawler.SougouNewsCrawlerParse;
+import com.zhiwei.media_data_crawler.entity.NewsData;
+
+public class DataCrawler {
+	
+	
+	
+	/**
+	 * 
+	 * @Title: getBaiduNewsData 
+	 * @author hero 
+	 * @Description: 根据关键词和时间，全文匹配百度新闻数据 
+	 * @param @param word
+	 * @param @param startTime
+	 * @param @param endTime
+	 * @param @param proxy
+	 * @param @return 设定文件 
+	 * @return List<NewsData> 返回类型
+	 */
+	public static List<NewsData> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy){
+		try {
+			return BaiduNewsCrawlerParse.getBaiduNewsData(word, startTime, endTime, proxy);
+		} catch (Exception e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	
+	/**
+	 * 
+	 * @Title: getBaiduNewsDataByTitle 
+	 * @author hero 
+	 * @Description: 根据关键词和时间，标题匹配百度新闻数据 
+	 * @param @param word
+	 * @param @param startTime
+	 * @param @param endTime
+	 * @param @param proxy
+	 * @param @return 设定文件 
+	 * @return List<NewsData> 返回类型
+	 */
+	public static List<NewsData> getBaiduNewsDataByTitle(String word, String startTime, String endTime, Proxy proxy){
+		try {
+			return BaiduNewsCrawlerParse.getBaiduNewsDataByTitle(word, startTime, endTime, proxy);
+		} catch (Exception e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	
+	/**
+	 * 
+	 * @Title: getSoNewsData 
+	 * @author hero 
+	 * @Description: 采集360新闻数据,按照全文匹配
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @return 设定文件 
+	 * @return List<NewsData> 返回类型
+	 */
+	public static List<NewsData> getSoNewsData(String word, Proxy proxy){
+		try {
+			return SoNewsCrawlerParse.getSoNewsData(word, proxy);
+		} catch (Exception e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	
+	/**
+	 * 
+	 * @Title: getSoNewsDataByTitle 
+	 * @author hero 
+	 * @Description: 采集360新闻数据 ,按照标题匹配
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @return 设定文件 
+	 * @return List<NewsData> 返回类型
+	 */
+	public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy){
+		try {
+			return SoNewsCrawlerParse.getSoNewsDataByTitle(word, proxy);
+		} catch (Exception e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	
+	
+	
+	/**
+	 * 
+	 * @Title: getSougouNewsData 
+	 * @author hero 
+	 * @Description: 搜狗新闻采集，全文匹配 
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @return 设定文件 
+	 * @return List<NewsData> 返回类型
+	 */
+	public static List<NewsData> getSougouNewsData(String word, Proxy proxy){
+		try {
+			return SougouNewsCrawlerParse.getSougouNewsData(word, proxy);
+		} catch (Exception e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	
+	/**
+	 * 
+	 * @Title: getSougouNewsDataByTitle 
+	 * @author hero 
+	 * @Description: 搜狗新闻采集，标题匹配 
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @return 设定文件 
+	 * @return List<NewsData> 返回类型
+	 */
+	public static List<NewsData> getSougouNewsDataByTitle(String word, Proxy proxy){
+		try {
+			return SougouNewsCrawlerParse.getSougouNewsDataByTitle(word, proxy);
+		} catch (Exception e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	
+	
+
+}
--- a/src/main/java/com/zhiwei/media_data_crawler/entity/NewsData.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/entity/NewsData.java
+package com.zhiwei.media_data_crawler.entity;
+
+import java.io.Serializable;
+
+/**
+ * @ClassName: NewsData 
+ * @Description:数据实体类
+ * @author hero 
+ * @date 2018年2月24日 下午5:51:31
+ */
+public class NewsData implements Serializable{
+
+	private static final long serialVersionUID = -4767006433365382515L;
+	
+	private String url;      //文章地址
+	private String title;     //文章标题
+	private String source;    //文章来源
+	private String time;       //文章时间
+	private String content;   //文章简介
+	private String pt;       //采集来源
+	private String word;      //采集关键词
+	
+	
+	
+	public NewsData() {}
+	
+	
+	public NewsData(String url, String title, String source, String time
+			,String content, String pt, String word) {
+		this.url = url;
+		this.title = title;
+		this.source = source;
+		this.time = time;
+		this.content = content;
+		this.pt = pt;
+		this.word = word;
+	}
+	
+	@Override
+	public String toString(){
+		return "new NewsData["
+				+ "url = " + url
+				+ ", title = " + title
+				+ ", source = " + source
+				+ ", time = " + time
+				+ ", content = " + content
+				+ ", pt = " + pt
+				+ ", word = " + word
+				+ "]";
+	}
+	
+	
+	
+	public String getUrl() {
+		return url;
+	}
+	public void setUrl(String url) {
+		this.url = url;
+	}
+	public String getTitle() {
+		return title;
+	}
+	public void setTitle(String title) {
+		this.title = title;
+	}
+	public String getSource() {
+		return source;
+	}
+	public void setSource(String source) {
+		this.source = source;
+	}
+	public String getTime() {
+		return time;
+	}
+	public void setTime(String time) {
+		this.time = time;
+	}
+	public String getContent() {
+		return content;
+	}
+	public void setContent(String content) {
+		this.content = content;
+	}
+	public String getPt() {
+		return pt;
+	}
+	public void setPt(String pt) {
+		this.pt = pt;
+	}
+	public String getWord() {
+		return word;
+	}
+	public void setWord(String word) {
+		this.word = word;
+	}
+	
+}