1.添加百度贴吧、天涯论坛、豆瓣（话题+日记）按照关键词采集程序

6c18504b · zhiwei · 06f917df · 6c18504b · 6c18504b · 6c18504b
Commit 6c18504b authored Mar 09, 2018 by zhiwei
9 changed files
--- a/README.md
+++ b/README.md
@@ -30,10 +30,26 @@
 	其它类的可看相应的源码，里面有休眠时间等设置
+	#####更新提示2018-03-08 更新
+	本次更新为添加贴吧\论坛\豆瓣(话题+日记)根据关键词采集功能,使用demo
+	//百度贴吧采集
+	String tiebaName = "京东";  //贴吧名称，指定贴吧内采集，无则为null
+	List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
+	//天涯论坛采集
+	List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
+	//豆瓣采集
+	List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
+	//天涯论坛采集
+	List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
+	//豆瓣采集
+	String type = "topic";   //topic 为指定话题采集，note为指定日记采集
+	List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
 	##### 摘要
-	> 这是一个基于OKHttp+Jsoup实现的网页抓取及解析功能的搜索引擎采集爬虫，目前包含：百度新闻、搜狗新闻、360新闻、搜狗知乎采集四种根据关键词采集功能
+	> 这是一个基于OKHttp+Jsoup实现的网页抓取及解析功能的搜索引擎采集爬虫，
-	的爬虫项目
+	目前包含：百度新闻、搜狗新闻、360新闻、搜狗知乎采集、贴吧、天涯论坛、豆瓣 
+	按照关键词采集数据的爬虫项目
 	#####  maven
@@ -58,8 +74,18 @@
 		List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
 		//搜狗知乎采集
 		List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
+		//百度贴吧采集
+		String tiebaName = "京东";  //贴吧名称，指定贴吧内采集，无则为null
+		List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
+		//天涯论坛采集
+		List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
+		//豆瓣采集
+		List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
+		//天涯论坛采集
+		List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
+		//豆瓣采集
+		String type = "topic";   //topic 为指定话题采集，note为指定日记采集
+		List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);

--- a/src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
+package com.zhiwei.media_data_crawler.crawler;
+import java.io.IOException;
+import java.net.Proxy;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import com.zhiwei.media_data_crawler.entity.TiebaData;
+import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
+import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
+import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
+import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
+public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
+	private static Logger logger = LoggerFactory.getLogger(BaiduTiebaCrawlerParse.class);
+	/**
+	 * @Title: getBaiduTiebaData 
+	 * @author hero 
+	 * @Description: 根據關鍵詞獲取百度貼吧數據（最多50頁）
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @param tiebaName
+	 * @param @return
+	 * @param @throws Exception 设定文件 
+	 * @return List<TiebaData> 返回类型
+	 */
+	@SuppressWarnings("unchecked")
+	public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName) throws Exception {
+		List<TiebaData> list = new ArrayList<TiebaData>();
+		int page = 0;
+		boolean more = true;
+		while (more) {
+			// 最大页数为20
+			if (page > 50) {
+				more = false;
+			}
+			String htmlBody = downloadHtml(word, proxy, tiebaName, page);
+			if (htmlBody != null) {
+				Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
+				List<TiebaData> dataList = (List<TiebaData>) dataMap.get("data");
+				list.addAll(dataList);
+				more = (Boolean) dataMap.get("more");
+			} else {
+				more = false;
+			}
+			page++;
+			ZhiWeiTools.sleep(3000);
+		}
+		return list;
+	}
+	/**
+	 * @Title: downloadHtml 
+	 * @author hero 
+	 * @Description: 下載百度貼吧數據
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @param tiebaName
+	 * @param @param page
+	 * @param @return
+	 * @param @throws Exception 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String downloadHtml(String word, Proxy proxy, String tiebaName,
+			int page) throws Exception{
+		// 获取通用请求头
+		Map<String, String> headerMap = HeaderTool.getCommonHead();
+		// 获取链接地址
+		String url = getUrl(word, tiebaName, page);
+		headerMap.put("Host", "tieba.baidu.com");
+		headerMap.put("Referer", url);
+		// 下载数据页面
+		for (int i = 1; i <= 3; i++) {
+			try {
+				return get(url, proxy, headerMap);
+			} catch (IOException e) {
+				logger.error("获取数据时出现问题,问题为：{}", e.fillInStackTrace());
+				if(i==3){
+					throw e;
+				}else{
+					continue;
+				}
+			}
+		}
+		return null;
+	}
+	/**
+	 * @Title: analysisData 
+	 * @author hero 
+	 * @Description: 解析Baidu貼吧數據
+	 * @param @param htmlBody
+	 * @param @param proxy
+	 * @param @param word
+	 * @param @return
+	 * @param @throws Exception 设定文件 
+	 * @return Map<String,Object> 返回类型
+	 */
+	private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
+		Map<String, Object> resultMap = new HashMap<String, Object>();
+		List<TiebaData> list = new ArrayList<TiebaData>();
+		boolean more = true;
+		/** 解析页面 */
+		Document document = Jsoup.parse(htmlBody);
+		/** 判断是否有下一页 **/
+		if (document.select("a.next") == null) {
+			more = false;
+		} else {
+			if (!document.select("[class=\"pager pager-search\"]").text().contains("下一页")) {
+				more = false;
+			}
+		}
+		// 开始解析
+		Elements elementes = document.select("div.s_post");
+		String time = null;
+		String source = null;
+		String link = null;
+		String title = null;
+		String content = null;
+		String tid = null;
+		String author = null;
+		for(Element element : elementes) {
+            title = element.select("span.p_title").select("a").text().replace("回复:", "");
+            link = "http://tieba.baidu.com"+element.select("span.p_title").select("a").attr("href");
+            tid = element.select("span.p_title").select("a").attr("data-tid");
+            source = element.select("a.p_forum").select("font.p_violet").text();
+            content = element.select("div.p_content").text();
+            try {
+            	author = element.select("a").select("font.p_violet").text().split(" ")[1];
+            	time = element.select("font.p_date").text();
+                TiebaData tiebaData = new TiebaData(link, title, time, tid, source, author, content, word);
+                System.out.println(tiebaData);
+                list.add(tiebaData);
+            }catch (Exception e) {
+                logger.debug("无作者 或者 无来源");
+                continue;
+            }
+        }
+		if(elementes.size()==0){
+			more = false;
+		}
+		resultMap.put("data", list);
+		resultMap.put("more", more);
+		return resultMap;
+	}
+	/**
+	 * @Title: getUrl 
+	 * @author hero 
+	 * @Description: 拼接請求鏈接
+	 * @param @param word
+	 * @param @param tiebaName
+	 * @param @param page
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String getUrl(String word, String tiebaName, int page) {
+		String url = null;
+		if (word != null) {
+			if(tiebaName!=null){
+				url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(word, "utf-8")+"&qw="+
+						URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
+			}else{
+				url = "http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="+
+						URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
+			}
+		}
+		System.out.println(url);
+		return url;
+	}
+}
--- a/src/main/java/com/zhiwei/media_data_crawler/crawler/DoubanCrawlerParse.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/crawler/DoubanCrawlerParse.java
+package com.zhiwei.media_data_crawler.crawler;
+import java.io.IOException;
+import java.net.Proxy;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import com.alibaba.fastjson.JSONObject;
+import com.zhiwei.media_data_crawler.entity.DouBanData;
+import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
+import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
+import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
+import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
+public class DoubanCrawlerParse extends HttpClientTemplateOK {
+	private static Logger logger = LoggerFactory.getLogger(DoubanCrawlerParse.class);
+	/**
+	 * 
+	 * @Title: getDoubanData 
+	 * @author hero 
+	 * @Description:  根据关键词获取豆瓣话题及日记数据
+	 * @param @param word
+	 * @param @param type  type=topic,type=note
+	 * @param @param proxy
+	 * @param @param endTime
+	 * @param @return
+	 * @param @throws Exception 设定文件 
+	 * @return List<DouBanData> 返回类型
+	 */
+	@SuppressWarnings("unchecked")
+	public static List<DouBanData> getDoubanData(String word, String type,Proxy proxy) throws Exception {
+		List<DouBanData> list = new ArrayList<DouBanData>();
+		int page = 0;
+		boolean more = true;
+		while (more) {
+			// 最大页数为20
+			if (page > 50) {
+				more = false;
+			}
+			String htmlBody = downloadHtml(word, type,proxy, page);
+			if (htmlBody != null) {
+				if(type.equals("topic")){
+					Map<String, Object> dataMap = analysisNoteData(htmlBody, proxy, word);
+					List<DouBanData> dataList = (List<DouBanData>) dataMap.get("data");
+					list.addAll(dataList);
+					more = (Boolean) dataMap.get("more");
+				}else if(type.equals("note")){
+					Map<String, Object> dataMap = analysisTopicData(htmlBody, proxy, word);
+					List<DouBanData> dataList = (List<DouBanData>) dataMap.get("data");
+					list.addAll(dataList);
+					more = (Boolean) dataMap.get("more");
+				}
+			} else {
+				more = false;
+			}
+			page++;
+		}
+		return list;
+	}
+	/**
+	 * @Title: downloadHtml 
+	 * @author hero 
+	 * @Description: 下載百度貼吧數據
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @param tiebaName
+	 * @param @param page
+	 * @param @return
+	 * @param @throws Exception 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String downloadHtml(String word, String type,Proxy proxy,
+			int page) throws Exception{
+		// 获取通用请求头
+		Map<String, String> headerMap = HeaderTool.getCommonHead();
+		// 获取链接地址
+		String url = getUrl(word, type, page);
+		headerMap.put("Host", "search.tianya.cn");
+		headerMap.put("Referer", url);
+		// 下载数据页面
+		for (int i = 1; i <= 3; i++) {
+			try {
+				return get(url, proxy, headerMap);
+			} catch (IOException e) {
+				logger.error("获取数据时出现问题,问题为：{}", e.fillInStackTrace());
+				if(i==3){
+					throw e;
+				}else{
+					continue;
+				}
+			}
+		}
+		return null;
+	}
+	private static String downloadHtml(String url, String type,Proxy proxy) throws Exception{
+		// 获取通用请求头
+		Map<String, String> headerMap = HeaderTool.getCommonHead();
+		headerMap.put("Referer", url);
+		// 下载数据页面
+		for (int i = 1; i <= 3; i++) {
+			try {
+				return get(url, proxy, headerMap);
+			} catch (IOException e) {
+				logger.error("获取数据时出现问题,问题为：{}", e.fillInStackTrace());
+				if(i==3){
+					throw e;
+				}else{
+					continue;
+				}
+			}
+		}
+		return null;
+	}
+	/**
+	 * @Title: analysisData 
+	 * @author hero 
+	 * @Description: 解析Baidu貼吧數據
+	 * @param @param htmlBody
+	 * @param @param proxy
+	 * @param @param word
+	 * @param @return
+	 * @param @throws Exception 设定文件 
+	 * @return Map<String,Object> 返回类型
+	 */
+	private static Map<String, Object> analysisTopicData(String htmlBody, Proxy proxy, String word) throws Exception{
+		Map<String, Object> resultMap = new HashMap<String, Object>();
+		List<DouBanData> list = new ArrayList<DouBanData>();
+		boolean more = true;
+		/** 解析页面 */
+		Document document = Jsoup.parse(htmlBody);
+		/** 判断是否有下一页 **/
+		if (!document.select("div.paginator").select("span.next").text().contains("后页")) {
+			more = false;
+		}
+		// 开始解析
+		Elements elementes = document.select("div.topics").select("tr.pl");
+		String link = null;
+		String title = null;
+		String group = null;
+		String time = null;
+		int reply_count = 0;
+		if(elementes.size()==0){
+			more = false;
+		}
+		for (Element element : elementes) {
+			link = element.select("td.td-subject").select("a").attr("href");
+			title = element.select("td.td-subject").select("a").text();
+			time = element.select("td.td-time").attr("title");
+			reply_count = Integer.valueOf(element.select("td.td-reply").select("span").text().split("回应")[0].trim());
+			group = element.select("td").get(3).text();
+			DouBanData douban = new DouBanData(link, title, group, null, time, null, "话题", reply_count, null);
+			douban = getTopicSourceAndContent(link, "话题",proxy, douban);
+			if(douban!=null){
+				list.add(douban);
+			}
+		}
+		resultMap.put("data", list);
+		resultMap.put("more", more);
+		return resultMap;
+	}
+	private static DouBanData getTopicSourceAndContent(String url,String type, Proxy proxy, DouBanData douban){
+		try {
+			String htmlBody =  downloadHtml(url, type, proxy);
+			if(htmlBody!=null){
+				Document document = Jsoup.parse(htmlBody);
+				String time = document.select("div.topic-doc").select("h3").select("span.color-green").text();
+				douban.setTime(time);
+				String source = document.select("div.topic-doc").select("h3").select("span.from").select("a").text();
+				douban.setSource(source);
+				String content = document.select("div.topic-doc").select("div#link-report").select("div.topic-content").text();
+				douban.setContent(content);
+			}
+			ZhiWeiTools.sleep(1000);
+			return douban;
+		} catch (Exception e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	/**
+	 * @Title: analysisNoteData 
+	 * @author hero 
+	 * @Description: 解析豆瓣日记数据 
+	 * @param @param htmlBody
+	 * @param @param proxy
+	 * @param @param word
+	 * @param @param endTime
+	 * @param @return
+	 * @param @throws Exception 设定文件 
+	 * @return Map<String,Object> 返回类型
+	 */
+	private static Map<String, Object> analysisNoteData(String htmlBody, Proxy proxy, String word) throws Exception{
+		Map<String, Object> resultMap = new HashMap<String, Object>();
+		List<DouBanData> list = new ArrayList<DouBanData>();
+		boolean more = true;
+		JSONObject json = JSONObject.parseObject(htmlBody);
+		String items = json.getString("items").replace("\\n", "").replace("\\", "");
+		more =  json.getBooleanValue("more");
+		Document document = Jsoup.parse(items);
+		Elements elements = document.select("div.result");
+		String link = null;
+		String title = null;
+		String content = null;
+		int likeNum = 0;
+		for (Element element : elements) {
+			link = "https://www.douban.com/note/"+element.select("div.title").select("h3").select("a").attr("onclick").split("sid: ")[1].split(", qcat")[0];
+			title = element.select("div.title").select("h3").select("a").text();
+			if(element.select("div.title").select("div.info").text().contains("人喜欢")){
+				likeNum = Integer.valueOf(element.select("div.title").select("div.info").text().split(" 人喜欢")[0].trim());
+			}else{
+				likeNum = 0;
+			}
+			content = element.select("div.content").select("p").text();
+			DouBanData douban = new DouBanData(link, title, null, null, null, content, "日记", null, likeNum);
+			douban = getNoteSourceTime(link, "日记", proxy, douban);
+			if(douban != null){
+				list.add(douban);
+			}
+		}
+		resultMap.put("data", list);
+		resultMap.put("more", more);
+		return resultMap;
+	}
+	/**
+	 * @Title: getNoteSourceTime 
+	 * @author hero 
+	 * @Description: 根据链接获取豆瓣日记时间及来源
+	 * @param @param url
+	 * @param @param type
+	 * @param @param proxy
+	 * @param @param douban
+	 * @param @return 设定文件 
+	 * @return DouBanData 返回类型
+	 */
+	private static DouBanData getNoteSourceTime(String url,String type, Proxy proxy, DouBanData douban){
+		try {
+			String htmlBody =  downloadHtml(url, type, proxy);
+			if(htmlBody !=null ){
+				Document document = Jsoup.parse(htmlBody);
+				String source = document.select("div.article").select("div.note-container").attr("data-author");
+				String time = document.select("div.article").select("div.note-container").select("span.pub-date").text();
+				String content = document.select("div.article").select("div#link-report").text();
+				douban.setSource(source);
+				douban.setTime(time);
+				if(content!=null){
+					douban.setContent(content);
+				}
+			}
+			ZhiWeiTools.sleep(1000);
+			return douban;
+		} catch (Exception e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	/**
+	 * @Title: getUrl 
+	 * @author hero 
+	 * @Description: 拼接請求鏈接
+	 * @param @param word
+	 * @param @param tiebaName
+	 * @param @param page
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String getUrl(String word, String type,int page) {
+		String url = null;
+		if (word != null) {
+			if(type.equals("topic")){
+				url = "https://www.douban.com/group/search?q="+URLCodeUtil.getURLEncode(word, "utf-8")+"&start="+page*50+"&cat=1013&sort=time";
+			}else if(type.equals("note")){
+				url = "https://www.douban.com/j/search?q="+URLCodeUtil.getURLEncode(word, "utf-8")+"&start="+page*20+"&cat=1015";
+			}
+		}
+		System.out.println(url);
+		return url;
+	}
+}
--- a/src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
+package com.zhiwei.media_data_crawler.crawler;
+import java.io.IOException;
+import java.net.Proxy;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import com.zhiwei.media_data_crawler.entity.LunTanData;
+import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
+import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
+import com.zhiwei.zhiweiTools.timeParse.TimeParse;
+import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
+import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
+public class TianYaCrawlerParse extends HttpClientTemplateOK {
+	private static Logger logger = LoggerFactory.getLogger(TianYaCrawlerParse.class);
+	private static final String pt = "天涯论坛";
+	/**
+	 * @Title: getBaiduTiebaData 
+	 * @author hero 
+	 * @Description: 根據關鍵詞獲取百度貼吧數據（最多50頁）
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @param tiebaName
+	 * @param @return
+	 * @param @throws Exception 设定文件 
+	 * @return List<TiebaData> 返回类型
+	 */
+	@SuppressWarnings("unchecked")
+	public static List<LunTanData> getLunTanData(String word, Proxy proxy, String endTime) throws Exception {
+		List<LunTanData> list = new ArrayList<LunTanData>();
+		int page = 0;
+		boolean more = true;
+		while (more) {
+			// 最大页数为20
+			if (page > 50) {
+				more = false;
+			}
+			String htmlBody = downloadHtml(word, proxy, page);
+			if (htmlBody != null) {
+				Map<String, Object> dataMap = analysisData(htmlBody, proxy, word, endTime);
+				List<LunTanData> dataList = (List<LunTanData>) dataMap.get("data");
+				list.addAll(dataList);
+				more = (Boolean) dataMap.get("more");
+			} else {
+				more = false;
+			}
+			page++;
+			ZhiWeiTools.sleep(3000);
+		}
+		return list;
+	}
+	/**
+	 * @Title: downloadHtml 
+	 * @author hero 
+	 * @Description: 下載百度貼吧數據
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @param tiebaName
+	 * @param @param page
+	 * @param @return
+	 * @param @throws Exception 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String downloadHtml(String word, Proxy proxy,
+			int page) throws Exception{
+		// 获取通用请求头
+		Map<String, String> headerMap = HeaderTool.getCommonHead();
+		// 获取链接地址
+		String url = getUrl(word, page);
+		headerMap.put("Host", "search.tianya.cn");
+		headerMap.put("Referer", url);
+		// 下载数据页面
+		for (int i = 1; i <= 3; i++) {
+			try {
+				return get(url, proxy, headerMap);
+			} catch (IOException e) {
+				logger.error("获取数据时出现问题,问题为：{}", e.fillInStackTrace());
+				if(i==3){
+					throw e;
+				}else{
+					continue;
+				}
+			}
+		}
+		return null;
+	}
+	/**
+	 * @Title: analysisData 
+	 * @author hero 
+	 * @Description: 解析Baidu貼吧數據
+	 * @param @param htmlBody
+	 * @param @param proxy
+	 * @param @param word
+	 * @param @return
+	 * @param @throws Exception 设定文件 
+	 * @return Map<String,Object> 返回类型
+	 */
+	private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word, String endTime) throws Exception{
+		Map<String, Object> resultMap = new HashMap<String, Object>();
+		List<LunTanData> list = new ArrayList<LunTanData>();
+		boolean more = true;
+		/** 解析页面 */
+		Document document = Jsoup.parse(htmlBody);
+		/** 判断是否有下一页 **/
+		if (!document.select("div.long-pages").select("a").text().contains("下一页")) {
+			more = false;
+		}
+		// 开始解析
+		Elements elementes = document.select("div.searchListOne").select("ul").select("li");
+		String time = null;
+		String source = null;
+		String link = null;
+		String title = null;
+		String content = null;
+		String author = null;
+		Integer reply_count = 0;
+		for(Element element : elementes) {
+            title = element.select("div").select("h3").select("a").text();
+            link = element.select("div").select("h3").select("a").attr("href");
+            content = element.select("div").select("p").text();
+            source = element.select("p.source").select("a").get(0).text();
+            author = element.select("p.source").select("a").get(1).text();
+            time = element.select("p.source").select("span").get(0).text();
+            reply_count = Integer.valueOf(element.select("p.source").select("span").get(1).text());
+            LunTanData luntanData = new LunTanData(link, title, time, source, author, content, reply_count, pt, word);
+            Date date = TimeParse.stringFormartDate(time);
+            Date endDate = TimeParse.stringFormartDate(endTime);
+            if(date.before(endDate)){
+            	more = false;
+            }else{
+                System.out.println(luntanData);
+                list.add(luntanData);
+            }
+        }
+		if(elementes.size()==0){
+			more = false;
+		}
+		resultMap.put("data", list);
+		resultMap.put("more", more);
+		return resultMap;
+	}
+	/**
+	 * @Title: getUrl 
+	 * @author hero 
+	 * @Description: 拼接請求鏈接
+	 * @param @param word
+	 * @param @param tiebaName
+	 * @param @param page
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String getUrl(String word, int page) {
+		String url = null;
+		if (word != null) {
+			url = "http://search.tianya.cn/bbs?q="+URLCodeUtil.getURLEncode(word, "utf-8")
+					+"&s=4&f=0&pn="+page;
+		}
+		System.out.println(url);
+		return url;
+	}
+}
--- a/src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
@@ -4,10 +4,16 @@ import java.net.Proxy;
 import java.util.List;
 import com.zhiwei.media_data_crawler.crawler.BaiduNewsCrawlerParse;
+import com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse;
+import com.zhiwei.media_data_crawler.crawler.DoubanCrawlerParse;
 import com.zhiwei.media_data_crawler.crawler.SoNewsCrawlerParse;
 import com.zhiwei.media_data_crawler.crawler.SougouNewsCrawlerParse;
 import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
+import com.zhiwei.media_data_crawler.crawler.TianYaCrawlerParse;
+import com.zhiwei.media_data_crawler.entity.DouBanData;
+import com.zhiwei.media_data_crawler.entity.LunTanData;
 import com.zhiwei.media_data_crawler.entity.NewsData;
+import com.zhiwei.media_data_crawler.entity.TiebaData;
 import com.zhiwei.media_data_crawler.entity.ZhiHuData;
 public class DataCrawler {
@@ -152,4 +158,80 @@ public class DataCrawler {
 		}
 	}
+	/**
+	 * @Title: getBaiduTiebaData 
+	 * @author hero 
+	 * @Description: 根据关键词采集贴吧数据
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @return 设定文件 
+	 * @return List<TiebaData> 返回类型
+	 */
+	public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy){
+		try {
+			return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, null);
+		} catch (Exception e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	/**
+	 * @Title: getBaiduTiebaData 
+	 * @author hero 
+	 * @Description: 根据关键词采集指定贴吧内数据 
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @param tiebaName
+	 * @param @return 设定文件 
+	 * @return List<TiebaData> 返回类型
+	 */
+	public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName){
+		try {
+			return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, tiebaName);
+		} catch (Exception e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	/**
+	 * @Title: getLunTanData 
+	 * @author hero 
+	 * @Description: 根据关键词采集天涯论坛数据
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @param endTime
+	 * @param @return 设定文件 
+	 * @return List<LunTanData> 返回类型
+	 */
+	public static List<LunTanData> getLunTanData(String word, Proxy proxy, String endTime){
+		try {
+			return TianYaCrawlerParse.getLunTanData(word, proxy, endTime);
+		} catch (Exception e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	/**
+	 * @Title: getDouBanData 
+	 * @author hero 
+	 * @Description: 根据关键词采集豆瓣数据 
+	 * @param @param word
+	 * @param @param type  type=topic,type=note
+	 * @param @param proxy
+	 * @param @param endTime
+	 * @param @return 设定文件 
+	 * @return List<DouBanData> 返回类型
+	 */
+	public static List<DouBanData> getDouBanData(String word, String type, Proxy proxy){
+		try {
+			return DoubanCrawlerParse.getDoubanData(word, type, proxy);
+		} catch (Exception e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
 }
--- a/src/main/java/com/zhiwei/media_data_crawler/entity/DouBanData.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/entity/DouBanData.java
+package com.zhiwei.media_data_crawler.entity;
+public class DouBanData {
+	private String url;     //地址
+	private String title;   //标题
+	private String source;  //来源
+	private String group;   //小组
+	private String time;    //时间
+	private String content; //内容
+	private Integer reply_count; //回复数
+	private String type;     //类型
+	private Integer like_count;  //点赞数
+	public DouBanData() {}
+	@Override
+	public String toString(){
+		return "new DouBanData["
+				+ "url = " + url
+				+ ", title = " + title
+				+ ", source = " + source
+				+ ", group = " + group
+				+ ", time = " + time
+				+ ", type = " + type
+				+ ", like_count = " + like_count
+				+ ", reply_count = " + reply_count
+				+ "]";
+	}
+	public DouBanData(String url, String title, String group,String source, String time, 
+			String content, String type, Integer reply_count, Integer like_count) {
+		this.url = url;
+		this.title = title;
+		this.group = group;
+		this.source = source;
+		this.time = time;
+		this.content = content;
+		this.type = type;
+		this.reply_count = reply_count;
+		this.like_count = like_count;
+	}
+	public String getType() {
+		return type;
+	}
+	public void setType(String type) {
+		this.type = type;
+	}
+	public String getUrl() {
+		return url;
+	}
+	public void setUrl(String url) {
+		this.url = url;
+	}
+	public String getTitle() {
+		return title;
+	}
+	public void setTitle(String title) {
+		this.title = title;
+	}
+	public String getSource() {
+		return source;
+	}
+	public void setSource(String source) {
+		this.source = source;
+	}
+	public String getTime() {
+		return time;
+	}
+	public void setTime(String time) {
+		this.time = time;
+	}
+	public String getContent() {
+		return content;
+	}
+	public void setContent(String content) {
+		this.content = content;
+	}
+	public String getGroup() {
+		return group;
+	}
+	public void setGroup(String group) {
+		this.group = group;
+	}
+	public void setReply_count(Integer reply_count) {
+		this.reply_count = reply_count;
+	}
+	public void setLike_count(Integer like_count) {
+		this.like_count = like_count;
+	}
+}
--- a/src/main/java/com/zhiwei/media_data_crawler/entity/LunTanData.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/entity/LunTanData.java
+package com.zhiwei.media_data_crawler.entity;
+import java.io.Serializable;
+public class LunTanData implements Serializable{
+	private static final long serialVersionUID = 6057811459180925060L;
+	private String url;     //地址
+	private String title;   //標題
+	private String time;    //時間
+	private String source;  //來源
+	private String author;  //回復者或樓主
+	private String content; //回復內容
+	private Integer reply_count; //回復數
+	private String pt;      //平台
+	private String word;    //關鍵詞
+	public LunTanData(){}
+	public LunTanData(String url, String title,String time, String source,
+			String author,String content, Integer reply_count,  String pt, String word){
+		this.url = url;
+		this.title = title;
+		this.time = time;
+		this.source = source;
+		this.author = author;
+		this.content = content;
+		this.reply_count = reply_count;
+		this.pt = pt;
+		this.word = word;
+	}
+	@Override
+	public String toString(){
+		return "new LunTanData["
+				+ "url = " + url
+				+ ", title = " + title
+				+ ", time = " + time
+				+ ", source = " + source
+				+ ", author = " + author
+				+ ", content = " + content
+				+ ", reply_count = " + reply_count
+				+ ", pt = " + pt
+				+ ", word = " + word
+				+ "]";
+	}
+	public String getUrl() {
+		return url;
+	}
+	public void setUrl(String url) {
+		this.url = url;
+	}
+	public String getTitle() {
+		return title;
+	}
+	public void setTitle(String title) {
+		this.title = title;
+	}
+	public String getTime() {
+		return time;
+	}
+	public void setTime(String time) {
+		this.time = time;
+	}
+	public String getSource() {
+		return source;
+	}
+	public void setSource(String source) {
+		this.source = source;
+	}
+	public String getAuthor() {
+		return author;
+	}
+	public void setAuthor(String author) {
+		this.author = author;
+	}
+	public String getContent() {
+		return content;
+	}
+	public void setContent(String content) {
+		this.content = content;
+	}
+	public Integer getReply_count() {
+		return reply_count;
+	}
+	public void setReply_count(Integer reply_count) {
+		this.reply_count = reply_count;
+	}
+	public String getPt() {
+		return pt;
+	}
+	public void setPt(String pt) {
+		this.pt = pt;
+	}
+	public String getWord() {
+		return word;
+	}
+	public void setWord(String word) {
+		this.word = word;
+	}
+}
--- a/src/main/java/com/zhiwei/media_data_crawler/entity/TiebaData.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/entity/TiebaData.java
+package com.zhiwei.media_data_crawler.entity;
+import java.io.Serializable;
+public class TiebaData implements Serializable{
+	private static final long serialVersionUID = 1L;
+	private String url;     //地址
+	private String title;   //標題
+	private String time;    //時間
+	private String tid;     //tid
+	private String source;  //來源
+	private String author;  //回復者或樓主
+	private String content; //回復內容
+	private String word;    //關鍵詞
+	public TiebaData(){}
+	public TiebaData(String url, String title,String time, String tid,String source,
+			String author,String content, String word){
+		this.url = url;
+		this.title = title;
+		this.time = time;
+		this.tid = tid;
+		this.source = source;
+		this.author = author;
+		this.content = content;
+		this.word = word;
+	}
+	@Override
+	public String toString(){
+		return "new TiebaData["
+				+ "url = " + url
+				+ ", title = " + title
+				+ ", time = " + time
+				+ ", tid = " + tid
+				+ ", source = " + source
+				+ ", author = " + author
+				+ ", content = " + content
+				+ ", word = " + word
+				+ "]";
+	}
+	public String getUrl() {
+		return url;
+	}
+	public void setUrl(String url) {
+		this.url = url;
+	}
+	public String getTitle() {
+		return title;
+	}
+	public void setTitle(String title) {
+		this.title = title;
+	}
+	public String getTime() {
+		return time;
+	}
+	public void setTime(String time) {
+		this.time = time;
+	}
+	public String getTid() {
+		return tid;
+	}
+	public void setTid(String tid) {
+		this.tid = tid;
+	}
+	public String getSource() {
+		return source;
+	}
+	public void setSource(String source) {
+		this.source = source;
+	}
+	public String getAuthor() {
+		return author;
+	}
+	public void setAuthor(String author) {
+		this.author = author;
+	}
+	public String getContent() {
+		return content;
+	}
+	public void setContent(String content) {
+		this.content = content;
+	}
+	public String getWord() {
+		return word;
+	}
+	public void setWord(String word) {
+		this.word = word;
+	}
+}
--- a/src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
+++ b/src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
-//package com.zhiwei.media_data_crawler.test;
+package com.zhiwei.media_data_crawler.test;
-//
-//import java.net.Proxy;
+import java.net.Proxy;
-//import java.util.List;
+import java.util.List;
-//
-//import org.junit.Test;
+import org.junit.Test;
-//
-//import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
+import com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse;
-//import com.zhiwei.media_data_crawler.data.DataCrawler;
+import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
-//import com.zhiwei.media_data_crawler.entity.NewsData;
+import com.zhiwei.media_data_crawler.data.DataCrawler;
-//import com.zhiwei.media_data_crawler.entity.ZhiHuData;
+import com.zhiwei.media_data_crawler.entity.DouBanData;
-//
+import com.zhiwei.media_data_crawler.entity.LunTanData;
-//public class DataCrawlerTest {
+import com.zhiwei.media_data_crawler.entity.NewsData;
-//	
+import com.zhiwei.media_data_crawler.entity.TiebaData;
-//	
+import com.zhiwei.media_data_crawler.entity.ZhiHuData;
-//	
-//	
+public class DataCrawlerTest {
-//	
-//	@Test
-//	public void getSoNewsTest(){
-//		String word = "马云";     //关键词
-//		String startTime = "2017-03-01 00:00:00";  //开始时间
-//		String endTime = "2017-03-01 23:59:59";    //结束时间
+	@Test
-//		Proxy proxy = null;      //代理IP，不用可不填写
+	public void getSoNewsTest(){
+		String word = "马云";     //关键词
+		String startTime = "2017-03-01 00:00:00";  //开始时间
+		String endTime = "2017-03-01 23:59:59";    //结束时间
+		Proxy proxy = null;      //代理IP，不用可不填写
+		try {
 //			//百度新闻采集demo
-////		List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
+//			List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
-////		//搜狗新闻关键词采集demo
+//			//搜狗新闻关键词采集demo
-////		List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
+//			List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
-////		//360新闻采集demo
+//			//360新闻采集demo
-////		List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
+//			List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
-////		//搜狗知乎采集
+//			//搜狗知乎采集
-////		List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
+//			List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
-////		System.out.println(zhihuList.size());
+//			System.out.println(zhihuList.size());
-//		
+//			//Baidu貼吧採集
-//	}
+//			String tiebaName = "京东";  //贴吧名称，指定贴吧内采集，无则为null
-//
+//			List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
-//}
+//			//天涯论坛采集
+//			List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
+			//豆瓣采集
+//			String type = "topic";   //topic 为指定话题采集，note为指定日记采集
+//			List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+	}
+}