更新百度新闻按照标题采集数据解析错误修复

ee34a906 · zhiwei · 6c18504b · ee34a906
Commit ee34a906 authored Mar 22, 2018 by zhiwei
Show whitespace changes
Inline Side-by-side

Showing with 82 additions and 1 deletions

src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+82 -1

No files found.
--- a/src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
@@ -119,7 +119,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
 			}
 			String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newstitle", page);
 			if (htmlBody != null) {
-				Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
+				Map<String, Object> dataMap = analysisDataByTitle(htmlBody, proxy, word);
 				List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
 				list.addAll(dataList);
 				more = (Boolean) dataMap.get("more");
@@ -158,6 +158,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
 		Map<String, String> headerMap = HeaderTool.getCommonHead();
 		// 获取链接地址
 		String url = getUrl(word, startTime, endTime, tn, page);
+		System.out.println(url);
 		headerMap.put("Host", "news.baidu.com");
 		headerMap.put("Referer", url);
 		// 下载数据页面
@@ -245,6 +246,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
 				link = element.select("h3.c-title").select("a").attr("href");
 				title = element.select("h3.c-title").select("a").text();
 				soureAndtime = element.select("div.c-row").select("p.c-author").html();
+				System.out.println("time========"+soureAndtime);
 				/** 截取时间 */
 				if (soureAndtime.contains("&nbsp;&nbsp;")) {
 					String soureAndtimes[] = soureAndtime.split("&nbsp;&nbsp;");
@@ -288,6 +290,85 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
 		return resultMap;
 	}

+	
+	
+	
+	
+	private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) throws Exception{
+		Map<String, Object> resultMap = new HashMap<String, Object>();
+		List<NewsData> list = new ArrayList<NewsData>();
+		boolean more = true;
+
+		/** 解析页面 */
+		Document document = Jsoup.parse(htmlBody);
+		/** 判断是否有下一页 **/
+		if (document.select("p#page") == null) {
+			more = false;
+		} else {
+			if (!document.select("p#page").text().contains("下一页")) {
+				more = false;
+			}
+		}
+		// 开始解析
+		Elements elementes = document.select("[class=\"result title\"]");
+		String time = null;
+		String source = null;
+		String link = null;
+		String title = null;
+		String soureAndtime = null;
+		String descript = null;
+		String soureAndtimeText = null;
+		String content = null;
+		Pattern pattern = null;
+		Matcher matcher = null;
+		for (Element element : elementes) {
+			try {
+				link = element.select("h3.c-title").select("a").attr("href");
+				title = element.select("h3.c-title").select("a").text();
+				soureAndtime = element.select("div.c-title-author").html();
+				/** 截取时间 */
+				if (soureAndtime.contains("&nbsp;&nbsp;")) {
+					String soureAndtimes[] = soureAndtime.split("&nbsp;&nbsp;");
+					time = soureAndtimes[1];
+					source = soureAndtimes[0];
+				} 
+				/** 文章发布时间处理 **/
+				time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
+				// 处理文章简介
+				if (element.select("div.c-row") != null) {
+					descript = element.select("div.c-row").text();
+					soureAndtimeText = element.select("div.c-row").select("p.c-author").text();
+					content = descript.substring(soureAndtimeText.length(), descript.length());
+					pattern = Pattern.compile("\\d*条相同新闻");
+					matcher = pattern.matcher(content);
+					content = matcher.replaceAll("").replace("-", "").replace("百度快照", "");
+				}
+				// 添加到数据集合中
+				NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
+				list.add(newsData);
+				/** 采集相同新闻链接 **/
+				String otherUrl = element.select("div.c-title-author").select("a").attr("href");
+				if (otherUrl != null && !otherUrl.equals("")) {
+					String otherLink = "http://news.baidu.com" + otherUrl;
+					List<NewsData> otherDataList = getOherBaiduNewsData(otherLink, word, proxy);
+					list.addAll(otherDataList);
+					ZhiWeiTools.sleep(100);
+				}
+			} catch (Exception e) {
+				e.printStackTrace();
+				logger.error("百度新闻数据解析时出现问题，问题为:{}", e.fillInStackTrace());
+				continue;
+			}
+		}
+		resultMap.put("data", list);
+		resultMap.put("more", more);
+
+		return resultMap;
+	}
+	
+	
+	
+	
 	/**
 	 * @Title: getOherBaiduNewsData
 	 * @author hero