修复搜狗新闻由于乱码引起的解析问题

f6fa753d · zhiwei · 630e8f87 · f6fa753d
Commit f6fa753d authored Feb 27, 2018 by zhiwei
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 11 deletions

src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
+14 -11

No files found.
--- a/src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
 import java.io.IOException;
 import java.net.Proxy;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -47,7 +48,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
 				 more = false;
 			 }
 			 String htmlBody = downloadHtml(word, 1, proxy, page);
-			 if(htmlBody != null){
+			 if(htmlBody != null && !htmlBody.equals("")){
 				 Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
 				 List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
 				 list.addAll(dataList);
@@ -184,30 +185,32 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
 				/**截取时间*/
 				if (soureAndtime.contains("&nbsp;")) {
 					String soureAndtimes[] = soureAndtime.split("&nbsp;");
-					time = soureAndtimes[1];
+					time = soureAndtimes[1].contains("<!--resultinfodat")?soureAndtimes[1].split("<!--resultinfodat")[0]:soureAndtimes[1];
 					source = soureAndtimes[0];
-				} else {
-					time = element.select("div.news-detail").select("div.news-info").select("p.news-from").text();
 				}
-				/**文章发布时间处理**/
+				if(time!=null && !time.equals("")){
-				time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ;
+					/**文章发布时间处理**/
+					time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ;
+				}
 				// 处理文章简介
 				content = element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("span#summary_1").text();
 				//添加到数据集合中
-				if(title != null){
+				if(title != null && !title.equals("") && source!=null && time!=null){
 					NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
+					logger.info("搜狗新闻数据:{}", newsData);
 					list.add(newsData);
 				}
 				/**采集相同新闻链接**/
-				if(element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("a#news_similar")!=null)
+				String otherUrl = element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("a#news_similar").attr("href");
+				if(otherUrl!=null && !otherUrl.equals(""))
 				{
-					String otherLink = "http://news.sogou.com/news"+element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("a#news_similar").attr("href");
+					String otherLink = "http://news.sogou.com/news"+otherUrl;
 					List<NewsData> otherDataList = getOherSougouNewsData(otherLink, word, proxy);
 					list.addAll(otherDataList);
 				}
 			} catch (Exception e) {
 				e.printStackTrace();
-				logger.error("搜狗新闻数据解析时出现问题，问题为:{}", e.fillInStackTrace());
+//				logger.error("搜狗新闻数据解析时出现问题，问题为:{}", e.fillInStackTrace());
 				continue;
 			}
 		}
@@ -267,7 +270,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
 		String url = null;
 		if(word!=null){
 			url = "http://news.sogou.com/news?mode="+ mode +"&media=&query="
-                    + URLCodeUtil.getURLEncode(word, "utf-8") + "&time=0&clusterId=&sort=1&page=2&dp=1&page="+page;
+                    + URLCodeUtil.getURLEncode(word, "utf-8") + "&time=0&clusterId=&sort=1&dp=1";
 		}
 		return url;
 	}