Commit f6fa753d by zhiwei

修复搜狗新闻由于乱码引起的解析问题

parent 630e8f87
......@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import java.net.Proxy;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
......@@ -47,7 +48,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
more = false;
}
String htmlBody = downloadHtml(word, 1, proxy, page);
if(htmlBody != null){
if(htmlBody != null && !htmlBody.equals("")){
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList);
......@@ -184,30 +185,32 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
/**截取时间*/
if (soureAndtime.contains("&nbsp;")) {
String soureAndtimes[] = soureAndtime.split("&nbsp;");
time = soureAndtimes[1];
time = soureAndtimes[1].contains("<!--resultinfodat")?soureAndtimes[1].split("<!--resultinfodat")[0]:soureAndtimes[1];
source = soureAndtimes[0];
} else {
time = element.select("div.news-detail").select("div.news-info").select("p.news-from").text();
}
/**文章发布时间处理**/
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ;
if(time!=null && !time.equals("")){
/**文章发布时间处理**/
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ;
}
// 处理文章简介
content = element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("span#summary_1").text();
//添加到数据集合中
if(title != null){
if(title != null && !title.equals("") && source!=null && time!=null){
NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
logger.info("搜狗新闻数据:{}", newsData);
list.add(newsData);
}
/**采集相同新闻链接**/
if(element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("a#news_similar")!=null)
String otherUrl = element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("a#news_similar").attr("href");
if(otherUrl!=null && !otherUrl.equals(""))
{
String otherLink = "http://news.sogou.com/news"+element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("a#news_similar").attr("href");
String otherLink = "http://news.sogou.com/news"+otherUrl;
List<NewsData> otherDataList = getOherSougouNewsData(otherLink, word, proxy);
list.addAll(otherDataList);
}
} catch (Exception e) {
e.printStackTrace();
logger.error("搜狗新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
// logger.error("搜狗新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
......@@ -267,7 +270,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
String url = null;
if(word!=null){
url = "http://news.sogou.com/news?mode="+ mode +"&media=&query="
+ URLCodeUtil.getURLEncode(word, "utf-8") + "&time=0&clusterId=&sort=1&page=2&dp=1&page="+page;
+ URLCodeUtil.getURLEncode(word, "utf-8") + "&time=0&clusterId=&sort=1&dp=1";
}
return url;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment