Commit ba2389f8 by yangchen

搜狗修改 content解析修改

parent 0a584d05
......@@ -122,7 +122,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
url = url + "&page" + page;
url = url + "&page=" + page;
headerMap.put("Host", "news.sogou.com");
headerMap.put("Referer", url);
//下载数据页面
......@@ -189,10 +189,20 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
}
if(time!=null && !time.equals("")){
/**文章发布时间处理**/
if(time.contains("\n")){
time = time.split("\n")[0];
}
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ;
}
// 处理文章简介
content = element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("span#summary_1").text();
if(content == null) {
for(int i = 0;i < 11;i++) {
content = element.select("#summary_"+i).text();
if(content != null && content.length() > 0) {
break;
}
}
}
//添加到数据集合中
if(title != null && !title.equals("") && source!=null && time!=null){
NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
......@@ -269,7 +279,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
String url = null;
if(word!=null){
url = "http://news.sogou.com/news?mode="+ mode +"&media=&query="
+ URLCodeUtil.getURLEncode(word, "utf-8") + "&time=0&clusterId=&sort=1&dp=1";
+ URLCodeUtil.getURLEncode(word, "utf-8") + "&time=0&clusterId=&sort=1&dp=1&page="+page;
}
return url;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment