Commit 0a584d05 by yangchen

搜狗修改

parent f81fdcab
...@@ -6,7 +6,6 @@ import java.util.ArrayList; ...@@ -6,7 +6,6 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
...@@ -57,7 +56,6 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -57,7 +56,6 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
more = false; more = false;
} }
ZhiWeiTools.sleep(5000); ZhiWeiTools.sleep(5000);
System.out.println(list.size());
page++; page++;
} }
return list; return list;
...@@ -106,7 +104,6 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -106,7 +104,6 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址 //获取链接地址
String url = getMediaNewsUrl(word, mode, page); String url = getMediaNewsUrl(word, mode, page);
// System.out.println("搜狗主页=="+url);
headerMap.put("Host", "news.sogou.com"); headerMap.put("Host", "news.sogou.com");
headerMap.put("Referer", url.split("&page=")[0]+"&page="+(page-1)); headerMap.put("Referer", url.split("&page=")[0]+"&page="+(page-1));
//下载数据页面 //下载数据页面
...@@ -125,8 +122,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -125,8 +122,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//获取通用请求头 //获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址 //获取链接地址
url = url + "&page=" + page; url = url + "&page" + page;
// System.out.println("搜狗相似新闻=="+url);
headerMap.put("Host", "news.sogou.com"); headerMap.put("Host", "news.sogou.com");
headerMap.put("Referer", url); headerMap.put("Referer", url);
//下载数据页面 //下载数据页面
...@@ -193,9 +189,6 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -193,9 +189,6 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
} }
if(time!=null && !time.equals("")){ if(time!=null && !time.equals("")){
/**文章发布时间处理**/ /**文章发布时间处理**/
if(time.contains("\n")) {
time = time.split("\n")[0];
}
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ; time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ;
} }
// 处理文章简介 // 处理文章简介
...@@ -256,7 +249,6 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -256,7 +249,6 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
}else{ }else{
more = false; more = false;
} }
ZhiWeiTools.sleep(500);
page++; page++;
} }
return list; return list;
...@@ -277,7 +269,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -277,7 +269,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
String url = null; String url = null;
if(word!=null){ if(word!=null){
url = "http://news.sogou.com/news?mode="+ mode +"&media=&query=" url = "http://news.sogou.com/news?mode="+ mode +"&media=&query="
+ URLCodeUtil.getURLEncode(word, "utf-8") + "&time=0&clusterId=&sort=1&dp=1&page="+page; + URLCodeUtil.getURLEncode(word, "utf-8") + "&time=0&clusterId=&sort=1&dp=1";
} }
return url; return url;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment