Commit d82bc101 by yangchen

搜狗采集修改

parent f6fa753d
...@@ -7,6 +7,9 @@ import java.util.ArrayList; ...@@ -7,6 +7,9 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
...@@ -57,6 +60,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -57,6 +60,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
more = false; more = false;
} }
ZhiWeiTools.sleep(5000); ZhiWeiTools.sleep(5000);
System.out.println(list.size());
page++; page++;
} }
return list; return list;
...@@ -105,6 +109,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -105,6 +109,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址 //获取链接地址
String url = getMediaNewsUrl(word, mode, page); String url = getMediaNewsUrl(word, mode, page);
// System.out.println("搜狗主页=="+url);
headerMap.put("Host", "news.sogou.com"); headerMap.put("Host", "news.sogou.com");
headerMap.put("Referer", url.split("&page=")[0]+"&page="+(page-1)); headerMap.put("Referer", url.split("&page=")[0]+"&page="+(page-1));
//下载数据页面 //下载数据页面
...@@ -123,7 +128,8 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -123,7 +128,8 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//获取通用请求头 //获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址 //获取链接地址
url = url + "&page" + page; url = url + "&page=" + page;
// System.out.println("搜狗相似新闻=="+url);
headerMap.put("Host", "news.sogou.com"); headerMap.put("Host", "news.sogou.com");
headerMap.put("Referer", url); headerMap.put("Referer", url);
//下载数据页面 //下载数据页面
...@@ -190,6 +196,9 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -190,6 +196,9 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
} }
if(time!=null && !time.equals("")){ if(time!=null && !time.equals("")){
/**文章发布时间处理**/ /**文章发布时间处理**/
if(time.contains("\n")) {
time = time.split("\n")[0];
}
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ; time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ;
} }
// 处理文章简介 // 处理文章简介
...@@ -250,6 +259,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -250,6 +259,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
}else{ }else{
more = false; more = false;
} }
ZhiWeiTools.sleep(500);
page++; page++;
} }
return list; return list;
...@@ -270,7 +280,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -270,7 +280,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
String url = null; String url = null;
if(word!=null){ if(word!=null){
url = "http://news.sogou.com/news?mode="+ mode +"&media=&query=" url = "http://news.sogou.com/news?mode="+ mode +"&media=&query="
+ URLCodeUtil.getURLEncode(word, "utf-8") + "&time=0&clusterId=&sort=1&dp=1"; + URLCodeUtil.getURLEncode(word, "utf-8") + "&time=0&clusterId=&sort=1&dp=1&page="+page;
} }
return url; return url;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment