Commit 630e8f87 by zhiwei

修复百度新闻采集问题

parent f21ed458
......@@ -224,9 +224,10 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
list.add(newsData);
/**采集相同新闻链接**/
if(element.select("div.c-row").select("a.c-more_link")!=null)
String otherUrl = element.select("div.c-row").select("a.c-more_link").attr("href");
if(otherUrl!=null && !otherUrl.equals(""))
{
String otherLink = "http://news.baidu.com"+element.select("div.c-row").select("a.c-more_link").attr("href");
String otherLink = "http://news.baidu.com" + element.select("div.c-row").select("a.c-more_link").attr("href");
List<NewsData> otherDataList = getOherBaiduNewsData(otherLink, word, proxy);
list.addAll(otherDataList);
ZhiWeiTools.sleep(100);
......
......@@ -103,7 +103,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
String url = getUrl(word, mode, page);
String url = getMediaNewsUrl(word, mode, page);
headerMap.put("Host", "news.sogou.com");
headerMap.put("Referer", url.split("&page=")[0]+"&page="+(page-1));
//下载数据页面
......@@ -263,7 +263,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return String 返回类型
*/
private static String getUrl(String word, int mode ,int page){
private static String getMediaNewsUrl(String word, int mode ,int page){
String url = null;
if(word!=null){
url = "http://news.sogou.com/news?mode="+ mode +"&media=&query="
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment