Commit 9d3ee2e7 by zhiwei

修复百度新闻根据标题匹配数据解析错误bug

parent ee34a906
......@@ -246,7 +246,6 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
link = element.select("h3.c-title").select("a").attr("href");
title = element.select("h3.c-title").select("a").text();
soureAndtime = element.select("div.c-row").select("p.c-author").html();
System.out.println("time========"+soureAndtime);
/** 截取时间 */
if (soureAndtime.contains("  ")) {
String soureAndtimes[] = soureAndtime.split("  ");
......@@ -350,7 +349,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
String otherUrl = element.select("div.c-title-author").select("a").attr("href");
if (otherUrl != null && !otherUrl.equals("")) {
String otherLink = "http://news.baidu.com" + otherUrl;
List<NewsData> otherDataList = getOherBaiduNewsData(otherLink, word, proxy);
List<NewsData> otherDataList = getOherBaiduNewsDataByTitle(otherLink, word, proxy);
list.addAll(otherDataList);
ZhiWeiTools.sleep(100);
}
......@@ -406,6 +405,31 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
}
return list;
}
public static List<NewsData> getOherBaiduNewsDataByTitle(String url, String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 20) {
more = false;
}
String htmlBody = downloadHtml(url, proxy, page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisDataByTitle(htmlBody, null, word);
List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
}
return list;
}
/**
* @Title: getUrl
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment