Commit ee34a906 by zhiwei

更新百度新闻按照标题采集数据解析错误修复

parent 6c18504b
......@@ -119,7 +119,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
}
String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newstitle", page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
Map<String, Object> dataMap = analysisDataByTitle(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
......@@ -158,6 +158,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, startTime, endTime, tn, page);
System.out.println(url);
headerMap.put("Host", "news.baidu.com");
headerMap.put("Referer", url);
// 下载数据页面
......@@ -245,6 +246,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
link = element.select("h3.c-title").select("a").attr("href");
title = element.select("h3.c-title").select("a").text();
soureAndtime = element.select("div.c-row").select("p.c-author").html();
System.out.println("time========"+soureAndtime);
/** 截取时间 */
if (soureAndtime.contains("&nbsp;&nbsp;")) {
String soureAndtimes[] = soureAndtime.split("&nbsp;&nbsp;");
......@@ -288,6 +290,85 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return resultMap;
}
private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/** 判断是否有下一页 **/
if (document.select("p#page") == null) {
more = false;
} else {
if (!document.select("p#page").text().contains("下一页")) {
more = false;
}
}
// 开始解析
Elements elementes = document.select("[class=\"result title\"]");
String time = null;
String source = null;
String link = null;
String title = null;
String soureAndtime = null;
String descript = null;
String soureAndtimeText = null;
String content = null;
Pattern pattern = null;
Matcher matcher = null;
for (Element element : elementes) {
try {
link = element.select("h3.c-title").select("a").attr("href");
title = element.select("h3.c-title").select("a").text();
soureAndtime = element.select("div.c-title-author").html();
/** 截取时间 */
if (soureAndtime.contains("&nbsp;&nbsp;")) {
String soureAndtimes[] = soureAndtime.split("&nbsp;&nbsp;");
time = soureAndtimes[1];
source = soureAndtimes[0];
}
/** 文章发布时间处理 **/
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
// 处理文章简介
if (element.select("div.c-row") != null) {
descript = element.select("div.c-row").text();
soureAndtimeText = element.select("div.c-row").select("p.c-author").text();
content = descript.substring(soureAndtimeText.length(), descript.length());
pattern = Pattern.compile("\\d*条相同新闻");
matcher = pattern.matcher(content);
content = matcher.replaceAll("").replace("-", "").replace("百度快照", "");
}
// 添加到数据集合中
NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
list.add(newsData);
/** 采集相同新闻链接 **/
String otherUrl = element.select("div.c-title-author").select("a").attr("href");
if (otherUrl != null && !otherUrl.equals("")) {
String otherLink = "http://news.baidu.com" + otherUrl;
List<NewsData> otherDataList = getOherBaiduNewsData(otherLink, word, proxy);
list.addAll(otherDataList);
ZhiWeiTools.sleep(100);
}
} catch (Exception e) {
e.printStackTrace();
logger.error("百度新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
* @Title: getOherBaiduNewsData
* @author hero
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment