Commit 2ca74931 by zhiwei

1.修复搜狗新闻无限死循环问题

parent 9d3ee2e7
......@@ -406,7 +406,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return list;
}
@SuppressWarnings("unchecked")
public static List<NewsData> getOherBaiduNewsDataByTitle(String url, String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
......
......@@ -49,7 +49,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
}
String htmlBody = downloadHtml(word, 1, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word, "normal");
List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList);
more = (Boolean)dataMap.get("more");
......@@ -67,7 +67,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
public static Map<String,Object> getSougouNewsData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, 1, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){
return analysisData(htmlBody, proxy, word);
return analysisData(htmlBody, proxy, word, "normal");
}
return null;
}
......@@ -85,7 +85,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
}
String htmlBody = downloadHtml(word, 2, proxy, page);
if(htmlBody != null){
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word, "normal");
List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList);
more = (Boolean)dataMap.get("more");
......@@ -168,7 +168,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word, String type) throws Exception{
Map<String,Object> resultMap = new HashMap<String,Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
......@@ -228,6 +228,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
logger.info("搜狗新闻数据:{}", newsData);
list.add(newsData);
}
if(!type.equals("other")){
/**采集相同新闻链接**/
String otherUrl = element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("a#news_similar").attr("href");
if(otherUrl!=null && !otherUrl.equals(""))
......@@ -236,9 +237,10 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
List<NewsData> otherDataList = getOherSougouNewsData(otherLink, word, proxy);
list.addAll(otherDataList);
}
}
} catch (Exception e) {
e.printStackTrace();
// logger.error("搜狗新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
// e.printStackTrace();
logger.error("搜狗新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
......@@ -271,7 +273,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
}
String htmlBody = downloadHtml(url, proxy, page);
if(htmlBody != null){
Map<String,Object> dataMap = analysisData(htmlBody, null, word);
Map<String,Object> dataMap = analysisData(htmlBody, null, word, "other");
List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList);
more = (Boolean)dataMap.get("more");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment