Commit 2ca74931 by zhiwei

1.修复搜狗新闻无限死循环问题

parent 9d3ee2e7
...@@ -406,7 +406,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -406,7 +406,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return list; return list;
} }
@SuppressWarnings("unchecked")
public static List<NewsData> getOherBaiduNewsDataByTitle(String url, String word, Proxy proxy) throws Exception{ public static List<NewsData> getOherBaiduNewsDataByTitle(String url, String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 0; int page = 0;
......
...@@ -49,7 +49,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -49,7 +49,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
} }
String htmlBody = downloadHtml(word, 1, proxy, page); String htmlBody = downloadHtml(word, 1, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){ if(htmlBody != null && !htmlBody.equals("")){
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word); Map<String,Object> dataMap = analysisData(htmlBody, proxy, word, "normal");
List<NewsData> dataList = (List<NewsData>)dataMap.get("data"); List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList); list.addAll(dataList);
more = (Boolean)dataMap.get("more"); more = (Boolean)dataMap.get("more");
...@@ -67,7 +67,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -67,7 +67,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
public static Map<String,Object> getSougouNewsData(String word, Proxy proxy, int page) throws Exception{ public static Map<String,Object> getSougouNewsData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, 1, proxy, page); String htmlBody = downloadHtml(word, 1, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){ if(htmlBody != null && !htmlBody.equals("")){
return analysisData(htmlBody, proxy, word); return analysisData(htmlBody, proxy, word, "normal");
} }
return null; return null;
} }
...@@ -85,7 +85,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -85,7 +85,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
} }
String htmlBody = downloadHtml(word, 2, proxy, page); String htmlBody = downloadHtml(word, 2, proxy, page);
if(htmlBody != null){ if(htmlBody != null){
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word); Map<String,Object> dataMap = analysisData(htmlBody, proxy, word, "normal");
List<NewsData> dataList = (List<NewsData>)dataMap.get("data"); List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList); list.addAll(dataList);
more = (Boolean)dataMap.get("more"); more = (Boolean)dataMap.get("more");
...@@ -168,7 +168,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -168,7 +168,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{ private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word, String type) throws Exception{
Map<String,Object> resultMap = new HashMap<String,Object>(); Map<String,Object> resultMap = new HashMap<String,Object>();
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
boolean more = true; boolean more = true;
...@@ -228,17 +228,19 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -228,17 +228,19 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
logger.info("搜狗新闻数据:{}", newsData); logger.info("搜狗新闻数据:{}", newsData);
list.add(newsData); list.add(newsData);
} }
/**采集相同新闻链接**/ if(!type.equals("other")){
String otherUrl = element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("a#news_similar").attr("href"); /**采集相同新闻链接**/
if(otherUrl!=null && !otherUrl.equals("")) String otherUrl = element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("a#news_similar").attr("href");
{ if(otherUrl!=null && !otherUrl.equals(""))
String otherLink = "http://news.sogou.com/news"+otherUrl; {
List<NewsData> otherDataList = getOherSougouNewsData(otherLink, word, proxy); String otherLink = "http://news.sogou.com/news"+otherUrl;
list.addAll(otherDataList); List<NewsData> otherDataList = getOherSougouNewsData(otherLink, word, proxy);
list.addAll(otherDataList);
}
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); // e.printStackTrace();
// logger.error("搜狗新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("搜狗新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue; continue;
} }
} }
...@@ -271,7 +273,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -271,7 +273,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
} }
String htmlBody = downloadHtml(url, proxy, page); String htmlBody = downloadHtml(url, proxy, page);
if(htmlBody != null){ if(htmlBody != null){
Map<String,Object> dataMap = analysisData(htmlBody, null, word); Map<String,Object> dataMap = analysisData(htmlBody, null, word, "other");
List<NewsData> dataList = (List<NewsData>)dataMap.get("data"); List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList); list.addAll(dataList);
more = (Boolean)dataMap.get("more"); more = (Boolean)dataMap.get("more");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment