Commit 806be52f by 马黎滨

知乎热搜榜单采集404页面解决

parent 141137f3
...@@ -29,14 +29,14 @@ public class ZhihuTopicSearchCrawler { ...@@ -29,14 +29,14 @@ public class ZhihuTopicSearchCrawler {
public static List<HotSearchList> getZhihuTopicSearch(){ public static List<HotSearchList> getZhihuTopicSearch(){
List<HotSearchList> list = new ArrayList<>(); List<HotSearchList> list = new ArrayList<>();
String url = "https://www.zhihu.com/topsearch"; String url = "https://www.zhihu.com/topsearch";
Map<String, String> headerMap = HeaderTool.getCommonHead();
JSONObject jsonObject = null; JSONObject jsonObject = null;
try { try {
for(int t=0 ;t<3&&jsonObject== null;t++) for(int t=0 ;t<3 && jsonObject== null;t++)
{ {
ZhiWeiTools.sleep(500L); // ZhiWeiTools.sleep(10000L);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),
ProxyHolder.NAT_HEAVY_PROXY).body().string(); ProxyHolder.NAT_HEAVY_PROXY).body().string();
log.info("页面内容获取:{}",htmlBody);
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
String html = document.getElementsByTag("script").select("#js-initialData").html(); String html = document.getElementsByTag("script").select("#js-initialData").html();
jsonObject = JSONObject.parseObject(html); jsonObject = JSONObject.parseObject(html);
...@@ -55,6 +55,7 @@ public class ZhihuTopicSearchCrawler { ...@@ -55,6 +55,7 @@ public class ZhihuTopicSearchCrawler {
return list; return list;
}else{ }else{
log.error("知乎热搜榜单页面获取异常,404"); log.error("知乎热搜榜单页面获取异常,404");
log.error(jsonObject);
} }
} catch (IOException e) { } catch (IOException e) {
log.error("知乎热搜获取异常", e); log.error("知乎热搜获取异常", e);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment