Commit 4237f42e by chenweitao

Merge branch 'working' into 'master'

Working

See merge request !201
parents 2782632a 50407ed7
...@@ -505,6 +505,7 @@ public class WeiboHotSearchCrawler { ...@@ -505,6 +505,7 @@ public class WeiboHotSearchCrawler {
String url = "https://s.weibo.com/weibo?q=" + encode + "&Refer=top"; String url = "https://s.weibo.com/weibo?q=" + encode + "&Refer=top";
String htmlBody = null; String htmlBody = null;
Map<String, String> headerMap = new HashMap<>(); Map<String, String> headerMap = new HashMap<>();
//该cookie有效期一年,微博pc端获取游客cookie链接 https://s.weibo.com/top/summary?cate=realtimehot
headerMap.put("Cookie", "SUB=_2AkMUShJMf8NxqwJRmP0RyWvgb4RwwgnEieKiFuOXJRMxHRl-yT92qlQvtRB6P8o8oso9Ew-s6vf16fdCca-Xz6DwwAMH; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFdAobr6HdAbgQQ9vbUQKDx"); headerMap.put("Cookie", "SUB=_2AkMUShJMf8NxqwJRmP0RyWvgb4RwwgnEieKiFuOXJRMxHRl-yT92qlQvtRB6P8o8oso9Ew-s6vf16fdCca-Xz6DwwAMH; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFdAobr6HdAbgQQ9vbUQKDx");
Request request = RequestUtils.wrapGet(url,headerMap); Request request = RequestUtils.wrapGet(url,headerMap);
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY); Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
......
...@@ -222,8 +222,8 @@ public class WeiboTopicCrawler { ...@@ -222,8 +222,8 @@ public class WeiboTopicCrawler {
*/ */
public static List<HotSearchList> startCrawlerByPc(Date date){ public static List<HotSearchList> startCrawlerByPc(Date date){
List<HotSearchList> topicList = new ArrayList<>(); List<HotSearchList> topicList = new ArrayList<>();
for(int page=1; page<=2; page++){ for(int page=1; page<=6; page++){
String pageUrl = "https://weibo.com/ajax/statuses/topic_band?sid=v_weibopro&category=all&page="+page+"&count=50"; String pageUrl = "https://weibo.com/ajax/statuses/topic_band?sid=v_weibopro&category=all&page="+page+"&count=10";
Request request = RequestUtils.wrapGet(pageUrl); Request request = RequestUtils.wrapGet(pageUrl);
String htmlBody = null; String htmlBody = null;
//重试三次 //重试三次
...@@ -236,8 +236,12 @@ public class WeiboTopicCrawler { ...@@ -236,8 +236,12 @@ public class WeiboTopicCrawler {
}else { }else {
htmlBody = response.bodyString(); htmlBody = response.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody)) { if (htmlBody.contains("data") && Objects.nonNull(JSONObject.parseObject(htmlBody).get("data"))) {
topicList.addAll(parseTopicPcHtml(htmlBody,date)); JSONObject data = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray statuses = data.getJSONArray("statuses");
if (statuses.size()>0) {
topicList.addAll(parseTopicPcHtml(statuses, date));
}
break; break;
} else { } else {
log.info("下载榜单列表页面时数据格式错误,页面为:{}", htmlBody); log.info("下载榜单列表页面时数据格式错误,页面为:{}", htmlBody);
...@@ -247,10 +251,8 @@ public class WeiboTopicCrawler { ...@@ -247,10 +251,8 @@ public class WeiboTopicCrawler {
return topicList; return topicList;
} }
private static List<HotSearchList> parseTopicPcHtml(String htmlBody,Date date) { private static List<HotSearchList> parseTopicPcHtml(JSONArray jsonArray,Date date) {
try { try {
JSONObject data = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray jsonArray = data.getJSONArray("statuses");
List<HotSearchList> topicList = new ArrayList<>(); List<HotSearchList> topicList = new ArrayList<>();
for (int j=0; j< jsonArray.size(); j++){ for (int j=0; j< jsonArray.size(); j++){
JSONObject card = jsonArray.getJSONObject(j); JSONObject card = jsonArray.getJSONObject(j);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment