Commit 50407ed7 by leiliangliang

更新微博话题采集程序

parent 2eda193a
......@@ -222,8 +222,8 @@ public class WeiboTopicCrawler {
*/
public static List<HotSearchList> startCrawlerByPc(Date date){
List<HotSearchList> topicList = new ArrayList<>();
for(int page=1; page<=2; page++){
String pageUrl = "https://weibo.com/ajax/statuses/topic_band?sid=v_weibopro&category=all&page="+page+"&count=50";
for(int page=1; page<=6; page++){
String pageUrl = "https://weibo.com/ajax/statuses/topic_band?sid=v_weibopro&category=all&page="+page+"&count=10";
Request request = RequestUtils.wrapGet(pageUrl);
String htmlBody = null;
//重试三次
......@@ -236,8 +236,12 @@ public class WeiboTopicCrawler {
}else {
htmlBody = response.bodyString();
}
if (StringUtils.isNotBlank(htmlBody)) {
topicList.addAll(parseTopicPcHtml(htmlBody,date));
if (htmlBody.contains("data") && Objects.nonNull(JSONObject.parseObject(htmlBody).get("data"))) {
JSONObject data = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray statuses = data.getJSONArray("statuses");
if (statuses.size()>0) {
topicList.addAll(parseTopicPcHtml(statuses, date));
}
break;
} else {
log.info("下载榜单列表页面时数据格式错误,页面为:{}", htmlBody);
......@@ -247,10 +251,8 @@ public class WeiboTopicCrawler {
return topicList;
}
private static List<HotSearchList> parseTopicPcHtml(String htmlBody,Date date) {
private static List<HotSearchList> parseTopicPcHtml(JSONArray jsonArray,Date date) {
try {
JSONObject data = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray jsonArray = data.getJSONArray("statuses");
List<HotSearchList> topicList = new ArrayList<>();
for (int j=0; j< jsonArray.size(); j++){
JSONObject card = jsonArray.getJSONObject(j);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment