Commit db96247a by zhiwei

修复排名问题

parent 67b48e23
...@@ -57,14 +57,23 @@ public class WeiboHuatiCrawler { ...@@ -57,14 +57,23 @@ public class WeiboHuatiCrawler {
String type = entry.getKey(); String type = entry.getKey();
for(int page= 1; page<=5; page++) { for(int page= 1; page<=5; page++) {
String pageUrl = url + "&page=" + page; String pageUrl = url + "&page=" + page;
try { //重试三次
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string(); for(int retryTimes = 1; retryTimes<=3; retryTimes++) {
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc1")) { try {
topicList.addAll(parseTopicRankHtml(htmlBody, type)); System.out.println("pageUrl=========="+pageUrl);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc1")) {
topicList.addAll(parseTopicRankHtml(page, htmlBody, type));
break;
}else {
logger.error("获取榜单列表页面时数据格式错误,页面为:{}", htmlBody);
}
} catch (Exception e) {
logger.error("获取榜单列表页面时出现错误,错误为:{}", e);
continue;
} }
} catch (Exception e) {
logger.error("获取榜单列表页面时出现错误,错误为:{}", e);
} }
} }
} }
return topicList; return topicList;
...@@ -78,10 +87,12 @@ public class WeiboHuatiCrawler { ...@@ -78,10 +87,12 @@ public class WeiboHuatiCrawler {
* @param type * @param type
* @return void * @return void
*/ */
private static List<WeiboTopic> parseTopicRankHtml(String htmlBody, String type) { private static List<WeiboTopic> parseTopicRankHtml(int page,String htmlBody, String type) {
try { try {
JSONArray list = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("list"); JSONArray list = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("list");
if(Objects.nonNull(list) && !list.isEmpty()) { if(Objects.nonNull(list) && !list.isEmpty()) {
page = (page-1)*20;
List<WeiboTopic> topicList = new ArrayList<>(); List<WeiboTopic> topicList = new ArrayList<>();
Integer toprank = null; Integer toprank = null;
String topicName = null; String topicName = null;
...@@ -92,7 +103,7 @@ public class WeiboHuatiCrawler { ...@@ -92,7 +103,7 @@ public class WeiboHuatiCrawler {
String url = null; String url = null;
for(int i=0;i<list.size();i++) { for(int i=0;i<list.size();i++) {
JSONObject data = list.getJSONObject(i); JSONObject data = list.getJSONObject(i);
toprank = data.getInteger("toprank"); toprank = page + data.getInteger("toprank");
topicName = data.getString("display_name"); topicName = data.getString("display_name");
id = data.getString("page_id"); id = data.getString("page_id");
score = data.getString("score"); score = data.getString("score");
...@@ -102,6 +113,7 @@ public class WeiboHuatiCrawler { ...@@ -102,6 +113,7 @@ public class WeiboHuatiCrawler {
WeiboTopic topic = new WeiboTopic(url, topicName, toprank, score, fensi, type); WeiboTopic topic = new WeiboTopic(url, topicName, toprank, score, fensi, type);
topic = getTopicInfo(id, topic); topic = getTopicInfo(id, topic);
System.out.println("topic====="+topic);
topicList.add(topic); topicList.add(topic);
} }
return topicList; return topicList;
...@@ -123,7 +135,7 @@ public class WeiboHuatiCrawler { ...@@ -123,7 +135,7 @@ public class WeiboHuatiCrawler {
* @return WeiboTopic * @return WeiboTopic
*/ */
private static WeiboTopic getTopicInfo(String id, WeiboTopic topic) { private static WeiboTopic getTopicInfo(String id, WeiboTopic topic) {
for(int i=1;i<=3;i++) { for(int retryTimes=1; retryTimes<=3; retryTimes++) {
try { try {
String url = "https://m.weibo.cn/api/container/getIndex?containerid="+ id; String url = "https://m.weibo.cn/api/container/getIndex?containerid="+ id;
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment