Commit 529937a4 by 马黎滨

Merge branch 'mlb-template-local' into 'mlbWork'

微博话题采集

See merge request !73
parents d3049158 84e8b8f1
...@@ -132,8 +132,8 @@ public class WeiboTopicCrawler { ...@@ -132,8 +132,8 @@ public class WeiboTopicCrawler {
*/ */
public static List<HotSearchList> startCrawlerByPhone(Date date){ public static List<HotSearchList> startCrawlerByPhone(Date date){
List<HotSearchList> topicList = new ArrayList<>(); List<HotSearchList> topicList = new ArrayList<>();
for(int page=1; page<=6; page++){ for(int page=1; page<=3; page++){
String pageUrl = "https://m.weibo.cn/api/container/getIndex?containerid=231648_-_2&page=" + page; String pageUrl = "https://api.weibo.cn/2/page?st_bottom_bar_new_style_enable=1&c=android&s=34dc160d&from=10A9295010&gsid=_2A25NH7inDeRxGeNH4lUX9ifIzTWIHXVvjUtvrDV6PUJbkdANLRjfkWpNSk7RXJ9vYwBfAr66TNj0zcFmOBPKZDuI&containerid=231648_-_4&page=" + page;
Request request = RequestUtils.wrapGet(pageUrl); Request request = RequestUtils.wrapGet(pageUrl);
String htmlBody = null; String htmlBody = null;
//重试三次 //重试三次
...@@ -144,7 +144,7 @@ public class WeiboTopicCrawler { ...@@ -144,7 +144,7 @@ public class WeiboTopicCrawler {
log.error("下载榜单列表页面时出现错误,错误为:{}", e); log.error("下载榜单列表页面时出现错误,错误为:{}", e);
continue; continue;
} }
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) { if (StringUtils.isNotBlank(htmlBody)) {
topicList.addAll(parseTopicHtml(htmlBody,date)); topicList.addAll(parseTopicHtml(htmlBody,date));
break; break;
} else { } else {
...@@ -158,58 +158,52 @@ public class WeiboTopicCrawler { ...@@ -158,58 +158,52 @@ public class WeiboTopicCrawler {
private static List<HotSearchList> parseTopicHtml(String htmlBody,Date date) { private static List<HotSearchList> parseTopicHtml(String htmlBody,Date date) {
try { try {
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("cards"); JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("cards");
if(Objects.nonNull(jsonArray) && !jsonArray.isEmpty()) { if(Objects.nonNull(jsonArray) && !jsonArray.isEmpty()) {
JSONArray cards = jsonArray.getJSONObject(0).getJSONArray("card_group"); for (int j=0; j< jsonArray.size(); j++){
List<HotSearchList> topicList = new ArrayList<>(); JSONObject card = jsonArray.getJSONObject(j);
Integer rank = null; if(card.containsKey("card_group")){
String topicName = null; JSONArray cards = card.getJSONArray("card_group");
String url = null; List<HotSearchList> topicList = new ArrayList<>();
String description = null; Integer rank = null;
Integer commentNum = null; String topicName = null;
Integer readNum = null; String url = null;
String desc2 = null; String description = null;
for(int i=0; i<cards.size(); i++) { Integer commentNum = null;
JSONObject cardGroup = cards.getJSONObject(i); Integer readNum = null;
rank = cardGroup.getInteger("top_mark_text"); String desc2 = null;
topicName = cardGroup.getString("title_sub"); for(int i=0; i<cards.size(); i++) {
url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode(topicName, "utf-8"); JSONObject cardGroup = cards.getJSONObject(i);
description = null; rank = cardGroup.getInteger("top_mark_text");
if(cardGroup.containsKey("card_expand")){ topicName = cardGroup.getString("title_sub");
description = cardGroup.getJSONObject("card_expand").getString("content"); url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode(topicName, "utf-8");
description = null;
if(cardGroup.containsKey("card_expand")){
description = cardGroup.getJSONObject("card_expand").getString("content");
}
desc2 = cardGroup.getString("desc");
String commentNumStr = desc2.replaceAll("讨论.*", "").trim();
String readNumStr = desc2.replaceAll(".*讨论|阅读", "").trim();
try {
commentNum = TipsUtils.getHotCount(commentNumStr);
readNum = TipsUtils.getHotCount(readNumStr);
}catch (Exception e){
e.printStackTrace();
}
HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博话题.name(), commentNum, description,date);
if(cardGroup.containsKey("title_flag_pic")){
String titlePic = cardGroup.getString("title_flag_pic");
if(titlePic.contains("new")){
topic.setIcon("新");
}else if(titlePic.contains("hot")){
topic.setIcon("热");
}
}
topicList.add(topic);
}
return topicList;
} }
desc2 = cardGroup.getString("desc");
String commentNumStr = desc2.replaceAll("讨论.*", "").trim();
String readNumStr = desc2.replaceAll(".*讨论|阅读", "").trim();
try {
commentNum = TipsUtils.getHotCount(commentNumStr);
readNum = TipsUtils.getHotCount(readNumStr);
// if(commentNumStr.contains("万")){
// commentNumStr = commentNumStr.replaceAll("万", "");
// commentNum = (int)(Double.parseDouble(commentNumStr)*10000);
// }else if(commentNumStr.contains("亿")){
// commentNumStr = commentNumStr.replaceAll("亿", "");
// commentNum = (int)(Double.parseDouble(commentNumStr)*10000000);
// }else{
// commentNum = Integer.getInteger(commentNumStr);
// }
//
// if(readNumStr.contains("万")){
// readNumStr = readNumStr.replaceAll("万", "");
// readNum = (int)(Double.parseDouble(readNumStr)*10000);
// }else if(readNumStr.contains("亿")){
// readNumStr = readNumStr.replaceAll("亿", "");
// readNum = (int)(Double.parseDouble(readNumStr)*10000000);
// }else{
// readNum = Integer.getInteger(readNumStr);
// }
}catch (Exception e){
e.printStackTrace();
}
HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博话题.name(), commentNum, description,date);
topicList.add(topic);
} }
return topicList;
}else{ }else{
// log.info("html:{}",htmlBody); // log.info("html:{}",htmlBody);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment