Commit b6b50e01 by chenweitao

Merge branch 'working' into 'master'

更新微博话题采集程序

See merge request !177
parents ccbd8308 206c358e
...@@ -13,7 +13,9 @@ import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic; ...@@ -13,7 +13,9 @@ import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.MediaType;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.RequestBody;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
...@@ -129,14 +131,103 @@ public class WeiboTopicCrawler { ...@@ -129,14 +131,103 @@ public class WeiboTopicCrawler {
/** /**
* 微博平话题榜采集 * 微博平话题榜采集
*/ */
public static List<HotSearchList> startCrawlerByPhone(Date date){ // public static List<HotSearchList> startCrawlerByPhone(Date date){
// List<HotSearchList> topicList = new ArrayList<>();
// for(int page=1; page<=3; page++){
// String pageUrl = "https://api.weibo.cn/2/page?st_bottom_bar_new_style_enable=1&c=android&s=34dc160d&from=10A9295010&gsid=_2A25NH7inDeRxGeNH4lUX9ifIzTWIHXVvjUtvrDV6PUJbkdANLRjfkWpNSk7RXJ9vYwBfAr66TNj0zcFmOBPKZDuI&containerid=231648_-_4&page=" + page;
// Request request = RequestUtils.wrapGet(pageUrl);
// String htmlBody = null;
// //重试三次
// for(int retryTimes = 1; retryTimes<=5; retryTimes++) {
// Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
// if (response.hasCause()){
// Throwable cause = response.cause();
// log.error("下载榜单列表页面时出现错误,错误为:{}", cause);
// continue;
// }else {
// htmlBody = response.bodyString();
// }
// if (StringUtils.isNotBlank(htmlBody)) {
// topicList.addAll(parseTopicHtml(htmlBody,date));
// break;
// } else {
// log.info("下载榜单列表页面时数据格式错误,页面为:{}", htmlBody);
// }
// }
// }
// return topicList;
// }
//
//
// private static List<HotSearchList> parseTopicHtml(String htmlBody,Date date) {
// try {
// JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("cards");
// if(Objects.nonNull(jsonArray) && !jsonArray.isEmpty()) {
// for (int j=0; j< jsonArray.size(); j++){
// JSONObject card = jsonArray.getJSONObject(j);
// if(card.containsKey("card_group")){
// JSONArray cards = card.getJSONArray("card_group");
// List<HotSearchList> topicList = new ArrayList<>();
// Integer rank = null;
// String topicName = null;
// String url = null;
// String description = null;
// Long commentNum = null;
// Long readNum = null;
// String desc2 = null;
// for(int i=0; i<cards.size(); i++) {
// JSONObject cardGroup = cards.getJSONObject(i);
// rank = cardGroup.getInteger("top_mark_text");
// topicName = cardGroup.getString("title_sub");
// url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode(topicName, "utf-8");
// description = null;
// if(cardGroup.containsKey("card_expand")){
// description = cardGroup.getJSONObject("card_expand").getString("content");
// }
// desc2 = cardGroup.getString("desc");
// String commentNumStr = desc2.replaceAll("讨论.*", "").trim();
// String readNumStr = desc2.replaceAll(".*讨论|阅读", "").trim();
// try {
// commentNum = TipsUtils.getHotCount(commentNumStr);
// readNum = TipsUtils.getHotCount(readNumStr);
// }catch (Exception e){
// e.printStackTrace();
// }
// HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博话题.name(), commentNum, description,date);
// if(cardGroup.containsKey("title_flag_pic")){
// String titlePic = cardGroup.getString("title_flag_pic");
// if(titlePic.contains("new")){
// topic.setIcon("新");
// }else if(titlePic.contains("hot")){
// topic.setIcon("热");
// }
// }
// topicList.add(topic);
// }
// return topicList;
// }
// }
// }else{
//// log.info("html:{}",htmlBody);
// }
// } catch (Exception e) {
// log.error("解析榜单列表页面时出现错误,错误为:{}", e);
// }
// return Collections.emptyList();
// }
/**
* 微博话题采集(PC端)
*/
public static List<HotSearchList> startCrawlerByPc(Date date){
List<HotSearchList> topicList = new ArrayList<>(); List<HotSearchList> topicList = new ArrayList<>();
for(int page=1; page<=3; page++){ for(int page=1; page<=2; page++){
String pageUrl = "https://api.weibo.cn/2/page?st_bottom_bar_new_style_enable=1&c=android&s=34dc160d&from=10A9295010&gsid=_2A25NH7inDeRxGeNH4lUX9ifIzTWIHXVvjUtvrDV6PUJbkdANLRjfkWpNSk7RXJ9vYwBfAr66TNj0zcFmOBPKZDuI&containerid=231648_-_4&page=" + page; String pageUrl = "https://weibo.com/ajax/statuses/topic_band?sid=v_weibopro&category=all&page="+page+"&count=50";
Request request = RequestUtils.wrapGet(pageUrl); Request request = RequestUtils.wrapGet(pageUrl);
String htmlBody = null; String htmlBody = null;
//重试三次 //重试三次
for(int retryTimes = 1; retryTimes<=5; retryTimes++) { for(int retryTimes = 1; retryTimes<=3; retryTimes++) {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY); Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()){ if (response.hasCause()){
Throwable cause = response.cause(); Throwable cause = response.cause();
...@@ -146,7 +237,7 @@ public class WeiboTopicCrawler { ...@@ -146,7 +237,7 @@ public class WeiboTopicCrawler {
htmlBody = response.bodyString(); htmlBody = response.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody)) { if (StringUtils.isNotBlank(htmlBody)) {
topicList.addAll(parseTopicHtml(htmlBody,date)); topicList.addAll(parseTopicPcHtml(htmlBody,date));
break; break;
} else { } else {
log.info("下载榜单列表页面时数据格式错误,页面为:{}", htmlBody); log.info("下载榜单列表页面时数据格式错误,页面为:{}", htmlBody);
...@@ -156,67 +247,27 @@ public class WeiboTopicCrawler { ...@@ -156,67 +247,27 @@ public class WeiboTopicCrawler {
return topicList; return topicList;
} }
private static List<HotSearchList> parseTopicPcHtml(String htmlBody,Date date) {
private static List<HotSearchList> parseTopicHtml(String htmlBody,Date date) {
try { try {
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("cards"); JSONObject data = JSONObject.parseObject(htmlBody).getJSONObject("data");
if(Objects.nonNull(jsonArray) && !jsonArray.isEmpty()) { JSONArray jsonArray = data.getJSONArray("statuses");
List<HotSearchList> topicList = new ArrayList<>();
for (int j=0; j< jsonArray.size(); j++){ for (int j=0; j< jsonArray.size(); j++){
JSONObject card = jsonArray.getJSONObject(j); JSONObject card = jsonArray.getJSONObject(j);
if(card.containsKey("card_group")){ Integer rank = card.getInteger("rank");
JSONArray cards = card.getJSONArray("card_group"); String description = card.getString("summary");
List<HotSearchList> topicList = new ArrayList<>(); String topicName = card.getString("topic");
Integer rank = null; Long commentNum = card.getLong("mention");
String topicName = null; Long readNum = card.getLong("read");
String url = null; String url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode("#"+topicName+"#", "utf-8");
String description = null;
Long commentNum = null;
Long readNum = null;
String desc2 = null;
for(int i=0; i<cards.size(); i++) {
JSONObject cardGroup = cards.getJSONObject(i);
rank = cardGroup.getInteger("top_mark_text");
topicName = cardGroup.getString("title_sub");
url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode(topicName, "utf-8");
description = null;
if(cardGroup.containsKey("card_expand")){
description = cardGroup.getJSONObject("card_expand").getString("content");
}
desc2 = cardGroup.getString("desc");
String commentNumStr = desc2.replaceAll("讨论.*", "").trim();
String readNumStr = desc2.replaceAll(".*讨论|阅读", "").trim();
try {
commentNum = TipsUtils.getHotCount(commentNumStr);
readNum = TipsUtils.getHotCount(readNumStr);
}catch (Exception e){
e.printStackTrace();
}
HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博话题.name(), commentNum, description,date); HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博话题.name(), commentNum, description,date);
if(cardGroup.containsKey("title_flag_pic")){
String titlePic = cardGroup.getString("title_flag_pic");
if(titlePic.contains("new")){
topic.setIcon("新");
}else if(titlePic.contains("hot")){
topic.setIcon("热");
}
}
topicList.add(topic); topicList.add(topic);
} }
return topicList; return topicList;
}
}
}else{
// log.info("html:{}",htmlBody);
}
} catch (Exception e) { } catch (Exception e) {
log.error("解析榜单列表页面时出现错误,错误为:{}", e); log.error("解析榜单列表页面时出现错误,错误为:{}", e);
} }
return Collections.emptyList(); return Collections.emptyList();
} }
} }
...@@ -222,7 +222,7 @@ public class GatherTimer { ...@@ -222,7 +222,7 @@ public class GatherTimer {
public void crawlerWeiBoTopic(){ public void crawlerWeiBoTopic(){
log.info("微博话题开始采集..."); log.info("微博话题开始采集...");
Date date = DateUtils.getMillSecondTime(new Date()); Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = WeiboTopicCrawler.startCrawlerByPhone(date); List<HotSearchList> list = WeiboTopicCrawler.startCrawlerByPc(date);
log.info("{}, 微博话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 微博话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
TipsUtils.addHotList(HotSearchType.微博话题.name(),list); TipsUtils.addHotList(HotSearchType.微博话题.name(),list);
log.info("微博话题采集结束..."); log.info("微博话题采集结束...");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment