Commit 3e5c72ea by chenweitao

Merge branch 'working' into 'master'

新增微博要闻榜采集功能

See merge request !136
parents 172e5b3c e07e6507
...@@ -113,6 +113,11 @@ public class HotSearchList implements Serializable{ ...@@ -113,6 +113,11 @@ public class HotSearchList implements Serializable{
**/ **/
private String rankPic; private String rankPic;
/**
* 主持人
*/
private String downtext;
public HotSearchList(){} public HotSearchList(){}
public HotSearchList(String url, String name, Long count,Boolean hot,Integer rank,String type,String icon,Date date){ public HotSearchList(String url, String name, Long count,Boolean hot,Integer rank,String type,String icon,Date date){
......
...@@ -29,5 +29,6 @@ public enum HotSearchType { ...@@ -29,5 +29,6 @@ public enum HotSearchType {
淘宝热搜, 淘宝热搜,
抖音同城榜, 抖音同城榜,
微博娱乐榜, 微博娱乐榜,
微博要闻榜,
} }
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.util.*;
/**
* @author ll
* @ClassName: WeiboNewsCrawler
* @Description: 微博要闻榜
* @date 2021年9月27日 上午10:54:31
*/
@Log4j2
public class WeiboNewsCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @return void 返回类型
* @Title: weiboNewsByPhone
* @author ll
* @Description: 手机端微博要闻榜采集
*/
public static List<HotSearchList> weiboNewsByPhone(Date date) {
String url1 = "https://api.weibo.cn/2/page?st_bottom_bar_new_style_enable=1&networktype=wifi&extparam=seat%3D1%26dgr%3D0%26pos%3D0_0%26position%3D%257B%2522objectid%2522%253A%25228008633020000000000%2522%252C%2522name%2522%253A%2522%255Cu5b81%255Cu6ce2%2522%257D%26mi_cid%3D100103%26c_type%3D30%26filter_type%3Drealtimehot%26cate%3D10103%26refresh_type%3D0%26display_time%3D1632707109%26pre_seqid%3D803934167&page_reform_enable=1&launchid=10000365--x&page_interrupt_enable=1&orifid=231619&uicode=10000011&ul_hid=893ae4f7-7b63-459e-a622-454a6fa3542c&ul_sid=893ae4f7-7b63-459e-a622-454a6fa3542c&moduleID=708&just_followed=false&wb_version=4880&lcardid=hot_search&c=android&s=f735389f&ft=0&ua=Xiaomi-Redmi%208__weibo__11.3.0__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&fid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&v_f=2&v_p=87&from=10B3095010&gsid=_2A25MVVJCDeRxGeNI7VMV9izPwjSIHXVtQ-KKrDV6PUJbkdCOLWz8kWpNSF_8k5iEXT9MqlN5YIgRREu9j71HIlCa&imsi=&lang=zh_CN&lfid=231619&page=1&skin=default&count=20&oldwm=20005_0002&sflag=1&oriuicode=10000010&containerid=231648_-_3&ignore_inturrpted_error=true&no_location_permission=1&luicode=10000010&android_id=0febc80e083662a7&header_skin_enable=0&ul_ctime=1632709238222&cum=2682A02C";
String url2 = "https://api.weibo.cn/2/page?st_bottom_bar_new_style_enable=1&networktype=wifi&extparam=seat%3D1%26dgr%3D0%26pos%3D0_0%26position%3D%257B%2522objectid%2522%253A%25228008633020000000000%2522%252C%2522name%2522%253A%2522%255Cu5b81%255Cu6ce2%2522%257D%26mi_cid%3D100103%26c_type%3D30%26filter_type%3Drealtimehot%26cate%3D10103%26refresh_type%3D0%26display_time%3D1632707109%26pre_seqid%3D803934167&page_reform_enable=1&launchid=10000365--x&page_interrupt_enable=1&orifid=231619&uicode=10000011&ul_hid=893ae4f7-7b63-459e-a622-454a6fa3542c&ul_sid=893ae4f7-7b63-459e-a622-454a6fa3542c&moduleID=708&just_followed=false&wb_version=4880&lcardid=hot_search&c=android&s=f735389f&ft=0&ua=Xiaomi-Redmi%208__weibo__11.3.0__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&fid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&v_f=2&v_p=87&from=10B3095010&gsid=_2A25MVVJCDeRxGeNI7VMV9izPwjSIHXVtQ-KKrDV6PUJbkdCOLWz8kWpNSF_8k5iEXT9MqlN5YIgRREu9j71HIlCa&imsi=&lang=zh_CN&lfid=231619&page=2&skin=default&count=20&oldwm=20005_0002&sflag=1&oriuicode=10000010&containerid=231648_-_3&ignore_inturrpted_error=true&no_location_permission=1&luicode=10000010&android_id=0febc80e083662a7&header_skin_enable=0&ul_ctime=1632709278776&cum=C4386412";
String url3 = "https://api.weibo.cn/2/page?st_bottom_bar_new_style_enable=1&networktype=wifi&extparam=seat%3D1%26dgr%3D0%26pos%3D0_0%26position%3D%257B%2522objectid%2522%253A%25228008633020000000000%2522%252C%2522name%2522%253A%2522%255Cu5b81%255Cu6ce2%2522%257D%26mi_cid%3D100103%26c_type%3D30%26filter_type%3Drealtimehot%26cate%3D10103%26refresh_type%3D0%26display_time%3D1632707109%26pre_seqid%3D803934167&page_reform_enable=1&launchid=10000365--x&page_interrupt_enable=1&orifid=231619&uicode=10000011&ul_hid=893ae4f7-7b63-459e-a622-454a6fa3542c&ul_sid=893ae4f7-7b63-459e-a622-454a6fa3542c&moduleID=708&just_followed=false&wb_version=4880&lcardid=hot_search&c=android&s=f735389f&ft=0&ua=Xiaomi-Redmi%208__weibo__11.3.0__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&fid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&v_f=2&v_p=87&from=10B3095010&gsid=_2A25MVVJCDeRxGeNI7VMV9izPwjSIHXVtQ-KKrDV6PUJbkdCOLWz8kWpNSF_8k5iEXT9MqlN5YIgRREu9j71HIlCa&imsi=&lang=zh_CN&lfid=231619&page=3&skin=default&count=20&oldwm=20005_0002&sflag=1&oriuicode=10000010&containerid=231648_-_3&ignore_inturrpted_error=true&no_location_permission=1&luicode=10000010&android_id=0febc80e083662a7&header_skin_enable=0&ul_ctime=1632709335385&cum=E51D64AB";
String htmlBody = null;
Request request1 = RequestUtils.wrapGet(url1);
Request request2 = RequestUtils.wrapGet(url2);
Request request3 = RequestUtils.wrapGet(url3);
for (int count = 0; count <= 5; count++) {
List<HotSearchList> result = new ArrayList();
//发送第一次请求获取前20条数据
try (Response response = httpBoot.syncCall(request1, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("第一次请求解析微博要闻榜时出现连接失败", e);
continue;
}
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONArray cards = JSONObject.parseObject(htmlBody).getJSONArray("cards");
int rank = 0;
List<HotSearchList> list = parsWeiboNews(date, cards, rank);
result.addAll(list);
} catch (Exception e) {
log.error("解析微博要闻榜时出现解析错误,数据不是json结构", e);
}
} else {
log.info("第一次解析微博要闻榜时出现解析错误,页面结构有问题");
continue;
}
//发送第二次请求获取中间20条数据
try (Response response = httpBoot.syncCall(request2, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("第二次请求解析微博要闻榜时出现连接失败", e);
continue;
}
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONArray cards = JSONObject.parseObject(htmlBody).getJSONArray("cards");
int rank = 20;
List<HotSearchList> list = parsWeiboNews(date, cards, rank);
result.addAll(list);
} catch (Exception e) {
log.error("解析微博要闻榜时出现解析错误,数据不是json结构", e);
}
} else {
log.info("第二次解析微博要闻榜时出现解析错误,页面结构有问题");
continue;
}
//发送第三次请求获取最后10条数据
try (Response response = httpBoot.syncCall(request3, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("第三次请求解析微博要闻榜时出现连接失败", e);
continue;
}
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONArray cards = JSONObject.parseObject(htmlBody).getJSONArray("cards");
int rank = 40;
List<HotSearchList> list = parsWeiboNews(date, cards, rank);
result.addAll(list);
} catch (Exception e) {
log.error("解析微博要闻榜时出现解析错误,数据不是json结构", e);
}
} else {
log.info("第三次解析微博要闻榜时出现解析错误,页面结构有问题");
continue;
}
return result;
}
return Collections.emptyList();
}
//解析微博要闻榜
public static List<HotSearchList> parsWeiboNews(Date date,JSONArray cards,int rank) {
List<HotSearchList> weiBoNewsList = new ArrayList();
try {
JSONObject card = cards.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group");
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
boolean hot = true;
for (int i = 0; i < cardGroup.size(); i++) {
JSONObject cardInfo = cardGroup.getJSONObject(i);
//获取标题
String title = cardInfo.getString("title_sub");
String name = title.replaceAll("#", "");
//获取热搜类型
String iconUrl = cardInfo.getString("title_flag_pic");
String icon = null;
if (StringUtils.isNotBlank(iconUrl)) {
icon = iconUrl.split("card8_")[1].split(".png")[0];
}
//获取链接
String id = cardInfo.getString("scheme");
String ul = "https://m.weibo.cn/search?" + id.split("[?]")[1];
//排名自增
rank++;
//获取主持人及阅读量
String desc = cardInfo.getString("desc");
Long commentCount = null;
String downtext =null;
if (Objects.nonNull(desc)) {
if (desc.split("[|]").length > 1){
//获取主持人
downtext = desc.split("[|]")[1].replace("@", "").trim();
String read = desc.split("[|]")[0];
if (read.contains("万阅读")) {
Double num = Double.valueOf(read.split("万")[0]) * 10000;
commentCount = new Double(num).longValue();
} else if (read.contains("亿阅读")) {
Double num = Double.valueOf(read.split("亿")[0]) * 100000000;
commentCount = new Double(num).longValue();
} else {
commentCount = Long.valueOf(read.split("阅读")[0]);
}
}
}
//默认热度值为零
Long hotCount = 0L;
HotSearchList hotSearch = new HotSearchList(ul, name, hotCount, hot, rank, HotSearchType.微博要闻榜.name(), icon, date);
//增加主持人
if (Objects.nonNull(downtext)) {
hotSearch.setDowntext(downtext);
}
//增加阅读量
hotSearch.setCommentCount(commentCount);
//增加热搜类型链接
if (Objects.nonNull(iconUrl)) {
hotSearch.setIconUrl(iconUrl);
}
weiBoNewsList.add(hotSearch);
}
} else {
log.info("card 数据结构为:{}", card);
}
} catch (Exception e) {
log.error("解析微博要闻榜时出现解析错误", e);
}
return weiBoNewsList;
}
}
...@@ -63,6 +63,11 @@ public class HotSearchCacheDAO { ...@@ -63,6 +63,11 @@ public class HotSearchCacheDAO {
if("虎嗅热文推荐".equals(hotSearch.getType())){ if("虎嗅热文推荐".equals(hotSearch.getType())){
document.put("comment_count", hotSearch.getCommentCount()); document.put("comment_count", hotSearch.getCommentCount());
} }
if("微博要闻榜".equals(hotSearch.getType())){
document.put("comment_count", hotSearch.getCommentCount());
document.put("iconUrl", hotSearch.getIconUrl());
document.put("downtext", hotSearch.getDowntext());
}
if("百度热搜".equals(hotSearch.getType())){ if("百度热搜".equals(hotSearch.getType())){
document.put("topic_lead", hotSearch.getTopicLead()); document.put("topic_lead", hotSearch.getTopicLead());
} }
...@@ -80,6 +85,9 @@ public class HotSearchCacheDAO { ...@@ -80,6 +85,9 @@ public class HotSearchCacheDAO {
if("百度热搜".equals(hotSearch.getType())){ if("百度热搜".equals(hotSearch.getType())){
document.remove("topic_lead"); document.remove("topic_lead");
} }
if("微博要闻榜".equals(hotSearch.getType())){
document.remove("downtext");
}
dataes.add(document); dataes.add(document);
}); });
return dataes; return dataes;
...@@ -210,7 +218,10 @@ public class HotSearchCacheDAO { ...@@ -210,7 +218,10 @@ public class HotSearchCacheDAO {
if("虎嗅热文推荐".equals(type)){ if("虎嗅热文推荐".equals(type)){
nowDoc.put("comment_count",document.getLong("comment_count")); nowDoc.put("comment_count",document.getLong("comment_count"));
} }
if("微博要闻榜".equals(type)){
nowDoc.put("downtext",document.getString("downtext"));
nowDoc.put("comment_count",document.getLong("comment_count"));
}
if(topicResult != null){ if(topicResult != null){
nowDoc.put("topicResult",topicResult); nowDoc.put("topicResult",topicResult);
} }
......
...@@ -588,4 +588,17 @@ public class GatherTimer { ...@@ -588,4 +588,17 @@ public class GatherTimer {
TipsUtils.addHotList(HotSearchType.微博娱乐榜.name(), weiBoEntertainmentList); TipsUtils.addHotList(HotSearchType.微博娱乐榜.name(), weiBoEntertainmentList);
logger.info("微博娱乐榜采集结束..."); logger.info("微博娱乐榜采集结束...");
} }
/**
*微博娱乐榜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeiBoNews(){
logger.info("微博要闻榜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> WeiboNewsList = WeiboNewsCrawler.weiboNewsByPhone(date);
logger.info("{}, 微博要闻榜此轮采集到的数据量为:{}", new Date(), WeiboNewsList != null ? WeiboNewsList.size() : 0);
TipsUtils.addHotList(HotSearchType.微博要闻榜.name(), WeiboNewsList);
logger.info("微博要闻榜采集结束...");
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment