Commit 67e92947 by leiliangliang

微博视频榜采集上线

parent 4c1756a3
...@@ -33,5 +33,6 @@ public enum HotSearchType { ...@@ -33,5 +33,6 @@ public enum HotSearchType {
B站综合热门, B站综合热门,
微视热榜, 微视热榜,
微博出圈榜, 微博出圈榜,
微博视频榜,
} }
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
/**
* @ClassName: WeiboVideoCrawlerTest
* @Description: 微博视频榜采集
* @author ll
* @date 2022年6月20日 上午09:54:31
*/
@Log4j2
public class WeiboVideoCrawler {
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/**
* @return void 返回类型
* @Title: weiboVideoCrawler
* @author ll
* @Description: 移动端微博视频榜采集
*/
public static List<HotSearchList> weiboVideoCrawler(Date date) {
String url = "https://api.weibo.cn/2/guest/page?st_bottom_bar_new_style_enable=1&networktype=wifi&extparam=seat%3D1%26lcate%3D1001%26filter_type%3Drealtimehot%26dgr%3D0%26mi_cid%3D100103%26position%3D%257B%2522objectid%2522%253A%25228008622060000000000%2522%252C%2522name%2522%253A%2522%255Cu767d%255Cu5c71%255Cu5e02%2522%257D%26c_type%3D30%26pos%3D0_0%26cate%3D10103%26display_time%3D1655705732%26pre_seqid%3D850625824&page_reform_enable=1&search_other_channel=1&launchid=10000365--x&page_interrupt_enable=1&orifid=102803_ctg1_1770_-_ctg1_1770&uicode=10000011&ul_hid=cda76bff-e6f6-4f97-a374-156f2d72850e&ul_sid=cda76bff-e6f6-4f97-a374-156f2d72850e&moduleID=708&checktoken=b43889d54a9ba7b530ab24a251bc5527&just_followed=false&wb_version=5573&refresh_type=0&lcardid=hot_search&c=android&s=fae4ea79&ft=0&ua=Xiaomi-Redmi%208__weibo__12.5.0__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&did=0f607264fc6318a92b9e13c65db7cd3cd6178b96&fid=106003_-_type%3A9_-_filter_type%3Avideoband&uid=2004639399897&v_f=2&v_p=90&from=10C5095010&gsid=_2AkMV83aHf8NhqwFRmPwTz2LhZYR_ww_EieKjr4dcJRM3HRl-wT9jqmYstRV6PfAyN0c5v_amzoB_UpmSTeVJb7T3W8hA&imsi=&lang=zh_CN&lfid=102803_ctg1_1770_-_ctg1_1770&page=1&skin=default&count=20&oldwm=20005_0002&sflag=1&oriuicode=10000512&containerid=106003_-_type%3A9_-_filter_type%3Avideoband&ignore_inturrpted_error=true&no_location_permission=1&luicode=10000512&android_id=0febc80e083662a7&header_skin_enable=0&is_top_header=1&is_album_water_fall=1&ul_ctime=1655705749720&cum=ADDC9D1A";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for (int x = 0; x <= 2; x++) {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()){
Throwable cause = response.cause();
log.error("解析微博视频榜时出现连接失败", cause);
}else {
htmlBody = response.bodyString();
}
List<HotSearchList> result = new ArrayList();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONArray videoList = JSONObject.parseObject(htmlBody).getJSONArray("cards").getJSONObject(0).getJSONArray("card_group");
Integer rank=0;
for (Object object : videoList) {
rank++;
JSONObject json = (JSONObject)JSONObject.toJSON(object);
String title = json.getString("title");
String video = json.getString("video_count_string");
//热度
long videoCount =0L;
if (video.contains("万")) {
videoCount = new Double(Double.parseDouble(video.split("万")[0])).longValue() * 10000;
}
String play = json.getString("play_count_string");
//播放量
long playCount =0L;
if (play.contains("万")) {
playCount = new Double(Double.parseDouble(play.split("万")[0])).longValue() * 10000;
}
HotSearchList hotSearchList = new HotSearchList(null,title,videoCount,rank,HotSearchType.微博视频榜.name(),date);
hotSearchList.setView(playCount);
result.add(hotSearchList);
}
return result;
} catch (Exception e) {
log.error("解析微博视频榜出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微博视频榜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
}
...@@ -682,4 +682,18 @@ public class GatherTimer { ...@@ -682,4 +682,18 @@ public class GatherTimer {
WeiBoSearchHotWordsCrawler.weiBoSearchHotWords(date); WeiBoSearchHotWordsCrawler.weiBoSearchHotWords(date);
log.info("微博热词采集结束........"); log.info("微博热词采集结束........");
} }
/**
* 微博视频榜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeiBoVideo(){
log.info("微博视频榜采集开始........");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> weiBoVideoList = WeiboVideoCrawler.weiboVideoCrawler(date);
log.info("{}, 此轮微博视频榜采集到的数据量为:{}", new Date(), Integer.valueOf(weiBoVideoList != null ? weiBoVideoList.size() : 0));
TipsUtils.addHotList("微博视频榜",weiBoVideoList);
log.info("微博视频榜采集结束........");
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment