Commit 310520db by leiliangliang

新增B站综合热门采集

parent c51af150
...@@ -94,7 +94,7 @@ public class HotSearchList implements Serializable{ ...@@ -94,7 +94,7 @@ public class HotSearchList implements Serializable{
private String topicResult; private String topicResult;
/** /**
* 观看数(目前近B站排行榜使用) * 观看数(目前近B站排行榜及综合热门使用)
*/ */
private Long view; private Long view;
...@@ -122,7 +122,6 @@ public class HotSearchList implements Serializable{ ...@@ -122,7 +122,6 @@ public class HotSearchList implements Serializable{
* 内容 * 内容
*/ */
private String content; private String content;
public HotSearchList(){} public HotSearchList(){}
public HotSearchList(String url, String name, Long count,Boolean hot,Integer rank,String type,String icon,Date date){ public HotSearchList(String url, String name, Long count,Boolean hot,Integer rank,String type,String icon,Date date){
......
...@@ -30,5 +30,6 @@ public enum HotSearchType { ...@@ -30,5 +30,6 @@ public enum HotSearchType {
抖音同城榜, 抖音同城榜,
微博娱乐榜, 微博娱乐榜,
微博要闻榜, 微博要闻榜,
B站综合热门,
} }
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.*;
/**
* @author ll
* @ClassName: BiliComprehensiveHotCrawler
* @Description: B站综合热门
* @date 2021年12月09日 下午14:54:31
*/
@Log4j2
public class BiliComprehensiveHotCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @return List<HotSearchList>
* @Title: getBiliComprehensiveHot
* @author ll
* @Description: pc端B站综合热门采集
*/
public static List<HotSearchList> getBiliComprehensiveHot(Date date) {
DateFormat fmt = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
//十次链接存list集合
List<String> urlList = new ArrayList<>();
for (int j = 1; j <= 10; j++) {
String url="https://api.bilibili.com/x/web-interface/popular?ps=20&pn="+j;
urlList.add(url);
}
String htmlBody = null;
//循环六次拿完整数据 六次都失败则返回空
for (int count = 0; count <= 5; count++) {
int b=0;
//数据集存入result集合
List<HotSearchList> result = new ArrayList();
for (int i = 0; i < urlList.size(); i++) {
Request request = RequestUtils.wrapGet(urlList.get(i));
//发送请求每次获取20条数据
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error(fmt.format(date)+":第"+i+1+"次请求解析B站综合热门时出现连接失败", e);
}
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
try {
JSONArray biliList = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("list");
List<HotSearchList> list = parsBiliComprehensiveHot(date, biliList, i);
result.addAll(list);
} catch (Exception e) {
log.error("解析B站综合热门时出现解析错误,数据不是json结构", e);
}
} else {
log.info(fmt.format(date)+":第"+i+1+"次解析B站综合热门时出现解析错误,页面结构有问题");
--i;
b++;
if (b==4){
log.info(fmt.format(date)+":分钟数据舍弃");
return Collections.emptyList();
}
}
}
//返回采集到的200条数据
return result;
}
return Collections.emptyList();
}
//B站综合热门
public static List<HotSearchList> parsBiliComprehensiveHot(Date date,JSONArray list,int a) {
//定义rank变量
int rank;
switch (a) {
case 0:
rank = 0;
break;
case 1:
rank = 20;
break;
case 2:
rank = 40;
break;
case 3:
rank = 60;
break;
case 4:
rank = 80;
break;
case 5:
rank = 100;
break;
case 6:
rank = 120;
break;
case 7:
rank = 140;
break;
case 8:
rank = 160;
break;
case 9:
rank = 180;
break;
default:
rank =0;
}
List<HotSearchList> biliComprehensiveHotList = new ArrayList();
try {
if (Objects.nonNull(list) && !list.isEmpty()) {
boolean hot = true;
for (int i = 0; i < list.size(); i++) {
JSONObject cardInfo = list.getJSONObject(i);
//获取标题
String name = cardInfo.getString("title");
//获取图片链接
String pictureUrl = cardInfo.getString("pic");
//获取链接
String url = cardInfo.getString("short_link");
//排名自增
rank++;
//获取主持人
String downtext = cardInfo.getJSONObject("owner").getString("name");
//获取播放量
Long view =Long.valueOf(cardInfo.getJSONObject("stat").getString("view")) ;
//获取讨论量
Long commentCount = Long.valueOf(cardInfo.getJSONObject("stat").getString("danmaku"));
//获取标签
String label = cardInfo.getJSONObject("rcmd_reason").getString("content");
//默认热度值为null
Long hotCount = null;
HotSearchList hotSearch = new HotSearchList(url, name, hotCount, rank, HotSearchType.B站综合热门.name(), date);
//增加主持人
hotSearch.setDowntext(downtext);
//增加图片链接
hotSearch.setPictureUrl(pictureUrl);
//增加播放量
hotSearch.setView(view);
//增加讨论量
hotSearch.setCommentCount(commentCount);
//增加标签
if (Objects.nonNull(label)) {
hotSearch.setHeatLabel(label);
}
biliComprehensiveHotList.add(hotSearch);
}
} else {
log.info("list 数据结构为:{}", list);
}
} catch (Exception e) {
log.error("解析B站综合热门时出现解析错误", e);
}
return biliComprehensiveHotList;
}
}
...@@ -97,6 +97,12 @@ public class HotSearchCacheDAO { ...@@ -97,6 +97,12 @@ public class HotSearchCacheDAO {
document.put("barrage", hotSearch.getBarrage()); document.put("barrage", hotSearch.getBarrage());
document.put("pictureUrl", hotSearch.getPictureUrl()); document.put("pictureUrl", hotSearch.getPictureUrl());
} }
if ("B站综合热门".equals(hotSearch.getType())) {
document.put("heatLabel", hotSearch.getHeatLabel());
document.put("view", hotSearch.getView());
document.put("pictureUrl", hotSearch.getPictureUrl());
document.put("commentCount", hotSearch.getCommentCount());
}
addAndUpdateData(document); addAndUpdateData(document);
if ("百度热搜".equals(hotSearch.getType())) { if ("百度热搜".equals(hotSearch.getType())) {
document.remove("topic_lead"); document.remove("topic_lead");
...@@ -269,6 +275,9 @@ public class HotSearchCacheDAO { ...@@ -269,6 +275,9 @@ public class HotSearchCacheDAO {
if (picTypes.contains(type)) { if (picTypes.contains(type)) {
nowDoc.put("pictureUrl", pictureUrl); nowDoc.put("pictureUrl", pictureUrl);
} }
if ("B站综合热门".equals(type)) {
nowDoc.put("pictureUrl", pictureUrl);
}
if ("微博热搜".equals(type)) { if ("微博热搜".equals(type)) {
nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc); nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc);
//更新微博话题贡献者,关于功能 //更新微博话题贡献者,关于功能
...@@ -368,7 +377,9 @@ public class HotSearchCacheDAO { ...@@ -368,7 +377,9 @@ public class HotSearchCacheDAO {
// } // }
if ("脉脉热榜".equals(type)) { if ("脉脉热榜".equals(type)) {
duration = duration + 30; duration = duration + 30;
} else { }else if("B站综合热门".equals(type)){
duration = duration + 60;
}else {
duration = duration + 1; duration = duration + 1;
} }
return duration; return duration;
......
...@@ -610,4 +610,19 @@ public class GatherTimer { ...@@ -610,4 +610,19 @@ public class GatherTimer {
WeiBoSearchBoxHotWordsCrawler.weiBoSearchBoxHotWords(date); WeiBoSearchBoxHotWordsCrawler.weiBoSearchBoxHotWords(date);
log.info("微博搜索框热词采集结束........"); log.info("微博搜索框热词采集结束........");
} }
/**
* B站综合热门的采集 每小时一次
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 0 0/1 * * ? ")
public void crawlerBiliComprehensiveHot(){
log.info("B站综合热门开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> BiliComprehensiveHotList = BiliComprehensiveHotCrawler.getBiliComprehensiveHot(date);
log.info("{}, B站综合热门此轮采集到的数据量为:{}", new Date(), Integer.valueOf(BiliComprehensiveHotList != null ? BiliComprehensiveHotList.size() : 0));
TipsUtils.addHotList(HotSearchType.B站综合热门.name(),BiliComprehensiveHotList);
log.info("B站综合热门采集结束...");
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment