Commit 8a4f438f by leiliangliang

新增微视热榜采集程序及变更微博主持人字段为定时更新

parent 14470085
......@@ -31,5 +31,6 @@ public enum HotSearchType {
微博娱乐榜,
微博要闻榜,
B站综合热门,
微视热榜,
}
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.MediaType;
import okhttp3.Request;
import okhttp3.RequestBody;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
/**
* @ClassName: WeiShiCrawlerTest
* @Description: 微视热榜采集
* @author ll
* @date 2022年2月22日 上午09:54:31
*/
@Log4j2
public class WeiShiHotSearchCrawler {
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/**
* @return void 返回类型
* @Title: WeiShiCrawlerTest
* @author ll
* @Description: 微视热榜采集
*/
public static List<HotSearchList> weiShiHotSearch(Date date) {
String url = "https://api.weishi.qq.com/trpc.wesee.weishi_search_hotrank.SearchHotrank/GetHotRankV2?_csrf=";
String htmlBody = null;
Map<String, String> headerMap = new HashMap<>();
headerMap.put("Content-Length","85");
headerMap.put("Content-Type","multipart/form-data;charset=UTF-8");
headerMap.put("Host","api.weishi.qq.com");
Request request = RequestUtils.wrapPost(url,headerMap,RequestBody.create(MediaType.get("application/json"),"{\"req_body\":{\"hotRankID\":\"\",\"attachInfo\":\"\",\"hotRankType\":1,\"sourceID\":\"WSSearchH5\"}}"));
for (int count = 0; count <=3; count++) {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()){
Throwable cause = response.cause();
log.error("解析微视热榜时出现连接失败", cause);
}else {
htmlBody = response.bodyString();
}
List<HotSearchList> result = new ArrayList();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("rsp_body")) {
try {
JSONObject car = JSONObject.parseObject(htmlBody).getJSONObject("rsp_body");
JSONArray cards = car.getJSONArray("hotRankEvents");
for (Object card : cards) {
String ul=null;
boolean hot = true;
JSONObject json = (JSONObject)JSONObject.toJSON(card);
Integer rank = json.getInteger("pos");
String title = json.getString("title");
Long hotCount = json.getLong("hotCount");
JSONObject label = json.getJSONObject("label");
String labelName =null;
String labelUrl =null;
if (Objects.nonNull(label)){
labelName = label.getString("name");
labelUrl = label.getString("labelURL");
}
HotSearchList hotSearch = new HotSearchList(ul, title, hotCount, hot, rank, HotSearchType.微视热榜.name(), labelName, date);
hotSearch.setIconUrl(labelUrl);
result.add(hotSearch);
}
return result;
} catch (Exception e) {
log.error("解析微视热榜出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微视热榜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
}
......@@ -29,6 +29,7 @@ public class HotSearchCacheDAO {
/**
* 存储数据
*
* @param dataList
* @return
*/
......@@ -114,7 +115,14 @@ public class HotSearchCacheDAO {
document.put("view", hotSearch.getView());
document.put("fans", hotSearch.getFans());
}
if ("微视热榜".equals(hotSearch.getType())) {
document.put("iconUrl", hotSearch.getIconUrl());
addAndUpdateData(document,true);
}else {
addAndUpdateData(document);
}
if ("百度热搜".equals(hotSearch.getType())) {
document.remove("topic_lead");
}
......@@ -135,9 +143,19 @@ public class HotSearchCacheDAO {
/**
* 添加及更新相应数据表中的数据
*
* @param document
*/
public void addAndUpdateData(Document document) {
addAndUpdateData(document,false);
}
/**
* 添加及更新相应数据表中的数据
*
* @param document
*/
public void addAndUpdateData(Document document, boolean isMostSave) {
try {
String name = document.getString("name");
String type = document.getString("type");
......@@ -155,7 +173,6 @@ public class HotSearchCacheDAO {
String id = name + "_" + type;
boolean recommend = false;
// Integer readCount = document.getInteger("comment_count");
if ("微博热搜".equals(type)) {
//排位标判断 例如 https://simg.s.weibo.com/20210408_search_point_orange.png
String rankPic = document.getString("rankPic");
......@@ -170,8 +187,8 @@ public class HotSearchCacheDAO {
}
Document query = new Document("_id", id);
//判断是否为微博推荐位,推荐位微博无排名,所以不纳入总的缓存表
if (nonNull(lastRank) && lastRank > 0) {
//判断是否为微博推荐位,推荐位微博无排名,所以不纳入总的缓存表 当isMostSave为true时忽略排名因素
if (nonNull(lastRank) && (lastRank > 0 || isMostSave)) {
Document nowDoc = (Document) collection.find(query).first();
if (nonNull(nowDoc)) {
Integer highestRank = nowDoc.getInteger("highestRank");
......@@ -193,7 +210,7 @@ public class HotSearchCacheDAO {
}
//判断真实最高排名
if ("微博热搜".equals(type) && nonNull(realLastRank) && nonNull(realHighestRank)) {
if ( realHighestRank < 0) {
if (realHighestRank < 0) {
realHighestRank = realLastRank;
}
if (realLastRank > 0 && realHighestRank > 0 && realLastRank < realHighestRank) {
......@@ -209,7 +226,6 @@ public class HotSearchCacheDAO {
long firstCount = Long.parseLong(nowDoc.get("firstCount").toString());
riseSpeed = ((double) (lastCount - firstCount) / (double) firstCount) * 1000 / ((double) duration);
}
// endTime = getEndTime(type, new Date());
//更新相应信息
if (url != null && !url.equals(lastUrl)) {
nowDoc.put("url", url);
......@@ -224,13 +240,10 @@ public class HotSearchCacheDAO {
nowDoc.put("duration", durationNow);
nowDoc.put("recommend", recommend);
nowDoc.put("riseSpeed", riseSpeed);
if ("微博热搜".equals(type)){
if ("微博热搜".equals(type)) {
nowDoc.put("realLastRank", realLastRank);
nowDoc.put("realHighestRank", realHighestRank);
}
// if(readCount != null){
// nowDoc.put("readCount",readCount);
// }
if (topicResult != null) {
nowDoc.put("topicResult", topicResult);
}
......@@ -241,6 +254,12 @@ public class HotSearchCacheDAO {
}
}
}
//定时更新主持人字段 讨论量 阅读量 用在榜时长来确定更新时间
if ("微博热搜".equals(type)) {
if (durationNow%10==0){
nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc);
}
}
collection.replaceOne(query, nowDoc);
} else {
nowDoc = new Document();
......@@ -253,7 +272,7 @@ public class HotSearchCacheDAO {
nowDoc.put("type", type);
nowDoc.put("lastRank", lastRank);
nowDoc.put("highestRank", lastRank);
if ("微博热搜".equals(type)){
if ("微博热搜".equals(type)) {
nowDoc.put("realLastRank", realLastRank);
nowDoc.put("realHighestRank", realLastRank);
}
......@@ -267,9 +286,6 @@ public class HotSearchCacheDAO {
nowDoc.put("recommend", recommend);
nowDoc.put("firstCount", lastCount);
nowDoc.put("riseSpeed", 0.00);
// if(readCount != null){
// nowDoc.put("readCount",readCount);
// }
if ("脉脉热榜".equals(type)) {
nowDoc.put("content", document.getString("content"));
}
......@@ -329,15 +345,16 @@ public class HotSearchCacheDAO {
collection.insertOne(nowDoc);
}
}
}catch (MongoWriteException e1){
log.error("数据写入时出错,数据为【{}】:", document,e1);
}catch (Exception e) {
log.error("数据存储时出错,数据为【{}】:", document,e);
} catch (MongoWriteException e1) {
log.error("数据写入时出错,数据为【{}】:", document, e1);
} catch (Exception e) {
log.error("数据存储时出错,数据为【{}】:", document, e);
}
}
/**
* 抖音链接更新
*
* @param document
*/
public void updateDouyinUrl(Document document) {
......@@ -366,6 +383,7 @@ public class HotSearchCacheDAO {
/**
* 计算热搜时长
*
* @param type
* @param duration
* @return
......@@ -398,9 +416,9 @@ public class HotSearchCacheDAO {
// }
if ("脉脉热榜".equals(type)) {
duration = duration + 30;
}else if("B站综合热门".equals(type)){
} else if ("B站综合热门".equals(type)) {
duration = duration + 60;
}else {
} else {
duration = duration + 1;
}
return duration;
......@@ -409,6 +427,7 @@ public class HotSearchCacheDAO {
/**
* 计算结束时间
*
* @param type
* @param time
* @return
......@@ -445,6 +464,7 @@ public class HotSearchCacheDAO {
/**
* 根据主键查询对应热搜
*
* @param id
* @return
*/
......
......@@ -625,4 +625,18 @@ public class GatherTimer {
log.info("B站综合热门采集结束...");
}
/**
* 微视热榜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeiShiHotSearch(){
log.info(" 微视热榜采集开始........");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> weiShiList =WeiShiHotSearchCrawler.weiShiHotSearch(date);
log.info("{}, 微视热榜此轮采集到的数据量为:{}", new Date(), weiShiList != null ? weiShiList.size() : 0);
TipsUtils.addHotList(HotSearchType.微视热榜.name(), weiShiList);
log.info(" 微视热榜采集结束........");
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment