Commit 8a4f438f by leiliangliang

新增微视热榜采集程序及变更微博主持人字段为定时更新

parent 14470085
...@@ -31,5 +31,6 @@ public enum HotSearchType { ...@@ -31,5 +31,6 @@ public enum HotSearchType {
微博娱乐榜, 微博娱乐榜,
微博要闻榜, 微博要闻榜,
B站综合热门, B站综合热门,
微视热榜,
} }
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.MediaType;
import okhttp3.Request;
import okhttp3.RequestBody;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
/**
* @ClassName: WeiShiCrawlerTest
* @Description: 微视热榜采集
* @author ll
* @date 2022年2月22日 上午09:54:31
*/
@Log4j2
public class WeiShiHotSearchCrawler {
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/**
* @return void 返回类型
* @Title: WeiShiCrawlerTest
* @author ll
* @Description: 微视热榜采集
*/
public static List<HotSearchList> weiShiHotSearch(Date date) {
String url = "https://api.weishi.qq.com/trpc.wesee.weishi_search_hotrank.SearchHotrank/GetHotRankV2?_csrf=";
String htmlBody = null;
Map<String, String> headerMap = new HashMap<>();
headerMap.put("Content-Length","85");
headerMap.put("Content-Type","multipart/form-data;charset=UTF-8");
headerMap.put("Host","api.weishi.qq.com");
Request request = RequestUtils.wrapPost(url,headerMap,RequestBody.create(MediaType.get("application/json"),"{\"req_body\":{\"hotRankID\":\"\",\"attachInfo\":\"\",\"hotRankType\":1,\"sourceID\":\"WSSearchH5\"}}"));
for (int count = 0; count <=3; count++) {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()){
Throwable cause = response.cause();
log.error("解析微视热榜时出现连接失败", cause);
}else {
htmlBody = response.bodyString();
}
List<HotSearchList> result = new ArrayList();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("rsp_body")) {
try {
JSONObject car = JSONObject.parseObject(htmlBody).getJSONObject("rsp_body");
JSONArray cards = car.getJSONArray("hotRankEvents");
for (Object card : cards) {
String ul=null;
boolean hot = true;
JSONObject json = (JSONObject)JSONObject.toJSON(card);
Integer rank = json.getInteger("pos");
String title = json.getString("title");
Long hotCount = json.getLong("hotCount");
JSONObject label = json.getJSONObject("label");
String labelName =null;
String labelUrl =null;
if (Objects.nonNull(label)){
labelName = label.getString("name");
labelUrl = label.getString("labelURL");
}
HotSearchList hotSearch = new HotSearchList(ul, title, hotCount, hot, rank, HotSearchType.微视热榜.name(), labelName, date);
hotSearch.setIconUrl(labelUrl);
result.add(hotSearch);
}
return result;
} catch (Exception e) {
log.error("解析微视热榜出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微视热榜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
}
...@@ -29,6 +29,7 @@ public class HotSearchCacheDAO { ...@@ -29,6 +29,7 @@ public class HotSearchCacheDAO {
/** /**
* 存储数据 * 存储数据
*
* @param dataList * @param dataList
* @return * @return
*/ */
...@@ -114,7 +115,14 @@ public class HotSearchCacheDAO { ...@@ -114,7 +115,14 @@ public class HotSearchCacheDAO {
document.put("view", hotSearch.getView()); document.put("view", hotSearch.getView());
document.put("fans", hotSearch.getFans()); document.put("fans", hotSearch.getFans());
} }
if ("微视热榜".equals(hotSearch.getType())) {
document.put("iconUrl", hotSearch.getIconUrl());
addAndUpdateData(document,true);
}else {
addAndUpdateData(document); addAndUpdateData(document);
}
if ("百度热搜".equals(hotSearch.getType())) { if ("百度热搜".equals(hotSearch.getType())) {
document.remove("topic_lead"); document.remove("topic_lead");
} }
...@@ -135,9 +143,19 @@ public class HotSearchCacheDAO { ...@@ -135,9 +143,19 @@ public class HotSearchCacheDAO {
/** /**
* 添加及更新相应数据表中的数据 * 添加及更新相应数据表中的数据
*
* @param document * @param document
*/ */
public void addAndUpdateData(Document document) { public void addAndUpdateData(Document document) {
addAndUpdateData(document,false);
}
/**
* 添加及更新相应数据表中的数据
*
* @param document
*/
public void addAndUpdateData(Document document, boolean isMostSave) {
try { try {
String name = document.getString("name"); String name = document.getString("name");
String type = document.getString("type"); String type = document.getString("type");
...@@ -155,7 +173,6 @@ public class HotSearchCacheDAO { ...@@ -155,7 +173,6 @@ public class HotSearchCacheDAO {
String id = name + "_" + type; String id = name + "_" + type;
boolean recommend = false; boolean recommend = false;
// Integer readCount = document.getInteger("comment_count");
if ("微博热搜".equals(type)) { if ("微博热搜".equals(type)) {
//排位标判断 例如 https://simg.s.weibo.com/20210408_search_point_orange.png //排位标判断 例如 https://simg.s.weibo.com/20210408_search_point_orange.png
String rankPic = document.getString("rankPic"); String rankPic = document.getString("rankPic");
...@@ -170,8 +187,8 @@ public class HotSearchCacheDAO { ...@@ -170,8 +187,8 @@ public class HotSearchCacheDAO {
} }
Document query = new Document("_id", id); Document query = new Document("_id", id);
//判断是否为微博推荐位,推荐位微博无排名,所以不纳入总的缓存表 //判断是否为微博推荐位,推荐位微博无排名,所以不纳入总的缓存表 当isMostSave为true时忽略排名因素
if (nonNull(lastRank) && lastRank > 0) { if (nonNull(lastRank) && (lastRank > 0 || isMostSave)) {
Document nowDoc = (Document) collection.find(query).first(); Document nowDoc = (Document) collection.find(query).first();
if (nonNull(nowDoc)) { if (nonNull(nowDoc)) {
Integer highestRank = nowDoc.getInteger("highestRank"); Integer highestRank = nowDoc.getInteger("highestRank");
...@@ -193,7 +210,7 @@ public class HotSearchCacheDAO { ...@@ -193,7 +210,7 @@ public class HotSearchCacheDAO {
} }
//判断真实最高排名 //判断真实最高排名
if ("微博热搜".equals(type) && nonNull(realLastRank) && nonNull(realHighestRank)) { if ("微博热搜".equals(type) && nonNull(realLastRank) && nonNull(realHighestRank)) {
if ( realHighestRank < 0) { if (realHighestRank < 0) {
realHighestRank = realLastRank; realHighestRank = realLastRank;
} }
if (realLastRank > 0 && realHighestRank > 0 && realLastRank < realHighestRank) { if (realLastRank > 0 && realHighestRank > 0 && realLastRank < realHighestRank) {
...@@ -209,7 +226,6 @@ public class HotSearchCacheDAO { ...@@ -209,7 +226,6 @@ public class HotSearchCacheDAO {
long firstCount = Long.parseLong(nowDoc.get("firstCount").toString()); long firstCount = Long.parseLong(nowDoc.get("firstCount").toString());
riseSpeed = ((double) (lastCount - firstCount) / (double) firstCount) * 1000 / ((double) duration); riseSpeed = ((double) (lastCount - firstCount) / (double) firstCount) * 1000 / ((double) duration);
} }
// endTime = getEndTime(type, new Date());
//更新相应信息 //更新相应信息
if (url != null && !url.equals(lastUrl)) { if (url != null && !url.equals(lastUrl)) {
nowDoc.put("url", url); nowDoc.put("url", url);
...@@ -224,13 +240,10 @@ public class HotSearchCacheDAO { ...@@ -224,13 +240,10 @@ public class HotSearchCacheDAO {
nowDoc.put("duration", durationNow); nowDoc.put("duration", durationNow);
nowDoc.put("recommend", recommend); nowDoc.put("recommend", recommend);
nowDoc.put("riseSpeed", riseSpeed); nowDoc.put("riseSpeed", riseSpeed);
if ("微博热搜".equals(type)){ if ("微博热搜".equals(type)) {
nowDoc.put("realLastRank", realLastRank); nowDoc.put("realLastRank", realLastRank);
nowDoc.put("realHighestRank", realHighestRank); nowDoc.put("realHighestRank", realHighestRank);
} }
// if(readCount != null){
// nowDoc.put("readCount",readCount);
// }
if (topicResult != null) { if (topicResult != null) {
nowDoc.put("topicResult", topicResult); nowDoc.put("topicResult", topicResult);
} }
...@@ -241,6 +254,12 @@ public class HotSearchCacheDAO { ...@@ -241,6 +254,12 @@ public class HotSearchCacheDAO {
} }
} }
} }
//定时更新主持人字段 讨论量 阅读量 用在榜时长来确定更新时间
if ("微博热搜".equals(type)) {
if (durationNow%10==0){
nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc);
}
}
collection.replaceOne(query, nowDoc); collection.replaceOne(query, nowDoc);
} else { } else {
nowDoc = new Document(); nowDoc = new Document();
...@@ -253,7 +272,7 @@ public class HotSearchCacheDAO { ...@@ -253,7 +272,7 @@ public class HotSearchCacheDAO {
nowDoc.put("type", type); nowDoc.put("type", type);
nowDoc.put("lastRank", lastRank); nowDoc.put("lastRank", lastRank);
nowDoc.put("highestRank", lastRank); nowDoc.put("highestRank", lastRank);
if ("微博热搜".equals(type)){ if ("微博热搜".equals(type)) {
nowDoc.put("realLastRank", realLastRank); nowDoc.put("realLastRank", realLastRank);
nowDoc.put("realHighestRank", realLastRank); nowDoc.put("realHighestRank", realLastRank);
} }
...@@ -267,9 +286,6 @@ public class HotSearchCacheDAO { ...@@ -267,9 +286,6 @@ public class HotSearchCacheDAO {
nowDoc.put("recommend", recommend); nowDoc.put("recommend", recommend);
nowDoc.put("firstCount", lastCount); nowDoc.put("firstCount", lastCount);
nowDoc.put("riseSpeed", 0.00); nowDoc.put("riseSpeed", 0.00);
// if(readCount != null){
// nowDoc.put("readCount",readCount);
// }
if ("脉脉热榜".equals(type)) { if ("脉脉热榜".equals(type)) {
nowDoc.put("content", document.getString("content")); nowDoc.put("content", document.getString("content"));
} }
...@@ -329,15 +345,16 @@ public class HotSearchCacheDAO { ...@@ -329,15 +345,16 @@ public class HotSearchCacheDAO {
collection.insertOne(nowDoc); collection.insertOne(nowDoc);
} }
} }
}catch (MongoWriteException e1){ } catch (MongoWriteException e1) {
log.error("数据写入时出错,数据为【{}】:", document,e1); log.error("数据写入时出错,数据为【{}】:", document, e1);
}catch (Exception e) { } catch (Exception e) {
log.error("数据存储时出错,数据为【{}】:", document,e); log.error("数据存储时出错,数据为【{}】:", document, e);
} }
} }
/** /**
* 抖音链接更新 * 抖音链接更新
*
* @param document * @param document
*/ */
public void updateDouyinUrl(Document document) { public void updateDouyinUrl(Document document) {
...@@ -366,6 +383,7 @@ public class HotSearchCacheDAO { ...@@ -366,6 +383,7 @@ public class HotSearchCacheDAO {
/** /**
* 计算热搜时长 * 计算热搜时长
*
* @param type * @param type
* @param duration * @param duration
* @return * @return
...@@ -398,9 +416,9 @@ public class HotSearchCacheDAO { ...@@ -398,9 +416,9 @@ public class HotSearchCacheDAO {
// } // }
if ("脉脉热榜".equals(type)) { if ("脉脉热榜".equals(type)) {
duration = duration + 30; duration = duration + 30;
}else if("B站综合热门".equals(type)){ } else if ("B站综合热门".equals(type)) {
duration = duration + 60; duration = duration + 60;
}else { } else {
duration = duration + 1; duration = duration + 1;
} }
return duration; return duration;
...@@ -409,6 +427,7 @@ public class HotSearchCacheDAO { ...@@ -409,6 +427,7 @@ public class HotSearchCacheDAO {
/** /**
* 计算结束时间 * 计算结束时间
*
* @param type * @param type
* @param time * @param time
* @return * @return
...@@ -445,6 +464,7 @@ public class HotSearchCacheDAO { ...@@ -445,6 +464,7 @@ public class HotSearchCacheDAO {
/** /**
* 根据主键查询对应热搜 * 根据主键查询对应热搜
*
* @param id * @param id
* @return * @return
*/ */
......
...@@ -625,4 +625,18 @@ public class GatherTimer { ...@@ -625,4 +625,18 @@ public class GatherTimer {
log.info("B站综合热门采集结束..."); log.info("B站综合热门采集结束...");
} }
/**
* 微视热榜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeiShiHotSearch(){
log.info(" 微视热榜采集开始........");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> weiShiList =WeiShiHotSearchCrawler.weiShiHotSearch(date);
log.info("{}, 微视热榜此轮采集到的数据量为:{}", new Date(), weiShiList != null ? weiShiList.size() : 0);
TipsUtils.addHotList(HotSearchType.微视热榜.name(), weiShiList);
log.info(" 微视热榜采集结束........");
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment