Commit 8fd37881 by leiliangliang

微博出圈榜采集程序上线

parent 8a4f438f
......@@ -145,6 +145,11 @@ public class HotSearchList implements Serializable{
* 收藏数(目前B站排行榜在用)
*/
private Long collectCount;
/**
* 指数(目前微博出圈榜在用)
*/
private Double exponent;
public HotSearchList(){}
public HotSearchList(String url, String name, Long count,Boolean hot,Integer rank,String type,String icon,Date date){
......
......@@ -32,5 +32,6 @@ public enum HotSearchType {
微博要闻榜,
B站综合热门,
微视热榜,
微博出圈榜,
}
package com.zhiwei.searchhotcrawler.crawler;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.*;
/**
* @ClassName: WeiboOutCircleCrawler
* @Description: 微博出圈榜采集
* @author ll
* @date 2022年3月8日 上午09:54:31
*/
@Log4j2
public class WeiboOutCircleCrawler {
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/**
* @return void 返回类型
* @Title: weiboOutCircleByPc
* @author ll
* @Description: pc端微博出圈榜采集
*/
public static List<HotSearchList> weiboOutCircleByPc(Date date) {
String url = "https://s.weibo.com/top/summary/summary?cate=entrank";
Map<String, String> headerMap = new HashMap<>();
headerMap.put("cookie","SUB=_2AkMVeV6pf8NxqwJRmf8dxWjnaI93zA_EieKjJa9yJRMxHRl-yT9jqhwjtRB6PvlwRg9GTjkyY4HPafAcoZ51IoeH2BEB; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFPedwOHQuV9wiApfyy8Myw;");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url,headerMap);
for (int x = 0; x <= 2; x++) {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()){
Throwable cause = response.cause();
log.error("解析微博出圈榜时出现连接失败", cause);
}else {
htmlBody = response.bodyString();
}
List<HotSearchList> result = new ArrayList();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("margin-top")) {
try {
String body = htmlBody.substring(htmlBody.indexOf("margin-top") + 19, htmlBody.indexOf("class=\"m-footer\"") - 6);
Document document = Jsoup.parse(body);
Elements select = document.select("tr");
for (int i = 0; i < select.size(); i++) {
if (i==0){continue;}
Element element = select.get(i);
Integer rank = Integer.valueOf(element.select("td.ranktop").text());
String href = element.select("a").attr("href");
String ur= "https://s.weibo.com"+href;
String name = element.select("a").text();
String str = element.select("span").text();
String con = str.split("[|]")[0].split(" ")[1].trim();
Long count = Long.valueOf(con);
String zh = str.split("[|]")[1].replaceAll("出圈指数","").replaceAll("%"," ").trim();
Double exponent = Double.valueOf(zh);
String iconUrl =null;
if (!element.select("td.td-03").select("i").isEmpty()) {
String attr = element.select("i").attr("style");
iconUrl = attr.substring(attr.indexOf("(")+1, attr.indexOf(")"));
}
HotSearchList weiboOutCircle = new HotSearchList(ur, name, count, rank, HotSearchType.微博出圈榜.name(), date);
weiboOutCircle.setExponent(exponent);
weiboOutCircle.setIconUrl(iconUrl);
result.add(weiboOutCircle);
}
return result;
} catch (Exception e) {
log.error("解析微博出圈榜出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微博出圈榜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
}
......@@ -116,6 +116,11 @@ public class HotSearchCacheDAO {
document.put("fans", hotSearch.getFans());
}
if ("微博出圈榜".equals(hotSearch.getType())) {
document.put("exponent", hotSearch.getExponent());
document.put("iconUrl", hotSearch.getIconUrl());
}
if ("微视热榜".equals(hotSearch.getType())) {
document.put("iconUrl", hotSearch.getIconUrl());
addAndUpdateData(document,true);
......
......@@ -639,4 +639,17 @@ public class GatherTimer {
log.info(" 微视热榜采集结束........");
}
/**
* 微博出圈榜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeiBoOutCircle(){
log.info(" 微博出圈榜采集开始........");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> weiBoOutCircleList =WeiboOutCircleCrawler.weiboOutCircleByPc(date);
log.info("{}, 微博出圈榜此轮采集到的数据量为:{}", new Date(), weiBoOutCircleList != null ? weiBoOutCircleList.size() : 0);
TipsUtils.addHotList(HotSearchType.微博出圈榜.name(), weiBoOutCircleList);
log.info(" 微博出圈榜采集结束........");
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment