Commit 49d50468 by chenweitao

Merge branch 'working' into 'master'

新增微博娱乐榜采集功能

See merge request !125
parents fd08d8f7 d5a49080
......@@ -27,5 +27,7 @@ public enum HotSearchType {
虎嗅热文推荐,
快手热榜,
淘宝热搜,
抖音同城榜,
微博娱乐榜,
}
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.util.*;
/**
* @ClassName: weiboEntertainmentByPhone
* @Description: 微博娱乐榜采集
* @author ll
* @date 2021年9月1日 上午10:54:31
*/
@Log4j2
public class WeiboEntertainmentCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @return void 返回类型
* @Title: weiboHotSearchYuLeByPhone
* @author ll
* @Description: 手机端微博娱乐榜采集
* @date 2021年9月2日 下午16:10:31
*/
public static List<HotSearchList> weiboEntertainmentByPhone(Date date) {
String url = "https://api.weibo.cn/2/guest/page?st_bottom_bar_new_style_enable=1&networktype=wifi&extparam=seat%3D1%26dgr%3D0%26pos%3D0_0%26position%3D%255B%255D%26mi_cid%3D100103%26c_type%3D30%26filter_type%3Drealtimehot%26cate%3D10103%26refresh_type%3D0%26display_time%3D1630311727%26pre_seqid%3D759583440&page_reform_enable=1&launchid=10000365--x&page_interrupt_enable=1&orifid=231619&uicode=10000011&ul_hid=069f1ce5-c01b-452a-8e35-63cc129b4922&ul_sid=069f1ce5-c01b-452a-8e35-63cc129b4922&moduleID=708&checktoken=49e4ed3181ae0f794326d93b345953a6&just_followed=false&wb_version=4880&lcardid=hot_search&c=android&s=14a75bb5&ft=0&ua=Xiaomi-Redmi%208__weibo__11.3.0__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&did=0f607264fc6318a92b9e13c65db7cd3c4421c4d0&fid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&uid=2004639399897&v_f=2&v_p=87&from=10B3095010&gsid=_2AkMWcBzYf8NhqwFRmPwTz2LhZYR_ww_EieKgLO0DJRM3HRl-wT9kqmIltRV6PfAyN0yL-qVVp2I3Kl7SamvpS9NmO7Ur&imsi=&lang=zh_CN&lfid=231619&page=1&skin=default&count=20&oldwm=20005_0002&sflag=1&oriuicode=10000512&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Dfun&ignore_inturrpted_error=true&no_location_permission=1&luicode=10000512&android_id=0febc80e083662a7&header_skin_enable=0&ul_ctime=1630311759544&cum=E30CEEEA";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博娱乐榜时出现连接失败", e);
}
List<HotSearchList> result = new ArrayList();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONArray cards = JSONObject.parseObject(htmlBody).getJSONArray("cards");
int rank = 0;
try {
JSONObject card = cards.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group");
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
boolean hot = true;
for (int j = 0; j < cardGroup.size(); j++) {
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
long hotCount = cardInfo.getLongValue("desc_extr");
String icon = cardInfo.getString("icon");
if (StringUtils.isNotBlank(icon)) {
icon = icon.split("_")[1].split(".png")[0];
}
String id = cardInfo.getString("scheme");
String ul="https://m.weibo.cn/search?"+id.split( "[?]")[1];
rank++;
HotSearchList hotSearch = new HotSearchList(ul, name, hotCount, hot, rank, HotSearchType.微博娱乐榜.name(), icon, date);
if(!"娱乐动态数据详情".equals(hotSearch.getName())){
result.add(hotSearch);
}
}
} else {
log.info("card 数据结构为:{}", card);
}
} catch (Exception e) {
log.error("解析微博娱乐榜时出现解析错误", e);
continue;
}
return result;
} catch (Exception e) {
log.error("解析微博娱乐榜时出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微博娱乐榜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
}
package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.KuaiShouHotSearchCrawler;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Log4j2
public class WeiBoYuLeRun extends Thread{
@Override
public void run() {
boolean f = true;
while(f) {
try {
getHotList();
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
}
ZhiWeiTools.sleep(50);
}
}
private void getHotList() {
log.info("微博娱乐榜采集开始........");
List<HotSearchList> weiBoEntertainmentList = WeiboEntertainmentCrawlerTest.weiboEntertainmentByPhone(new Date());
log.info("{}, 此轮微博娱乐榜采集到的数据量为:{}", new Date(), Integer.valueOf(weiBoEntertainmentList != null ? weiBoEntertainmentList.size() : 0));
TipsUtils.addHotList("快手热榜",weiBoEntertainmentList);
log.info("微博娱乐榜采集结束........");
}
}
\ No newline at end of file
package com.zhiwei.searchhotcrawler.test;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.util.*;
/**
* @ClassName: weiboEntertainmentByPhone
* @Description: 微博娱乐榜采集
* @author ll
* @date 2021年9月1日 上午10:54:31
*/
@Log4j2
public class WeiboEntertainmentCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @return void 返回类型
* @Title: weiboHotSearchYuLeByPhone
* @author ll
* @Description: 手机端微博娱乐榜采集
*/
public static List<HotSearchList> weiboEntertainmentByPhone(Date date) {
String url = "https://api.weibo.cn/2/guest/page?st_bottom_bar_new_style_enable=1&networktype=wifi&extparam=seat%3D1%26dgr%3D0%26pos%3D0_0%26position%3D%255B%255D%26mi_cid%3D100103%26c_type%3D30%26filter_type%3Drealtimehot%26cate%3D10103%26refresh_type%3D0%26display_time%3D1630311727%26pre_seqid%3D759583440&page_reform_enable=1&launchid=10000365--x&page_interrupt_enable=1&orifid=231619&uicode=10000011&ul_hid=069f1ce5-c01b-452a-8e35-63cc129b4922&ul_sid=069f1ce5-c01b-452a-8e35-63cc129b4922&moduleID=708&checktoken=49e4ed3181ae0f794326d93b345953a6&just_followed=false&wb_version=4880&lcardid=hot_search&c=android&s=14a75bb5&ft=0&ua=Xiaomi-Redmi%208__weibo__11.3.0__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&did=0f607264fc6318a92b9e13c65db7cd3c4421c4d0&fid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&uid=2004639399897&v_f=2&v_p=87&from=10B3095010&gsid=_2AkMWcBzYf8NhqwFRmPwTz2LhZYR_ww_EieKgLO0DJRM3HRl-wT9kqmIltRV6PfAyN0yL-qVVp2I3Kl7SamvpS9NmO7Ur&imsi=&lang=zh_CN&lfid=231619&page=1&skin=default&count=20&oldwm=20005_0002&sflag=1&oriuicode=10000512&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Dfun&ignore_inturrpted_error=true&no_location_permission=1&luicode=10000512&android_id=0febc80e083662a7&header_skin_enable=0&ul_ctime=1630311759544&cum=E30CEEEA";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博娱乐榜时出现连接失败", e);
}
List<HotSearchList> result = new ArrayList();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONArray cards = JSONObject.parseObject(htmlBody).getJSONArray("cards");
int rank = 0;
try {
JSONObject card = cards.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group");
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
boolean hot = true;
for (int j = 0; j < cardGroup.size(); j++) {
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
long hotCount = cardInfo.getLongValue("desc_extr");
String icon = cardInfo.getString("icon");
if (StringUtils.isNotBlank(icon)) {
icon = icon.split("_")[1].split(".png")[0];
}
String id = cardInfo.getString("scheme");
String ul="https://m.weibo.cn/search?"+id.split( "[?]")[1];
rank++;
HotSearchList hotSearch = new HotSearchList(ul, name, hotCount, hot, rank, HotSearchType.微博娱乐榜.name(), icon, date);
if(!"娱乐动态数据详情".equals(hotSearch.getName())){
result.add(hotSearch);
}
}
} else {
log.info("card 数据结构为:{}", card);
}
} catch (Exception e) {
log.error("解析微博娱乐榜时出现解析错误", e);
continue;
}
return result;
} catch (Exception e) {
log.error("解析微博娱乐榜时出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微博娱乐榜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
}
......@@ -10,6 +10,7 @@ import com.zhiwei.searchhotcrawler.dao.RedisDao;
import com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO;
import com.zhiwei.searchhotcrawler.crawler.HotSearch36KrCrawler;
import com.zhiwei.searchhotcrawler.crawler.HuXiuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.test.DouYinTongChengCrawlerTest;
import com.zhiwei.searchhotcrawler.timer.TouTiaoExecutor;
import com.zhiwei.searchhotcrawler.util.DateUtils;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
......@@ -533,4 +534,59 @@ public class GatherTimer {
// TipsUtils.addHotList(HotSearchType.淘宝热搜.name(), taoBaoList);
// logger.info("淘宝热搜采集结束...");
// }
//
// /**
// * 抖音同城榜的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 * * * * ? ")
// public void crawlerDouYinTongCheng(){
// logger.info("抖音同城榜开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> douyinTongChengList = DouYinTongChengCrawlerTest.DouYinTongChengCrawler(date);
// logger.info("{}, 抖音同城榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(douyinTongChengList != null ? douyinTongChengList.size() : 0));
// TipsUtils.addHotList(HotSearchType.抖音同城榜.name(),douyinTongChengList);
// logger.info("抖音同城榜采集结束...");
// }
//
// /**
// * 抖音同城链接的更新
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 0/5 * * * ? ")
// public void updateDouYinTongChengUrl(){
// logger.info("抖音同城链接更新开始...");
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<HotSearchList> douyinTongChengList = DouYinTongChengCrawlerTest.list;
// if(douyinTongChengList!=null && douyinTongChengList.size()>0){
// for(int i=0; i<douyinTongChengList.size(); i++){
// String name = douyinTongChengList.get(i).getName();
// String id = name+"_"+douyinTongChengList.get(i).getType();
// String url = DouYinTongChengCrawlerTest.getDouyinTongChengUrl("https://api5-normal-c-lq.amemv.com/aweme/v1/hot/search/video/list/?hotword="+name);
// if(url != null) {
// Document document = new Document();
// document.put("id", id);
// document.put("url", url);
// hotSearchCacheDAO.updateDouyinUrl(document);
// }
// }
// logger.info("抖音同城链接更新结束");
// }else{
// logger.info("抖音同城链接更新失败,抖音同城榜列表获取为空。");
// }
// }
/**
*微博娱乐榜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeiBoEntertainment(){
logger.info("微博娱乐榜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> weiBoEntertainmentList = WeiboEntertainmentCrawler.weiboEntertainmentByPhone(date);
logger.info("{}, 微博娱乐榜此轮采集到的数据量为:{}", new Date(), weiBoEntertainmentList != null ? weiBoEntertainmentList.size() : 0);
TipsUtils.addHotList(HotSearchType.微博娱乐榜.name(), weiBoEntertainmentList);
logger.info("微博娱乐榜采集结束...");
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment