Commit 7e156432 by leiliangliang

新增微博话题采集话题贡献者,关于功能

parent f986b5c8
......@@ -73,4 +73,10 @@ public class WeiBoUser implements Serializable {
this.profileImageUrl = profileImageUrl;
}
public WeiBoUser(String userId, String userName,String topic,Date time) {
this.userId = userId;
this.userName = userName;
this.topic=topic;
this.time=time;
}
}
......@@ -328,17 +328,18 @@ public class WeiboHotSearchCrawler {
}
/**
* 微博热搜数据更新话题贡献者排行,阅读量,讨论量,关于
* 微博热搜数据更新话题贡献者排行,关于
*
* @param document
* @return
*/
public static Document weiboUpdatePC(Document document) {
document.getString("name");
String name = document.getString("name");
String gb = "#" + name + "#";
String encode =null;
String topic = document.getString("name");
String gb = "#" + topic + "#";
String encode = null;
try {
encode = URLEncoder.encode(gb, "utf-8");
encode = URLEncoder.encode(gb, "utf-8");
} catch (UnsupportedEncodingException e) {
log.error("字符解析成URl模式异常", e);
}
......@@ -356,35 +357,37 @@ public class WeiboHotSearchCrawler {
org.jsoup.nodes.Document documen = Jsoup.parse(htmlBody);
//获取贡献者信息
try {
Elements li = documen.select("ul.card-user-list-a").select("li");
if (Objects.isNull(weiBoUserDao)) {
weiBoUserDao = new WeiBoUserDao();
}
if (Objects.nonNull(li)) {
Date date = new Date();
for (Element element : li) {
WeiBoUser weiBoUser = new WeiBoUser();
//获取用户名
String userName = element.select("a.name").text();
//获取用户id
String attr = element.select("span.avator").select("a").first().attr("href");
String userId = attr.substring(14);
String type = "话题贡献者";
String id = userId + "_" + type + "_" + name;
weiBoUser.setType(type);
weiBoUser.setId(id);
weiBoUser.setUserName(userName);
weiBoUser.setUserId(userId);
weiBoUser.setTopic(name);
weiBoUser.setTime(date);
weiBoUserDao.addWeiBoUser(weiBoUser);
Elements cardUser = documen.select("div.card-user");
for (Element element : cardUser) {
if (!element.select("div.card-head").text().isEmpty()) {
Elements li = element.select("ul.card-user-list-a").select("li");
if (Objects.nonNull(li)) {
//循环获取话题贡献者相关信息
for (Element eleme : li) {
String type = "话题贡献者";
writeUser(eleme, type, topic);
}
}
} else {
Elements li = element.select("ul.card-user-list-a").select("li");
if (Objects.nonNull(li)) {
//循环获取话题贡献者相关信息
for (Element eleme : li) {
String type = "当事人";
writeUser(eleme, type, topic);
}
}
}
}
} catch (Exception e) {
log.error("话题贡献者排行采集异常",e);
log.error("话题贡献者排行采集异常", e);
}
Elements dt = documen.select("div.card-about").select("dt");
if (Objects.nonNull(dt)) {
//获取微博关于的相关信息
Elements dd = documen.select("div.card-about").select("dd");
Document dtDocument = new Document();
Document ddDocument = new Document();
......@@ -407,12 +410,36 @@ public class WeiboHotSearchCrawler {
}
return docm;
} catch (Exception e) {
log.error("解析微博话题时出现解析错误",e);
log.error("解析微博话题时出现解析错误", e);
}
}
return document;
}
/**
* 写入user数据
*
* @param eleme
* @param type
*/
private static void writeUser(Element eleme, String type, String topic) {
Date date = new Date();
if (Objects.isNull(weiBoUserDao)) {
weiBoUserDao = new WeiBoUserDao();
}
//获取用户名
String userName = eleme.select("a.name").text();
String attr = eleme.select("span.avator").select("a").first().attr("href");
//获取用户id
String userId = attr.substring(14);
String id = userId + "_" + type + "_" + topic;
WeiBoUser weiBoUser = new WeiBoUser(userName, userId, topic, date);
weiBoUser.setType(type);
weiBoUser.setId(id);
weiBoUserDao.addWeiBoUser(weiBoUser);
}
/**
* 解析微博信息
......
......@@ -29,496 +29,496 @@ import java.util.*;
@EnableScheduling
@EnableAsync
public class GatherTimer {
//
// private Logger logger = LoggerFactory.getLogger(GatherTimer.class);
//
// private RedisDao redisDao = new RedisDao();
// /** 知乎数码子分类 */
// private String DIGITAL = "digital";
// /** 知乎国际子分类 */
// private String FOCUS = "focus";
// /** 知乎时事子分类 */
// private String DEPTH = "depth";
//
//
// /**
// * 虎嗅热文推荐的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 * * * * ?")
// public void crawlerHuXiu() {
// logger.info("虎嗅热文推荐开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> huXiuList = HuXiuHotSearchCrawler.HuXiuHotArticleRecommended(date);
// logger.info("{}, 虎嗅热文推荐此轮采集到的数据量为:{}", new Date(), Integer.valueOf(huXiuList != null ? huXiuList.size() : 0));
// TipsUtils.addHotList(HotSearchType.虎嗅热文推荐.name(), huXiuList);
// logger.info("虎嗅热文推荐采集结束...");
//
// /**
// * 36氪人气榜的采集
// */
// logger.info("36氪人气榜开始采集...");
// List<HotSearchList> list36Kr = HotSearch36KrCrawler.hotSearch36Kr(date);
// logger.info("{}, 36氪人气榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list36Kr != null ? list36Kr.size() : 0));
// TipsUtils.addHotList(HotSearchType.人气榜36氪.name(), list36Kr);
// logger.info("36氪人气榜采集结束...");
// }
//
// /**
// * 微博热搜的采集
// */
private Logger logger = LoggerFactory.getLogger(GatherTimer.class);
private RedisDao redisDao = new RedisDao();
/** 知乎数码子分类 */
private String DIGITAL = "digital";
/** 知乎国际子分类 */
private String FOCUS = "focus";
/** 知乎时事子分类 */
private String DEPTH = "depth";
/**
* 虎嗅热文推荐的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ?")
public void crawlerHuXiu() {
logger.info("虎嗅热文推荐开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> huXiuList = HuXiuHotSearchCrawler.HuXiuHotArticleRecommended(date);
logger.info("{}, 虎嗅热文推荐此轮采集到的数据量为:{}", new Date(), Integer.valueOf(huXiuList != null ? huXiuList.size() : 0));
TipsUtils.addHotList(HotSearchType.虎嗅热文推荐.name(), huXiuList);
logger.info("虎嗅热文推荐采集结束...");
/**
* 36氪人气榜的采集
*/
logger.info("36氪人气榜开始采集...");
List<HotSearchList> list36Kr = HotSearch36KrCrawler.hotSearch36Kr(date);
logger.info("{}, 36氪人气榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list36Kr != null ? list36Kr.size() : 0));
TipsUtils.addHotList(HotSearchType.人气榜36.name(), list36Kr);
logger.info("36氪人气榜采集结束...");
}
/**
* 微博热搜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeiBo(){
logger.info("微博热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> weiboList = WeiboHotSearchCrawler.weiboHotSearchByPhone(date);
logger.info("{}, 微博此轮采集到的数据量为:{}", new Date(), weiboList != null ? weiboList.size() : 0);
TipsUtils.addHotList(HotSearchType.微博热搜.name(),weiboList);
logger.info("微博热搜采集结束...");
}
/**
* 微博热搜导语,阅读量,讨论量更新
*/
@Async(value = "myScheduler")
@Scheduled(cron = "45 0/10 * * * ? ")
public void updateWeiBo(){
logger.info("微博热搜导语更新...");
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
Set<String> hotSearchIdSet = redisDao.getRedisSetData(RedisConfig.WEIBO_HOTSEARCHIDS);
redisDao.removeRedis(RedisConfig.WEIBO_HOTSEARCHIDS);
Iterator<String> hotSearchIterator = hotSearchIdSet.iterator();
while (hotSearchIterator.hasNext()){
String id = hotSearchIterator.next();
Document document = hotSearchCacheDAO.getHotSearchById(id);
if(document != null){
document = WeiboHotSearchCrawler.weiboUpdate(document);
if(document.containsKey("topicLead") || document.containsKey("readCount") || document.containsKey("discussCount")) {
hotSearchCacheDAO.updateWeibo(document, id);
}
ZhiWeiTools.sleep(3000L);
}
}
logger.info("微博热搜导语更新结束...");
}
/**
* 今日头条热搜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerTouTiao(){
logger.info("今日头条热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> toutiaoList = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone(date);
logger.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), toutiaoList != null ? toutiaoList.size() : 0);
TipsUtils.addHotList(HotSearchType.今日头条热搜.name(),toutiaoList);
logger.info("今日头条热搜采集结束...");
logger.info("今日头条热搜详情趋势阅读量更新...");
TouTiaoExecutor.countTouTiaoReadCount(toutiaoList);
}
/**
* 百度热搜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerBaiDu(){
logger.info("百度热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> baiduList = BaiDuHotSearchCrawler.baiduHotSearch(date);
logger.info("{}, 百度热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(baiduList != null ? baiduList.size() : 0));
TipsUtils.addHotList(HotSearchType.百度热搜.name(),baiduList);
logger.info("百度热搜采集结束...");
}
/**
* 抖音热搜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerDouYin(){
logger.info("抖音热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> douyinList = DouyinHotSearchCrawler.getMobileDouyinHotList(date);
logger.info("{}, 抖音热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(douyinList != null ? douyinList.size() : 0));
TipsUtils.addHotList(HotSearchType.抖音热搜.name(),douyinList);
logger.info("抖音热搜采集结束...");
}
/**
* 抖音链接的更新
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 0/5 * * * ? ")
public void updateDouYinUrl(){
logger.info("抖音链接更新开始...");
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> douyinList = DouyinHotSearchCrawler.list;
if(douyinList!=null && douyinList.size()>0){
for(int i=0; i<douyinList.size(); i++){
String name = douyinList.get(i).getName();
String id = name+"_"+douyinList.get(i).getType();
String url = DouyinHotSearchCrawler.getDouyinUrl("https://aweme-hl.snssdk.com/aweme/v1/hot/search/video/list/?hotword="+name);
if(url != null) {
Document document = new Document();
document.put("id", id);
document.put("url", url);
hotSearchCacheDAO.updateDouyinUrl(document);
}
}
logger.info("抖音链接更新结束");
}else{
logger.info("抖音链接更新失败,抖音热搜列表获取为空。");
}
}
/**
* 知乎热榜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerZhihu(){
logger.info("知乎热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> zhihuList = ZhihuHotSearchCrawler.getMobileZhihuHotList(date);
logger.info("{}, 知乎热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(zhihuList != null ? zhihuList.size() : 0));
TipsUtils.addHotList(HotSearchType.知乎热搜.name(),zhihuList);
logger.info("知乎热搜采集结束...");
}
/**
* 搜狗微信热词的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeChat(){
logger.info("搜狗微信热词开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch(date);
logger.info("{}, 搜狗微信热词采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
TipsUtils.addHotList(HotSearchType.搜狗微信热搜.name(),list);
logger.info("搜狗微信热词采集结束...");
}
/**
* 搜狗微信热搜的采集(app端采集链接)
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void ceawlerSougouHotData(){
logger.info("搜狗微信热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = SougoHotSearchCrawler.sougouHotDataCrawler(date);
logger.info("{}, 搜狗微信热搜此轮采集到的数据量为:{}", new Date(), list != null ? list.size() : 0);
TipsUtils.addHotList(HotSearchType.搜狗微信客户端热搜.name(),list);
logger.info("搜狗微信热搜采集结束...");
}
/**
* 微博话题的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeiBoTopic(){
logger.info("微博话题开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = WeiboTopicCrawler.startCrawlerByPhone(date);
logger.info("{}, 微博话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
TipsUtils.addHotList(HotSearchType.微博话题.name(),list);
logger.info("微博话题采集结束...");
}
/**
* 腾讯新闻热点的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerTengXun(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = TengXunCrawler.getTengXunHotList(date);
TipsUtils.addHotList(HotSearchType.腾讯新闻.name(),list);
}
/**
* 新浪热点的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerXinLangHotSpot(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = XinLangHotSearchCrawler.getXinLangHotSpot(date);
TipsUtils.addHotList(HotSearchType.新浪热点.name(),list);
}
/**
* 新浪热榜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerXinLangHotSearch(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = XinLangHotSearchCrawler.getXinLangHotSearch(date);
TipsUtils.addHotList(HotSearchType.新浪热榜.name(),list);
}
/**
* 网易新闻热榜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerWangYiHotSearch(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = WangYiHotSearchCrawler.getWangYiHotSearch(date);
TipsUtils.addHotList(HotSearchType.网易热榜.name(),list);
}
/**
* 网易新闻跟帖热议的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerWangYiHotComment(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = WangYiHotSearchCrawler.getWangYicomment(date);
TipsUtils.addHotList(HotSearchType.网易跟帖热议.name(),list);
}
/**
* 凤凰新闻热榜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerFengHuangHotData(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = FengHuangSearchCrawler.getFengHuangHotData(date);
TipsUtils.addHotList(HotSearchType.凤凰新闻热榜.name(),list);
}
/**
* 凤凰新闻热搜的采集
*/
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 * * * * ? ")
// public void crawlerWeiBo(){
// logger.info("微博热搜开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> weiboList = WeiboHotSearchCrawler.weiboHotSearchByPhone(date);
// logger.info("{}, 微博此轮采集到的数据量为:{}", new Date(), weiboList != null ? weiboList.size() : 0);
// TipsUtils.addHotList(HotSearchType.微博热搜.name(),weiboList);
// logger.info("微博热搜采集结束...");
// }
//
// /**
// * 微博热搜导语,阅读量,讨论量更新
// */
// @Scheduled(cron = "10 * * * * ? ")
public void crawlerFengHuangHotSearch(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = FengHuangSearchCrawler.getFengHuangHotSearch(date);
TipsUtils.addHotList(HotSearchType.凤凰新闻热搜.name(),list);
}
/**
* 腾讯较真辟谣榜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerTengXunVerificationHotSearch(){
logger.info("{},腾讯较真辟谣榜开始采集", new Date());
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = TengXunCrawler.getTengXunVerificationList(date);
logger.info("腾讯较真辟谣榜本轮采集数量:{}",list.size());
TipsUtils.addHotList(HotSearchType.腾讯较真榜.name(), list);
logger.info("{},腾讯较真辟谣榜采集结束", new Date());
}
/**
* 搜狐话题的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void crawlerSouHuTopic(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = SouhuTopicCrawler.getSouhuTopic(date);
TipsUtils.addHotList(HotSearchType.搜狐话题.name(),list);
}
/**
* 知乎热搜话题的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void crawlerZhihuHotTopic(){
logger.info("知乎热搜话题开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = ZhihuTopicSearchCrawler.getZhihuTopicSearch(date);
logger.info("{}, 知乎热搜话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
TipsUtils.addHotList(HotSearchType.知乎热搜榜单.name(),list);
logger.info("知乎热搜话题采集结束...");
}
/**
* 微博预热榜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void crawlerWeiBoPreheat(){
logger.info("微博预热榜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = WeiboHotSearchCrawler.weiboPreheatSearch(date);
logger.info("{},微博预热榜此轮采集到的数据量为:{}", new Date(),Integer.valueOf(list != null ? list.size() : 0));
TipsUtils.addHotList(HotSearchType.微博预热榜.name(),list);
logger.info("微博预热榜采集结束...");
}
/**
* 知乎热搜数码分类采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void crawlerZhiHuDigital(){
this.crawlerZhiHuChild(DIGITAL);
}
/**
* 知乎热搜国际分类采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void crawlerZhiHuFocus(){
this.crawlerZhiHuChild(FOCUS);
}
/**
* 知乎热搜时事分类采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void crawlerZhiHuDepth(){
this.crawlerZhiHuChild(DEPTH);
}
/**
* maimai采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "30 0/30 * * * ? ")
public void crawlerMaiMaiHotSearch(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = MaiMaiHotSearchCrawler.getMaiMaiHotData(date);
int i=0;
while (list.size()==0 && i<10){
ZhiWeiTools.sleep(5000L);
list = MaiMaiHotSearchCrawler.getMaiMaiHotData(date);
i++;
}
TipsUtils.addHotList(HotSearchType.脉脉热榜.name(),list);
}
/**
* B站排行榜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "30 * * * * ? ")
public void crawlerBilibiliHotSearch(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list =BililiCrawler.getBilibiliHotSearch(date);
TipsUtils.addHotList(HotSearchType.B站排行榜.name(),list);
}
/**
* B站热搜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "30 * * * * ? ")
public void crawlerBilibiliHotData() {
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = BililiCrawler.getBiHotData(date);
TipsUtils.addHotList(HotSearchType.B站热搜.name(),list);
}
/**
* 微博超话的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 0 0/3 * * ? ")
public void crawlerWeiBoSuperTopic(){
logger.info("微博超话采集开始........");
Date date = DateUtils.getMillSecondTime(new Date());
WeiboSuperTopicDAO weiboTopicDAO = new WeiboSuperTopicDAO();
List<WeiboSuperTopic> list = WeiboSuperTopicCrawler.startCrawler();
logger.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<Document> data = new ArrayList<>();
for(WeiboSuperTopic topic : list){
logger.info("topic::::{}", topic);
Document doc = new Document();
doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName());
doc.put("rank", topic.getRank());
doc.put("score_num", topic.getScore());
doc.put("fensi_num", topic.getFensi());
doc.put("post_num", topic.getPostNum());
doc.put("type", topic.getType());
doc.put("day", topic.getDay());
doc.put("time", topic.getTime());
doc.put("url", topic.getUrl());
data.add(doc);
}
weiboTopicDAO.addTopicList(data);
logger.info("微博话题采集结束........");
}
// @Async(value = "myScheduler")
// @Scheduled(cron = "45 0/10 * * * ? ")
// public void updateWeiBo(){
// logger.info("微博热搜导语更新...");
// @Scheduled(cron = "0 05 09 * * ? ")
// public void updateWeiboHistory(){
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// Set<String> hotSearchIdSet = redisDao.getRedisSetData(RedisConfig.WEIBO_HOTSEARCHIDS);
// redisDao.removeRedis(RedisConfig.WEIBO_HOTSEARCHIDS);
// Iterator<String> hotSearchIterator = hotSearchIdSet.iterator();
// while (hotSearchIterator.hasNext()){
// String id = hotSearchIterator.next();
// Document document = hotSearchCacheDAO.getHotSearchById(id);
// List<Document> documentList = hotSearchCacheDAO.getHotSearchList();
// int i=0;
// for (Document document : documentList){
// document = WeiboHotSearchCrawler.updateWeiBoTopic(document);
// if(document != null){
// document = WeiboHotSearchCrawler.weiboUpdate(document);
// if(document.containsKey("topicLead") || document.containsKey("readCount") || document.containsKey("discussCount")) {
// hotSearchCacheDAO.updateWeibo(document, id);
// }
// ZhiWeiTools.sleep(3000L);
// }
// }
// logger.info("微博热搜导语更新结束...");
// }
//
// /**
// * 今日头条热搜的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 * * * * ? ")
// public void crawlerTouTiao(){
// logger.info("今日头条热搜开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> toutiaoList = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone(date);
// logger.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), toutiaoList != null ? toutiaoList.size() : 0);
// TipsUtils.addHotList(HotSearchType.今日头条热搜.name(),toutiaoList);
// logger.info("今日头条热搜采集结束...");
// logger.info("今日头条热搜详情趋势阅读量更新...");
// TouTiaoExecutor.countTouTiaoReadCount(toutiaoList);
// }
//
// /**
// * 百度热搜的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 * * * * ? ")
// public void crawlerBaiDu(){
// logger.info("百度热搜开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> baiduList = BaiDuHotSearchCrawler.baiduHotSearch(date);
// logger.info("{}, 百度热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(baiduList != null ? baiduList.size() : 0));
// TipsUtils.addHotList(HotSearchType.百度热搜.name(),baiduList);
// logger.info("百度热搜采集结束...");
// }
//
// /**
// * 抖音热搜的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 * * * * ? ")
// public void crawlerDouYin(){
// logger.info("抖音热搜开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> douyinList = DouyinHotSearchCrawler.getMobileDouyinHotList(date);
// logger.info("{}, 抖音热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(douyinList != null ? douyinList.size() : 0));
// TipsUtils.addHotList(HotSearchType.抖音热搜.name(),douyinList);
// logger.info("抖音热搜采集结束...");
// }
//
// /**
// * 抖音链接的更新
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 0/5 * * * ? ")
// public void updateDouYinUrl(){
// logger.info("抖音链接更新开始...");
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<HotSearchList> douyinList = DouyinHotSearchCrawler.list;
// if(douyinList!=null && douyinList.size()>0){
// for(int i=0; i<douyinList.size(); i++){
// String name = douyinList.get(i).getName();
// String id = name+"_"+douyinList.get(i).getType();
// String url = DouyinHotSearchCrawler.getDouyinUrl("https://aweme-hl.snssdk.com/aweme/v1/hot/search/video/list/?hotword="+name);
// if(url != null) {
// Document document = new Document();
// document.put("id", id);
// document.put("url", url);
// hotSearchCacheDAO.updateDouyinUrl(document);
// }
// hotSearchCacheDAO.updateWeibo(document,document.getString("_id"));
// ZhiWeiTools.sleep(500L);
// }
// logger.info("抖音链接更新结束");
// }else{
// logger.info("抖音链接更新失败,抖音热搜列表获取为空。");
// }
// }
//
// /**
// * 知乎热榜的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 * * * * ? ")
// public void crawlerZhihu(){
// logger.info("知乎热搜开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> zhihuList = ZhihuHotSearchCrawler.getMobileZhihuHotList(date);
// logger.info("{}, 知乎热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(zhihuList != null ? zhihuList.size() : 0));
// TipsUtils.addHotList(HotSearchType.知乎热搜.name(),zhihuList);
// logger.info("知乎热搜采集结束...");
// }
//
// /**
// * 搜狗微信热词的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 * * * * ? ")
// public void crawlerWeChat(){
// logger.info("搜狗微信热词开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch(date);
// logger.info("{}, 搜狗微信热词采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// TipsUtils.addHotList(HotSearchType.搜狗微信热搜.name(),list);
// logger.info("搜狗微信热词采集结束...");
// }
//
// /**
// * 搜狗微信热搜的采集(app端采集链接)
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "20 * * * * ? ")
// public void ceawlerSougouHotData(){
// logger.info("搜狗微信热搜开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = SougoHotSearchCrawler.sougouHotDataCrawler(date);
// logger.info("{}, 搜狗微信热搜此轮采集到的数据量为:{}", new Date(), list != null ? list.size() : 0);
// TipsUtils.addHotList(HotSearchType.搜狗微信客户端热搜.name(),list);
// logger.info("搜狗微信热搜采集结束...");
// }
//
// /**
// * 微博话题的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 * * * * ? ")
// public void crawlerWeiBoTopic(){
// logger.info("微博话题开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = WeiboTopicCrawler.startCrawlerByPhone(date);
// logger.info("{}, 微博话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// TipsUtils.addHotList(HotSearchType.微博话题.name(),list);
// logger.info("微博话题采集结束...");
// }
//
// /**
// * 腾讯新闻热点的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "10 * * * * ? ")
// public void crawlerTengXun(){
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = TengXunCrawler.getTengXunHotList(date);
// TipsUtils.addHotList(HotSearchType.腾讯新闻.name(),list);
// }
//
// /**
// * 新浪热点的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "10 * * * * ? ")
// public void crawlerXinLangHotSpot(){
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = XinLangHotSearchCrawler.getXinLangHotSpot(date);
// TipsUtils.addHotList(HotSearchType.新浪热点.name(),list);
// }
//
// /**
// * 新浪热榜的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "10 * * * * ? ")
// public void crawlerXinLangHotSearch(){
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = XinLangHotSearchCrawler.getXinLangHotSearch(date);
// TipsUtils.addHotList(HotSearchType.新浪热榜.name(),list);
// }
//
// /**
// * 网易新闻热榜的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "10 * * * * ? ")
// public void crawlerWangYiHotSearch(){
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = WangYiHotSearchCrawler.getWangYiHotSearch(date);
// TipsUtils.addHotList(HotSearchType.网易热榜.name(),list);
// }
//
// /**
// * 网易新闻跟帖热议的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "10 * * * * ? ")
// public void crawlerWangYiHotComment(){
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = WangYiHotSearchCrawler.getWangYicomment(date);
// TipsUtils.addHotList(HotSearchType.网易跟帖热议.name(),list);
// }
//
// /**
// * 凤凰新闻热榜的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "10 * * * * ? ")
// public void crawlerFengHuangHotData(){
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = FengHuangSearchCrawler.getFengHuangHotData(date);
// TipsUtils.addHotList(HotSearchType.凤凰新闻热榜.name(),list);
// }
//
// /**
// * 凤凰新闻热搜的采集
// */
//// @Async(value = "myScheduler")
//// @Scheduled(cron = "10 * * * * ? ")
// public void crawlerFengHuangHotSearch(){
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = FengHuangSearchCrawler.getFengHuangHotSearch(date);
// TipsUtils.addHotList(HotSearchType.凤凰新闻热搜.name(),list);
// }
//
// /**
// * 腾讯较真辟谣榜采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "10 * * * * ? ")
// public void crawlerTengXunVerificationHotSearch(){
// logger.info("{},腾讯较真辟谣榜开始采集", new Date());
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = TengXunCrawler.getTengXunVerificationList(date);
// logger.info("腾讯较真辟谣榜本轮采集数量:{}",list.size());
// TipsUtils.addHotList(HotSearchType.腾讯较真榜.name(), list);
// logger.info("{},腾讯较真辟谣榜采集结束", new Date());
// }
//
// /**
// * 搜狐话题的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "20 * * * * ? ")
// public void crawlerSouHuTopic(){
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = SouhuTopicCrawler.getSouhuTopic(date);
// TipsUtils.addHotList(HotSearchType.搜狐话题.name(),list);
// }
//
// /**
// * 知乎热搜话题的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "20 * * * * ? ")
// public void crawlerZhihuHotTopic(){
// logger.info("知乎热搜话题开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = ZhihuTopicSearchCrawler.getZhihuTopicSearch(date);
// logger.info("{}, 知乎热搜话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// TipsUtils.addHotList(HotSearchType.知乎热搜榜单.name(),list);
// logger.info("知乎热搜话题采集结束...");
// }
//
// /**
// * 微博预热榜的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "20 * * * * ? ")
// public void crawlerWeiBoPreheat(){
// logger.info("微博预热榜开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboPreheatSearch(date);
// logger.info("{},微博预热榜此轮采集到的数据量为:{}", new Date(),Integer.valueOf(list != null ? list.size() : 0));
// TipsUtils.addHotList(HotSearchType.微博预热榜.name(),list);
// logger.info("微博预热榜采集结束...");
// }
//
// /**
// * 知乎热搜数码分类采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "20 * * * * ? ")
// public void crawlerZhiHuDigital(){
// this.crawlerZhiHuChild(DIGITAL);
// }
//
// /**
// * 知乎热搜国际分类采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "20 * * * * ? ")
// public void crawlerZhiHuFocus(){
// this.crawlerZhiHuChild(FOCUS);
// }
//
// /**
// * 知乎热搜时事分类采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "20 * * * * ? ")
// public void crawlerZhiHuDepth(){
// this.crawlerZhiHuChild(DEPTH);
// }
//
// /**
// * maimai采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "30 0/30 * * * ? ")
// public void crawlerMaiMaiHotSearch(){
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = MaiMaiHotSearchCrawler.getMaiMaiHotData(date);
// int i=0;
// while (list.size()==0 && i<10){
// ZhiWeiTools.sleep(5000L);
// list = MaiMaiHotSearchCrawler.getMaiMaiHotData(date);
// i++;
// logger.info("更新进度:{}",i*100/documentList.size());
// }
// TipsUtils.addHotList(HotSearchType.脉脉热榜.name(),list);
// }
//
// /**
// * B站排行榜采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "30 * * * * ? ")
// public void crawlerBilibiliHotSearch(){
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list =BililiCrawler.getBilibiliHotSearch(date);
// TipsUtils.addHotList(HotSearchType.B站排行榜.name(),list);
// }
//
// /**
// * B站热搜采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "30 * * * * ? ")
// public void crawlerBilibiliHotData() {
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> list = BililiCrawler.getBiHotData(date);
// TipsUtils.addHotList(HotSearchType.B站热搜.name(),list);
// }
//
// /**
// * 微博超话的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 0 0/3 * * ? ")
// public void crawlerWeiBoSuperTopic(){
// logger.info("微博超话采集开始........");
// Date date = DateUtils.getMillSecondTime(new Date());
// WeiboSuperTopicDAO weiboTopicDAO = new WeiboSuperTopicDAO();
// List<WeiboSuperTopic> list = WeiboSuperTopicCrawler.startCrawler();
// logger.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// List<Document> data = new ArrayList<>();
// for(WeiboSuperTopic topic : list){
// logger.info("topic::::{}", topic);
// Document doc = new Document();
// doc.put("_id", topic.getId());
// doc.put("name", topic.getTopicName());
// doc.put("rank", topic.getRank());
// doc.put("score_num", topic.getScore());
// doc.put("fensi_num", topic.getFensi());
// doc.put("post_num", topic.getPostNum());
// doc.put("type", topic.getType());
// doc.put("day", topic.getDay());
// doc.put("time", topic.getTime());
// doc.put("url", topic.getUrl());
// data.add(doc);
// }
// weiboTopicDAO.addTopicList(data);
// logger.info("微博话题采集结束........");
// }
//
//
//// @Async(value = "myScheduler")
//// @Scheduled(cron = "0 05 09 * * ? ")
//// public void updateWeiboHistory(){
//// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
//// List<Document> documentList = hotSearchCacheDAO.getHotSearchList();
//// int i=0;
//// for (Document document : documentList){
//// document = WeiboHotSearchCrawler.updateWeiBoTopic(document);
//// if(document != null){
//// hotSearchCacheDAO.updateWeibo(document,document.getString("_id"));
//// ZhiWeiTools.sleep(500L);
//// }
//// i++;
//// logger.info("更新进度:{}",i*100/documentList.size());
//// }
//// logger.info("更新结束");
//// }
//
// /**
// * 知乎子类采集函数
// * @param type
// */
// private void crawlerZhiHuChild(String type){
// Date date = DateUtils.getMillSecondTime(new Date());
// String name = this.getTypeName(type);
// logger.info("知乎{}话题热榜采集开始...", name);
// List<HotSearchList> list = ZhihuChildHotSearchCrawler.getZhihuTopicSearch(type,name,date);
// logger.info("{}, 知乎{}话题此轮采集到的数据量为:{}", new Date(),name, Integer.valueOf(list != null ? list.size() : 0));
// TipsUtils.addHotList(name,list);
// logger.info("知乎{}话题热榜采集结束...", name);
// }
//
// private String getTypeName(String type){
// String name;
// switch (type) {
// case "digital":
// name = "数码";
// break;
// case "focus":
// name = "国际";
// break;
// case "depth":
// name = "时事";
// break;
// default:
// name = "";
// }
// return name;
// }
// /**
// *快手热榜采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 * * * * ? ")
// public void crawlerKuaiShou(){
// logger.info("快手热榜开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> kuaiShouList = KuaiShouHotSearchCrawler.KuaiShouHotSearchCrawler(date);
// logger.info("{}, 快手此轮采集到的数据量为:{}", new Date(), kuaiShouList != null ? kuaiShouList.size() : 0);
// TipsUtils.addHotList(HotSearchType.快手热榜.name(), kuaiShouList);
// logger.info("快手热榜采集结束...");
// logger.info("更新结束");
// }
/**
* 知乎子类采集函数
* @param type
*/
private void crawlerZhiHuChild(String type){
Date date = DateUtils.getMillSecondTime(new Date());
String name = this.getTypeName(type);
logger.info("知乎{}话题热榜采集开始...", name);
List<HotSearchList> list = ZhihuChildHotSearchCrawler.getZhihuTopicSearch(type,name,date);
logger.info("{}, 知乎{}话题此轮采集到的数据量为:{}", new Date(),name, Integer.valueOf(list != null ? list.size() : 0));
TipsUtils.addHotList(name,list);
logger.info("知乎{}话题热榜采集结束...", name);
}
private String getTypeName(String type){
String name;
switch (type) {
case "digital":
name = "数码";
break;
case "focus":
name = "国际";
break;
case "depth":
name = "时事";
break;
default:
name = "";
}
return name;
}
/**
*快手热榜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerKuaiShou(){
logger.info("快手热榜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> kuaiShouList = KuaiShouHotSearchCrawler.KuaiShouHotSearchCrawler(date);
logger.info("{}, 快手此轮采集到的数据量为:{}", new Date(), kuaiShouList != null ? kuaiShouList.size() : 0);
TipsUtils.addHotList(HotSearchType.快手热榜.name(), kuaiShouList);
logger.info("快手热榜采集结束...");
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment