Commit f8861322 by chenweitao

Merge branch 'mlbWork' into 'master'

Mlb work

See merge request !50
parents 81f2f704 987430de
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
17.网易新闻跟帖热议 17.网易新闻跟帖热议
18.搜狗微信热搜 18.搜狗微信热搜
19.微博话题 19.微博话题
20.微博预热榜
#### Mongo内网 #### Mongo内网
192.168.0.101,192.168.0.106,192.168.0.108 192.168.0.101,192.168.0.106,192.168.0.108
...@@ -42,6 +43,12 @@ ...@@ -42,6 +43,12 @@
30000 30000
#### Mongo数据表名 #### Mongo数据表名
hot_search_list hot_search_list
#### zookeeper
zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
#### redis
redis.host = 192.168.0.39
redis.port = 6379
redis.database = 1
......
...@@ -112,6 +112,13 @@ ...@@ -112,6 +112,13 @@
<artifactId>spring-tx</artifactId> <artifactId>spring-tx</artifactId>
<version>${spring.version}</version> <version>${spring.version}</version>
</dependency> </dependency>
<!-- redis写 -->
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.8.1</version>
</dependency>
</dependencies> </dependencies>
<build> <build>
......
...@@ -80,7 +80,20 @@ public class HotSearchCache { ...@@ -80,7 +80,20 @@ public class HotSearchCache {
*/ */
private Boolean recommend; private Boolean recommend;
/**
* 阅读量
*/
private Integer readCount;
/**
* 讨论量
*/
private Integer discussCount;
/**
* 话题真假(腾讯较真榜使用)
*/
private String topicResult;
public HotSearchCache(String url, String name, String topicLead, Integer highestCount, Integer lastCount, Boolean hot, public HotSearchCache(String url, String name, String topicLead, Integer highestCount, Integer lastCount, Boolean hot,
Date startTime, Date endTime, Integer highestRank, Integer lastRank, String type, Integer duration){ Date startTime, Date endTime, Integer highestRank, Integer lastRank, String type, Integer duration){
...@@ -107,4 +120,28 @@ public class HotSearchCache { ...@@ -107,4 +120,28 @@ public class HotSearchCache {
public void setRecommend(Boolean recommend) { public void setRecommend(Boolean recommend) {
this.recommend = recommend; this.recommend = recommend;
} }
public Integer getReadCount() {
return readCount;
}
public void setReadCount(Integer readCount) {
this.readCount = readCount;
}
public Integer getDiscussCount() {
return discussCount;
}
public void setDiscussCount(Integer discussCount) {
this.discussCount = discussCount;
}
public String getTopicLead() {
return topicLead;
}
public void setTopicLead(String topicLead) {
this.topicLead = topicLead;
}
} }
...@@ -75,10 +75,15 @@ public class HotSearchList implements Serializable{ ...@@ -75,10 +75,15 @@ public class HotSearchList implements Serializable{
private String icon; private String icon;
/** /**
* 话题讨论量 * 话题讨论量或阅读量
*/ */
private Integer commentCount; private Integer commentCount;
/**
* 话题真假结果(腾讯较真榜使用)
*/
private String topicResult;
public HotSearchList(){} public HotSearchList(){}
public HotSearchList(String url, String name, Integer count,Boolean hot,Integer rank,String type,String icon,Date date){ public HotSearchList(String url, String name, Integer count,Boolean hot,Integer rank,String type,String icon,Date date){
...@@ -122,4 +127,17 @@ public class HotSearchList implements Serializable{ ...@@ -122,4 +127,17 @@ public class HotSearchList implements Serializable{
this.topicLead = topicLead; this.topicLead = topicLead;
} }
public HotSearchList(String url, String name, Integer count, Boolean hot,Integer rank, String type, Date date, String icon, String topicResult){
this.id = name + "_" + new Date().getTime() + "_" + type;
this.url = url;
this.name = name;
this.hot = hot;
this.count = count;
this.rank = rank;
this.time = date;
this.day = TimeParse.dateFormartString(date, "yyyy-MM-dd");
this.type = type;
this.icon = icon;
this.topicResult = topicResult;
}
} }
...@@ -17,5 +17,6 @@ public enum HotSearchType { ...@@ -17,5 +17,6 @@ public enum HotSearchType {
凤凰新闻热搜, 凤凰新闻热搜,
网易热榜, 网易热榜,
网易跟帖热议, 网易跟帖热议,
微博预热榜 微博预热榜,
腾讯较真榜
} }
package com.zhiwei.searchhotcrawler.config;
import java.io.IOException;
import java.util.Properties;
public class RedisConfig {
public static String redisHost;
public static Integer redisPort;
public static String redisPassword;
public static Integer redisDataBase;
public static Integer redisMaxIdle;
public static Integer redisMinIdle;
public static Integer redisMaxTotal;
public static Integer redisTimeout;
/** 采集到的微博热搜Id */
public static String WEIBO_HOTSEARCHIDS = "weibo_hotsearchIds";
static {
Properties redisProperties = new Properties();
try {
redisProperties.load(RedisConfig.class.getClassLoader().getResourceAsStream("redis.properties"));
redisHost = redisProperties.getProperty("redis.host");
redisPort = Integer.valueOf(redisProperties.getProperty("redis.port"));
redisPassword = redisProperties.getProperty("redis.password");
redisDataBase = Integer.valueOf(redisProperties.getProperty("redis.database"));
redisMaxIdle = Integer.valueOf(redisProperties.getProperty("redis.maxIdle"));
redisMinIdle = Integer.valueOf(redisProperties.getProperty("redis.minIdle"));
redisMaxTotal = Integer.valueOf(redisProperties.getProperty("redis.maxTotal"));
redisTimeout = Integer.valueOf(redisProperties.getProperty("redis.timeout"));
} catch (IOException e) {
e.printStackTrace();
}
}
}
...@@ -79,4 +79,39 @@ public class TengXunCrawler { ...@@ -79,4 +79,39 @@ public class TengXunCrawler {
log.info("腾讯新闻采集结束"); log.info("腾讯新闻采集结束");
return list; return list;
} }
/**
* 腾讯较真辟谣榜数据采集
* @param date
* @return
*/
public static List<HotSearchList> getTengXunVerificationList(Date date) {
List<HotSearchList> list = new ArrayList<>();
String htmlBody = null;
String url = "https://vp.fact.qq.com/hotlistData?num=20";
Request request = RequestUtils.wrapGet(url);
//采集为空最多重试3次
for (int t = 0; t < 3; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
e.printStackTrace();
}
if (htmlBody != null && htmlBody.contains("data")){
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("data");
for (int i=0; i<jsonArray.size(); i++){
JSONObject jsonObject = jsonArray.getJSONObject(i);
Integer rank = jsonObject.getIntValue("index");
String name = jsonObject.getString("title");
Integer count = jsonObject.getIntValue("score");
String tengxunUrl = jsonObject.getString("link");
String topicResult = jsonObject.getString("result");
HotSearchList hotSearchList = new HotSearchList(tengxunUrl,name,count,false,rank,HotSearchType.腾讯较真榜.name(),date,null,topicResult);
list.add(hotSearchList);
}
return list;
}
}
return list;
}
} }
package com.zhiwei.searchhotcrawler.crawler; package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
...@@ -7,13 +8,17 @@ import com.zhiwei.crawler.proxy.ProxyHolder; ...@@ -7,13 +8,17 @@ import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
...@@ -96,6 +101,86 @@ public class ToutiaoHotSearchCrawler { ...@@ -96,6 +101,86 @@ public class ToutiaoHotSearchCrawler {
} }
// /**
// * 采集今日头条数据
// * @param date
// * @return
// */
// public static List<HotSearchList> toutiaoHotSearchByPhone(Date date){
// List<HotSearchList> hotSearchLists = new ArrayList<>();
// //采集头条内容
// String url = "https://api5-normal-c-lq.snssdk.com/api/feed/hotboard_online/v1/?category=hotboard_online&count=50";
// Map<String,Object> headerMap = new HashMap<>();
// headerMap.put("upgrade-insecure-requests","1");
// headerMap.put("user-agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36");
// String htmlBody = null;
// Request request = RequestUtils.wrapGet(url,headerMap);
// try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// log.info(htmlBody);
// } catch (IOException e1) {
// log.error("解析今日头条实时热搜时出现连接失败",e1);
// }
// if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
// try {
// JSONArray words = JSONObject.parseObject(htmlBody).getJSONArray("data");
// for (int i = 0; i < words.size(); i++) {
// JSONObject jsonObject = JSON.parseObject(words.get(i).toString());
// int rank = i+1;
// String name =jsonObject.getJSONObject("content").getJSONObject("raw_data").getString("title");
// String link = jsonObject.getJSONObject("content").getJSONObject("raw_data").getString("schema");
// if(link.contains("keyword=")) {
// link = "https://so.toutiao.com/search/?"+link.substring(link.indexOf("keyword="), link.indexOf("&search_json="));
// }else{
// link = null;
// }
// HotSearchList hotSearch = new HotSearchList(link, name, null, true, rank, HotSearchType.今日头条热搜.name(), null,date);
// hotSearchLists.add(hotSearch);
// }
// } catch (Exception e) {
// log.error("解析今日头条实时热搜时出现解析错误,数据不是json结构", e);
// }
// } else {
// log.info("解析今日头条实时热搜时出现解析错误,页面结构有问题");
// }
// return hotSearchLists;
// }
/**
* 获取今日头条热搜阅读量
* @param hotSearchList
* @return
*/
public static HotSearchList toutiaoReadCount(HotSearchList hotSearchList){
if (hotSearchList.getUrl() != null) {
String htmlBody = null;
String url = hotSearchList.getUrl();
Request request = RequestUtils.wrapGet(url);
for (int i = 0; i <= 5; i++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e1) {
log.error("解析今日头条热搜详情页面出现连接失败", e1);
}
if (StringUtils.isNotBlank(htmlBody)) {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select(".result-content .cs-view .cs-topone-tail .cs-view .margin-bottom-m .margin-left-m");
if (Objects.nonNull(elements) && !elements.isEmpty()) {
Element element = elements.first();
String readCount = element.text().replaceAll("阅读", "");
Integer count = TipsUtils.getHotCount(readCount);
log.info("{},阅读量:{}", hotSearchList.getName(), count);
hotSearchList.setCommentCount(count);
return hotSearchList;
}
}
ZhiWeiTools.sleep(1000L);
}
}
return hotSearchList;
}
/** /**
* 热搜类型 * 热搜类型
* @param wordsType * @param wordsType
......
...@@ -5,12 +5,16 @@ import java.util.*; ...@@ -5,12 +5,16 @@ import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSON;
import com.zhiwei.searchhotcrawler.bean.HotSearchCache;
import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.dao.RedisDao;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.bson.Document;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.slf4j.Logger; import org.slf4j.Logger;
...@@ -25,6 +29,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList; ...@@ -25,6 +29,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.mail.SendMailWeibo; import com.zhiwei.searchhotcrawler.mail.SendMailWeibo;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import org.springframework.beans.factory.annotation.Autowired;
/** /**
* @ClassName: WeiboHotSearch * @ClassName: WeiboHotSearch
...@@ -107,6 +112,7 @@ public class WeiboHotSearchCrawler { ...@@ -107,6 +112,7 @@ public class WeiboHotSearchCrawler {
* @return void 返回类型 * @return void 返回类型
*/ */
public static List<HotSearchList> weiboHotSearchByPhone(Date date){ public static List<HotSearchList> weiboHotSearchByPhone(Date date){
RedisDao redisDao = new RedisDao();
String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583"; String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
Map<String,String> headerMap = new HashMap<>(); Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"); headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
...@@ -147,10 +153,12 @@ public class WeiboHotSearchCrawler { ...@@ -147,10 +153,12 @@ public class WeiboHotSearchCrawler {
if (StringUtils.isNotBlank(icon)) { if (StringUtils.isNotBlank(icon)) {
icon = icon.split("_")[1].split(".png")[0]; icon = icon.split("_")[1].split(".png")[0];
} }
String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top"; // String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String id = cardInfo.getString("scheme");
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date); HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
result.add(hotSearch); result.add(hotSearch);
rank++; rank++;
redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS,name+"_微博热搜");
} }
} else { } else {
log.info("card 数据结构为:{}", card); log.info("card 数据结构为:{}", card);
...@@ -214,4 +222,104 @@ public class WeiboHotSearchCrawler { ...@@ -214,4 +222,104 @@ public class WeiboHotSearchCrawler {
return result; return result;
} }
/**
* 微博热搜数据更新导语,阅读量,讨论量
* @param document
* @return
*/
public static Document weiboUpdate(Document document) {
log.info("更新微博热搜{}导语阅读量和讨论量",document.getString("name"));
String url = "https://m.weibo.cn/api/container/getIndex?"+ document.getString("url").substring(
document.getString("url").indexOf("?")+1,document.getString("url").indexOf("&"));
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for(int count =0; count<=5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜详情页面时出现连接失败", e);
}
if (htmlBody != null && htmlBody.contains("data")) {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo");
if(json.containsKey("desc")){
String topicLead = json.getString("desc");
if(!"".equals(topicLead)) {
document.put("topicLead", topicLead);
}
}
if(json.containsKey("cardlist_head_cards")){
JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext");
String read = midText.replaceAll("阅读", "").replaceAll("讨论.*", "").trim();
String discussCount = midText.replaceAll(".*讨论", "").replaceAll("详情.*", "").trim();
document.put("readCount", TipsUtils.getHotCount(read));
document.put("discussCount", TipsUtils.getHotCount(discussCount));
}
}
return document;
}
}
return null;
}
// /**
// * 微博更新历史数据
// * @param hotSearch
// * @return
// */
// public static Document updateWeiBoTopic(Document hotSearch){
// String hotUrl = hotSearch.getString("url");
// String htmlBody = null;
// if(hotUrl != null && !"http://s.weibo.comjavascript:void(0);".equals(hotUrl)) {
// Request request = RequestUtils.wrapGet(hotUrl);
// try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// org.jsoup.nodes.Document document = Jsoup.parse(htmlBody);
// Element element = document.select(".m-wrap .m-con-r .card-wrap .card-interest .card-content .item-topic .info h2 a").first();
// if (element != null) {
// String topicLeadUrl = element.attr("href");
// return analyHtml(hotSearch,topicLeadUrl);
// } else{
// return analyHtml(hotSearch,hotUrl);
// }
// } catch (Exception e) {
// log.error("解析微博话题榜时出现解析错误,页面结构有问题", e);
// }
// }
// return null;
// }
//
// public static Document analyHtml(Document hotSearch,String topicUrl){
// String htmlBody = null;
// Request request = RequestUtils.wrapGet(topicUrl);
// try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// org.jsoup.nodes.Document document = Jsoup.parse(htmlBody);
// if(document != null) {
// Element body = document.body();
// Element read = body.select(".card-topic-a .info .total span").first();
// if(read != null) {
// String readCount = read.text().replaceAll("阅读", "");
// hotSearch.put("readCount",TipsUtils.getHotCount(readCount));
// }
// Element dis = body.select(".card-topic-a .info .total span").last();
// if(dis != null) {
// String disCount = dis.text().replaceAll("讨论", "");
// hotSearch.put("discussCount",TipsUtils.getHotCount(disCount));
// }
// Element topicLead = body.select(".m-wrap .card-wrap .card-topic-lead p").first();
// if(topicLead != null) {
// String topicLeadString = topicLead.html().replaceAll("<strong>导语:</strong>", "");
// topicLeadString = topicLeadString.length() > 150 ? topicLeadString.substring(0,150) : topicLeadString;
// hotSearch.put("topicLead",topicLeadString);
// }
// return hotSearch;
// }
// } catch (Exception e) {
// log.error("解析微博话题导语时出现解析错误,页面结构有问题", e);
// }
// return null;
// }
} }
...@@ -121,7 +121,7 @@ public class ZhihuHotSearchCrawler { ...@@ -121,7 +121,7 @@ public class ZhihuHotSearchCrawler {
hotCount = (int) (Double.parseDouble(hotText) * 10000); hotCount = (int) (Double.parseDouble(hotText) * 10000);
} else if (hotText.contains("亿")) { } else if (hotText.contains("亿")) {
hotText = hotText.replaceAll("亿.*", "").trim(); hotText = hotText.replaceAll("亿.*", "").trim();
hotCount = (int) (Double.parseDouble(hotText) * 10000000); hotCount = (int) (Double.parseDouble(hotText) * 100000000);
} else { } else {
hotCount = Integer.getInteger(hotText); hotCount = Integer.getInteger(hotText);
} }
......
package com.zhiwei.searchhotcrawler.dao; package com.zhiwei.searchhotcrawler.dao;
import com.mongodb.BasicDBObject;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.config.DBConfig; import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate; import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import org.bson.Document; import org.bson.Document;
...@@ -44,6 +48,12 @@ public class HotSearchCacheDAO { ...@@ -44,6 +48,12 @@ public class HotSearchCacheDAO {
document.put("topic_lead", hotSearch.getTopicLead()); document.put("topic_lead", hotSearch.getTopicLead());
document.put("comment_count", hotSearch.getCommentCount()); document.put("comment_count", hotSearch.getCommentCount());
} }
if("今日头条热搜".equals(hotSearch.getType())){
document.put("comment_count", hotSearch.getCommentCount());
}
if("腾讯较真榜".equals(hotSearch.getType())){
document.put("topic_result",hotSearch.getTopicResult());
}
addAndUpdateData(document); addAndUpdateData(document);
dataes.add(document); dataes.add(document);
}); });
...@@ -69,8 +79,10 @@ public class HotSearchCacheDAO { ...@@ -69,8 +79,10 @@ public class HotSearchCacheDAO {
String topicLead = document.getString("topic_lead")!=null?document.getString("topic_lead"):null; String topicLead = document.getString("topic_lead")!=null?document.getString("topic_lead"):null;
boolean hot = document.getBoolean("hot")!=null?document.getBoolean("hot"):true; boolean hot = document.getBoolean("hot")!=null?document.getBoolean("hot"):true;
String url = document.getString("url")!=null?document.getString("url"):null; String url = document.getString("url")!=null?document.getString("url"):null;
String topicResult = document.getString("topic_result")!=null?document.getString("topic_result"):null;
String id = name + "_" + type; String id = name + "_" + type;
boolean recommend = false; boolean recommend = false;
Integer readCount = document.getInteger("comment_count");
if("微博热搜".equals(type)){ if("微博热搜".equals(type)){
String icon = document.getString("icon"); String icon = document.getString("icon");
if("recom".equals(icon) || "jian".equals(icon)){ if("recom".equals(icon) || "jian".equals(icon)){
...@@ -87,7 +99,7 @@ public class HotSearchCacheDAO { ...@@ -87,7 +99,7 @@ public class HotSearchCacheDAO {
Integer highestCount = nowDoc.getInteger("highestCount"); Integer highestCount = nowDoc.getInteger("highestCount");
Integer preRank = nowDoc.getInteger("lastRank"); Integer preRank = nowDoc.getInteger("lastRank");
Integer preCount = nowDoc.getInteger("lastCount"); Integer preCount = nowDoc.getInteger("lastCount");
String lastUrl = nowDoc.getString("url");
//判断最大热度值 //判断最大热度值
if (Objects.nonNull(lastCount) && Objects.nonNull(highestCount) && lastCount > highestCount) { if (Objects.nonNull(lastCount) && Objects.nonNull(highestCount) && lastCount > highestCount) {
highestCount = lastCount; highestCount = lastCount;
...@@ -104,6 +116,9 @@ public class HotSearchCacheDAO { ...@@ -104,6 +116,9 @@ public class HotSearchCacheDAO {
int durationNow = getDuration(type, duration); int durationNow = getDuration(type, duration);
// endTime = getEndTime(type, new Date()); // endTime = getEndTime(type, new Date());
//更新相应信息 //更新相应信息
if(url != null && !url.equals(lastUrl)){
nowDoc.put("url",url);
}
nowDoc.put("endTime", endTime); nowDoc.put("endTime", endTime);
nowDoc.put("lastRank", lastRank); nowDoc.put("lastRank", lastRank);
nowDoc.put("lastCount", lastCount); nowDoc.put("lastCount", lastCount);
...@@ -113,6 +128,12 @@ public class HotSearchCacheDAO { ...@@ -113,6 +128,12 @@ public class HotSearchCacheDAO {
nowDoc.put("preCount", preCount); nowDoc.put("preCount", preCount);
nowDoc.put("duration", durationNow); nowDoc.put("duration", durationNow);
nowDoc.put("recommend",recommend); nowDoc.put("recommend",recommend);
if(readCount != null){
nowDoc.put("readCount",readCount);
}
if(topicResult != null){
nowDoc.put("topicResult",topicResult);
}
collection.replaceOne(query, nowDoc); collection.replaceOne(query, nowDoc);
} else { } else {
nowDoc = new Document(); nowDoc = new Document();
...@@ -133,12 +154,25 @@ public class HotSearchCacheDAO { ...@@ -133,12 +154,25 @@ public class HotSearchCacheDAO {
nowDoc.put("preRank", null); nowDoc.put("preRank", null);
nowDoc.put("preCount", null); nowDoc.put("preCount", null);
nowDoc.put("recommend",recommend); nowDoc.put("recommend",recommend);
if(readCount != null){
nowDoc.put("readCount",readCount);
}
if(topicResult != null){
nowDoc.put("topicResult",topicResult);
}
if("微博热搜".equals(type)){
nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc);
if(nowDoc.containsKey("topicLead")){
nowDoc.put("topicLead", nowDoc.getString("topicLead"));
}
if(nowDoc.containsKey("readCount") && nowDoc.containsKey("discussCount")) {
nowDoc.put("readCount", nowDoc.getInteger("readCount"));
nowDoc.put("discussCount", nowDoc.getInteger("discussCount"));
}
}
collection.insertOne(nowDoc); collection.insertOne(nowDoc);
} }
} }
}catch (Exception e){ }catch (Exception e){
log.info("数据存储时出错:{}", e); log.info("数据存储时出错:{}", e);
} }
...@@ -158,6 +192,19 @@ public class HotSearchCacheDAO { ...@@ -158,6 +192,19 @@ public class HotSearchCacheDAO {
} }
} }
// public List<Document> getHotSearchList(){
// List<Document> documentList = new ArrayList<>();
// Document query = new Document("type","微博热搜");
// query.put("endTime",new BasicDBObject("$gte", new Date(1604851200000L)).append("$lt",new Date(1604973600000L)));
// query.put("readCount",new BasicDBObject("$exists",false));
// FindIterable<Document> findIterable = collection.find(query);
// MongoCursor<Document> mongoCursor = findIterable.iterator();
// while(mongoCursor.hasNext()){
// documentList.add(mongoCursor.next());
// }
// return documentList;
// }
/** /**
* 计算热搜时长 * 计算热搜时长
...@@ -232,6 +279,34 @@ public class HotSearchCacheDAO { ...@@ -232,6 +279,34 @@ public class HotSearchCacheDAO {
return new Date(timeLong); return new Date(timeLong);
} }
/**
* 根据主键查询对应热搜
* @param id
* @return
*/
public Document getHotSearchById(String id){
Document query = new Document("_id", id);
return (Document) collection.find(query).first();
}
/**
*
* @param document
* @param id
*/
public void updateWeibo(Document document,String id){
Document query = new Document("_id", id);
Document nowDoc = (Document) collection.find(query).first();
if (Objects.nonNull(nowDoc)) {
if(document.containsKey("topicLead") && document.getString("topicLead") != null) {
nowDoc.put("topicLead", document.getString("topicLead"));
}
if(document.containsKey("readCount") && document.containsKey("discussCount")) {
nowDoc.put("readCount", document.getInteger("readCount"));
nowDoc.put("discussCount", document.getInteger("discussCount"));
}
collection.replaceOne(query, nowDoc);
}
}
} }
package com.zhiwei.searchhotcrawler.dao;
import com.zhiwei.searchhotcrawler.config.RedisConfig;
import lombok.extern.log4j.Log4j2;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import java.util.Set;
/**
* redis基础操作类
*/
@Log4j2
public class RedisDao {
private Jedis jedis;
public RedisDao(){
JedisPoolConfig poolConfig = new JedisPoolConfig();
poolConfig.setMaxIdle(RedisConfig.redisMaxIdle);
poolConfig.setMaxTotal(RedisConfig.redisMaxTotal);
poolConfig.setMinIdle(RedisConfig.redisMinIdle);
JedisPool jedisPool = new JedisPool(poolConfig,RedisConfig.redisHost,RedisConfig.redisPort,RedisConfig.redisTimeout);
jedis = jedisPool.getResource();
jedis.select(RedisConfig.redisDataBase);
}
/**
* 存储redis数据
* @param key
* @param value
*/
public void setRedisData(String key, String value){
jedis.set(key,value);
}
/**
* 读取redis缓存数据
* @param key
* @return
*/
public String getRedisData(String key){
return jedis.get(key);
}
/**
* redis存值set集合,
* @param key
* @param value
*/
public void addDataToSet(String key, String value){
jedis.sadd(key,value);
}
/**
* redis读取set集合的值
* @param key
* @return
*/
public Set<String> getRedisSetData(String key){
return jedis.smembers(key);
}
/**
* 移除redis
* @param key
*/
public void removeRedis(String key){
jedis.del(key);
}
}
package com.zhiwei.searchhotcrawler.timer;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.ToutiaoHotSearchCrawler;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Log4j2
public class TouTiaoExecutor extends Thread {
private HotSearchList hotSearchList;
private static List<HotSearchList> resultList;
public TouTiaoExecutor(HotSearchList hotSearchList){
this.hotSearchList = hotSearchList;
}
@Override
public void run() {
try {
hotSearchList = ToutiaoHotSearchCrawler.toutiaoReadCount(hotSearchList);
resultList.add(hotSearchList);
}catch (Exception e){
e.printStackTrace();
}
}
/**
* 今日头条阅读量统计
* @param list
* @return
*/
public static List<HotSearchList> countTouTiaoReadCount(List<HotSearchList> list){
resultList= new ArrayList<>();
// ExecutorService service = Executors.newFixedThreadPool(list.size());
for(int i=0; i<list.size(); i++){
TipsUtils.service.execute(new TouTiaoExecutor(list.get(i)));
}
// TipsUtils.service.shutdown();
try {
if(!TipsUtils.service.awaitTermination(1, TimeUnit.MINUTES)){
log.info("查询今日头条阅读量超时");
}
} catch (InterruptedException e) {
log.info(e.fillInStackTrace());
}
return resultList;
}
}
...@@ -3,11 +3,15 @@ package com.zhiwei.searchhotcrawler.timer.quartz; ...@@ -3,11 +3,15 @@ package com.zhiwei.searchhotcrawler.timer.quartz;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic; import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.crawler.*; import com.zhiwei.searchhotcrawler.crawler.*;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.RedisDao;
import com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO; import com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO;
import com.zhiwei.searchhotcrawler.timer.TouTiaoExecutor;
import com.zhiwei.searchhotcrawler.util.DateUtils; import com.zhiwei.searchhotcrawler.util.DateUtils;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.bson.Document; import org.bson.Document;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -17,9 +21,7 @@ import org.springframework.scheduling.annotation.EnableScheduling; ...@@ -17,9 +21,7 @@ import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled; import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.util.ArrayList; import java.util.*;
import java.util.Date;
import java.util.List;
@Component @Component
@EnableScheduling @EnableScheduling
...@@ -49,6 +51,32 @@ public class GatherTimer { ...@@ -49,6 +51,32 @@ public class GatherTimer {
} }
/** /**
* 微博热搜导语,阅读量,讨论量更新
*/
@Async(value = "myScheduler")
@Scheduled(cron = "45 0/10 * * * ? ")
public void updateWeiBo(){
logger.info("微博热搜导语更新...");
RedisDao redisDao = new RedisDao();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
Set<String> hotSearchIdSet = redisDao.getRedisSetData(RedisConfig.WEIBO_HOTSEARCHIDS);
redisDao.removeRedis(RedisConfig.WEIBO_HOTSEARCHIDS);
Iterator<String> hotSearchIterator = hotSearchIdSet.iterator();
while (hotSearchIterator.hasNext()){
String id = hotSearchIterator.next();
Document document = hotSearchCacheDAO.getHotSearchById(id);
if(document != null){
document = WeiboHotSearchCrawler.weiboUpdate(document);
if(document.containsKey("topicLead") || document.containsKey("readCount") || document.containsKey("discussCount")) {
hotSearchCacheDAO.updateWeibo(document, id);
}
ZhiWeiTools.sleep(3000L);
}
}
logger.info("微博热搜导语更新结束...");
}
/**
* 今日头条热搜的采集 * 今日头条热搜的采集
*/ */
@Async(value = "myScheduler") @Async(value = "myScheduler")
...@@ -57,8 +85,10 @@ public class GatherTimer { ...@@ -57,8 +85,10 @@ public class GatherTimer {
logger.info("今日头条热搜开始采集..."); logger.info("今日头条热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date()); Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> toutiaoList = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone(date); List<HotSearchList> toutiaoList = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone(date);
logger.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(toutiaoList != null ? toutiaoList.size() : 0)); List<HotSearchList> toutiaoResult = new ArrayList<>();
TipsUtils.addHotList(HotSearchType.今日头条热搜.name(),toutiaoList); toutiaoResult = TouTiaoExecutor.countTouTiaoReadCount(toutiaoList);
logger.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), toutiaoResult != null ? toutiaoResult.size() : 0);
TipsUtils.addHotList(HotSearchType.今日头条热搜.name(),toutiaoResult);
logger.info("今日头条热搜采集结束..."); logger.info("今日头条热搜采集结束...");
} }
...@@ -228,8 +258,8 @@ public class GatherTimer { ...@@ -228,8 +258,8 @@ public class GatherTimer {
/** /**
* 凤凰新闻热搜的采集 * 凤凰新闻热搜的采集
*/ */
@Async(value = "myScheduler") // @Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ") // @Scheduled(cron = "10 * * * * ? ")
public void crawlerFengHuangHotSearch(){ public void crawlerFengHuangHotSearch(){
Date date = DateUtils.getMillSecondTime(new Date()); Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = FengHuangSearchCrawler.getFengHuangHotSearch(date); List<HotSearchList> list = FengHuangSearchCrawler.getFengHuangHotSearch(date);
...@@ -237,6 +267,20 @@ public class GatherTimer { ...@@ -237,6 +267,20 @@ public class GatherTimer {
} }
/** /**
* 腾讯较真辟谣榜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerTengXunVerificationHotSearch(){
logger.info("{},腾讯较真辟谣榜开始采集", new Date());
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = TengXunCrawler.getTengXunVerificationList(date);
logger.info("腾讯较真辟谣榜本轮采集数量:{}",list.size());
TipsUtils.addHotList(HotSearchType.腾讯较真榜.name(), list);
logger.info("{},腾讯较真辟谣榜采集结束", new Date());
}
/**
* 搜狐话题的采集 * 搜狐话题的采集
*/ */
@Async(value = "myScheduler") @Async(value = "myScheduler")
...@@ -261,6 +305,9 @@ public class GatherTimer { ...@@ -261,6 +305,9 @@ public class GatherTimer {
logger.info("知乎热搜话题采集结束..."); logger.info("知乎热搜话题采集结束...");
} }
/**
* 微博预热榜的采集
*/
@Async(value = "myScheduler") @Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ") @Scheduled(cron = "20 * * * * ? ")
public void crawlerWeiBoPreheat(){ public void crawlerWeiBoPreheat(){
...@@ -330,6 +377,25 @@ public class GatherTimer { ...@@ -330,6 +377,25 @@ public class GatherTimer {
logger.info("微博话题采集结束........"); logger.info("微博话题采集结束........");
} }
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 05 09 * * ? ")
// public void updateWeiboHistory(){
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<Document> documentList = hotSearchCacheDAO.getHotSearchList();
// int i=0;
// for (Document document : documentList){
// document = WeiboHotSearchCrawler.updateWeiBoTopic(document);
// if(document != null){
// hotSearchCacheDAO.updateWeibo(document,document.getString("_id"));
// ZhiWeiTools.sleep(500L);
// }
// i++;
// logger.info("更新进度:{}",i*100/documentList.size());
// }
// logger.info("更新结束");
// }
/** /**
* 知乎子类采集函数 * 知乎子类采集函数
* @param type * @param type
......
...@@ -9,6 +9,8 @@ import org.slf4j.Logger; ...@@ -9,6 +9,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/** /**
* 预警发送 * 预警发送
...@@ -23,6 +25,8 @@ public class TipsUtils { ...@@ -23,6 +25,8 @@ public class TipsUtils {
private static Map<String,Date> typeTips = new HashMap<>(); private static Map<String,Date> typeTips = new HashMap<>();
public static ExecutorService service = Executors.newFixedThreadPool(30);
//未采集到数据发送预警信息 //未采集到数据发送预警信息
public static void sendTips(String type, Date time){ public static void sendTips(String type, Date time){
HotSearchListDAO hotSearchListDAO = new HotSearchListDAO(); HotSearchListDAO hotSearchListDAO = new HotSearchListDAO();
......
#redis.host=127.0.0.1
#redis.port=6379
#redis.password=
#redis
redis.host = 192.168.0.39
redis.port = 6379
redis.database = 1
#maxIdle
redis.maxIdle=10
#minIdle
redis.minIdle=5
#maxTotal
redis.maxTotal=10
#timeout
redis.timeout=5000
redis.testOnBorrow=false
redis.testOnReturn=false
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment