Commit f8861322 by chenweitao

Merge branch 'mlbWork' into 'master'

Mlb work

See merge request !50
parents 81f2f704 987430de
......@@ -33,6 +33,7 @@
17.网易新闻跟帖热议
18.搜狗微信热搜
19.微博话题
20.微博预热榜
#### Mongo内网
192.168.0.101,192.168.0.106,192.168.0.108
......@@ -42,6 +43,12 @@
30000
#### Mongo数据表名
hot_search_list
#### zookeeper
zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
#### redis
redis.host = 192.168.0.39
redis.port = 6379
redis.database = 1
......
......@@ -112,6 +112,13 @@
<artifactId>spring-tx</artifactId>
<version>${spring.version}</version>
</dependency>
<!-- redis写 -->
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.8.1</version>
</dependency>
</dependencies>
<build>
......
......@@ -80,7 +80,20 @@ public class HotSearchCache {
*/
private Boolean recommend;
/**
* 阅读量
*/
private Integer readCount;
/**
* 讨论量
*/
private Integer discussCount;
/**
* 话题真假(腾讯较真榜使用)
*/
private String topicResult;
public HotSearchCache(String url, String name, String topicLead, Integer highestCount, Integer lastCount, Boolean hot,
Date startTime, Date endTime, Integer highestRank, Integer lastRank, String type, Integer duration){
......@@ -107,4 +120,28 @@ public class HotSearchCache {
public void setRecommend(Boolean recommend) {
this.recommend = recommend;
}
public Integer getReadCount() {
return readCount;
}
public void setReadCount(Integer readCount) {
this.readCount = readCount;
}
public Integer getDiscussCount() {
return discussCount;
}
public void setDiscussCount(Integer discussCount) {
this.discussCount = discussCount;
}
public String getTopicLead() {
return topicLead;
}
public void setTopicLead(String topicLead) {
this.topicLead = topicLead;
}
}
......@@ -75,10 +75,15 @@ public class HotSearchList implements Serializable{
private String icon;
/**
* 话题讨论量
* 话题讨论量或阅读量
*/
private Integer commentCount;
/**
* 话题真假结果(腾讯较真榜使用)
*/
private String topicResult;
public HotSearchList(){}
public HotSearchList(String url, String name, Integer count,Boolean hot,Integer rank,String type,String icon,Date date){
......@@ -122,4 +127,17 @@ public class HotSearchList implements Serializable{
this.topicLead = topicLead;
}
public HotSearchList(String url, String name, Integer count, Boolean hot,Integer rank, String type, Date date, String icon, String topicResult){
this.id = name + "_" + new Date().getTime() + "_" + type;
this.url = url;
this.name = name;
this.hot = hot;
this.count = count;
this.rank = rank;
this.time = date;
this.day = TimeParse.dateFormartString(date, "yyyy-MM-dd");
this.type = type;
this.icon = icon;
this.topicResult = topicResult;
}
}
......@@ -17,5 +17,6 @@ public enum HotSearchType {
凤凰新闻热搜,
网易热榜,
网易跟帖热议,
微博预热榜
微博预热榜,
腾讯较真榜
}
package com.zhiwei.searchhotcrawler.config;
import java.io.IOException;
import java.util.Properties;
public class RedisConfig {
public static String redisHost;
public static Integer redisPort;
public static String redisPassword;
public static Integer redisDataBase;
public static Integer redisMaxIdle;
public static Integer redisMinIdle;
public static Integer redisMaxTotal;
public static Integer redisTimeout;
/** 采集到的微博热搜Id */
public static String WEIBO_HOTSEARCHIDS = "weibo_hotsearchIds";
static {
Properties redisProperties = new Properties();
try {
redisProperties.load(RedisConfig.class.getClassLoader().getResourceAsStream("redis.properties"));
redisHost = redisProperties.getProperty("redis.host");
redisPort = Integer.valueOf(redisProperties.getProperty("redis.port"));
redisPassword = redisProperties.getProperty("redis.password");
redisDataBase = Integer.valueOf(redisProperties.getProperty("redis.database"));
redisMaxIdle = Integer.valueOf(redisProperties.getProperty("redis.maxIdle"));
redisMinIdle = Integer.valueOf(redisProperties.getProperty("redis.minIdle"));
redisMaxTotal = Integer.valueOf(redisProperties.getProperty("redis.maxTotal"));
redisTimeout = Integer.valueOf(redisProperties.getProperty("redis.timeout"));
} catch (IOException e) {
e.printStackTrace();
}
}
}
......@@ -79,4 +79,39 @@ public class TengXunCrawler {
log.info("腾讯新闻采集结束");
return list;
}
/**
* 腾讯较真辟谣榜数据采集
* @param date
* @return
*/
public static List<HotSearchList> getTengXunVerificationList(Date date) {
List<HotSearchList> list = new ArrayList<>();
String htmlBody = null;
String url = "https://vp.fact.qq.com/hotlistData?num=20";
Request request = RequestUtils.wrapGet(url);
//采集为空最多重试3次
for (int t = 0; t < 3; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
e.printStackTrace();
}
if (htmlBody != null && htmlBody.contains("data")){
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("data");
for (int i=0; i<jsonArray.size(); i++){
JSONObject jsonObject = jsonArray.getJSONObject(i);
Integer rank = jsonObject.getIntValue("index");
String name = jsonObject.getString("title");
Integer count = jsonObject.getIntValue("score");
String tengxunUrl = jsonObject.getString("link");
String topicResult = jsonObject.getString("result");
HotSearchList hotSearchList = new HotSearchList(tengxunUrl,name,count,false,rank,HotSearchType.腾讯较真榜.name(),date,null,topicResult);
list.add(hotSearchList);
}
return list;
}
}
return list;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
......@@ -7,13 +8,17 @@ import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.*;
......@@ -96,6 +101,86 @@ public class ToutiaoHotSearchCrawler {
}
// /**
// * 采集今日头条数据
// * @param date
// * @return
// */
// public static List<HotSearchList> toutiaoHotSearchByPhone(Date date){
// List<HotSearchList> hotSearchLists = new ArrayList<>();
// //采集头条内容
// String url = "https://api5-normal-c-lq.snssdk.com/api/feed/hotboard_online/v1/?category=hotboard_online&count=50";
// Map<String,Object> headerMap = new HashMap<>();
// headerMap.put("upgrade-insecure-requests","1");
// headerMap.put("user-agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36");
// String htmlBody = null;
// Request request = RequestUtils.wrapGet(url,headerMap);
// try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// log.info(htmlBody);
// } catch (IOException e1) {
// log.error("解析今日头条实时热搜时出现连接失败",e1);
// }
// if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
// try {
// JSONArray words = JSONObject.parseObject(htmlBody).getJSONArray("data");
// for (int i = 0; i < words.size(); i++) {
// JSONObject jsonObject = JSON.parseObject(words.get(i).toString());
// int rank = i+1;
// String name =jsonObject.getJSONObject("content").getJSONObject("raw_data").getString("title");
// String link = jsonObject.getJSONObject("content").getJSONObject("raw_data").getString("schema");
// if(link.contains("keyword=")) {
// link = "https://so.toutiao.com/search/?"+link.substring(link.indexOf("keyword="), link.indexOf("&search_json="));
// }else{
// link = null;
// }
// HotSearchList hotSearch = new HotSearchList(link, name, null, true, rank, HotSearchType.今日头条热搜.name(), null,date);
// hotSearchLists.add(hotSearch);
// }
// } catch (Exception e) {
// log.error("解析今日头条实时热搜时出现解析错误,数据不是json结构", e);
// }
// } else {
// log.info("解析今日头条实时热搜时出现解析错误,页面结构有问题");
// }
// return hotSearchLists;
// }
/**
* 获取今日头条热搜阅读量
* @param hotSearchList
* @return
*/
public static HotSearchList toutiaoReadCount(HotSearchList hotSearchList){
if (hotSearchList.getUrl() != null) {
String htmlBody = null;
String url = hotSearchList.getUrl();
Request request = RequestUtils.wrapGet(url);
for (int i = 0; i <= 5; i++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e1) {
log.error("解析今日头条热搜详情页面出现连接失败", e1);
}
if (StringUtils.isNotBlank(htmlBody)) {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select(".result-content .cs-view .cs-topone-tail .cs-view .margin-bottom-m .margin-left-m");
if (Objects.nonNull(elements) && !elements.isEmpty()) {
Element element = elements.first();
String readCount = element.text().replaceAll("阅读", "");
Integer count = TipsUtils.getHotCount(readCount);
log.info("{},阅读量:{}", hotSearchList.getName(), count);
hotSearchList.setCommentCount(count);
return hotSearchList;
}
}
ZhiWeiTools.sleep(1000L);
}
}
return hotSearchList;
}
/**
* 热搜类型
* @param wordsType
......
......@@ -5,12 +5,16 @@ import java.util.*;
import java.util.stream.Collectors;
import com.alibaba.fastjson.JSON;
import com.zhiwei.searchhotcrawler.bean.HotSearchCache;
import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.dao.RedisDao;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.bson.Document;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
......@@ -25,6 +29,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.mail.SendMailWeibo;
import com.zhiwei.tools.tools.URLCodeUtil;
import org.springframework.beans.factory.annotation.Autowired;
/**
* @ClassName: WeiboHotSearch
......@@ -34,7 +39,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
*/
@Log4j2
public class WeiboHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: weiboHotSearchTest
......@@ -107,6 +112,7 @@ public class WeiboHotSearchCrawler {
* @return void 返回类型
*/
public static List<HotSearchList> weiboHotSearchByPhone(Date date){
RedisDao redisDao = new RedisDao();
String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
......@@ -147,10 +153,12 @@ public class WeiboHotSearchCrawler {
if (StringUtils.isNotBlank(icon)) {
icon = icon.split("_")[1].split(".png")[0];
}
String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String id = cardInfo.getString("scheme");
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
result.add(hotSearch);
rank++;
redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS,name+"_微博热搜");
}
} else {
log.info("card 数据结构为:{}", card);
......@@ -214,4 +222,104 @@ public class WeiboHotSearchCrawler {
return result;
}
/**
* 微博热搜数据更新导语,阅读量,讨论量
* @param document
* @return
*/
public static Document weiboUpdate(Document document) {
log.info("更新微博热搜{}导语阅读量和讨论量",document.getString("name"));
String url = "https://m.weibo.cn/api/container/getIndex?"+ document.getString("url").substring(
document.getString("url").indexOf("?")+1,document.getString("url").indexOf("&"));
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for(int count =0; count<=5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜详情页面时出现连接失败", e);
}
if (htmlBody != null && htmlBody.contains("data")) {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo");
if(json.containsKey("desc")){
String topicLead = json.getString("desc");
if(!"".equals(topicLead)) {
document.put("topicLead", topicLead);
}
}
if(json.containsKey("cardlist_head_cards")){
JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext");
String read = midText.replaceAll("阅读", "").replaceAll("讨论.*", "").trim();
String discussCount = midText.replaceAll(".*讨论", "").replaceAll("详情.*", "").trim();
document.put("readCount", TipsUtils.getHotCount(read));
document.put("discussCount", TipsUtils.getHotCount(discussCount));
}
}
return document;
}
}
return null;
}
// /**
// * 微博更新历史数据
// * @param hotSearch
// * @return
// */
// public static Document updateWeiBoTopic(Document hotSearch){
// String hotUrl = hotSearch.getString("url");
// String htmlBody = null;
// if(hotUrl != null && !"http://s.weibo.comjavascript:void(0);".equals(hotUrl)) {
// Request request = RequestUtils.wrapGet(hotUrl);
// try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// org.jsoup.nodes.Document document = Jsoup.parse(htmlBody);
// Element element = document.select(".m-wrap .m-con-r .card-wrap .card-interest .card-content .item-topic .info h2 a").first();
// if (element != null) {
// String topicLeadUrl = element.attr("href");
// return analyHtml(hotSearch,topicLeadUrl);
// } else{
// return analyHtml(hotSearch,hotUrl);
// }
// } catch (Exception e) {
// log.error("解析微博话题榜时出现解析错误,页面结构有问题", e);
// }
// }
// return null;
// }
//
// public static Document analyHtml(Document hotSearch,String topicUrl){
// String htmlBody = null;
// Request request = RequestUtils.wrapGet(topicUrl);
// try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// org.jsoup.nodes.Document document = Jsoup.parse(htmlBody);
// if(document != null) {
// Element body = document.body();
// Element read = body.select(".card-topic-a .info .total span").first();
// if(read != null) {
// String readCount = read.text().replaceAll("阅读", "");
// hotSearch.put("readCount",TipsUtils.getHotCount(readCount));
// }
// Element dis = body.select(".card-topic-a .info .total span").last();
// if(dis != null) {
// String disCount = dis.text().replaceAll("讨论", "");
// hotSearch.put("discussCount",TipsUtils.getHotCount(disCount));
// }
// Element topicLead = body.select(".m-wrap .card-wrap .card-topic-lead p").first();
// if(topicLead != null) {
// String topicLeadString = topicLead.html().replaceAll("<strong>导语:</strong>", "");
// topicLeadString = topicLeadString.length() > 150 ? topicLeadString.substring(0,150) : topicLeadString;
// hotSearch.put("topicLead",topicLeadString);
// }
// return hotSearch;
// }
// } catch (Exception e) {
// log.error("解析微博话题导语时出现解析错误,页面结构有问题", e);
// }
// return null;
// }
}
......@@ -121,7 +121,7 @@ public class ZhihuHotSearchCrawler {
hotCount = (int) (Double.parseDouble(hotText) * 10000);
} else if (hotText.contains("亿")) {
hotText = hotText.replaceAll("亿.*", "").trim();
hotCount = (int) (Double.parseDouble(hotText) * 10000000);
hotCount = (int) (Double.parseDouble(hotText) * 100000000);
} else {
hotCount = Integer.getInteger(hotText);
}
......
package com.zhiwei.searchhotcrawler.dao;
import com.mongodb.BasicDBObject;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
......@@ -44,6 +48,12 @@ public class HotSearchCacheDAO {
document.put("topic_lead", hotSearch.getTopicLead());
document.put("comment_count", hotSearch.getCommentCount());
}
if("今日头条热搜".equals(hotSearch.getType())){
document.put("comment_count", hotSearch.getCommentCount());
}
if("腾讯较真榜".equals(hotSearch.getType())){
document.put("topic_result",hotSearch.getTopicResult());
}
addAndUpdateData(document);
dataes.add(document);
});
......@@ -69,8 +79,10 @@ public class HotSearchCacheDAO {
String topicLead = document.getString("topic_lead")!=null?document.getString("topic_lead"):null;
boolean hot = document.getBoolean("hot")!=null?document.getBoolean("hot"):true;
String url = document.getString("url")!=null?document.getString("url"):null;
String topicResult = document.getString("topic_result")!=null?document.getString("topic_result"):null;
String id = name + "_" + type;
boolean recommend = false;
Integer readCount = document.getInteger("comment_count");
if("微博热搜".equals(type)){
String icon = document.getString("icon");
if("recom".equals(icon) || "jian".equals(icon)){
......@@ -87,7 +99,7 @@ public class HotSearchCacheDAO {
Integer highestCount = nowDoc.getInteger("highestCount");
Integer preRank = nowDoc.getInteger("lastRank");
Integer preCount = nowDoc.getInteger("lastCount");
String lastUrl = nowDoc.getString("url");
//判断最大热度值
if (Objects.nonNull(lastCount) && Objects.nonNull(highestCount) && lastCount > highestCount) {
highestCount = lastCount;
......@@ -104,6 +116,9 @@ public class HotSearchCacheDAO {
int durationNow = getDuration(type, duration);
// endTime = getEndTime(type, new Date());
//更新相应信息
if(url != null && !url.equals(lastUrl)){
nowDoc.put("url",url);
}
nowDoc.put("endTime", endTime);
nowDoc.put("lastRank", lastRank);
nowDoc.put("lastCount", lastCount);
......@@ -113,6 +128,12 @@ public class HotSearchCacheDAO {
nowDoc.put("preCount", preCount);
nowDoc.put("duration", durationNow);
nowDoc.put("recommend",recommend);
if(readCount != null){
nowDoc.put("readCount",readCount);
}
if(topicResult != null){
nowDoc.put("topicResult",topicResult);
}
collection.replaceOne(query, nowDoc);
} else {
nowDoc = new Document();
......@@ -133,12 +154,25 @@ public class HotSearchCacheDAO {
nowDoc.put("preRank", null);
nowDoc.put("preCount", null);
nowDoc.put("recommend",recommend);
if(readCount != null){
nowDoc.put("readCount",readCount);
}
if(topicResult != null){
nowDoc.put("topicResult",topicResult);
}
if("微博热搜".equals(type)){
nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc);
if(nowDoc.containsKey("topicLead")){
nowDoc.put("topicLead", nowDoc.getString("topicLead"));
}
if(nowDoc.containsKey("readCount") && nowDoc.containsKey("discussCount")) {
nowDoc.put("readCount", nowDoc.getInteger("readCount"));
nowDoc.put("discussCount", nowDoc.getInteger("discussCount"));
}
}
collection.insertOne(nowDoc);
}
}
}catch (Exception e){
log.info("数据存储时出错:{}", e);
}
......@@ -158,6 +192,19 @@ public class HotSearchCacheDAO {
}
}
// public List<Document> getHotSearchList(){
// List<Document> documentList = new ArrayList<>();
// Document query = new Document("type","微博热搜");
// query.put("endTime",new BasicDBObject("$gte", new Date(1604851200000L)).append("$lt",new Date(1604973600000L)));
// query.put("readCount",new BasicDBObject("$exists",false));
// FindIterable<Document> findIterable = collection.find(query);
// MongoCursor<Document> mongoCursor = findIterable.iterator();
// while(mongoCursor.hasNext()){
// documentList.add(mongoCursor.next());
// }
// return documentList;
// }
/**
* 计算热搜时长
......@@ -232,6 +279,34 @@ public class HotSearchCacheDAO {
return new Date(timeLong);
}
/**
* 根据主键查询对应热搜
* @param id
* @return
*/
public Document getHotSearchById(String id){
Document query = new Document("_id", id);
return (Document) collection.find(query).first();
}
/**
*
* @param document
* @param id
*/
public void updateWeibo(Document document,String id){
Document query = new Document("_id", id);
Document nowDoc = (Document) collection.find(query).first();
if (Objects.nonNull(nowDoc)) {
if(document.containsKey("topicLead") && document.getString("topicLead") != null) {
nowDoc.put("topicLead", document.getString("topicLead"));
}
if(document.containsKey("readCount") && document.containsKey("discussCount")) {
nowDoc.put("readCount", document.getInteger("readCount"));
nowDoc.put("discussCount", document.getInteger("discussCount"));
}
collection.replaceOne(query, nowDoc);
}
}
}
package com.zhiwei.searchhotcrawler.dao;
import com.zhiwei.searchhotcrawler.config.RedisConfig;
import lombok.extern.log4j.Log4j2;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import java.util.Set;
/**
* redis基础操作类
*/
@Log4j2
public class RedisDao {
private Jedis jedis;
public RedisDao(){
JedisPoolConfig poolConfig = new JedisPoolConfig();
poolConfig.setMaxIdle(RedisConfig.redisMaxIdle);
poolConfig.setMaxTotal(RedisConfig.redisMaxTotal);
poolConfig.setMinIdle(RedisConfig.redisMinIdle);
JedisPool jedisPool = new JedisPool(poolConfig,RedisConfig.redisHost,RedisConfig.redisPort,RedisConfig.redisTimeout);
jedis = jedisPool.getResource();
jedis.select(RedisConfig.redisDataBase);
}
/**
* 存储redis数据
* @param key
* @param value
*/
public void setRedisData(String key, String value){
jedis.set(key,value);
}
/**
* 读取redis缓存数据
* @param key
* @return
*/
public String getRedisData(String key){
return jedis.get(key);
}
/**
* redis存值set集合,
* @param key
* @param value
*/
public void addDataToSet(String key, String value){
jedis.sadd(key,value);
}
/**
* redis读取set集合的值
* @param key
* @return
*/
public Set<String> getRedisSetData(String key){
return jedis.smembers(key);
}
/**
* 移除redis
* @param key
*/
public void removeRedis(String key){
jedis.del(key);
}
}
package com.zhiwei.searchhotcrawler.timer;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.ToutiaoHotSearchCrawler;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Log4j2
public class TouTiaoExecutor extends Thread {
private HotSearchList hotSearchList;
private static List<HotSearchList> resultList;
public TouTiaoExecutor(HotSearchList hotSearchList){
this.hotSearchList = hotSearchList;
}
@Override
public void run() {
try {
hotSearchList = ToutiaoHotSearchCrawler.toutiaoReadCount(hotSearchList);
resultList.add(hotSearchList);
}catch (Exception e){
e.printStackTrace();
}
}
/**
* 今日头条阅读量统计
* @param list
* @return
*/
public static List<HotSearchList> countTouTiaoReadCount(List<HotSearchList> list){
resultList= new ArrayList<>();
// ExecutorService service = Executors.newFixedThreadPool(list.size());
for(int i=0; i<list.size(); i++){
TipsUtils.service.execute(new TouTiaoExecutor(list.get(i)));
}
// TipsUtils.service.shutdown();
try {
if(!TipsUtils.service.awaitTermination(1, TimeUnit.MINUTES)){
log.info("查询今日头条阅读量超时");
}
} catch (InterruptedException e) {
log.info(e.fillInStackTrace());
}
return resultList;
}
}
......@@ -3,11 +3,15 @@ package com.zhiwei.searchhotcrawler.timer.quartz;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.crawler.*;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.RedisDao;
import com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO;
import com.zhiwei.searchhotcrawler.timer.TouTiaoExecutor;
import com.zhiwei.searchhotcrawler.util.DateUtils;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -17,9 +21,7 @@ import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.*;
@Component
@EnableScheduling
......@@ -49,6 +51,32 @@ public class GatherTimer {
}
/**
* 微博热搜导语,阅读量,讨论量更新
*/
@Async(value = "myScheduler")
@Scheduled(cron = "45 0/10 * * * ? ")
public void updateWeiBo(){
logger.info("微博热搜导语更新...");
RedisDao redisDao = new RedisDao();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
Set<String> hotSearchIdSet = redisDao.getRedisSetData(RedisConfig.WEIBO_HOTSEARCHIDS);
redisDao.removeRedis(RedisConfig.WEIBO_HOTSEARCHIDS);
Iterator<String> hotSearchIterator = hotSearchIdSet.iterator();
while (hotSearchIterator.hasNext()){
String id = hotSearchIterator.next();
Document document = hotSearchCacheDAO.getHotSearchById(id);
if(document != null){
document = WeiboHotSearchCrawler.weiboUpdate(document);
if(document.containsKey("topicLead") || document.containsKey("readCount") || document.containsKey("discussCount")) {
hotSearchCacheDAO.updateWeibo(document, id);
}
ZhiWeiTools.sleep(3000L);
}
}
logger.info("微博热搜导语更新结束...");
}
/**
* 今日头条热搜的采集
*/
@Async(value = "myScheduler")
......@@ -57,8 +85,10 @@ public class GatherTimer {
logger.info("今日头条热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> toutiaoList = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone(date);
logger.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(toutiaoList != null ? toutiaoList.size() : 0));
TipsUtils.addHotList(HotSearchType.今日头条热搜.name(),toutiaoList);
List<HotSearchList> toutiaoResult = new ArrayList<>();
toutiaoResult = TouTiaoExecutor.countTouTiaoReadCount(toutiaoList);
logger.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), toutiaoResult != null ? toutiaoResult.size() : 0);
TipsUtils.addHotList(HotSearchType.今日头条热搜.name(),toutiaoResult);
logger.info("今日头条热搜采集结束...");
}
......@@ -228,8 +258,8 @@ public class GatherTimer {
/**
* 凤凰新闻热搜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
// @Async(value = "myScheduler")
// @Scheduled(cron = "10 * * * * ? ")
public void crawlerFengHuangHotSearch(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = FengHuangSearchCrawler.getFengHuangHotSearch(date);
......@@ -237,6 +267,20 @@ public class GatherTimer {
}
/**
* 腾讯较真辟谣榜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerTengXunVerificationHotSearch(){
logger.info("{},腾讯较真辟谣榜开始采集", new Date());
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = TengXunCrawler.getTengXunVerificationList(date);
logger.info("腾讯较真辟谣榜本轮采集数量:{}",list.size());
TipsUtils.addHotList(HotSearchType.腾讯较真榜.name(), list);
logger.info("{},腾讯较真辟谣榜采集结束", new Date());
}
/**
* 搜狐话题的采集
*/
@Async(value = "myScheduler")
......@@ -261,6 +305,9 @@ public class GatherTimer {
logger.info("知乎热搜话题采集结束...");
}
/**
* 微博预热榜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void crawlerWeiBoPreheat(){
......@@ -330,6 +377,25 @@ public class GatherTimer {
logger.info("微博话题采集结束........");
}
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 05 09 * * ? ")
// public void updateWeiboHistory(){
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<Document> documentList = hotSearchCacheDAO.getHotSearchList();
// int i=0;
// for (Document document : documentList){
// document = WeiboHotSearchCrawler.updateWeiBoTopic(document);
// if(document != null){
// hotSearchCacheDAO.updateWeibo(document,document.getString("_id"));
// ZhiWeiTools.sleep(500L);
// }
// i++;
// logger.info("更新进度:{}",i*100/documentList.size());
// }
// logger.info("更新结束");
// }
/**
* 知乎子类采集函数
* @param type
......
......@@ -9,6 +9,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* 预警发送
......@@ -23,6 +25,8 @@ public class TipsUtils {
private static Map<String,Date> typeTips = new HashMap<>();
public static ExecutorService service = Executors.newFixedThreadPool(30);
//未采集到数据发送预警信息
public static void sendTips(String type, Date time){
HotSearchListDAO hotSearchListDAO = new HotSearchListDAO();
......
#redis.host=127.0.0.1
#redis.port=6379
#redis.password=
#redis
redis.host = 192.168.0.39
redis.port = 6379
redis.database = 1
#maxIdle
redis.maxIdle=10
#minIdle
redis.minIdle=5
#maxTotal
redis.maxTotal=10
#timeout
redis.timeout=5000
redis.testOnBorrow=false
redis.testOnReturn=false
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment