Commit 9ce337bb by zhiwei

添加总热搜表缓存+mongo修改为多节点读取+微博话题类型错误修复

parent dd95f27b
package com.zhiwei.searchhotcrawler.bean;
import lombok.Data;
import lombok.ToString;
import java.util.Date;
@ToString
@Data
public class HotSearchCache {
/**
* 主键
*/
private String id;
/**
* 消息链接
*/
private String url;
/**
* 热搜关键词,且为消息主键
*/
private String name;
/**
* 热搜或话题导语
*/
private String topicLead;
/**
* 最高热搜值
*/
private Integer highestCount;
/**
* 最新热搜热度值
*/
private Integer lastCount;
/**
* 状态(true 为热搜; false为时时上升)
*/
private Boolean hot;
/**
* 话题开始时间
*/
private Date startTime;
/**
* 话题结束时间
*/
private Date endTime;
/**
* 最高排名
*/
private Integer highestRank;
/**
* 最新排名
*/
private Integer lastRank;
/**
* 热搜分类
*/
private String type;
/**
* 热搜持续时长
*/
private Integer duration;
public HotSearchCache(String url, String name, String topicLead, Integer highestCount, Integer lastCount, Boolean hot,
Date startTime, Date endTime, Integer highestRank, Integer lastRank, String type, Integer duration){
this.id = name + "_" + type;
this.url = url;
this.name = name;
this.topicLead = topicLead;
this.hot = hot;
this.highestCount = highestCount;
this.lastCount = lastCount;
this.hot = hot;
this.startTime = startTime;
this.endTime = endTime;
this.highestRank = highestRank;
this.lastRank = lastRank;
this.type = type;
this.duration = duration;
}
}
......@@ -40,7 +40,7 @@ public class HotSearchList implements Serializable{
private String topicLead;
/**
* 时时热搜量
* 热搜量
*/
private Integer count;
......@@ -60,11 +60,6 @@ public class HotSearchList implements Serializable{
private Date time;
/**
* 据上分钟变化量
*/
private Integer changeCount;
/**
* 排名
*/
private Integer rank;
......
......@@ -3,7 +3,7 @@ package com.zhiwei.searchhotcrawler.config;
import java.io.InputStream;
import java.util.Properties;
public class Config {
public class DBConfig {
static {
Properties conf = null;
try {
......@@ -12,29 +12,22 @@ public class Config {
conf = new Properties();
conf.load(is);
is.close();
mongoIp = conf.getProperty("mongoIp");
mongoPort = Integer.valueOf(conf.getProperty("mongoPort"));
userName = conf.getProperty("db.username");
userPwd = conf.getProperty("db.paasword");
authDB = conf.getProperty("db.certifiedDB");
mongoUri = conf.getProperty("mongoUri");
dbName = conf.getProperty("dbName");
searchCollName = conf.getProperty("searchCollName");
searchCacheCollName = conf.getProperty("searchCacheCollName");
topicCollName = conf.getProperty("topicCollName");
collWechatUserName = conf.getProperty("collWechatUserName");
} catch (Exception e) {
e.printStackTrace();
}
}
public static String mongoIp;
public static int mongoPort;
public static String userName;
public static String userPwd;
public static String authDB;
public static String mongoUri;
public static String dbName;
public static String searchCollName;
public static String searchCacheCollName;
public static String topicCollName;
public static String collWechatUserName;
}
package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
......@@ -107,6 +103,8 @@ public class WeiboHotSearchCrawler {
* @return void 返回类型
*/
public static List<HotSearchList> weiboHotSearchByPhone(){
for(int count =0; count<=5; count++){
String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
......@@ -123,9 +121,10 @@ public class WeiboHotSearchCrawler {
try {
JSONObject card = cards.getJSONObject(i);
JSONArray cardGroup = card.getJSONArray("card_group");
if(Objects.nonNull(cardGroup) && !cardGroup.isEmpty()){
String title = card.getString("title");
boolean hot = true;
if(title.contains("实时上升热点")){
if(Objects.nonNull(title) && title.contains("实时上升热点")){
hot = false;
rank = 50;
}
......@@ -142,6 +141,9 @@ public class WeiboHotSearchCrawler {
result.add(hotSearch);
rank++;
}
}else{
log.info("card 数据结构为:{}", card);
}
} catch (Exception e) {
log.error("解析微博时时热搜时出现解析错误",e);
continue;
......@@ -150,14 +152,13 @@ public class WeiboHotSearchCrawler {
return result;
} catch (Exception e) {
log.error("解析微博时时热搜时出现解析错误,数据不是json结构",e);
return Collections.emptyList();
}
}else{
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
}
} catch (IOException e1) {
log.error("解析微博时时热搜时出现连接失败",e1);
return Collections.emptyList();
}
}
return Collections.emptyList();
}
......
......@@ -61,7 +61,7 @@ public class WeiboSuperTopicCrawler {
//重试三次
for(int retryTimes = 1; retryTimes<=3; retryTimes++) {
try {
System.out.println("pageUrl=========="+pageUrl);
// System.out.println("pageUrl=========="+pageUrl);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc1")) {
topicList.addAll(parseTopicRankHtml(page, htmlBody, type));
......
......@@ -134,7 +134,7 @@ public class WeiboTopicCrawler {
//重试三次
for(int retryTimes = 1; retryTimes<=5; retryTimes++) {
try {
log.info("pageUrl::{}", pageUrl);
// log.info("pageUrl::{}", pageUrl);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("top_mark_text")) {
topicList.addAll(parseTopicHtml(htmlBody));
......@@ -202,7 +202,7 @@ public class WeiboTopicCrawler {
}
return topicList;
}else{
log.info("html:{}",htmlBody);
// log.info("html:{}",htmlBody);
}
} catch (Exception e) {
log.error("解析榜单列表页面时出现错误,错误为:{}", e);
......
package com.zhiwei.searchhotcrawler.dao;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.mongodb.client.MongoCollection;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import java.util.Date;
import java.util.List;
import java.util.Objects;
/**
* 热搜基础结果表
*/
@Log4j2
public class HotSearchCacheDAO {
private static MongoCollection collection = MongoDBTemplate.getCollection(DBConfig.dbName, DBConfig.searchCacheCollName);
/**
* 添加及更新相应数据表中的数据
* @param document
*/
public void addAndUpdateData(Document document){
try {
String name = document.getString("name");
String type = document.getString("type");
int lastRank = document.getInteger("rank")!=null?document.getInteger("rank"): -1;
int lastCount = document.getInteger("count")!=null?document.getInteger("count"): -1;
Date startTime = document.getDate("time");
Date endTime = new Date(startTime.getTime() + (60 * 1000));
String topicLead = document.getString("topic_lead")!=null?document.getString("topic_lead"):null;
boolean hot = document.getBoolean("hot")!=null?document.getBoolean("hot"):true;
String url = document.getString("url")!=null?document.getString("url"):null;
String id = name + "_" + type;
Document query = new Document("_id", id);
Document nowDoc = (Document) collection.find(query).first();
if (Objects.nonNull(nowDoc)) {
int highestRank = nowDoc.getInteger("highestRank");
int highestCount = nowDoc.getInteger("highestCount");
//判断最大热度值
if (lastCount > highestCount) {
highestCount = lastCount;
}
//判断最高排名
if (lastRank < highestRank) {
highestRank = lastRank;
}
//计算热搜时长
int duration = nowDoc.getInteger("duration");
int durationNow = getDuration(type, duration);
endTime = new Date(new Date().getTime() + (60 * 1000));
//更新相应信息
nowDoc.put("endTime", endTime);
nowDoc.put("lastRank", lastRank);
nowDoc.put("lastCount", lastCount);
nowDoc.put("highestRank", highestRank);
nowDoc.put("highestCount", highestCount);
nowDoc.put("duration", durationNow);
collection.replaceOne(query, nowDoc);
} else {
nowDoc = new Document();
int durationNow = getDuration(type, 0);
nowDoc.put("_id", id);
nowDoc.put("url", url);
nowDoc.put("name", name);
nowDoc.put("hot", hot);
nowDoc.put("topicLead", topicLead);
nowDoc.put("type", type);
nowDoc.put("lastRank", lastRank);
nowDoc.put("highestRank", lastRank);
nowDoc.put("lastCount", lastCount);
nowDoc.put("highestCount", lastCount);
nowDoc.put("startTime", startTime);
nowDoc.put("endTime", endTime);
nowDoc.put("duration", durationNow);
collection.insertOne(nowDoc);
}
}catch (Exception e){
log.info("数据存储时出错:{}", e);
}
}
/**
* 计算热搜时长
* @param type
* @param duration
* @return
*/
private int getDuration(String type, int duration){
switch (type){
case "微博热搜" :
duration = duration + 1;
break;
case "百度热搜" :
duration = duration + 5;
break;
case "知乎热搜" :
duration = duration + 10;
break;
case "抖音热搜" :
duration = duration + 10;
break;
case "搜狗微信热搜" :
duration = duration + 5;
break;
case "微博话题" :
duration = duration + 3;
break;
default :
duration = duration + 1;
}
return duration;
}
}
......@@ -6,7 +6,12 @@ import java.util.Date;
import java.util.List;
import java.util.Objects;
import com.zhiwei.searchhotcrawler.config.Config;
import com.mongodb.client.ListIndexesIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.IndexOptions;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import lombok.extern.log4j.Log4j2;
import com.mongodb.BasicDBObject;
......@@ -14,138 +19,50 @@ import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.cache.CacheManager;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.tools.timeparse.TimeParse;
import org.bson.BsonDocument;
import org.bson.Document;
import org.bson.conversions.Bson;
@Log4j2
public class HotSearchListDAO extends MongoDBTemplate{
public class HotSearchListDAO{
public static MongoDatabase mongoDatabase = MongoDBTemplate.getDB(DBConfig.dbName);
public static MongoCollection mongoCollection;
public HotSearchListDAO() {
super();
super.setDbName(Config.dbName);
String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
String year = time.substring(0,4);
String month = time.substring(5,7);
String collName = Config.searchCollName + year + "_" + month;
super.setCollName(collName);
String collName = DBConfig.searchCollName + year + "_" + month;
mongoCollection = mongoDatabase.getCollection(collName);
//给数据表创建索引
createIndex();
MongoDBTemplate.createIndex(DBConfig.dbName, collName);
}
/**
* 初次创建表及创建相应的索引
*/
private void createIndex(){
List<DBObject> indexList = this.getReadColl().getIndexInfo();
if(Objects.isNull(indexList) && indexList.isEmpty()){
DBObject countIndexDoc = new BasicDBObject();
countIndexDoc.put("count", -1);
DBObject timeIndexDoc = new BasicDBObject();
timeIndexDoc.put("time", -1);
DBObject rankIndexDoc = new BasicDBObject();
rankIndexDoc.put("rank", -1);
DBObject nameIndexDoc = new BasicDBObject();
nameIndexDoc.put("name", -1);
DBObject typeIndexDoc = new BasicDBObject();
typeIndexDoc.put("type", -1);
try {
super.getReadColl().createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
super.getReadColl().createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
super.getReadColl().createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
super.getReadColl().createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
super.getReadColl().createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* 添加数据入库
* @param list
*/
public void addHotSearchList(List<DBObject> list){
try {
this.getReadColl().insert(list);
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
}
public void addHotSearch(DBObject doc){
try {
this.getReadColl().insert(doc);
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
}
/**
* 查询据上次变化量
* @Title: getChangeCount
* @author hero
* @param @param weiboHotSearch
* @param @return 设定文件
* @return int 返回类型
*/
public int getChangeCount(HotSearchList weiboHotSearch){
int result = 0;
DBObject query = new BasicDBObject();
query.put("name", weiboHotSearch.getName());
DBObject sort = new BasicDBObject();
sort.put("time", -1);
public void addHotSearchList(List<Document> list){
try {
DBCursor cur = this.getReadColl().find(query).sort(sort).limit(1);
while(cur.hasNext()){
DBObject doc = cur.next();
if(doc.get("count")!=null) {
result = weiboHotSearch.getCount() - Integer.valueOf(doc.get("count").toString());
break;
}
}
cur.close();
mongoCollection.insertMany(list);
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
return result;
}
return result;
}
/**
* @Title: getWeiboHotOneHour
* @author hero
* @Description: 查询最近1小时内新增的微博热搜
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
public List<DBObject> getHotOneHour(String type){
List<DBObject> list = new ArrayList<>();
Date date = new Date((new Date().getTime()-60*60*1000));
DBObject query = new BasicDBObject();
query.put("time", new BasicDBObject("$gte", date));
query.put("changeCount", 0);
query.put("type", type);
public void addHotSearch(Document doc){
try {
DBCursor cur = this.getReadColl().find(query);
while(cur.hasNext()){
DBObject doc = cur.next();
String name = doc.get("name").toString();
if(CacheManager.getCacheByKey(name)==null){
CacheManager.putCache(name, doc, 48*60*60*1000);
list.add(doc);
}
}
cur.close();
mongoCollection.insertOne(doc);
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.dao;
import java.util.Collections;
import java.util.List;
import com.zhiwei.searchhotcrawler.config.Config;
import lombok.extern.log4j.Log4j2;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
@Log4j2
public class WechatUserDao extends MongoDBTemplate{
public WechatUserDao() {
super();
super.setDbName(Config.dbName);
super.setCollName(Config.collWechatUserName);
}
/**
* 添加分组用户
* @param userlist
* @param groupName
* @param groupId
*/
public void addWechatUser(List<String> userlist, String groupName, Integer groupId){
for(int i=0; i<3; i++){
try {
DBObject doc = new BasicDBObject();
doc.put("_id", groupId+"-"+groupName);
doc.put("groupId", groupId);
doc.put("groupName", groupName);
doc.put("user", userlist);
this.getReadColl().save(doc);
break;
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
}
}
/**
* 根据分组名称查询分组用户
* @param group
* @return
*/
@SuppressWarnings("unchecked")
public List<String> getWechatUserByGroup(String group){
try {
DBObject query = new BasicDBObject();
query.put("groupName", group);
DBObject doc = this.getReadColl().findOne(query);
if(doc != null){
return (List<String>)doc.get("user");
}
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
return Collections.emptyList();
}
}
......@@ -3,76 +3,46 @@ package com.zhiwei.searchhotcrawler.dao;
import java.util.Date;
import java.util.List;
import java.util.Objects;
import com.zhiwei.searchhotcrawler.config.Config;
import lombok.extern.log4j.Log4j2;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import lombok.extern.log4j.Log4j2;
import com.zhiwei.tools.timeparse.TimeParse;
import org.bson.Document;
@Log4j2
public class WeiboSuperTopicDAO extends MongoDBTemplate{
public class WeiboSuperTopicDAO{
public static MongoDatabase mongoDatabase = MongoDBTemplate.getDB(DBConfig.dbName);
public static MongoCollection mongoCollection;
public WeiboSuperTopicDAO() {
super();
super.setDbName(Config.dbName);
String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
String year = time.substring(0,4);
String month = time.substring(5,7);
String collName = Config.topicCollName + year + "_" + month;
super.setCollName(collName);
String collName = DBConfig.searchCollName + year + "_" + month;
mongoCollection = mongoDatabase.getCollection(collName);
createIndex();
//给数据表创建索引
MongoDBTemplate.createIndex(DBConfig.dbName, collName);
}
/**
* 初次创建表及创建相应的索引
*/
private void createIndex(){
List<DBObject> indexList = this.getReadColl().getIndexInfo();
if(Objects.isNull(indexList) && indexList.isEmpty()){
DBObject countIndexDoc = new BasicDBObject();
countIndexDoc.put("score_num", -1);
DBObject timeIndexDoc = new BasicDBObject();
timeIndexDoc.put("time", -1);
DBObject rankIndexDoc = new BasicDBObject();
rankIndexDoc.put("rank", -1);
DBObject nameIndexDoc = new BasicDBObject();
nameIndexDoc.put("name", -1);
DBObject typeIndexDoc = new BasicDBObject();
typeIndexDoc.put("type", -1);
try {
super.getReadColl().createIndex(countIndexDoc, new BasicDBObject("name", "score_desc"));
super.getReadColl().createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
super.getReadColl().createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
super.getReadColl().createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
super.getReadColl().createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* 添加数据入库
* @param list
*/
public void addTopicList(List<DBObject> list){
public void addTopicList(List<Document> list){
try {
this.getReadColl().insert(list);
mongoCollection.insertMany(list);
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
}
public void addTopic(DBObject doc){
public void addTopic(Document doc){
try {
this.getReadColl().insert(doc);
mongoCollection.insertOne(doc);
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
......
package com.zhiwei.searchhotcrawler.dbtemplate;
import java.util.Arrays;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.Mongo;
import com.mongodb.MongoClient;
import com.mongodb.MongoCredential;
import com.mongodb.MongoException;
import com.mongodb.ServerAddress;
import com.zhiwei.searchhotcrawler.config.Config;
import com.mongodb.MongoClientOptions;
import com.mongodb.MongoClientURI;
import com.mongodb.WriteConcern;
import com.mongodb.client.ListIndexesIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.IndexOptions;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import java.util.Objects;
@Log4j2
public enum MongoDBTemplate {
instance;
private MongoClient mongoClient;
static {
MongoClientOptions options = new MongoClientOptions.Builder()
.connectionsPerHost(300) //连接池设置为300个连接,默认为100
.connectTimeout(15000) //连接超时,推荐>3000毫秒
.maxWaitTime(5000)
.socketTimeout(0) // 套接字超时时间,0无限制
.threadsAllowedToBlockForConnectionMultiplier(5000) // 线程队列数,如果连接线程排满了队列就会抛出“Out of semaphores to get db”错误。
.writeConcern(WriteConcern.W1) //
.build();
log.info("MongoDBTemplate.static initializer : {}", DBConfig.mongoUri);
MongoClientURI mongoClientURI = new MongoClientURI(DBConfig.mongoUri);
instance.mongoClient = new MongoClient(mongoClientURI);
}
/**
/**
* 获取DB实例 - 指定DB
*
* @Description: MongoDB模板类
* @author Tou Tang
* @date 2014-11-14 下午3:24:40
* @param databaseName
* @return
*/
public class MongoDBTemplate {
protected static Mongo reader;
protected static Mongo writer;
protected String collName;
protected String dbName;
@SuppressWarnings("deprecation")
public MongoDBTemplate() {
try {
MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray());
ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
if(reader==null)
{
reader = new MongoClient(address, Arrays.asList(credential));
// reader = new MongoClient(address);
}
if(writer==null)
{
writer = new MongoClient(address, Arrays.asList(credential));
// writer = new MongoClient(address);
}
} catch (MongoException e) {
e.printStackTrace();
}
}
public DBCollection getReadColl() {
@SuppressWarnings("deprecation")
final DB db = getReader().getDB(dbName);
final DBCollection coll = db.getCollection(collName);
return coll;
}
protected Mongo getReader() {
return reader;
public static MongoDatabase getDB(String databaseName) {
return instance.mongoClient.getDatabase(databaseName);
}
public DBCollection getWriteColl() {
@SuppressWarnings("deprecation")
final DB db = getWriter().getDB(dbName);
final DBCollection coll = db.getCollection(collName);
return coll;
}
protected Mongo getWriter() {
return writer;
/**
* 获取collection对象 - 指定Collection
*
* @param databaseName
* @param collectionName
* @return
*/
public static MongoCollection<Document> getCollection(String databaseName, String collectionName) {
MongoDatabase db = instance.mongoClient.getDatabase(databaseName);
return db.getCollection(collectionName);
}
protected void setCollName(final String collName) {
this.collName = collName;
}
protected void setDbName(final String dbName) {
this.dbName = dbName;
}
@SuppressWarnings("static-access")
protected void setReader(final Mongo reader) {
this.reader = reader;
/**
* 创建索引
* @param databaseName
* @param collectionName
*/
public static void createIndex(String databaseName, String collectionName){
MongoDatabase db = instance.mongoClient.getDatabase(databaseName);
MongoCollection mongoCollection = db.getCollection(collectionName);
ListIndexesIterable<Document> indexList = mongoCollection.listIndexes();
if(Objects.isNull(indexList)){
Document countIndexDoc = new Document();
countIndexDoc.put("score_num", -1);
Document timeIndexDoc = new Document();
timeIndexDoc.put("time", -1);
Document rankIndexDoc = new Document();
rankIndexDoc.put("rank", -1);
Document nameIndexDoc = new Document();
nameIndexDoc.put("name", -1);
Document typeIndexDoc = new Document();
typeIndexDoc.put("type", -1);
try {
mongoCollection.createIndex(countIndexDoc, new IndexOptions().name("count_desc"));
mongoCollection.createIndex(timeIndexDoc, new IndexOptions().name("time_desc"));
mongoCollection.createIndex(rankIndexDoc, new IndexOptions().name("rank_desc"));
mongoCollection.createIndex(nameIndexDoc, new IndexOptions().name( "name_desc"));
mongoCollection.createIndex(typeIndexDoc, new IndexOptions().name( "type_desc"));
} catch (Exception e) {
e.printStackTrace();
}
@SuppressWarnings("static-access")
protected void setWriter(final Mongo writer) {
this.writer = writer;
}
public static void main(String[] args) {
}
}
......@@ -20,9 +20,12 @@ public class HotSearchRun {
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
new UpdateWechatUserRun().start();
ZhiWeiTools.sleep(10000);
new CacheListener().startListen();
// new UpdateWechatUserRun().start();
// ZhiWeiTools.sleep(10000);
// new CacheListener().startListen();
//推送程序启动
// new SendWeiboHotSearchRun().start();
// new SendZhihuHotSearchRun().start();
// ScheduledExecutorService scheduledThreadPool = Executors.newScheduledThreadPool(6);
//
......@@ -47,8 +50,6 @@ public class HotSearchRun {
new ZhihuHotSearchRun().start();
new WeiboSuperTopicRun().start();
new WeiboTopicRun().start();
//推送程序启动
new SendWeiboHotSearchRun().start();
new SendZhihuHotSearchRun().start();
}
}
package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HotSearchListTest{
public static void main(String[] args) {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("zzw").build();
ProxyFactory.init(simpleConfig);
String url = "http://app.myzaker.com/news/app.php?f=";
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
try{
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
Elements elements = Jsoup.parse(htmlBody).select("div.titlebar>a");
for(Element element : elements){
String lableUrl = "http://app.myzaker.com/news/app.php" + element.attr("href");
System.out.println("lableUrl========="+lableUrl);
String htmlBodyLable = httpBoot.syncCall(RequestUtils.wrapGet(lableUrl), ProxyHolder.NAT_HEAVY_PROXY).body().string();
Elements elementsLable = Jsoup.parse(htmlBodyLable).select("div#infinite_scroll>a");
for(Element elementLable : elementsLable){
System.out.println(elementLable.attr("href") + "=============" + elementLable.text());
}
}
}catch (Exception e){
e.printStackTrace();
}
// MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray());
// ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
// Mongo mongo = new MongoClient(address, Arrays.asList(credential));
//
// DB db = mongo.getDB("hot_search_list");
// DBCollection coll = db.getCollection("hot_search_list2019_09");
//
//// MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray());
//// ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
//// Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew));
//// DB dbNew = mongoNew.getDB("hot_search_list");
//
// Map<String,String> timLine = TimeParse.getTimeMap("2019-10-01 00:00:00", "2019-10-09 23:59:59", "dd", 1);
//
// timLine.forEach((start, end) ->{
//
// String year = end.substring(0,4);
// String month = end.substring(5,7);
// Date startDate = TimeParse.stringFormartDate(start);
// Date endDate = TimeParse.stringFormartDate(end);
//
// String collName = "hot_search_list"+year+"_"+month;
// System.out.println("collName=========="+collName);
//// DBCollection collNew = dbNew.getCollection(collName);
//// DBObject countIndexDoc = new BasicDBObject();
//// countIndexDoc.put("count", -1);
//// DBObject timeIndexDoc = new BasicDBObject();
//// timeIndexDoc.put("time", -1);
//// DBObject rankIndexDoc = new BasicDBObject();
//// rankIndexDoc.put("rank", -1);
//// DBObject nameIndexDoc = new BasicDBObject();
//// nameIndexDoc.put("name", -1);
//// DBObject typeIndexDoc = new BasicDBObject();
//// typeIndexDoc.put("type", -1);
//// try {
//// collNew.createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
//// collNew.createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
//// collNew.createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
//// collNew.createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
//// collNew.createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//
// DBObject query = new BasicDBObject(new BasicDBObject("time",
// new BasicDBObject("$gte",startDate).append("$lte", endDate)));
// System.out.println(query);
// WriteResult wr = coll.remove(query);
// System.out.println("========"+wr.getN());
//// int i = 0;
//// DBCursor cur = coll.remove(query);
//// System.out.println(query +"======="+ cur.count());
//// List<DBObject> dataList = new ArrayList<>();
//// while(cur.hasNext()) {
//// DBObject doc = cur.next();
//// try {
////// collNew.save(doc);
//// i++;
//// coll.remove(doc);
//// } catch (Exception e2) {
//// e2.printStackTrace();
//// }
//// dataList.add(doc);
//// }
//// System.out.println(collName +"数据量大小" +dataList.size());
//// cur.close();
//// if(!dataList.isEmpty()) {
//// try {
//// collNew.insert(dataList);
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// });
// mongo.close();
}
}
......@@ -6,7 +6,9 @@ import java.util.List;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -39,23 +41,24 @@ public class BaiduHotSearchRun extends Thread{
private void getHotList() {
log.info("百度风云榜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> list = BaiDuHotSearchCrawler.baiduHotSearch();
log.info("{}, 此轮百度风云榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> saveDataList = new ArrayList<>();
List<Document> saveDataList = new ArrayList<>();
if(Objects.nonNull(list) && !list.isEmpty()) {
list.forEach(baiduHotSearch ->{
int changeCount = hotSearchDAO.getChangeCount(baiduHotSearch);
DBObject doc = new BasicDBObject();
Document doc = new Document();
doc.put("_id", baiduHotSearch.getId());
doc.put("name", baiduHotSearch.getName());
doc.put("url", baiduHotSearch.getUrl());
doc.put("count", baiduHotSearch.getCount());
doc.put("day", baiduHotSearch.getDay());
doc.put("time", baiduHotSearch.getTime());
doc.put("changeCount", changeCount);
doc.put("rank", baiduHotSearch.getRank());
doc.put("type", baiduHotSearch.getType());
saveDataList.add(doc);
hotSearchCacheDAO.addAndUpdateData(doc);
});
}
hotSearchDAO.addHotSearchList(saveDataList);
......
......@@ -5,7 +5,9 @@ import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -42,23 +44,24 @@ public class DouyinHotSearchRun extends Thread{
private void getHotList() {
log.info("抖音热搜榜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> list = DouyinHotSearchCrawler.getMobileDouyinHotList();
log.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
List<Document> data = new ArrayList<>();
for(HotSearchList douyinHotSearch : list){
int changeCount = hotSearchDAO.getChangeCount(douyinHotSearch);
DBObject douyin = new BasicDBObject();
Document douyin = new Document();
douyin.put("_id", douyinHotSearch.getId());
douyin.put("name", douyinHotSearch.getName());
douyin.put("rank", douyinHotSearch.getRank());
douyin.put("count", douyinHotSearch.getCount());
douyin.put("hot", douyinHotSearch.getHot());
douyin.put("day", douyinHotSearch.getDay());
douyin.put("time", douyinHotSearch.getTime());
douyin.put("changeCount", changeCount);
douyin.put("url", null);
douyin.put("type", douyinHotSearch.getType());
data.add(douyin);
hotSearchDAO.addHotSearch(douyin);
hotSearchCacheDAO.addAndUpdateData(douyin);
}
log.info("抖音热搜榜采集结束........");
}
......
package com.zhiwei.searchhotcrawler.timer;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.util.Template;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.searchhotcrawler.util.WechatConstant;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class SendWeiboHotSearchRun extends Thread {
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
private static WechatUserDao wechatUserDao = new WechatUserDao();
@Override
public void run() {
while (true) {
try {
Calendar calendar = Calendar.getInstance();
int hour = calendar.get(Calendar.HOUR_OF_DAY);
log.info("微博推送,当前系统时间为:" + hour);
if (hour > 6 && hour < 23) {
List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.微博热搜.name());
if (list != null && !list.isEmpty()) {
for (DBObject weibo : list) {
String title = weibo.get("name").toString();
String time = TimeParse.dateFormartString((Date) weibo.get("time"), "yyyy-MM-dd HH:mm:ss");
String url = weibo.get("url").toString();
sendTemplateByUserIds(title, time, url);
}
} else {
log.info("微博最近一小时无数据");
sendTemplateByUserIds("最近一小时无数据",
TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null);
}
}
ZhiWeiTools.sleep(1 * 60 * 60 * 1000);
} catch (Exception e) {
log.debug("微博热搜推送出现问题,问题为:::{}", e.fillInStackTrace());
ZhiWeiTools.sleep(1 * 60 * 60 * 1000);
continue;
}
}
}
/**
* @Title: sendTemplateByUserIds
* @author hero
* @Description: 发送模版消息
* @param @param
* microTouTiao
* @param @param
* userList 设定文件
* @return void 返回类型
*/
public static void sendTemplateByUserIds(String title, String time, String url) {
Map<String, Object> dataMap = new HashMap<String, Object>();
JSONObject first = new JSONObject();
first.put("value", "您好,有一条来自微博热搜榜的预警通知。");
dataMap.put("first", first);
JSONObject keyword1 = new JSONObject();
keyword1.put("value", title);
keyword1.put("color", "#173177");
dataMap.put("keyword1", keyword1);
JSONObject keyword2 = new JSONObject();
keyword2.put("value", "微博热搜榜");
keyword2.put("color", "#173177");
dataMap.put("keyword2", keyword2);
JSONObject keyword3 = new JSONObject();
keyword3.put("value", time);
keyword3.put("color", "#173177");
dataMap.put("keyword3", keyword3);
JSONObject remark = new JSONObject();
remark.put("value", "知微情报监测服务");
dataMap.put("remark", remark);
List<String> userList = getUserList();
if (userList != null && userList.size() > 0) {
for (String openId : userList) {
Template template = new Template();
template.setTouser(openId);
if (url != null) {
template.setUrl(url);
}
template.setTemplate_id(WechatConstant.WECHAT_TEMPLATEID_EARLY_IT);
template.setData(dataMap);
JSONObject templateJson = (JSONObject) JSONObject.toJSON(template);
WechatCodeUtil.sendDataJson(templateJson);
}
} else {
log.info("拉取微博用户列表失败");
}
}
/**
* @Title: getUserList
* @author hero
* @Description: 用户列表
* @param @param
* projectName
* @param @return
* 设定文件
* @return List<String> 返回类型
*/
public static List<String> getUserList() {
List<String> userList = wechatUserDao.getWechatUserByGroup("weibohot");
if(userList==null){
userList = WechatCodeUtil.getUserListByGroupName("weibohot");
}
return userList;
}
}
//package com.zhiwei.searchhotcrawler.timer;
//
//import java.util.Calendar;
//import java.util.Date;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//
//import lombok.extern.log4j.Log4j2;
//
//import com.alibaba.fastjson.JSONObject;
//import com.mongodb.DBObject;
//import com.zhiwei.searchhotcrawler.bean.HotSearchType;
//import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
//import com.zhiwei.searchhotcrawler.util.Template;
//import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
//import com.zhiwei.searchhotcrawler.util.WechatConstant;
//import com.zhiwei.tools.timeparse.TimeParse;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//@Log4j2
//public class SendWeiboHotSearchRun extends Thread {
// private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
// private static WechatUserDao wechatUserDao = new WechatUserDao();
// @Override
// public void run() {
// while (true) {
// try {
// Calendar calendar = Calendar.getInstance();
// int hour = calendar.get(Calendar.HOUR_OF_DAY);
// log.info("微博推送,当前系统时间为:" + hour);
// if (hour > 6 && hour < 23) {
// List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.微博热搜.name());
// if (list != null && !list.isEmpty()) {
// for (DBObject weibo : list) {
// String title = weibo.get("name").toString();
// String time = TimeParse.dateFormartString((Date) weibo.get("time"), "yyyy-MM-dd HH:mm:ss");
// String url = weibo.get("url").toString();
// sendTemplateByUserIds(title, time, url);
// }
// } else {
// log.info("微博最近一小时无数据");
// sendTemplateByUserIds("最近一小时无数据",
// TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null);
// }
// }
// ZhiWeiTools.sleep(1 * 60 * 60 * 1000);
// } catch (Exception e) {
// log.debug("微博热搜推送出现问题,问题为:::{}", e.fillInStackTrace());
// ZhiWeiTools.sleep(1 * 60 * 60 * 1000);
// continue;
// }
// }
// }
//
// /**
// * @Title: sendTemplateByUserIds
// * @author hero
// * @Description: 发送模版消息
// * @param @param
// * microTouTiao
// * @param @param
// * userList 设定文件
// * @return void 返回类型
// */
// public static void sendTemplateByUserIds(String title, String time, String url) {
// Map<String, Object> dataMap = new HashMap<String, Object>();
// JSONObject first = new JSONObject();
// first.put("value", "您好,有一条来自微博热搜榜的预警通知。");
// dataMap.put("first", first);
// JSONObject keyword1 = new JSONObject();
// keyword1.put("value", title);
// keyword1.put("color", "#173177");
// dataMap.put("keyword1", keyword1);
// JSONObject keyword2 = new JSONObject();
// keyword2.put("value", "微博热搜榜");
// keyword2.put("color", "#173177");
// dataMap.put("keyword2", keyword2);
// JSONObject keyword3 = new JSONObject();
// keyword3.put("value", time);
// keyword3.put("color", "#173177");
// dataMap.put("keyword3", keyword3);
// JSONObject remark = new JSONObject();
// remark.put("value", "知微情报监测服务");
// dataMap.put("remark", remark);
// List<String> userList = getUserList();
// if (userList != null && userList.size() > 0) {
// for (String openId : userList) {
// Template template = new Template();
// template.setTouser(openId);
// if (url != null) {
// template.setUrl(url);
// }
// template.setTemplate_id(WechatConstant.WECHAT_TEMPLATEID_EARLY_IT);
// template.setData(dataMap);
//
// JSONObject templateJson = (JSONObject) JSONObject.toJSON(template);
// WechatCodeUtil.sendDataJson(templateJson);
// }
// } else {
// log.info("拉取微博用户列表失败");
// }
// }
//
// /**
// * @Title: getUserList
// * @author hero
// * @Description: 用户列表
// * @param @param
// * projectName
// * @param @return
// * 设定文件
// * @return List<String> 返回类型
// */
// public static List<String> getUserList() {
// List<String> userList = wechatUserDao.getWechatUserByGroup("weibohot");
// if(userList==null){
// userList = WechatCodeUtil.getUserListByGroupName("weibohot");
// }
// return userList;
// }
//}
package com.zhiwei.searchhotcrawler.timer;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.searchhotcrawler.util.Template;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.searchhotcrawler.util.WechatConstant;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class SendZhihuHotSearchRun extends Thread{
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
private static WechatUserDao wechatUserDao = new WechatUserDao();
@Override
public void run() {
while(true) {
try {
Calendar calendar = Calendar.getInstance();
int hour = calendar.get(Calendar.HOUR_OF_DAY);
log.info("知乎推送,当前系统时间为:"+hour);
if(hour > 6 && hour <23){
List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.知乎热搜.name());
if(list!=null && !list.isEmpty()){
for(DBObject zhihu : list){
String title = zhihu.get("display_query").toString();
String time = TimeParse.dateFormartString((Date)zhihu.get("time"), "yyyy-MM-dd HH:mm:ss");
String url = zhihu.get("_id").toString();
if(calendar.get(Calendar.HOUR_OF_DAY) > 6 && calendar.get(Calendar.HOUR_OF_DAY) < 23){
sendTemplateByUserIds(title, time, url);
}
}
}else{
log.info("知乎最近一小时无数据");
sendTemplateByUserIds("最近一小时无数据", TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null);
}
}
ZhiWeiTools.sleep(1*60*60*1000);
} catch (Exception e) {
log.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
ZhiWeiTools.sleep(1*60*60*1000);
}
}
}
/**
* @Title: sendTemplateByUserIds
* @author hero
* @Description: 发送模版消息
* @param @param microTouTiao
* @param @param userList 设定文件
* @return void 返回类型
*/
public static void sendTemplateByUserIds(String title,String time, String url) {
Map<String, Object> dataMap = new HashMap<>();
JSONObject first = new JSONObject();
first.put("value", "您好,有一条来自知乎热搜榜的预警通知。");
dataMap.put("first", first);
JSONObject keyword1 = new JSONObject();
keyword1.put("value", title);
keyword1.put("color", "#173177");
dataMap.put("keyword1", keyword1);
JSONObject keyword2 = new JSONObject();
keyword2.put("value", "知乎热搜榜");
keyword2.put("color", "#173177");
dataMap.put("keyword2", keyword2);
JSONObject keyword3 = new JSONObject();
keyword3.put("value", time);
keyword3.put("color", "#173177");
dataMap.put("keyword3", keyword3);
JSONObject remark = new JSONObject();
remark.put("value", "知微情报监测服务");
dataMap.put("remark", remark);
List<String> userList = getUserList();
if(userList!=null && !userList.isEmpty()) {
for (String openId : userList) {
Template template = new Template();
template.setTouser(openId);
if(url!=null){
template.setUrl(url);
}
template.setTemplate_id(WechatConstant.WECHAT_TEMPLATEID_EARLY_IT);
template.setData(dataMap);
JSONObject templateJson = (JSONObject)JSONObject.toJSON(template);
WechatCodeUtil.sendDataJson(templateJson);
}
}else {
log.info("知乎推送拉取用户列表失败");
}
}
/**
* @Title: getUserList
* @author hero
* @Description: 用户列表
* @param @param projectName
* @param @return 设定文件
* @return List<String> 返回类型
*/
private static List<String> getUserList()
{
List<String> userList = wechatUserDao.getWechatUserByGroup("LP组");
if(userList==null){
userList = WechatCodeUtil.getUserListByGroupName("LP组");
}
return userList;
}
}
//package com.zhiwei.searchhotcrawler.timer;
//
//import java.util.Calendar;
//import java.util.Date;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//
//import lombok.extern.log4j.Log4j2;
//
//import com.alibaba.fastjson.JSONObject;
//import com.mongodb.DBObject;
//import com.zhiwei.searchhotcrawler.bean.HotSearchType;
//import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
//import com.zhiwei.searchhotcrawler.util.Template;
//import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
//import com.zhiwei.searchhotcrawler.util.WechatConstant;
//import com.zhiwei.tools.timeparse.TimeParse;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//@Log4j2
//public class SendZhihuHotSearchRun extends Thread{
// private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
// private static WechatUserDao wechatUserDao = new WechatUserDao();
// @Override
// public void run() {
//
// while(true) {
// try {
// Calendar calendar = Calendar.getInstance();
// int hour = calendar.get(Calendar.HOUR_OF_DAY);
// log.info("知乎推送,当前系统时间为:"+hour);
// if(hour > 6 && hour <23){
// List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.知乎热搜.name());
// if(list!=null && !list.isEmpty()){
// for(DBObject zhihu : list){
// String title = zhihu.get("display_query").toString();
// String time = TimeParse.dateFormartString((Date)zhihu.get("time"), "yyyy-MM-dd HH:mm:ss");
// String url = zhihu.get("_id").toString();
// if(calendar.get(Calendar.HOUR_OF_DAY) > 6 && calendar.get(Calendar.HOUR_OF_DAY) < 23){
// sendTemplateByUserIds(title, time, url);
// }
// }
// }else{
// log.info("知乎最近一小时无数据");
// sendTemplateByUserIds("最近一小时无数据", TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null);
// }
// }
// ZhiWeiTools.sleep(1*60*60*1000);
// } catch (Exception e) {
// log.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
// ZhiWeiTools.sleep(1*60*60*1000);
// }
// }
// }
//
// /**
// * @Title: sendTemplateByUserIds
// * @author hero
// * @Description: 发送模版消息
// * @param @param microTouTiao
// * @param @param userList 设定文件
// * @return void 返回类型
// */
// public static void sendTemplateByUserIds(String title,String time, String url) {
//
// Map<String, Object> dataMap = new HashMap<>();
// JSONObject first = new JSONObject();
// first.put("value", "您好,有一条来自知乎热搜榜的预警通知。");
// dataMap.put("first", first);
// JSONObject keyword1 = new JSONObject();
// keyword1.put("value", title);
// keyword1.put("color", "#173177");
// dataMap.put("keyword1", keyword1);
// JSONObject keyword2 = new JSONObject();
// keyword2.put("value", "知乎热搜榜");
// keyword2.put("color", "#173177");
// dataMap.put("keyword2", keyword2);
// JSONObject keyword3 = new JSONObject();
// keyword3.put("value", time);
// keyword3.put("color", "#173177");
// dataMap.put("keyword3", keyword3);
// JSONObject remark = new JSONObject();
// remark.put("value", "知微情报监测服务");
// dataMap.put("remark", remark);
//
// List<String> userList = getUserList();
// if(userList!=null && !userList.isEmpty()) {
// for (String openId : userList) {
// Template template = new Template();
// template.setTouser(openId);
// if(url!=null){
// template.setUrl(url);
// }
// template.setTemplate_id(WechatConstant.WECHAT_TEMPLATEID_EARLY_IT);
// template.setData(dataMap);
//
// JSONObject templateJson = (JSONObject)JSONObject.toJSON(template);
// WechatCodeUtil.sendDataJson(templateJson);
// }
// }else {
// log.info("知乎推送拉取用户列表失败");
// }
//
// }
//
// /**
// * @Title: getUserList
// * @author hero
// * @Description: 用户列表
// * @param @param projectName
// * @param @return 设定文件
// * @return List<String> 返回类型
// */
//// private static List<String> getUserList()
//// {
//// List<String> userList = wechatUserDao.getWechatUserByGroup("LP组");
//// if(userList==null){
//// userList = WechatCodeUtil.getUserListByGroupName("LP组");
//// }
//// return userList;
//// }
//
//}
......@@ -5,7 +5,9 @@ import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -37,12 +39,13 @@ public class SougoHotSearchRun extends Thread {
private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
log.info("搜狗微信采集开始........");
List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch();
log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
List<Document> data = new ArrayList<>();
for(HotSearchList sougoHotSearch : list){
DBObject doc = new BasicDBObject();
Document doc = new Document();
doc.put("_id", sougoHotSearch.getId());
doc.put("name", sougoHotSearch.getName());
doc.put("url", sougoHotSearch.getUrl());
......@@ -51,6 +54,7 @@ public class SougoHotSearchRun extends Thread {
doc.put("rank", sougoHotSearch.getRank());
doc.put("type", sougoHotSearch.getType());
data.add(doc);
hotSearchCacheDAO.addAndUpdateData(doc);
}
hotSearchDAO.addHotSearchList(data);
log.info("搜狗微信采集结束........");
......
package com.zhiwei.searchhotcrawler.timer;
import java.util.Calendar;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class UpdateWechatUserRun extends Thread{
private WechatUserDao wechatUserDao = new WechatUserDao();
@Override
public void run() {
log.info("开始更新用户数据");
while(true) {
try {
Calendar calendar = Calendar.getInstance();
int hour = calendar.get(Calendar.HOUR_OF_DAY);
if(hour > 6 ){
Map<String,Integer> groupMap = WechatCodeUtil.getAllGroupIp();
log.info("此公众号的分组数量为:::{}", groupMap.size());
if(!groupMap.isEmpty() && groupMap!=null){
for(Entry<String,Integer> group : groupMap.entrySet()){
log.info("此公众号的分组名称及IP为:::{},{}", group.getKey(), group.getValue());
List<String> userList = WechatCodeUtil.getUserListByGroupId(group.getValue());
log.info("{},此分组下的用户数量为::{}", group.getKey(), userList.size());
if(userList!=null && !userList.isEmpty()){
wechatUserDao.addWechatUser(userList, group.getKey(), group.getValue());
}
}
}
}
ZhiWeiTools.sleep(1*60*60*1000);
} catch (Exception e) {
log.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
ZhiWeiTools.sleep(1*60*60*1000);
continue;
}
}
}
}
//package com.zhiwei.searchhotcrawler.timer;
//
//import java.util.Calendar;
//import java.util.List;
//import java.util.Map;
//import java.util.Map.Entry;
//
//import lombok.extern.log4j.Log4j2;
//
//import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//@Log4j2
//public class UpdateWechatUserRun extends Thread{
// private WechatUserDao wechatUserDao = new WechatUserDao();
// @Override
// public void run() {
// log.info("开始更新用户数据");
// while(true) {
// try {
// Calendar calendar = Calendar.getInstance();
// int hour = calendar.get(Calendar.HOUR_OF_DAY);
// if(hour > 6 ){
// Map<String,Integer> groupMap = WechatCodeUtil.getAllGroupIp();
// log.info("此公众号的分组数量为:::{}", groupMap.size());
// if(!groupMap.isEmpty() && groupMap!=null){
// for(Entry<String,Integer> group : groupMap.entrySet()){
// log.info("此公众号的分组名称及IP为:::{},{}", group.getKey(), group.getValue());
// List<String> userList = WechatCodeUtil.getUserListByGroupId(group.getValue());
// log.info("{},此分组下的用户数量为::{}", group.getKey(), userList.size());
// if(userList!=null && !userList.isEmpty()){
// wechatUserDao.addWechatUser(userList, group.getKey(), group.getValue());
// }
// }
// }
// }
// ZhiWeiTools.sleep(1*60*60*1000);
// } catch (Exception e) {
// log.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
// ZhiWeiTools.sleep(1*60*60*1000);
// continue;
// }
// }
// }
//
//
//}
......@@ -5,16 +5,13 @@ import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.bson.Document;
@Log4j2
public class WeiboHotSearchRun extends Thread{
......@@ -38,13 +35,13 @@ public class WeiboHotSearchRun extends Thread{
private void getHotList() {
log.info("微博话题采集开始........");
HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch();
List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearchByPhone();
log.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
List<Document> data = new ArrayList<>();
for(HotSearchList weiboHotSearch : list){
int changeCount = weiboHotSearchDAO.getChangeCount(weiboHotSearch);
DBObject doc = new BasicDBObject();
Document doc = new Document();
doc.put("_id", weiboHotSearch.getId());
doc.put("name", weiboHotSearch.getName());
doc.put("url", weiboHotSearch.getUrl());
......@@ -52,11 +49,11 @@ public class WeiboHotSearchRun extends Thread{
doc.put("hot", weiboHotSearch.getHot());
doc.put("day", weiboHotSearch.getDay());
doc.put("time", weiboHotSearch.getTime());
doc.put("changeCount", changeCount);
doc.put("rank", weiboHotSearch.getRank());
doc.put("type", weiboHotSearch.getType());
doc.put("icon", weiboHotSearch.getIcon());
data.add(doc);
hotSearchCacheDAO.addAndUpdateData(doc);
}
weiboHotSearchDAO.addHotSearchList(data);
log.info("微博话题采集结束........");
......
......@@ -9,11 +9,7 @@ import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import com.zhiwei.searchhotcrawler.crawler.WeiboSuperTopicCrawler;
import com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import org.bson.Document;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
......@@ -40,10 +36,10 @@ public class WeiboSuperTopicRun extends Thread{
log.info("微博超话采集开始........");
List<WeiboSuperTopic> list = WeiboSuperTopicCrawler.startCrawler();
log.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
List<Document> data = new ArrayList<>();
for(WeiboSuperTopic topic : list){
log.info("topic::::{}", topic);
DBObject doc = new BasicDBObject();
Document doc = new Document();
doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName());
doc.put("rank", topic.getRank());
......
package com.zhiwei.searchhotcrawler.timer;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.WeiboTopicCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
......@@ -32,12 +33,13 @@ public class WeiboTopicRun extends Thread{
private void getTopicList() {
HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
log.info("微博话题采集开始........");
List<HotSearchList> list = WeiboTopicCrawler.startCrawlerByPhone();
log.info("{}, 微博话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
List<Document> data = new ArrayList<>();
for(HotSearchList topic : list){
DBObject doc = new BasicDBObject();
Document doc = new Document();
doc.put("_id", topic.getId());
doc.put("name", topic.getName());
doc.put("url", topic.getUrl());
......@@ -50,6 +52,7 @@ public class WeiboTopicRun extends Thread{
doc.put("topic_lead", topic.getTopicLead());
doc.put("comment_count", topic.getCommentCount());
data.add(doc);
hotSearchCacheDAO.addAndUpdateData(doc);
}
weiboHotSearchDAO.addHotSearchList(data);
log.info("微博话题采集结束........");
......
package com.zhiwei.searchhotcrawler.timer;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -36,13 +39,15 @@ public class ZhihuHotSearchRun extends Thread{
private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
log.info("知乎话题采集开始...,当前线程名字:{}", Thread.currentThread().getName());
List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList();
List<HotSearchList> mobilelist = ZhihuHotSearchCrawler.getMobileZhihuHotList();
list.addAll(mobilelist);
log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<Document> dataList = new ArrayList<>();
for(HotSearchList zhihuHotSearch : list){
DBObject zhihu = new BasicDBObject();
Document zhihu = new Document();
zhihu.put("_id", zhihuHotSearch.getId());
zhihu.put("name", zhihuHotSearch.getName());
zhihu.put("url", zhihuHotSearch.getUrl());
......@@ -50,11 +55,12 @@ public class ZhihuHotSearchRun extends Thread{
zhihu.put("hot", zhihuHotSearch.getHot());
zhihu.put("day", zhihuHotSearch.getDay());
zhihu.put("time", zhihuHotSearch.getTime());
zhihu.put("changeCount", 0);
zhihu.put("rank", zhihuHotSearch.getRank());
zhihu.put("type", zhihuHotSearch.getType());
hotSearchDAO.addHotSearch(zhihu);
dataList.add(zhihu);
hotSearchCacheDAO.addAndUpdateData(zhihu);
}
hotSearchDAO.addHotSearchList(dataList);
log.info("知乎话题采集结束........");
}
......
#mongoIp=202.107.192.94
mongoIp=192.168.0.101
mongoPort=30000
#mongoIp=192.168.0.81
#mongoPort=27017
db.username=searchhotcrawleruser
db.paasword=searchhotcrawler1q2w3e4r
db.certifiedDB=admin
#local service
#mongoUri=mongodb://searchhotcrawleruser:searchhotcrawler1q2w3e4r@202.107.192.94:30000/istarshine_data?authSource=admin&authMechanism=SCRAM-SHA-1
#local
#mongoUri=mongodb://192.168.0.81:27017/istarshine_data
#service
mongoUri=mongodb://istarshineuser:istarshine1q2w3e4r@192.168.0.101:30000,192.168.0.106:30000,192.168.0.108:30000/istarshine_data?authSource=admin&authMechanism=SCRAM-SHA-1
dbName=hot_search_list
searchCollName=hot_search_list
searchCacheCollName=hot_search_cache
topicCollName=topic_list
collWechatUserName=wechat_user
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment