Commit 33f69a5d by zhiwei

采集调整为每分钟采集一次

parent bc9cabb1
......@@ -7,7 +7,6 @@ import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import javax.print.Doc;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
......@@ -135,31 +134,32 @@ public class HotSearchCacheDAO {
* @return
*/
private int getDuration(String type, int duration){
switch (type){
case "微博热搜" :
// switch (type){
// case "微博热搜" :
// duration = duration + 1;
// break;
// case "百度热搜" :
// duration = duration + 5;
// break;
// case "知乎热搜" :
// duration = duration + 10;
// break;
// case "抖音热搜" :
// duration = duration + 10;
// break;
// case "搜狗微信热搜" :
// duration = duration + 5;
// break;
// case "微博话题" :
// duration = duration + 3;
// break;
// case "今日头条热搜" :
// duration = duration + 1;
// break;
// default :
// duration = duration + 1;
// }
duration = duration + 1;
break;
case "百度热搜" :
duration = duration + 5;
break;
case "知乎热搜" :
duration = duration + 10;
break;
case "抖音热搜" :
duration = duration + 10;
break;
case "搜狗微信热搜" :
duration = duration + 5;
break;
case "微博话题" :
duration = duration + 3;
break;
case "今日头条热搜" :
duration = duration + 1;
break;
default :
duration = duration + 1;
}
return duration;
}
......@@ -171,32 +171,32 @@ public class HotSearchCacheDAO {
* @return
*/
private Date getEndTime(String type, Date time){
long timeLong = time.getTime();
switch (type){
case "微博热搜" :
timeLong = timeLong + 1*60*1000;
break;
case "百度热搜" :
timeLong = timeLong + 5*60*1000;
break;
case "知乎热搜" :
timeLong = timeLong + 10*60*1000;
break;
case "抖音热搜" :
timeLong = timeLong + 10*60*1000;
break;
case "搜狗微信热搜" :
timeLong = timeLong + 5*60*1000;
break;
case "微博话题" :
timeLong = timeLong + 3*60*1000;
break;
case "今日头条热搜" :
timeLong = timeLong + 1*60*1000;
break;
default :
timeLong = timeLong + 1*60*1000;
}
long timeLong = time.getTime() + 1*60*1000;
// switch (type){
// case "微博热搜" :
// timeLong = timeLong + 1*60*1000;
// break;
// case "百度热搜" :
// timeLong = timeLong + 5*60*1000;
// break;
// case "知乎热搜" :
// timeLong = timeLong + 10*60*1000;
// break;
// case "抖音热搜" :
// timeLong = timeLong + 10*60*1000;
// break;
// case "搜狗微信热搜" :
// timeLong = timeLong + 5*60*1000;
// break;
// case "微博话题" :
// timeLong = timeLong + 3*60*1000;
// break;
// case "今日头条热搜" :
// timeLong = timeLong + 1*60*1000;
// break;
// default :
// timeLong = timeLong + 1*60*1000;
// }
return new Date(timeLong);
}
......
package com.zhiwei.searchhotcrawler.dbtemplate;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientOptions;
import com.mongodb.MongoClientURI;
import com.mongodb.WriteConcern;
import com.mongodb.client.ListIndexesIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.IndexOptions;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import java.util.Objects;
@Log4j2
public enum MongoDBLocalTemplate {
instance;
private MongoClient mongoClient;
static {
MongoClientOptions options = new MongoClientOptions.Builder()
.connectionsPerHost(300) //连接池设置为300个连接,默认为100
.connectTimeout(15000) //连接超时,推荐>3000毫秒
.maxWaitTime(5000)
.socketTimeout(0) // 套接字超时时间,0无限制
.threadsAllowedToBlockForConnectionMultiplier(5000) // 线程队列数,如果连接线程排满了队列就会抛出“Out of semaphores to get db”错误。
.writeConcern(WriteConcern.W1) //
.build();
log.info("MongoDBTemplate.static initializer : {}", DBConfig.mongoLocalUri);
MongoClientURI mongoClientURI = new MongoClientURI(DBConfig.mongoLocalUri);
instance.mongoClient = new MongoClient(mongoClientURI);
}
/**
* 获取DB实例 - 指定DB
*
* @param databaseName
* @return
*/
public static MongoDatabase getDB(String databaseName) {
return instance.mongoClient.getDatabase(databaseName);
}
/**
* 获取collection对象 - 指定Collection
*
* @param databaseName
* @param collectionName
* @return
*/
public static MongoCollection<Document> getCollection(String databaseName, String collectionName) {
MongoDatabase db = instance.mongoClient.getDatabase(databaseName);
return db.getCollection(collectionName);
}
/**
* 创建索引
* @param databaseName
* @param collectionName
*/
public static void createIndex(String databaseName, String collectionName){
MongoDatabase db = instance.mongoClient.getDatabase(databaseName);
MongoCollection mongoCollection = db.getCollection(collectionName);
ListIndexesIterable<Document> indexList = mongoCollection.listIndexes();
if(Objects.isNull(indexList)){
Document countIndexDoc = new Document();
if(collectionName.contains("hot_search_list")){
countIndexDoc.put("count", -1);
}else{
countIndexDoc.put("score_num", -1);
}
Document timeIndexDoc = new Document();
timeIndexDoc.put("time", -1);
Document rankIndexDoc = new Document();
rankIndexDoc.put("rank", -1);
Document nameIndexDoc = new Document();
nameIndexDoc.put("name", -1);
Document typeIndexDoc = new Document();
typeIndexDoc.put("type", -1);
try {
mongoCollection.createIndex(countIndexDoc, new IndexOptions().name("count_desc"));
mongoCollection.createIndex(timeIndexDoc, new IndexOptions().name("time_desc"));
mongoCollection.createIndex(rankIndexDoc, new IndexOptions().name("rank_desc"));
mongoCollection.createIndex(nameIndexDoc, new IndexOptions().name( "name_desc"));
mongoCollection.createIndex(typeIndexDoc, new IndexOptions().name( "type_desc"));
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
//package com.zhiwei.searchhotcrawler.test;
//
//import com.mongodb.client.MongoCollection;
//import com.mongodb.client.MongoCursor;
//import com.mongodb.client.MongoDatabase;
//import com.zhiwei.searchhotcrawler.config.DBConfig;
//import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
//import com.zhiwei.tools.timeparse.TimeParse;
//import lombok.extern.log4j.Log4j2;
//import org.bson.Document;
//
//import java.util.*;
//
//@Log4j2
//public class TopicTest {
//
// private static MongoDatabase mongoDB = MongoDBTemplate.getDB(DBConfig.dbName);
//
// public static void main(String[] args) {
//// repairTopic();
//
package com.zhiwei.searchhotcrawler.test;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBLocalTemplate;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.tools.timeparse.TimeParse;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import java.util.*;
@Log4j2
public class TopicTest {
private static MongoDatabase mongoDB = MongoDBTemplate.getDB(DBConfig.dbName);
private static MongoDatabase mongoDBLocal = MongoDBLocalTemplate.getDB(DBConfig.dbName);
public static void main(String[] args) {
repairHotType();
// updateHotSearchCache();
// }
//
// /**
// * 修复热搜话题类型错误问题
// */
// public static void repairTopic(){
// MongoCollection mongoCollection = mongoDB.getCollection("hot_search_list2020_04");
// Document query = new Document("comment_count", new Document("$ne", null));
// query.put("type", "微博热搜");
// Date time = TimeParse.stringFormartDate("2020-03-12 18:00:00");
//
// long count = mongoCollection.countDocuments(query);
// log.info("count is {}", count);
// for(int i=0;i<55;i++){
// MongoCursor<Document> cursor = mongoCollection.find(query).limit(1000).iterator();
// while(cursor.hasNext()){
// Document update = cursor.next();
// update.put("type", "微博话题");
// Document query2 = new Document();
// query2.put("_id", update.getString("_id"));
// mongoCollection.findOneAndReplace(query2, update);
// time = update.getDate("time");
// }
// log.info("i========{}", i);
// }
// }
//
//
//
//
// public static void updateHotSearchCache(){
// for(int month = 3; month<=3; month++){
// String collectionName = "hot_search_list2020_0" + month;
// if(month>=10){
// collectionName = "hot_search_list2020_" + month;
// }
// log.info("collectionName is {}", collectionName);
// MongoCollection mongoCollection = mongoDB.getCollection(collectionName);
// MongoCollection mongoCollectionLocal = mongoDBLocal.getCollection("hot_search_cache");
//
// long count = mongoCollection.countDocuments();
// int pageCount = 10000;
// int pages = (int)Math.ceil((double)count/(double)pageCount);
// log.info("count====={},pages====={}",count, pages);
// Date date = TimeParse.stringFormartDate("2020-03-12 18:00:00");
//
// Map<String,Document> resultMap = new HashMap<>();
//
// for(int page = 1; page<pages; page++){
// Document query = new Document();
// if(page>1) {
// query.put("time", new Document("$gt", date));
// }
// log.info("page is {} ,query is {},coutn is {}", page ,query ,mongoCollection.countDocuments(query));
// MongoCursor<Document> cursor = mongoCollection.find(query).limit(pageCount).sort(new Document("time",1)).iterator();
// while(cursor.hasNext()){
// Document document = cursor.next();
// String name = document.getString("name");
// String type = document.getString("type");
// int lastRank = document.getInteger("rank")!=null?document.getInteger("rank"): -1;
// int lastCount = document.getInteger("count")!=null?document.getInteger("count"): -1;
// Date startTime = document.getDate("time");
// Date endTime = new Date(startTime.getTime() + (60 * 1000));
// String topicLead = document.getString("topic_lead")!=null?document.getString("topic_lead"):null;
// boolean hot = document.getBoolean("hot")!=null?document.getBoolean("hot"):true;
// String url = document.getString("url")!=null?document.getString("url"):null;
// String id = name + "_" + type;
//
// Document nowDoc = resultMap.get(id);
// if (Objects.nonNull(nowDoc)) {
// int highestRank = nowDoc.getInteger("highestRank");
// int highestCount = nowDoc.getInteger("highestCount");
// //判断最大热度值
// if (lastCount>0 && lastCount > highestCount) {
// highestCount = lastCount;
// }
// //判断最高排名
// if (lastRank>0 && lastRank < highestRank) {
// highestRank = lastRank;
// }
// //计算热搜时长
// int duration = nowDoc.getInteger("duration");
// int durationNow = getDuration(type, duration);
//
// //更新相应信息
// nowDoc.put("endTime", endTime);
// nowDoc.put("lastRank", lastRank);
// nowDoc.put("lastCount", lastCount);
// nowDoc.put("highestRank", highestRank);
// nowDoc.put("highestCount", highestCount);
// nowDoc.put("duration", durationNow);
// } else {
// nowDoc = new Document();
// int durationNow = getDuration(type, 0);
// nowDoc.put("_id", id);
// nowDoc.put("url", url);
// nowDoc.put("name", name);
// nowDoc.put("hot", hot);
// nowDoc.put("topicLead", topicLead);
// nowDoc.put("type", type);
// nowDoc.put("lastRank", lastRank);
// nowDoc.put("highestRank", lastRank);
// nowDoc.put("lastCount", lastCount);
// nowDoc.put("highestCount", lastCount);
// nowDoc.put("startTime", startTime);
// nowDoc.put("endTime", endTime);
// nowDoc.put("duration", durationNow);
// }
// resultMap.put(id, nowDoc);
// date = startTime;
// }
// cursor.close();
// }
//
// log.info("list size is {}", resultMap.size());
// for (Map.Entry<String,Document> entry: resultMap.entrySet()){
// String id = entry.getKey();
// Document document = entry.getValue();
// String name = document.getString("name");
// String type = document.getString("type");
// int lastRank = document.getInteger("lastRank");
// int lastCount = document.getInteger("lastCount");
// int highestRank = document.getInteger("highestRank");
// int highestCount = document.getInteger("highestCount");
// int duration = document.getInteger("duration");
//
// Document query = new Document("_id", id);
// Document resultDoc = (Document) mongoCollectionLocal.find(query).first();
// if(Objects.isNull(resultDoc)){
// mongoCollectionLocal.insertOne(document);
// }else{
//
// int highestRankResult = resultDoc.getInteger("highestRank");
// int highestCountResult = resultDoc.getInteger("highestCount");
// int durationResult = document.getInteger("duration");
// //判断最大热度值
// if (highestCountResult > highestCount) {
// highestCount = highestCountResult;
// }
// //判断最高排名
// if (highestRankResult < highestRank) {
// highestRank = highestRankResult;
// }
// //计算热搜时长
// int durationNow = duration + durationResult;
// Date endTime = document.getDate("endTime");
// //更新相应信息
// resultDoc.put("endTime", endTime);
// resultDoc.put("lastRank", lastRank);
// resultDoc.put("lastCount", lastCount);
// resultDoc.put("highestRank", highestRank);
// resultDoc.put("highestCount", highestCount);
// resultDoc.put("duration", durationNow);
// mongoCollectionLocal.findOneAndReplace(query, resultDoc);
// }
// }
// }
// }
//
//
// /**
// * 计算热搜时长
// * @param type
// * @param duration
// * @return
// */
// private static int getDuration(String type, int duration){
// switch (type){
// case "微博热搜" :
// duration = duration + 1;
// break;
// case "百度热搜" :
// duration = duration + 5;
// break;
// case "知乎热搜" :
// duration = duration + 10;
// break;
// case "抖音热搜" :
// duration = duration + 10;
// break;
// case "搜狗微信热搜" :
// duration = duration + 5;
// break;
// case "微博话题" :
// duration = duration + 3;
// break;
// default :
// duration = duration + 1;
// }
// return duration;
// }
//
//}
}
/**
* 修复热搜话题类型错误问题
*/
public static void repairHotType(){
try{
for(int month = 6; month<=6; month++){
String collectionName = "hot_search_list2020_0" + month;
MongoCollection mongoCollection = mongoDB.getCollection(collectionName);
MongoCollection mongoLocalCollection = mongoDBLocal.getCollection(collectionName);
Date date = TimeParse.stringFormartDate("2020-06-17 12:59:59");
Document query = new Document();
query.put("time", new Document("$gt", date));
long count = mongoCollection.countDocuments(query);
int pageCount = 10000;
int pages = (int)Math.ceil((double)count/(double)pageCount);
log.info("count====={},pages====={}",count, pages);
for(int page = 1; page<pages; page++){
query.put("time", new Document("$gt", date));
log.info("page is {} ,query is {},coutn is {}", page ,query ,mongoCollection.countDocuments(query));
MongoCursor<Document> cursor = mongoCollection.find(query).limit(pageCount).sort(new Document("time",1)).iterator();
List<Document> dataList = new ArrayList<>();
while(cursor.hasNext()) {
Document document = cursor.next();
date = document.getDate("time");;
dataList.add(document);
try{
mongoLocalCollection.insertOne(document);
}catch (Exception e){
continue;
}
}
cursor.close();
}
}
}catch (Exception e){
e.printStackTrace();
}
}
public static void updateHotSearchCache(){
for(int month = 3; month<=12; month++){
String collectionName = "hot_search_list2019_0" + month;
if(month>=10){
collectionName = "hot_search_list2019_" + month;
}
log.info("collectionName is {}", collectionName);
MongoCollection mongoCollection = mongoDB.getCollection(collectionName);
MongoCollection mongoCollectionLocal = mongoDBLocal.getCollection("hot_search_cache");
long count = mongoCollection.countDocuments();
int pageCount = 10000;
int pages = (int)Math.ceil((double)count/(double)pageCount);
log.info("count====={},pages====={}",count, pages);
Date date = TimeParse.stringFormartDate("2020-03-12 18:00:00");
Map<String,Document> resultMap = new HashMap<>();
for(int page = 1; page<pages; page++){
Document query = new Document();
if(page>1) {
query.put("time", new Document("$gt", date));
}
log.info("page is {} ,query is {},coutn is {}", page ,query ,mongoCollection.countDocuments(query));
MongoCursor<Document> cursor = mongoCollection.find(query).limit(pageCount).sort(new Document("time",1)).iterator();
while(cursor.hasNext()){
Document document = cursor.next();
String name = document.getString("name");
String type = document.getString("type");
Integer lastRank = document.getInteger("rank")!=null?document.getInteger("rank"): null;
Integer lastCount = document.getInteger("count")!=null?document.getInteger("count"): null;
Date startTime = document.getDate("time");
Date endTime = new Date(startTime.getTime() + (60 * 1000));
String topicLead = document.getString("topic_lead")!=null?document.getString("topic_lead"):null;
boolean hot = document.getBoolean("hot")!=null?document.getBoolean("hot"):true;
String url = document.getString("url")!=null?document.getString("url"):null;
String id = name + "_" + type;
Document nowDoc = resultMap.get(id);
if (Objects.nonNull(nowDoc)) {
Integer highestRank = nowDoc.getInteger("highestRank");
Integer highestCount = nowDoc.getInteger("highestCount");
Integer preRank = nowDoc.getInteger("lastRank");
Integer preCount = nowDoc.getInteger("lastCount");
//判断最大热度值
if (Objects.nonNull(lastCount) && Objects.nonNull(highestCount) && lastCount>0 && lastCount > highestCount) {
highestCount = lastCount;
}
//判断最高排名
if (Objects.nonNull(lastRank) && Objects.nonNull(highestRank) && lastRank>0 && lastRank < highestRank) {
highestRank = lastRank;
}
//计算热搜时长
int duration = nowDoc.getInteger("duration");
int durationNow = getDuration(type, duration);
//更新相应信息
nowDoc.put("endTime", endTime);
nowDoc.put("lastRank", lastRank);
nowDoc.put("lastCount", lastCount);
nowDoc.put("preRank", preRank);
nowDoc.put("preCount", preCount);
nowDoc.put("highestRank", highestRank);
nowDoc.put("highestCount", highestCount);
nowDoc.put("duration", durationNow);
} else {
nowDoc = new Document();
int durationNow = getDuration(type, 0);
nowDoc.put("_id", id);
nowDoc.put("url", url);
nowDoc.put("name", name);
nowDoc.put("hot", hot);
nowDoc.put("topicLead", topicLead);
nowDoc.put("type", type);
nowDoc.put("lastRank", lastRank);
nowDoc.put("highestRank", lastRank);
nowDoc.put("lastCount", lastCount);
nowDoc.put("highestCount", lastCount);
nowDoc.put("startTime", startTime);
nowDoc.put("endTime", endTime);
nowDoc.put("duration", durationNow);
nowDoc.put("preRank", null);
nowDoc.put("preCount", null);
}
resultMap.put(id, nowDoc);
date = startTime;
}
cursor.close();
}
log.info("list size is {}", resultMap.size());
for (Map.Entry<String,Document> entry: resultMap.entrySet()){
String id = entry.getKey();
Document document = entry.getValue();
String name = document.getString("name");
String type = document.getString("type");
Integer lastRank = document.getInteger("lastRank");
Integer lastCount = document.getInteger("lastCount");
Integer highestRank = document.getInteger("highestRank");
Integer highestCount = document.getInteger("highestCount");
Integer duration = document.getInteger("duration");
Integer preRank = document.getInteger("preRank");
Integer preCount = document.getInteger("preCount");
Document query = new Document("_id", id);
Document resultDoc = (Document) mongoCollectionLocal.find(query).first();
if(Objects.isNull(resultDoc)){
mongoCollectionLocal.insertOne(document);
}else{
Integer highestRankResult = resultDoc.getInteger("highestRank");
Integer highestCountResult = resultDoc.getInteger("highestCount");
Integer durationResult = document.getInteger("duration");
//判断最大热度值
if (Objects.nonNull(highestRankResult) && Objects.nonNull(highestCount) && highestCountResult > highestCount) {
highestCount = highestCountResult;
}
//判断最高排名
if (Objects.nonNull(highestRankResult) && Objects.nonNull(highestRank) && highestRankResult < highestRank) {
highestRank = highestRankResult;
}
//计算热搜时长
int durationNow = duration + durationResult;
Date endTime = document.getDate("endTime");
//更新相应信息
resultDoc.put("endTime", endTime);
resultDoc.put("lastRank", lastRank);
resultDoc.put("lastCount", lastCount);
resultDoc.put("highestRank", highestRank);
resultDoc.put("highestCount", highestCount);
resultDoc.put("duration", durationNow);
resultDoc.put("preRank", preRank);
resultDoc.put("preCount", preCount);
mongoCollectionLocal.findOneAndReplace(query, resultDoc);
}
}
}
}
/**
* 计算热搜时长
* @param type
* @param duration
* @return
*/
private static int getDuration(String type, int duration){
switch (type){
case "微博热搜" :
duration = duration + 1;
break;
case "百度热搜" :
duration = duration + 1;
break;
case "知乎热搜" :
duration = duration + 1;
break;
case "抖音热搜" :
duration = duration + 10;
break;
case "搜狗微信热搜" :
duration = duration + 5;
break;
case "微博话题" :
duration = duration + 3;
break;
default :
duration = duration + 1;
}
return duration;
}
}
......@@ -28,7 +28,7 @@ public class BaiduHotSearchRun extends Thread{
while(f) {
try {
getHotList();
TimeUnit.MINUTES.sleep(5);
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
......
......@@ -27,7 +27,7 @@ public class DouyinHotSearchRun extends Thread{
while(f) {
try {
getHotList();
TimeUnit.MINUTES.sleep(10);
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
......
......@@ -27,7 +27,7 @@ public class SougoHotSearchRun extends Thread {
while(f) {
try {
getHotList();
TimeUnit.MINUTES.sleep(5);
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
......
......@@ -21,7 +21,7 @@ public class WeiboTopicRun extends Thread{
while(f) {
try {
getTopicList();
TimeUnit.MINUTES.sleep(3);
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
......
......@@ -27,7 +27,7 @@ public class ZhihuHotSearchRun extends Thread{
while(f) {
try {
getHotList();
TimeUnit.MINUTES.sleep(10);
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment