Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
33f69a5d
Commit
33f69a5d
authored
Jun 17, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
采集调整为每分钟采集一次
parent
bc9cabb1
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
398 additions
and
270 deletions
+398
-270
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+51
-51
src/main/java/com/zhiwei/searchhotcrawler/dbtemplate/MongoDBLocalTemplate.java
+97
-0
src/main/java/com/zhiwei/searchhotcrawler/test/TopicTest.java
+245
-214
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
+1
-1
No files found.
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
33f69a5d
...
...
@@ -7,7 +7,6 @@ import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
javax.print.Doc
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
...
...
@@ -135,31 +134,32 @@ public class HotSearchCacheDAO {
* @return
*/
private
int
getDuration
(
String
type
,
int
duration
){
switch
(
type
){
case
"微博热搜"
:
// switch (type){
// case "微博热搜" :
// duration = duration + 1;
// break;
// case "百度热搜" :
// duration = duration + 5;
// break;
// case "知乎热搜" :
// duration = duration + 10;
// break;
// case "抖音热搜" :
// duration = duration + 10;
// break;
// case "搜狗微信热搜" :
// duration = duration + 5;
// break;
// case "微博话题" :
// duration = duration + 3;
// break;
// case "今日头条热搜" :
// duration = duration + 1;
// break;
// default :
// duration = duration + 1;
// }
duration
=
duration
+
1
;
break
;
case
"百度热搜"
:
duration
=
duration
+
5
;
break
;
case
"知乎热搜"
:
duration
=
duration
+
10
;
break
;
case
"抖音热搜"
:
duration
=
duration
+
10
;
break
;
case
"搜狗微信热搜"
:
duration
=
duration
+
5
;
break
;
case
"微博话题"
:
duration
=
duration
+
3
;
break
;
case
"今日头条热搜"
:
duration
=
duration
+
1
;
break
;
default
:
duration
=
duration
+
1
;
}
return
duration
;
}
...
...
@@ -171,32 +171,32 @@ public class HotSearchCacheDAO {
* @return
*/
private
Date
getEndTime
(
String
type
,
Date
time
){
long
timeLong
=
time
.
getTime
();
switch
(
type
){
case
"微博热搜"
:
timeLong
=
timeLong
+
1
*
60
*
1000
;
break
;
case
"百度热搜"
:
timeLong
=
timeLong
+
5
*
60
*
1000
;
break
;
case
"知乎热搜"
:
timeLong
=
timeLong
+
10
*
60
*
1000
;
break
;
case
"抖音热搜"
:
timeLong
=
timeLong
+
10
*
60
*
1000
;
break
;
case
"搜狗微信热搜"
:
timeLong
=
timeLong
+
5
*
60
*
1000
;
break
;
case
"微博话题"
:
timeLong
=
timeLong
+
3
*
60
*
1000
;
break
;
case
"今日头条热搜"
:
timeLong
=
timeLong
+
1
*
60
*
1000
;
break
;
default
:
timeLong
=
timeLong
+
1
*
60
*
1000
;
}
long
timeLong
=
time
.
getTime
()
+
1
*
60
*
1000
;
//
switch (type){
//
case "微博热搜" :
//
timeLong = timeLong + 1*60*1000;
//
break;
//
case "百度热搜" :
//
timeLong = timeLong + 5*60*1000;
//
break;
//
case "知乎热搜" :
//
timeLong = timeLong + 10*60*1000;
//
break;
//
case "抖音热搜" :
//
timeLong = timeLong + 10*60*1000;
//
break;
//
case "搜狗微信热搜" :
//
timeLong = timeLong + 5*60*1000;
//
break;
//
case "微博话题" :
//
timeLong = timeLong + 3*60*1000;
//
break;
//
case "今日头条热搜" :
//
timeLong = timeLong + 1*60*1000;
//
break;
//
default :
//
timeLong = timeLong + 1*60*1000;
//
}
return
new
Date
(
timeLong
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/dbtemplate/MongoDBLocalTemplate.java
0 → 100644
View file @
33f69a5d
package
com
.
zhiwei
.
searchhotcrawler
.
dbtemplate
;
import
com.mongodb.MongoClient
;
import
com.mongodb.MongoClientOptions
;
import
com.mongodb.MongoClientURI
;
import
com.mongodb.WriteConcern
;
import
com.mongodb.client.ListIndexesIterable
;
import
com.mongodb.client.MongoCollection
;
import
com.mongodb.client.MongoDatabase
;
import
com.mongodb.client.model.IndexOptions
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
java.util.Objects
;
@Log4j2
public
enum
MongoDBLocalTemplate
{
instance
;
private
MongoClient
mongoClient
;
static
{
MongoClientOptions
options
=
new
MongoClientOptions
.
Builder
()
.
connectionsPerHost
(
300
)
//连接池设置为300个连接,默认为100
.
connectTimeout
(
15000
)
//连接超时,推荐>3000毫秒
.
maxWaitTime
(
5000
)
.
socketTimeout
(
0
)
// 套接字超时时间,0无限制
.
threadsAllowedToBlockForConnectionMultiplier
(
5000
)
// 线程队列数,如果连接线程排满了队列就会抛出“Out of semaphores to get db”错误。
.
writeConcern
(
WriteConcern
.
W1
)
//
.
build
();
log
.
info
(
"MongoDBTemplate.static initializer : {}"
,
DBConfig
.
mongoLocalUri
);
MongoClientURI
mongoClientURI
=
new
MongoClientURI
(
DBConfig
.
mongoLocalUri
);
instance
.
mongoClient
=
new
MongoClient
(
mongoClientURI
);
}
/**
* 获取DB实例 - 指定DB
*
* @param databaseName
* @return
*/
public
static
MongoDatabase
getDB
(
String
databaseName
)
{
return
instance
.
mongoClient
.
getDatabase
(
databaseName
);
}
/**
* 获取collection对象 - 指定Collection
*
* @param databaseName
* @param collectionName
* @return
*/
public
static
MongoCollection
<
Document
>
getCollection
(
String
databaseName
,
String
collectionName
)
{
MongoDatabase
db
=
instance
.
mongoClient
.
getDatabase
(
databaseName
);
return
db
.
getCollection
(
collectionName
);
}
/**
* 创建索引
* @param databaseName
* @param collectionName
*/
public
static
void
createIndex
(
String
databaseName
,
String
collectionName
){
MongoDatabase
db
=
instance
.
mongoClient
.
getDatabase
(
databaseName
);
MongoCollection
mongoCollection
=
db
.
getCollection
(
collectionName
);
ListIndexesIterable
<
Document
>
indexList
=
mongoCollection
.
listIndexes
();
if
(
Objects
.
isNull
(
indexList
)){
Document
countIndexDoc
=
new
Document
();
if
(
collectionName
.
contains
(
"hot_search_list"
)){
countIndexDoc
.
put
(
"count"
,
-
1
);
}
else
{
countIndexDoc
.
put
(
"score_num"
,
-
1
);
}
Document
timeIndexDoc
=
new
Document
();
timeIndexDoc
.
put
(
"time"
,
-
1
);
Document
rankIndexDoc
=
new
Document
();
rankIndexDoc
.
put
(
"rank"
,
-
1
);
Document
nameIndexDoc
=
new
Document
();
nameIndexDoc
.
put
(
"name"
,
-
1
);
Document
typeIndexDoc
=
new
Document
();
typeIndexDoc
.
put
(
"type"
,
-
1
);
try
{
mongoCollection
.
createIndex
(
countIndexDoc
,
new
IndexOptions
().
name
(
"count_desc"
));
mongoCollection
.
createIndex
(
timeIndexDoc
,
new
IndexOptions
().
name
(
"time_desc"
));
mongoCollection
.
createIndex
(
rankIndexDoc
,
new
IndexOptions
().
name
(
"rank_desc"
));
mongoCollection
.
createIndex
(
nameIndexDoc
,
new
IndexOptions
().
name
(
"name_desc"
));
mongoCollection
.
createIndex
(
typeIndexDoc
,
new
IndexOptions
().
name
(
"type_desc"
));
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/test/TopicTest.java
View file @
33f69a5d
//package com.zhiwei.searchhotcrawler.test;
//
//import com.mongodb.client.MongoCollection;
//import com.mongodb.client.MongoCursor;
//import com.mongodb.client.MongoDatabase;
//import com.zhiwei.searchhotcrawler.config.DBConfig;
//import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
//import com.zhiwei.tools.timeparse.TimeParse;
//import lombok.extern.log4j.Log4j2;
//import org.bson.Document;
//
//import java.util.*;
//
//@Log4j2
//public class TopicTest {
//
// private static MongoDatabase mongoDB = MongoDBTemplate.getDB(DBConfig.dbName);
//
// public static void main(String[] args) {
//// repairTopic();
//
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.mongodb.client.MongoCollection
;
import
com.mongodb.client.MongoCursor
;
import
com.mongodb.client.MongoDatabase
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBLocalTemplate
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
java.util.*
;
@Log4j2
public
class
TopicTest
{
private
static
MongoDatabase
mongoDB
=
MongoDBTemplate
.
getDB
(
DBConfig
.
dbName
);
private
static
MongoDatabase
mongoDBLocal
=
MongoDBLocalTemplate
.
getDB
(
DBConfig
.
dbName
);
public
static
void
main
(
String
[]
args
)
{
repairHotType
();
// updateHotSearchCache();
// }
//
// /**
// * 修复热搜话题类型错误问题
// */
// public static void repairTopic(){
// MongoCollection mongoCollection = mongoDB.getCollection("hot_search_list2020_04");
// Document query = new Document("comment_count", new Document("$ne", null));
// query.put("type", "微博热搜");
// Date time = TimeParse.stringFormartDate("2020-03-12 18:00:00");
//
// long count = mongoCollection.countDocuments(query);
// log.info("count is {}", count);
// for(int i=0;i<55;i++){
// MongoCursor<Document> cursor = mongoCollection.find(query).limit(1000).iterator();
// while(cursor.hasNext()){
// Document update = cursor.next();
// update.put("type", "微博话题");
// Document query2 = new Document();
// query2.put("_id", update.getString("_id"));
// mongoCollection.findOneAndReplace(query2, update);
// time = update.getDate("time");
// }
// log.info("i========{}", i);
// }
// }
//
//
//
//
// public static void updateHotSearchCache(){
// for(int month = 3; month<=3; month++){
// String collectionName = "hot_search_list2020_0" + month;
// if(month>=10){
// collectionName = "hot_search_list2020_" + month;
// }
// log.info("collectionName is {}", collectionName);
// MongoCollection mongoCollection = mongoDB.getCollection(collectionName);
// MongoCollection mongoCollectionLocal = mongoDBLocal.getCollection("hot_search_cache");
//
// long count = mongoCollection.countDocuments();
// int pageCount = 10000;
// int pages = (int)Math.ceil((double)count/(double)pageCount);
// log.info("count====={},pages====={}",count, pages);
// Date date = TimeParse.stringFormartDate("2020-03-12 18:00:00");
//
// Map<String,Document> resultMap = new HashMap<>();
//
// for(int page = 1; page<pages; page++){
// Document query = new Document();
// if(page>1) {
// query.put("time", new Document("$gt", date));
// }
// log.info("page is {} ,query is {},coutn is {}", page ,query ,mongoCollection.countDocuments(query));
// MongoCursor<Document> cursor = mongoCollection.find(query).limit(pageCount).sort(new Document("time",1)).iterator();
// while(cursor.hasNext()){
// Document document = cursor.next();
// String name = document.getString("name");
// String type = document.getString("type");
// int lastRank = document.getInteger("rank")!=null?document.getInteger("rank"): -1;
// int lastCount = document.getInteger("count")!=null?document.getInteger("count"): -1;
// Date startTime = document.getDate("time");
// Date endTime = new Date(startTime.getTime() + (60 * 1000));
// String topicLead = document.getString("topic_lead")!=null?document.getString("topic_lead"):null;
// boolean hot = document.getBoolean("hot")!=null?document.getBoolean("hot"):true;
// String url = document.getString("url")!=null?document.getString("url"):null;
// String id = name + "_" + type;
//
// Document nowDoc = resultMap.get(id);
// if (Objects.nonNull(nowDoc)) {
// int highestRank = nowDoc.getInteger("highestRank");
// int highestCount = nowDoc.getInteger("highestCount");
// //判断最大热度值
// if (lastCount>0 && lastCount > highestCount) {
// highestCount = lastCount;
// }
// //判断最高排名
// if (lastRank>0 && lastRank < highestRank) {
// highestRank = lastRank;
// }
// //计算热搜时长
// int duration = nowDoc.getInteger("duration");
// int durationNow = getDuration(type, duration);
//
// //更新相应信息
// nowDoc.put("endTime", endTime);
// nowDoc.put("lastRank", lastRank);
// nowDoc.put("lastCount", lastCount);
// nowDoc.put("highestRank", highestRank);
// nowDoc.put("highestCount", highestCount);
// nowDoc.put("duration", durationNow);
// } else {
// nowDoc = new Document();
// int durationNow = getDuration(type, 0);
// nowDoc.put("_id", id);
// nowDoc.put("url", url);
// nowDoc.put("name", name);
// nowDoc.put("hot", hot);
// nowDoc.put("topicLead", topicLead);
// nowDoc.put("type", type);
// nowDoc.put("lastRank", lastRank);
// nowDoc.put("highestRank", lastRank);
// nowDoc.put("lastCount", lastCount);
// nowDoc.put("highestCount", lastCount);
// nowDoc.put("startTime", startTime);
// nowDoc.put("endTime", endTime);
// nowDoc.put("duration", durationNow);
// }
// resultMap.put(id, nowDoc);
// date = startTime;
// }
// cursor.close();
// }
//
// log.info("list size is {}", resultMap.size());
// for (Map.Entry<String,Document> entry: resultMap.entrySet()){
// String id = entry.getKey();
// Document document = entry.getValue();
// String name = document.getString("name");
// String type = document.getString("type");
// int lastRank = document.getInteger("lastRank");
// int lastCount = document.getInteger("lastCount");
// int highestRank = document.getInteger("highestRank");
// int highestCount = document.getInteger("highestCount");
// int duration = document.getInteger("duration");
//
// Document query = new Document("_id", id);
// Document resultDoc = (Document) mongoCollectionLocal.find(query).first();
// if(Objects.isNull(resultDoc)){
// mongoCollectionLocal.insertOne(document);
// }else{
//
// int highestRankResult = resultDoc.getInteger("highestRank");
// int highestCountResult = resultDoc.getInteger("highestCount");
// int durationResult = document.getInteger("duration");
// //判断最大热度值
// if (highestCountResult > highestCount) {
// highestCount = highestCountResult;
// }
// //判断最高排名
// if (highestRankResult < highestRank) {
// highestRank = highestRankResult;
// }
// //计算热搜时长
// int durationNow = duration + durationResult;
// Date endTime = document.getDate("endTime");
// //更新相应信息
// resultDoc.put("endTime", endTime);
// resultDoc.put("lastRank", lastRank);
// resultDoc.put("lastCount", lastCount);
// resultDoc.put("highestRank", highestRank);
// resultDoc.put("highestCount", highestCount);
// resultDoc.put("duration", durationNow);
// mongoCollectionLocal.findOneAndReplace(query, resultDoc);
// }
// }
// }
// }
//
//
// /**
// * 计算热搜时长
// * @param type
// * @param duration
// * @return
// */
// private static int getDuration(String type, int duration){
// switch (type){
// case "微博热搜" :
// duration = duration + 1;
// break;
// case "百度热搜" :
// duration = duration + 5;
// break;
// case "知乎热搜" :
// duration = duration + 10;
// break;
// case "抖音热搜" :
// duration = duration + 10;
// break;
// case "搜狗微信热搜" :
// duration = duration + 5;
// break;
// case "微博话题" :
// duration = duration + 3;
// break;
// default :
// duration = duration + 1;
// }
// return duration;
// }
//
//}
}
/**
* 修复热搜话题类型错误问题
*/
public
static
void
repairHotType
(){
try
{
for
(
int
month
=
6
;
month
<=
6
;
month
++){
String
collectionName
=
"hot_search_list2020_0"
+
month
;
MongoCollection
mongoCollection
=
mongoDB
.
getCollection
(
collectionName
);
MongoCollection
mongoLocalCollection
=
mongoDBLocal
.
getCollection
(
collectionName
);
Date
date
=
TimeParse
.
stringFormartDate
(
"2020-06-17 12:59:59"
);
Document
query
=
new
Document
();
query
.
put
(
"time"
,
new
Document
(
"$gt"
,
date
));
long
count
=
mongoCollection
.
countDocuments
(
query
);
int
pageCount
=
10000
;
int
pages
=
(
int
)
Math
.
ceil
((
double
)
count
/(
double
)
pageCount
);
log
.
info
(
"count====={},pages====={}"
,
count
,
pages
);
for
(
int
page
=
1
;
page
<
pages
;
page
++){
query
.
put
(
"time"
,
new
Document
(
"$gt"
,
date
));
log
.
info
(
"page is {} ,query is {},coutn is {}"
,
page
,
query
,
mongoCollection
.
countDocuments
(
query
));
MongoCursor
<
Document
>
cursor
=
mongoCollection
.
find
(
query
).
limit
(
pageCount
).
sort
(
new
Document
(
"time"
,
1
)).
iterator
();
List
<
Document
>
dataList
=
new
ArrayList
<>();
while
(
cursor
.
hasNext
())
{
Document
document
=
cursor
.
next
();
date
=
document
.
getDate
(
"time"
);;
dataList
.
add
(
document
);
try
{
mongoLocalCollection
.
insertOne
(
document
);
}
catch
(
Exception
e
){
continue
;
}
}
cursor
.
close
();
}
}
}
catch
(
Exception
e
){
e
.
printStackTrace
();
}
}
public
static
void
updateHotSearchCache
(){
for
(
int
month
=
3
;
month
<=
12
;
month
++){
String
collectionName
=
"hot_search_list2019_0"
+
month
;
if
(
month
>=
10
){
collectionName
=
"hot_search_list2019_"
+
month
;
}
log
.
info
(
"collectionName is {}"
,
collectionName
);
MongoCollection
mongoCollection
=
mongoDB
.
getCollection
(
collectionName
);
MongoCollection
mongoCollectionLocal
=
mongoDBLocal
.
getCollection
(
"hot_search_cache"
);
long
count
=
mongoCollection
.
countDocuments
();
int
pageCount
=
10000
;
int
pages
=
(
int
)
Math
.
ceil
((
double
)
count
/(
double
)
pageCount
);
log
.
info
(
"count====={},pages====={}"
,
count
,
pages
);
Date
date
=
TimeParse
.
stringFormartDate
(
"2020-03-12 18:00:00"
);
Map
<
String
,
Document
>
resultMap
=
new
HashMap
<>();
for
(
int
page
=
1
;
page
<
pages
;
page
++){
Document
query
=
new
Document
();
if
(
page
>
1
)
{
query
.
put
(
"time"
,
new
Document
(
"$gt"
,
date
));
}
log
.
info
(
"page is {} ,query is {},coutn is {}"
,
page
,
query
,
mongoCollection
.
countDocuments
(
query
));
MongoCursor
<
Document
>
cursor
=
mongoCollection
.
find
(
query
).
limit
(
pageCount
).
sort
(
new
Document
(
"time"
,
1
)).
iterator
();
while
(
cursor
.
hasNext
()){
Document
document
=
cursor
.
next
();
String
name
=
document
.
getString
(
"name"
);
String
type
=
document
.
getString
(
"type"
);
Integer
lastRank
=
document
.
getInteger
(
"rank"
)!=
null
?
document
.
getInteger
(
"rank"
):
null
;
Integer
lastCount
=
document
.
getInteger
(
"count"
)!=
null
?
document
.
getInteger
(
"count"
):
null
;
Date
startTime
=
document
.
getDate
(
"time"
);
Date
endTime
=
new
Date
(
startTime
.
getTime
()
+
(
60
*
1000
));
String
topicLead
=
document
.
getString
(
"topic_lead"
)!=
null
?
document
.
getString
(
"topic_lead"
):
null
;
boolean
hot
=
document
.
getBoolean
(
"hot"
)!=
null
?
document
.
getBoolean
(
"hot"
):
true
;
String
url
=
document
.
getString
(
"url"
)!=
null
?
document
.
getString
(
"url"
):
null
;
String
id
=
name
+
"_"
+
type
;
Document
nowDoc
=
resultMap
.
get
(
id
);
if
(
Objects
.
nonNull
(
nowDoc
))
{
Integer
highestRank
=
nowDoc
.
getInteger
(
"highestRank"
);
Integer
highestCount
=
nowDoc
.
getInteger
(
"highestCount"
);
Integer
preRank
=
nowDoc
.
getInteger
(
"lastRank"
);
Integer
preCount
=
nowDoc
.
getInteger
(
"lastCount"
);
//判断最大热度值
if
(
Objects
.
nonNull
(
lastCount
)
&&
Objects
.
nonNull
(
highestCount
)
&&
lastCount
>
0
&&
lastCount
>
highestCount
)
{
highestCount
=
lastCount
;
}
//判断最高排名
if
(
Objects
.
nonNull
(
lastRank
)
&&
Objects
.
nonNull
(
highestRank
)
&&
lastRank
>
0
&&
lastRank
<
highestRank
)
{
highestRank
=
lastRank
;
}
//计算热搜时长
int
duration
=
nowDoc
.
getInteger
(
"duration"
);
int
durationNow
=
getDuration
(
type
,
duration
);
//更新相应信息
nowDoc
.
put
(
"endTime"
,
endTime
);
nowDoc
.
put
(
"lastRank"
,
lastRank
);
nowDoc
.
put
(
"lastCount"
,
lastCount
);
nowDoc
.
put
(
"preRank"
,
preRank
);
nowDoc
.
put
(
"preCount"
,
preCount
);
nowDoc
.
put
(
"highestRank"
,
highestRank
);
nowDoc
.
put
(
"highestCount"
,
highestCount
);
nowDoc
.
put
(
"duration"
,
durationNow
);
}
else
{
nowDoc
=
new
Document
();
int
durationNow
=
getDuration
(
type
,
0
);
nowDoc
.
put
(
"_id"
,
id
);
nowDoc
.
put
(
"url"
,
url
);
nowDoc
.
put
(
"name"
,
name
);
nowDoc
.
put
(
"hot"
,
hot
);
nowDoc
.
put
(
"topicLead"
,
topicLead
);
nowDoc
.
put
(
"type"
,
type
);
nowDoc
.
put
(
"lastRank"
,
lastRank
);
nowDoc
.
put
(
"highestRank"
,
lastRank
);
nowDoc
.
put
(
"lastCount"
,
lastCount
);
nowDoc
.
put
(
"highestCount"
,
lastCount
);
nowDoc
.
put
(
"startTime"
,
startTime
);
nowDoc
.
put
(
"endTime"
,
endTime
);
nowDoc
.
put
(
"duration"
,
durationNow
);
nowDoc
.
put
(
"preRank"
,
null
);
nowDoc
.
put
(
"preCount"
,
null
);
}
resultMap
.
put
(
id
,
nowDoc
);
date
=
startTime
;
}
cursor
.
close
();
}
log
.
info
(
"list size is {}"
,
resultMap
.
size
());
for
(
Map
.
Entry
<
String
,
Document
>
entry:
resultMap
.
entrySet
()){
String
id
=
entry
.
getKey
();
Document
document
=
entry
.
getValue
();
String
name
=
document
.
getString
(
"name"
);
String
type
=
document
.
getString
(
"type"
);
Integer
lastRank
=
document
.
getInteger
(
"lastRank"
);
Integer
lastCount
=
document
.
getInteger
(
"lastCount"
);
Integer
highestRank
=
document
.
getInteger
(
"highestRank"
);
Integer
highestCount
=
document
.
getInteger
(
"highestCount"
);
Integer
duration
=
document
.
getInteger
(
"duration"
);
Integer
preRank
=
document
.
getInteger
(
"preRank"
);
Integer
preCount
=
document
.
getInteger
(
"preCount"
);
Document
query
=
new
Document
(
"_id"
,
id
);
Document
resultDoc
=
(
Document
)
mongoCollectionLocal
.
find
(
query
).
first
();
if
(
Objects
.
isNull
(
resultDoc
)){
mongoCollectionLocal
.
insertOne
(
document
);
}
else
{
Integer
highestRankResult
=
resultDoc
.
getInteger
(
"highestRank"
);
Integer
highestCountResult
=
resultDoc
.
getInteger
(
"highestCount"
);
Integer
durationResult
=
document
.
getInteger
(
"duration"
);
//判断最大热度值
if
(
Objects
.
nonNull
(
highestRankResult
)
&&
Objects
.
nonNull
(
highestCount
)
&&
highestCountResult
>
highestCount
)
{
highestCount
=
highestCountResult
;
}
//判断最高排名
if
(
Objects
.
nonNull
(
highestRankResult
)
&&
Objects
.
nonNull
(
highestRank
)
&&
highestRankResult
<
highestRank
)
{
highestRank
=
highestRankResult
;
}
//计算热搜时长
int
durationNow
=
duration
+
durationResult
;
Date
endTime
=
document
.
getDate
(
"endTime"
);
//更新相应信息
resultDoc
.
put
(
"endTime"
,
endTime
);
resultDoc
.
put
(
"lastRank"
,
lastRank
);
resultDoc
.
put
(
"lastCount"
,
lastCount
);
resultDoc
.
put
(
"highestRank"
,
highestRank
);
resultDoc
.
put
(
"highestCount"
,
highestCount
);
resultDoc
.
put
(
"duration"
,
durationNow
);
resultDoc
.
put
(
"preRank"
,
preRank
);
resultDoc
.
put
(
"preCount"
,
preCount
);
mongoCollectionLocal
.
findOneAndReplace
(
query
,
resultDoc
);
}
}
}
}
/**
* 计算热搜时长
* @param type
* @param duration
* @return
*/
private
static
int
getDuration
(
String
type
,
int
duration
){
switch
(
type
){
case
"微博热搜"
:
duration
=
duration
+
1
;
break
;
case
"百度热搜"
:
duration
=
duration
+
1
;
break
;
case
"知乎热搜"
:
duration
=
duration
+
1
;
break
;
case
"抖音热搜"
:
duration
=
duration
+
10
;
break
;
case
"搜狗微信热搜"
:
duration
=
duration
+
5
;
break
;
case
"微博话题"
:
duration
=
duration
+
3
;
break
;
default
:
duration
=
duration
+
1
;
}
return
duration
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
View file @
33f69a5d
...
...
@@ -28,7 +28,7 @@ public class BaiduHotSearchRun extends Thread{
while
(
f
)
{
try
{
getHotList
();
TimeUnit
.
MINUTES
.
sleep
(
5
);
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
View file @
33f69a5d
...
...
@@ -27,7 +27,7 @@ public class DouyinHotSearchRun extends Thread{
while
(
f
)
{
try
{
getHotList
();
TimeUnit
.
MINUTES
.
sleep
(
1
0
);
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
View file @
33f69a5d
...
...
@@ -27,7 +27,7 @@ public class SougoHotSearchRun extends Thread {
while
(
f
)
{
try
{
getHotList
();
TimeUnit
.
MINUTES
.
sleep
(
5
);
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
View file @
33f69a5d
...
...
@@ -21,7 +21,7 @@ public class WeiboTopicRun extends Thread{
while
(
f
)
{
try
{
getTopicList
();
TimeUnit
.
MINUTES
.
sleep
(
3
);
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
View file @
33f69a5d
...
...
@@ -27,7 +27,7 @@ public class ZhihuHotSearchRun extends Thread{
while
(
f
)
{
try
{
getHotList
();
TimeUnit
.
MINUTES
.
sleep
(
1
0
);
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment