Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
6ec7a116
You need to sign in or sign up before continuing.
Commit
6ec7a116
authored
Feb 25, 2022
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'working' into 'master'
新增微视热榜采集程序及变更微博主持人字段为定时更新 See merge request
!179
parents
21336ec9
8a4f438f
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
143 additions
and
19 deletions
+143
-19
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+1
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiShiHotSearchCrawler.java
+89
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+39
-19
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+14
-0
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
6ec7a116
...
...
@@ -31,5 +31,6 @@ public enum HotSearchType {
微博娱乐榜
,
微博要闻榜
,
B
站综合热门
,
微视热榜
,
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiShiHotSearchCrawler.java
0 → 100644
View file @
6ec7a116
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.MediaType
;
import
okhttp3.Request
;
import
okhttp3.RequestBody
;
import
org.apache.commons.lang3.StringUtils
;
import
java.util.*
;
/**
* @ClassName: WeiShiCrawlerTest
* @Description: 微视热榜采集
* @author ll
* @date 2022年2月22日 上午09:54:31
*/
@Log4j2
public
class
WeiShiHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
build
();
/**
* @return void 返回类型
* @Title: WeiShiCrawlerTest
* @author ll
* @Description: 微视热榜采集
*/
public
static
List
<
HotSearchList
>
weiShiHotSearch
(
Date
date
)
{
String
url
=
"https://api.weishi.qq.com/trpc.wesee.weishi_search_hotrank.SearchHotrank/GetHotRankV2?_csrf="
;
String
htmlBody
=
null
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"Content-Length"
,
"85"
);
headerMap
.
put
(
"Content-Type"
,
"multipart/form-data;charset=UTF-8"
);
headerMap
.
put
(
"Host"
,
"api.weishi.qq.com"
);
Request
request
=
RequestUtils
.
wrapPost
(
url
,
headerMap
,
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
"{\"req_body\":{\"hotRankID\":\"\",\"attachInfo\":\"\",\"hotRankType\":1,\"sourceID\":\"WSSearchH5\"}}"
));
for
(
int
count
=
0
;
count
<=
3
;
count
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微视热榜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
List
<
HotSearchList
>
result
=
new
ArrayList
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"rsp_body"
))
{
try
{
JSONObject
car
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"rsp_body"
);
JSONArray
cards
=
car
.
getJSONArray
(
"hotRankEvents"
);
for
(
Object
card
:
cards
)
{
String
ul
=
null
;
boolean
hot
=
true
;
JSONObject
json
=
(
JSONObject
)
JSONObject
.
toJSON
(
card
);
Integer
rank
=
json
.
getInteger
(
"pos"
);
String
title
=
json
.
getString
(
"title"
);
Long
hotCount
=
json
.
getLong
(
"hotCount"
);
JSONObject
label
=
json
.
getJSONObject
(
"label"
);
String
labelName
=
null
;
String
labelUrl
=
null
;
if
(
Objects
.
nonNull
(
label
)){
labelName
=
label
.
getString
(
"name"
);
labelUrl
=
label
.
getString
(
"labelURL"
);
}
HotSearchList
hotSearch
=
new
HotSearchList
(
ul
,
title
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微视热榜
.
name
(),
labelName
,
date
);
hotSearch
.
setIconUrl
(
labelUrl
);
result
.
add
(
hotSearch
);
}
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微视热榜出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
"解析微视热榜时出现解析错误,页面结构有问题"
);
}
}
return
Collections
.
emptyList
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
6ec7a116
...
...
@@ -29,6 +29,7 @@ public class HotSearchCacheDAO {
/**
* 存储数据
*
* @param dataList
* @return
*/
...
...
@@ -114,7 +115,14 @@ public class HotSearchCacheDAO {
document
.
put
(
"view"
,
hotSearch
.
getView
());
document
.
put
(
"fans"
,
hotSearch
.
getFans
());
}
if
(
"微视热榜"
.
equals
(
hotSearch
.
getType
()))
{
document
.
put
(
"iconUrl"
,
hotSearch
.
getIconUrl
());
addAndUpdateData
(
document
,
true
);
}
else
{
addAndUpdateData
(
document
);
}
if
(
"百度热搜"
.
equals
(
hotSearch
.
getType
()))
{
document
.
remove
(
"topic_lead"
);
}
...
...
@@ -135,9 +143,19 @@ public class HotSearchCacheDAO {
/**
* 添加及更新相应数据表中的数据
*
* @param document
*/
public
void
addAndUpdateData
(
Document
document
)
{
addAndUpdateData
(
document
,
false
);
}
/**
* 添加及更新相应数据表中的数据
*
* @param document
*/
public
void
addAndUpdateData
(
Document
document
,
boolean
isMostSave
)
{
try
{
String
name
=
document
.
getString
(
"name"
);
String
type
=
document
.
getString
(
"type"
);
...
...
@@ -155,7 +173,6 @@ public class HotSearchCacheDAO {
String
id
=
name
+
"_"
+
type
;
boolean
recommend
=
false
;
// Integer readCount = document.getInteger("comment_count");
if
(
"微博热搜"
.
equals
(
type
))
{
//排位标判断 例如 https://simg.s.weibo.com/20210408_search_point_orange.png
String
rankPic
=
document
.
getString
(
"rankPic"
);
...
...
@@ -170,8 +187,8 @@ public class HotSearchCacheDAO {
}
Document
query
=
new
Document
(
"_id"
,
id
);
//判断是否为微博推荐位,推荐位微博无排名,所以不纳入总的缓存表
if
(
nonNull
(
lastRank
)
&&
lastRank
>
0
)
{
//判断是否为微博推荐位,推荐位微博无排名,所以不纳入总的缓存表
当isMostSave为true时忽略排名因素
if
(
nonNull
(
lastRank
)
&&
(
lastRank
>
0
||
isMostSave
)
)
{
Document
nowDoc
=
(
Document
)
collection
.
find
(
query
).
first
();
if
(
nonNull
(
nowDoc
))
{
Integer
highestRank
=
nowDoc
.
getInteger
(
"highestRank"
);
...
...
@@ -193,7 +210,7 @@ public class HotSearchCacheDAO {
}
//判断真实最高排名
if
(
"微博热搜"
.
equals
(
type
)
&&
nonNull
(
realLastRank
)
&&
nonNull
(
realHighestRank
))
{
if
(
realHighestRank
<
0
)
{
if
(
realHighestRank
<
0
)
{
realHighestRank
=
realLastRank
;
}
if
(
realLastRank
>
0
&&
realHighestRank
>
0
&&
realLastRank
<
realHighestRank
)
{
...
...
@@ -209,7 +226,6 @@ public class HotSearchCacheDAO {
long
firstCount
=
Long
.
parseLong
(
nowDoc
.
get
(
"firstCount"
).
toString
());
riseSpeed
=
((
double
)
(
lastCount
-
firstCount
)
/
(
double
)
firstCount
)
*
1000
/
((
double
)
duration
);
}
// endTime = getEndTime(type, new Date());
//更新相应信息
if
(
url
!=
null
&&
!
url
.
equals
(
lastUrl
))
{
nowDoc
.
put
(
"url"
,
url
);
...
...
@@ -224,13 +240,10 @@ public class HotSearchCacheDAO {
nowDoc
.
put
(
"duration"
,
durationNow
);
nowDoc
.
put
(
"recommend"
,
recommend
);
nowDoc
.
put
(
"riseSpeed"
,
riseSpeed
);
if
(
"微博热搜"
.
equals
(
type
)){
if
(
"微博热搜"
.
equals
(
type
))
{
nowDoc
.
put
(
"realLastRank"
,
realLastRank
);
nowDoc
.
put
(
"realHighestRank"
,
realHighestRank
);
}
// if(readCount != null){
// nowDoc.put("readCount",readCount);
// }
if
(
topicResult
!=
null
)
{
nowDoc
.
put
(
"topicResult"
,
topicResult
);
}
...
...
@@ -241,6 +254,12 @@ public class HotSearchCacheDAO {
}
}
}
//定时更新主持人字段 讨论量 阅读量 用在榜时长来确定更新时间
if
(
"微博热搜"
.
equals
(
type
))
{
if
(
durationNow
%
10
==
0
){
nowDoc
=
WeiboHotSearchCrawler
.
weiboUpdate
(
nowDoc
);
}
}
collection
.
replaceOne
(
query
,
nowDoc
);
}
else
{
nowDoc
=
new
Document
();
...
...
@@ -253,7 +272,7 @@ public class HotSearchCacheDAO {
nowDoc
.
put
(
"type"
,
type
);
nowDoc
.
put
(
"lastRank"
,
lastRank
);
nowDoc
.
put
(
"highestRank"
,
lastRank
);
if
(
"微博热搜"
.
equals
(
type
)){
if
(
"微博热搜"
.
equals
(
type
))
{
nowDoc
.
put
(
"realLastRank"
,
realLastRank
);
nowDoc
.
put
(
"realHighestRank"
,
realLastRank
);
}
...
...
@@ -267,9 +286,6 @@ public class HotSearchCacheDAO {
nowDoc
.
put
(
"recommend"
,
recommend
);
nowDoc
.
put
(
"firstCount"
,
lastCount
);
nowDoc
.
put
(
"riseSpeed"
,
0.00
);
// if(readCount != null){
// nowDoc.put("readCount",readCount);
// }
if
(
"脉脉热榜"
.
equals
(
type
))
{
nowDoc
.
put
(
"content"
,
document
.
getString
(
"content"
));
}
...
...
@@ -329,15 +345,16 @@ public class HotSearchCacheDAO {
collection
.
insertOne
(
nowDoc
);
}
}
}
catch
(
MongoWriteException
e1
)
{
log
.
error
(
"数据写入时出错,数据为【{}】:"
,
document
,
e1
);
}
catch
(
Exception
e
)
{
log
.
error
(
"数据存储时出错,数据为【{}】:"
,
document
,
e
);
}
catch
(
MongoWriteException
e1
)
{
log
.
error
(
"数据写入时出错,数据为【{}】:"
,
document
,
e1
);
}
catch
(
Exception
e
)
{
log
.
error
(
"数据存储时出错,数据为【{}】:"
,
document
,
e
);
}
}
/**
* 抖音链接更新
*
* @param document
*/
public
void
updateDouyinUrl
(
Document
document
)
{
...
...
@@ -366,6 +383,7 @@ public class HotSearchCacheDAO {
/**
* 计算热搜时长
*
* @param type
* @param duration
* @return
...
...
@@ -398,9 +416,9 @@ public class HotSearchCacheDAO {
// }
if
(
"脉脉热榜"
.
equals
(
type
))
{
duration
=
duration
+
30
;
}
else
if
(
"B站综合热门"
.
equals
(
type
))
{
}
else
if
(
"B站综合热门"
.
equals
(
type
))
{
duration
=
duration
+
60
;
}
else
{
}
else
{
duration
=
duration
+
1
;
}
return
duration
;
...
...
@@ -409,6 +427,7 @@ public class HotSearchCacheDAO {
/**
* 计算结束时间
*
* @param type
* @param time
* @return
...
...
@@ -445,6 +464,7 @@ public class HotSearchCacheDAO {
/**
* 根据主键查询对应热搜
*
* @param id
* @return
*/
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
6ec7a116
...
...
@@ -625,4 +625,18 @@ public class GatherTimer {
log
.
info
(
"B站综合热门采集结束..."
);
}
/**
* 微视热榜采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerWeiShiHotSearch
(){
log
.
info
(
" 微视热榜采集开始........"
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
weiShiList
=
WeiShiHotSearchCrawler
.
weiShiHotSearch
(
date
);
log
.
info
(
"{}, 微视热榜此轮采集到的数据量为:{}"
,
new
Date
(),
weiShiList
!=
null
?
weiShiList
.
size
()
:
0
);
TipsUtils
.
addHotList
(
HotSearchType
.
微视热榜
.
name
(),
weiShiList
);
log
.
info
(
" 微视热榜采集结束........"
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment