Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
98f0116b
Commit
98f0116b
authored
Jan 11, 2021
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'mlbWork' into 'master'
Mlb work See merge request
!66
parents
c263ec99
ccb11676
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
229 additions
and
25 deletions
+229
-25
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchCache.java
+33
-24
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
+25
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+3
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/BililiCrawler.java
+127
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+2
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+17
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+22
-0
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchCache.java
View file @
98f0116b
...
...
@@ -95,6 +95,19 @@ public class HotSearchCache {
*/
private
String
topicResult
;
/**
* 首次上榜热度
*/
private
Integer
firstCount
;
/** 详情页图片(微博平台) */
private
String
pictureUrl
;
/**
* 上升速度
*/
private
double
riseSpeed
;
public
HotSearchCache
(
String
url
,
String
name
,
String
topicLead
,
Integer
highestCount
,
Integer
lastCount
,
Boolean
hot
,
Date
startTime
,
Date
endTime
,
Integer
highestRank
,
Integer
lastRank
,
String
type
,
Integer
duration
){
this
.
id
=
name
+
"_"
+
type
;
...
...
@@ -113,35 +126,31 @@ public class HotSearchCache {
this
.
duration
=
duration
;
}
public
Boolean
getRecommend
()
{
return
recommend
;
}
public
Boolean
getRecommend
()
{
return
recommend
;
}
public
void
setRecommend
(
Boolean
recommend
)
{
this
.
recommend
=
recommend
;
}
public
void
setRecommend
(
Boolean
recommend
)
{
this
.
recommend
=
recommend
;
}
public
Integer
getReadCount
()
{
return
readCount
;
}
public
Integer
getReadCount
()
{
return
readCount
;
}
public
void
setReadCount
(
Integer
readCount
)
{
this
.
readCount
=
readCount
;
}
public
void
setReadCount
(
Integer
readCount
)
{
this
.
readCount
=
readCount
;
}
public
Integer
getDiscussCount
()
{
return
discussCount
;
}
public
Integer
getDiscussCount
()
{
return
discussCount
;
}
public
void
setDiscussCount
(
Integer
discussCount
)
{
this
.
discussCount
=
discussCount
;
}
public
void
setDiscussCount
(
Integer
discussCount
)
{
this
.
discussCount
=
discussCount
;
}
public
String
getTopicLead
()
{
return
topicLead
;
}
public
String
getTopicLead
()
{
return
topicLead
;
}
public
void
setTopicLead
(
String
topicLead
)
{
this
.
topicLead
=
topicLead
;
}
public
void
setTopicLead
(
String
topicLead
)
{
this
.
topicLead
=
topicLead
;
}
public
Integer
getFirstCount
()
{
return
firstCount
;
}
public
void
setFirstCount
(
Integer
firstCount
)
{
this
.
firstCount
=
firstCount
;
}
public
String
getPictureUrl
()
{
return
pictureUrl
;
}
public
void
setPictureUrl
(
String
pictureUrl
)
{
this
.
pictureUrl
=
pictureUrl
;
}
public
double
getRiseSpeed
()
{
return
riseSpeed
;
}
public
void
setRiseSpeed
(
double
riseSpeed
)
{
this
.
riseSpeed
=
riseSpeed
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
View file @
98f0116b
...
...
@@ -84,6 +84,16 @@ public class HotSearchList implements Serializable{
*/
private
String
topicResult
;
/**
* 观看数(目前近B站排行榜使用)
*/
private
Integer
view
;
/**
* 弹幕数(目前仅B站排行榜使用)
*/
private
Integer
barrage
;
public
HotSearchList
(){}
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
,
Date
date
){
...
...
@@ -140,4 +150,19 @@ public class HotSearchList implements Serializable{
this
.
icon
=
icon
;
this
.
topicResult
=
topicResult
;
}
public
HotSearchList
(
String
url
,
String
name
,
String
topicLead
,
Integer
count
,
Boolean
hot
,
Date
time
,
Integer
rank
,
String
type
,
Integer
view
,
Integer
barrage
)
{
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
()+
"_"
+
type
;
this
.
url
=
url
;
this
.
name
=
name
;
this
.
topicLead
=
topicLead
;
this
.
count
=
count
;
this
.
hot
=
hot
;
this
.
time
=
time
;
this
.
rank
=
rank
;
this
.
day
=
TimeParse
.
dateFormartString
(
time
,
"yyyy-MM-dd"
);
this
.
type
=
type
;
this
.
view
=
view
;
this
.
barrage
=
barrage
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
98f0116b
...
...
@@ -20,5 +20,7 @@ public enum HotSearchType {
网易跟帖热议
,
微博预热榜
,
腾讯较真榜
,
脉脉热榜
脉脉热榜
,
B
站排行榜
,
B
站热搜
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/BililiCrawler.java
0 → 100644
View file @
98f0116b
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Date
;
import
java.util.List
;
@Log4j2
public
class
BililiCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* B站排行榜的采集
* @param date
* @return
*/
public
static
List
<
HotSearchList
>
getBilibiliHotSearch
(
Date
date
){
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
log
.
info
(
"bilibili排行榜开始采集..."
);
JSONArray
dataJson
=
null
;
String
htmlBody
=
null
;
String
url
=
"https://api.bilibili.com/x/web-interface/ranking/v2?rid=0&type=all"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"B站排行榜页面连接失败"
,
e
.
fillInStackTrace
());
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
dataJson
=
jsonObject
.
getJSONArray
(
"list"
);
if
(
dataJson
!=
null
)
{
for
(
int
i
=
0
;
i
<
dataJson
.
size
();
i
++)
{
JSONObject
data
=
dataJson
.
getJSONObject
(
i
);
int
rank
=
i
+
1
;
String
name
=
data
.
getString
(
"title"
);
String
topicLead
=
data
.
getString
(
"desc"
);
int
count
=
data
.
getIntValue
(
"score"
);
String
bvid
=
data
.
getString
(
"bvid"
);
String
bUrl
=
"https://www.bilibili.com/video/"
+
bvid
;
Integer
view
=
null
;
Integer
barrage
=
null
;
if
(
data
.
containsKey
(
"stat"
))
{
JSONObject
stat
=
data
.
getJSONObject
(
"stat"
);
view
=
stat
.
getIntValue
(
"view"
);
barrage
=
stat
.
getIntValue
(
"danmaku"
);
}
HotSearchList
hotSearchList
=
new
HotSearchList
(
bUrl
,
name
,
topicLead
,
count
,
null
,
date
,
rank
,
HotSearchType
.
B
站排行榜
.
name
(),
view
,
barrage
);
hotSearchLists
.
add
(
hotSearchList
);
}
}
}
ZhiWeiTools
.
sleep
(
3000L
);
}
log
.
info
(
"{}, B站排行榜此轮采集到的数据量为:{}"
,
new
Date
(),
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
);
log
.
info
(
"B站排行榜采集结束"
);
return
hotSearchLists
;
}
/**
* B站热搜的采集
* @param date
* @return
*/
public
static
List
<
HotSearchList
>
getBiHotData
(
Date
date
)
{
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
log
.
info
(
"B站热搜榜开始采集..."
);
JSONArray
dataJson
=
null
;
String
htmlBody
=
null
;
String
url
=
"https://app.biliapi.com/x/v2/search/square?build=616050&limit=10"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"B站热搜页面连接失败"
,
e
.
fillInStackTrace
());
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
).
getJSONObject
(
0
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
if
(
dataJson
!=
null
)
{
for
(
int
i
=
0
;
i
<
dataJson
.
size
();
i
++)
{
JSONObject
data
=
dataJson
.
getJSONObject
(
i
);
int
rank
=
i
+
1
;
String
name
=
data
.
getString
(
"show_name"
);
String
icon
=
null
;
if
(
data
.
containsKey
(
"icon"
)){
String
iconPicture
=
data
.
getString
(
"icon"
);
if
(
iconPicture
.
contains
(
"e9e7a2d8497d4063421b685e72680bf1cfb99a0d"
)){
icon
=
"热"
;
}
else
if
(
iconPicture
.
contains
(
"4d579fb61f9655316582db193118bba3a721eec0"
)){
icon
=
"新"
;
}
else
{
icon
=
"未知"
;
}
}
String
keyWord
=
data
.
getString
(
"keyword"
);
String
biliUrl
=
"https://search.bilibili.com/all?keyword="
+
URLCodeUtil
.
getURLEncode
(
keyWord
,
"utf-8"
);
HotSearchList
hotSearchList
=
new
HotSearchList
(
biliUrl
,
name
,
null
,
rank
,
HotSearchType
.
B
站热搜
.
name
(),
date
);
hotSearchLists
.
add
(
hotSearchList
);
}
}
}
ZhiWeiTools
.
sleep
(
3000L
);
}
log
.
info
(
"{}, B站热搜榜此轮采集到的数据量为:{}"
,
new
Date
(),
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
);
log
.
info
(
"B站热搜榜采集结束"
);
return
hotSearchLists
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
98f0116b
...
...
@@ -254,8 +254,10 @@ public class WeiboHotSearchCrawler {
String
midText
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"midtext"
);
String
read
=
midText
.
replaceAll
(
"阅读"
,
""
).
replaceAll
(
"讨论.*"
,
""
).
trim
();
String
discussCount
=
midText
.
replaceAll
(
".*讨论"
,
""
).
replaceAll
(
"详情.*"
,
""
).
trim
();
String
pictureUrl
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"portrait_url"
);
document
.
put
(
"readCount"
,
TipsUtils
.
getHotCount
(
read
));
document
.
put
(
"discussCount"
,
TipsUtils
.
getHotCount
(
discussCount
));
document
.
put
(
"pictureUrl"
,
pictureUrl
);
}
}
return
document
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
98f0116b
...
...
@@ -55,6 +55,11 @@ public class HotSearchCacheDAO {
if
(
"腾讯较真榜"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"topic_result"
,
hotSearch
.
getTopicResult
());
}
if
(
"B站排行榜"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"topic_lead"
,
hotSearch
.
getTopicLead
());
document
.
put
(
"view"
,
hotSearch
.
getView
());
document
.
put
(
"barrage"
,
hotSearch
.
getBarrage
());
}
addAndUpdateData
(
document
);
dataes
.
add
(
document
);
});
...
...
@@ -115,6 +120,12 @@ public class HotSearchCacheDAO {
//计算热搜时长
int
duration
=
nowDoc
.
getInteger
(
"duration"
);
int
durationNow
=
getDuration
(
type
,
duration
);
//计算上升速度
double
riseSpeed
=
nowDoc
.
containsKey
(
"riseSpeed"
)?
nowDoc
.
getDouble
(
"riseSpeed"
):
0.00
;
if
(
Objects
.
nonNull
(
lastCount
)
&&
nowDoc
.
containsKey
(
"firstCount"
))
{
int
firstCount
=
nowDoc
.
getInteger
(
"firstCount"
);
riseSpeed
=
((
double
)(
lastCount
-
firstCount
)/(
double
)
firstCount
)*
1000
/((
double
)
duration
);
}
// endTime = getEndTime(type, new Date());
//更新相应信息
if
(
url
!=
null
&&
!
url
.
equals
(
lastUrl
)){
...
...
@@ -129,6 +140,7 @@ public class HotSearchCacheDAO {
nowDoc
.
put
(
"preCount"
,
preCount
);
nowDoc
.
put
(
"duration"
,
durationNow
);
nowDoc
.
put
(
"recommend"
,
recommend
);
nowDoc
.
put
(
"riseSpeed"
,
riseSpeed
);
// if(readCount != null){
// nowDoc.put("readCount",readCount);
// }
...
...
@@ -155,6 +167,8 @@ public class HotSearchCacheDAO {
nowDoc
.
put
(
"preRank"
,
null
);
nowDoc
.
put
(
"preCount"
,
null
);
nowDoc
.
put
(
"recommend"
,
recommend
);
nowDoc
.
put
(
"firstCount"
,
lastCount
);
nowDoc
.
put
(
"riseSpeed"
,
0.00
);
// if(readCount != null){
// nowDoc.put("readCount",readCount);
// }
...
...
@@ -170,6 +184,9 @@ public class HotSearchCacheDAO {
nowDoc
.
put
(
"readCount"
,
nowDoc
.
getInteger
(
"readCount"
));
nowDoc
.
put
(
"discussCount"
,
nowDoc
.
getInteger
(
"discussCount"
));
}
if
(
nowDoc
.
containsKey
(
"pictureUrl"
))
{
nowDoc
.
put
(
"pictureUrl"
,
nowDoc
.
getString
(
"pictureUrl"
));
}
}
collection
.
insertOne
(
nowDoc
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
98f0116b
...
...
@@ -373,6 +373,28 @@ public class GatherTimer {
}
/**
* B站排行榜采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"30 * * * * ? "
)
public
void
crawlerBilibiliHotSearch
(){
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
BililiCrawler
.
getBilibiliHotSearch
(
date
);
TipsUtils
.
addHotList
(
HotSearchType
.
B
站排行榜
.
name
(),
list
);
}
/**
* B站热搜采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"30 * * * * ? "
)
public
void
crawlerBilibiliHotData
()
{
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
BililiCrawler
.
getBiHotData
(
date
);
TipsUtils
.
addHotList
(
HotSearchType
.
B
站热搜
.
name
(),
list
);
}
/**
* 微博超话的采集
*/
@Async
(
value
=
"myScheduler"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment