Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
98a6f728
Commit
98a6f728
authored
Dec 14, 2021
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'working' into 'master'
新增B站综合热门采集 See merge request
!162
parents
04c6bdac
310520db
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
203 additions
and
3 deletions
+203
-3
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
+1
-2
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+1
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/BiliComprehensiveHotCrawler.java
+174
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+12
-1
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+15
-0
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
View file @
98a6f728
...
@@ -94,7 +94,7 @@ public class HotSearchList implements Serializable{
...
@@ -94,7 +94,7 @@ public class HotSearchList implements Serializable{
private
String
topicResult
;
private
String
topicResult
;
/**
/**
* 观看数(目前近B站排行榜使用)
* 观看数(目前近B站排行榜
及综合热门
使用)
*/
*/
private
Long
view
;
private
Long
view
;
...
@@ -122,7 +122,6 @@ public class HotSearchList implements Serializable{
...
@@ -122,7 +122,6 @@ public class HotSearchList implements Serializable{
* 内容
* 内容
*/
*/
private
String
content
;
private
String
content
;
public
HotSearchList
(){}
public
HotSearchList
(){}
public
HotSearchList
(
String
url
,
String
name
,
Long
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
,
Date
date
){
public
HotSearchList
(
String
url
,
String
name
,
Long
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
,
Date
date
){
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
98a6f728
...
@@ -30,5 +30,6 @@ public enum HotSearchType {
...
@@ -30,5 +30,6 @@ public enum HotSearchType {
抖音同城榜
,
抖音同城榜
,
微博娱乐榜
,
微博娱乐榜
,
微博要闻榜
,
微博要闻榜
,
B
站综合热门
,
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/BiliComprehensiveHotCrawler.java
0 → 100644
View file @
98a6f728
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.text.DateFormat
;
import
java.text.SimpleDateFormat
;
import
java.util.*
;
/**
* @author ll
* @ClassName: BiliComprehensiveHotCrawler
* @Description: B站综合热门
* @date 2021年12月09日 下午14:54:31
*/
@Log4j2
public
class
BiliComprehensiveHotCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @return List<HotSearchList>
* @Title: getBiliComprehensiveHot
* @author ll
* @Description: pc端B站综合热门采集
*/
public
static
List
<
HotSearchList
>
getBiliComprehensiveHot
(
Date
date
)
{
DateFormat
fmt
=
new
SimpleDateFormat
(
"yyyy-MM-dd HH:mm:ss"
);
//十次链接存list集合
List
<
String
>
urlList
=
new
ArrayList
<>();
for
(
int
j
=
1
;
j
<=
10
;
j
++)
{
String
url
=
"https://api.bilibili.com/x/web-interface/popular?ps=20&pn="
+
j
;
urlList
.
add
(
url
);
}
String
htmlBody
=
null
;
//循环六次拿完整数据 六次都失败则返回空
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
int
b
=
0
;
//数据集存入result集合
List
<
HotSearchList
>
result
=
new
ArrayList
();
for
(
int
i
=
0
;
i
<
urlList
.
size
();
i
++)
{
Request
request
=
RequestUtils
.
wrapGet
(
urlList
.
get
(
i
));
//发送请求每次获取20条数据
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
fmt
.
format
(
date
)+
":第"
+
i
+
1
+
"次请求解析B站综合热门时出现连接失败"
,
e
);
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
try
{
JSONArray
biliList
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
List
<
HotSearchList
>
list
=
parsBiliComprehensiveHot
(
date
,
biliList
,
i
);
result
.
addAll
(
list
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析B站综合热门时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
fmt
.
format
(
date
)+
":第"
+
i
+
1
+
"次解析B站综合热门时出现解析错误,页面结构有问题"
);
--
i
;
b
++;
if
(
b
==
4
){
log
.
info
(
fmt
.
format
(
date
)+
":分钟数据舍弃"
);
return
Collections
.
emptyList
();
}
}
}
//返回采集到的200条数据
return
result
;
}
return
Collections
.
emptyList
();
}
//B站综合热门
public
static
List
<
HotSearchList
>
parsBiliComprehensiveHot
(
Date
date
,
JSONArray
list
,
int
a
)
{
//定义rank变量
int
rank
;
switch
(
a
)
{
case
0
:
rank
=
0
;
break
;
case
1
:
rank
=
20
;
break
;
case
2
:
rank
=
40
;
break
;
case
3
:
rank
=
60
;
break
;
case
4
:
rank
=
80
;
break
;
case
5
:
rank
=
100
;
break
;
case
6
:
rank
=
120
;
break
;
case
7
:
rank
=
140
;
break
;
case
8
:
rank
=
160
;
break
;
case
9
:
rank
=
180
;
break
;
default
:
rank
=
0
;
}
List
<
HotSearchList
>
biliComprehensiveHotList
=
new
ArrayList
();
try
{
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
boolean
hot
=
true
;
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
JSONObject
cardInfo
=
list
.
getJSONObject
(
i
);
//获取标题
String
name
=
cardInfo
.
getString
(
"title"
);
//获取图片链接
String
pictureUrl
=
cardInfo
.
getString
(
"pic"
);
//获取链接
String
url
=
cardInfo
.
getString
(
"short_link"
);
//排名自增
rank
++;
//获取主持人
String
downtext
=
cardInfo
.
getJSONObject
(
"owner"
).
getString
(
"name"
);
//获取播放量
Long
view
=
Long
.
valueOf
(
cardInfo
.
getJSONObject
(
"stat"
).
getString
(
"view"
))
;
//获取讨论量
Long
commentCount
=
Long
.
valueOf
(
cardInfo
.
getJSONObject
(
"stat"
).
getString
(
"danmaku"
));
//获取标签
String
label
=
cardInfo
.
getJSONObject
(
"rcmd_reason"
).
getString
(
"content"
);
//默认热度值为null
Long
hotCount
=
null
;
HotSearchList
hotSearch
=
new
HotSearchList
(
url
,
name
,
hotCount
,
rank
,
HotSearchType
.
B
站综合热门
.
name
(),
date
);
//增加主持人
hotSearch
.
setDowntext
(
downtext
);
//增加图片链接
hotSearch
.
setPictureUrl
(
pictureUrl
);
//增加播放量
hotSearch
.
setView
(
view
);
//增加讨论量
hotSearch
.
setCommentCount
(
commentCount
);
//增加标签
if
(
Objects
.
nonNull
(
label
))
{
hotSearch
.
setHeatLabel
(
label
);
}
biliComprehensiveHotList
.
add
(
hotSearch
);
}
}
else
{
log
.
info
(
"list 数据结构为:{}"
,
list
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析B站综合热门时出现解析错误"
,
e
);
}
return
biliComprehensiveHotList
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
98a6f728
...
@@ -97,6 +97,12 @@ public class HotSearchCacheDAO {
...
@@ -97,6 +97,12 @@ public class HotSearchCacheDAO {
document
.
put
(
"barrage"
,
hotSearch
.
getBarrage
());
document
.
put
(
"barrage"
,
hotSearch
.
getBarrage
());
document
.
put
(
"pictureUrl"
,
hotSearch
.
getPictureUrl
());
document
.
put
(
"pictureUrl"
,
hotSearch
.
getPictureUrl
());
}
}
if
(
"B站综合热门"
.
equals
(
hotSearch
.
getType
()))
{
document
.
put
(
"heatLabel"
,
hotSearch
.
getHeatLabel
());
document
.
put
(
"view"
,
hotSearch
.
getView
());
document
.
put
(
"pictureUrl"
,
hotSearch
.
getPictureUrl
());
document
.
put
(
"commentCount"
,
hotSearch
.
getCommentCount
());
}
addAndUpdateData
(
document
);
addAndUpdateData
(
document
);
if
(
"百度热搜"
.
equals
(
hotSearch
.
getType
()))
{
if
(
"百度热搜"
.
equals
(
hotSearch
.
getType
()))
{
document
.
remove
(
"topic_lead"
);
document
.
remove
(
"topic_lead"
);
...
@@ -269,6 +275,9 @@ public class HotSearchCacheDAO {
...
@@ -269,6 +275,9 @@ public class HotSearchCacheDAO {
if
(
picTypes
.
contains
(
type
))
{
if
(
picTypes
.
contains
(
type
))
{
nowDoc
.
put
(
"pictureUrl"
,
pictureUrl
);
nowDoc
.
put
(
"pictureUrl"
,
pictureUrl
);
}
}
if
(
"B站综合热门"
.
equals
(
type
))
{
nowDoc
.
put
(
"pictureUrl"
,
pictureUrl
);
}
if
(
"微博热搜"
.
equals
(
type
))
{
if
(
"微博热搜"
.
equals
(
type
))
{
nowDoc
=
WeiboHotSearchCrawler
.
weiboUpdate
(
nowDoc
);
nowDoc
=
WeiboHotSearchCrawler
.
weiboUpdate
(
nowDoc
);
//更新微博话题贡献者,关于功能
//更新微博话题贡献者,关于功能
...
@@ -368,7 +377,9 @@ public class HotSearchCacheDAO {
...
@@ -368,7 +377,9 @@ public class HotSearchCacheDAO {
// }
// }
if
(
"脉脉热榜"
.
equals
(
type
))
{
if
(
"脉脉热榜"
.
equals
(
type
))
{
duration
=
duration
+
30
;
duration
=
duration
+
30
;
}
else
{
}
else
if
(
"B站综合热门"
.
equals
(
type
)){
duration
=
duration
+
60
;
}
else
{
duration
=
duration
+
1
;
duration
=
duration
+
1
;
}
}
return
duration
;
return
duration
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
98a6f728
...
@@ -610,4 +610,19 @@ public class GatherTimer {
...
@@ -610,4 +610,19 @@ public class GatherTimer {
WeiBoSearchBoxHotWordsCrawler
.
weiBoSearchBoxHotWords
(
date
);
WeiBoSearchBoxHotWordsCrawler
.
weiBoSearchBoxHotWords
(
date
);
log
.
info
(
"微博搜索框热词采集结束........"
);
log
.
info
(
"微博搜索框热词采集结束........"
);
}
}
/**
* B站综合热门的采集 每小时一次
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 0 0/1 * * ? "
)
public
void
crawlerBiliComprehensiveHot
(){
log
.
info
(
"B站综合热门开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
BiliComprehensiveHotList
=
BiliComprehensiveHotCrawler
.
getBiliComprehensiveHot
(
date
);
log
.
info
(
"{}, B站综合热门此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
BiliComprehensiveHotList
!=
null
?
BiliComprehensiveHotList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
B
站综合热门
.
name
(),
BiliComprehensiveHotList
);
log
.
info
(
"B站综合热门采集结束..."
);
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment