Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
1fd52a37
Commit
1fd52a37
authored
Jan 10, 2022
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
新增B站标签采集和知乎热搜标签采集
parent
d59803e9
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
214 additions
and
57 deletions
+214
-57
pom.xml
+6
-1
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
+11
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/BililiCrawler.java
+97
-24
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+82
-31
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+18
-0
No files found.
pom.xml
View file @
1fd52a37
...
...
@@ -48,7 +48,12 @@
<artifactId>
crawler-core
</artifactId>
<version>
0.6.7.4-SNAPSHOT
</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.conscrypt/conscrypt-openjdk-uber -->
<dependency>
<groupId>
org.conscrypt
</groupId>
<artifactId>
conscrypt-openjdk-uber
</artifactId>
<version>
2.5.2
</version>
</dependency>
<!-- 日志依赖 -->
<!-- https://mvnrepository.com/artifact/org.apache.logging.log4j/log4j-core -->
<dependency>
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
View file @
1fd52a37
...
...
@@ -94,7 +94,7 @@ public class HotSearchList implements Serializable{
private
String
topicResult
;
/**
* 观看数(目前近B站排行榜及综合热门使用)
* 观看数(目前近B站排行榜及综合热门
,知乎浏览量
使用)
*/
private
Long
view
;
...
...
@@ -122,6 +122,16 @@ public class HotSearchList implements Serializable{
* 内容
*/
private
String
content
;
/**
* 粉丝数(目前仅B站排行榜和知乎热搜使用)
*/
private
Long
fans
;
/**
* 标签(目前仅B站排行榜和知乎热搜使用)
*/
private
String
tag
;
public
HotSearchList
(){}
public
HotSearchList
(
String
url
,
String
name
,
Long
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
,
Date
date
){
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/BililiCrawler.java
View file @
1fd52a37
...
...
@@ -7,19 +7,21 @@ import com.zhiwei.crawler.core.proxy.ProxyHolder;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.ExecutorService
;
import
java.util.concurrent.Executors
;
@Log4j2
public
class
BililiCrawler
{
...
...
@@ -32,6 +34,7 @@ public class BililiCrawler {
*/
public
static
List
<
HotSearchList
>
getBilibiliHotSearch
(
Date
date
){
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
ExecutorService
executor
=
Executors
.
newFixedThreadPool
(
10
);
log
.
info
(
"bilibili排行榜开始采集..."
);
JSONArray
dataJson
=
null
;
String
htmlBody
=
null
;
...
...
@@ -43,38 +46,108 @@ public class BililiCrawler {
}
catch
(
IOException
e
)
{
log
.
error
(
"B站排行榜页面连接失败"
,
e
.
fillInStackTrace
());
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
dataJson
=
jsonObject
.
getJSONArray
(
"list"
);
if
(
dataJson
!=
null
)
{
for
(
int
i
=
0
;
i
<
dataJson
.
size
();
i
++)
{
JSONObject
data
=
dataJson
.
getJSONObject
(
i
);
int
rank
=
i
+
1
;
String
name
=
data
.
getString
(
"title"
);
String
topicLead
=
data
.
getString
(
"desc"
);
long
count
=
data
.
getLongValue
(
"score"
);
String
bvid
=
data
.
getString
(
"bvid"
);
String
pic
=
data
.
getString
(
"pic"
);
String
bUrl
=
"https://www.bilibili.com/video/"
+
bvid
;
Long
view
=
null
;
Long
barrage
=
null
;
if
(
data
.
containsKey
(
"stat"
))
{
JSONObject
stat
=
data
.
getJSONObject
(
"stat"
);
view
=
stat
.
getLongValue
(
"view"
);
barrage
=
stat
.
getLongValue
(
"danmaku"
);
try
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
dataJson
=
jsonObject
.
getJSONArray
(
"list"
);
if
(
dataJson
!=
null
)
{
for
(
int
i
=
0
;
i
<
dataJson
.
size
();
i
++)
{
JSONObject
data
=
dataJson
.
getJSONObject
(
i
);
int
rank
=
i
+
1
;
String
name
=
data
.
getString
(
"title"
);
String
topicLead
=
data
.
getString
(
"desc"
);
long
count
=
data
.
getLongValue
(
"score"
);
String
bvid
=
data
.
getString
(
"bvid"
);
String
pic
=
data
.
getString
(
"pic"
);
String
bUrl
=
"https://www.bilibili.com/video/"
+
bvid
;
Long
view
=
null
;
Long
barrage
=
null
;
if
(
data
.
containsKey
(
"stat"
))
{
JSONObject
stat
=
data
.
getJSONObject
(
"stat"
);
view
=
stat
.
getLongValue
(
"view"
);
barrage
=
stat
.
getLongValue
(
"danmaku"
);
}
//获取主持人
String
downtext
=
null
;
if
(
data
.
containsKey
(
"owner"
))
{
JSONObject
stat
=
data
.
getJSONObject
(
"owner"
);
downtext
=
stat
.
getString
(
"name"
);
}
HotSearchList
hotSearchList
=
new
HotSearchList
(
bUrl
,
name
,
topicLead
,
count
,
null
,
date
,
rank
,
HotSearchType
.
B
站排行榜
.
name
(),
view
,
barrage
,
pic
);
hotSearchList
.
setDowntext
(
downtext
);
executor
.
execute
(
new
Runnable
()
{
@Override
public
void
run
()
{
HotSearchList
tag
=
getTag
(
bUrl
,
hotSearchList
);
hotSearchLists
.
add
(
tag
);
}
});
}
HotSearchList
hotSearchList
=
new
HotSearchList
(
bUrl
,
name
,
topicLead
,
count
,
null
,
date
,
rank
,
HotSearchType
.
B
站排行榜
.
name
(),
view
,
barrage
,
pic
);
hotSearchLists
.
add
(
hotSearchList
);
//进行多线程任务是否执行完毕 如到达指定时间也结束循环
executor
.
shutdown
();
long
time
=
0L
;
while
(
true
){
if
(
executor
.
isTerminated
()){
break
;
}
try
{
Thread
.
sleep
(
3000
);
time
=
3000
+
time
;
if
(
time
>
50000
){
break
;
}
}
catch
(
InterruptedException
e
)
{
e
.
printStackTrace
();
}
}
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"B站排行榜页面解析异常:{}"
,
e
);
}
ZhiWeiTools
.
sleep
(
3000L
);
}
log
.
info
(
"{}, B站排行榜此轮采集到的数据量为:{}"
,
new
Date
(),
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
);
log
.
info
(
"B站排行榜采集结束"
);
return
hotSearchLists
;
}
//获取标签及粉丝量
private
static
HotSearchList
getTag
(
String
url
,
HotSearchList
hotSearchList
)
{
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
{
System
.
setProperty
(
"https.protocols"
,
"TLSv1,TLSv1.1,TLSv1.2,SSLv3"
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
);
String
htmlBody
=
response
.
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"v-wrap"
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
//获取标签
String
tags
=
"`"
+
document
.
select
(
"li.tag"
).
text
()+
";"
;
String
tag
=
tags
.
replaceAll
(
" "
,
";`"
);
hotSearchList
.
setTag
(
tag
);
//获取粉丝数
if
(
htmlBody
.
contains
(
"v_upinfo"
))
{
String
text
=
document
.
select
(
"div.follow-btn"
).
select
(
"span"
).
text
();
String
fan
=
text
.
split
(
" "
)[
2
];
Long
fanCount
=
null
;
if
(
fan
.
contains
(
"万"
)){
double
dou
=
Double
.
parseDouble
(
fan
.
replaceAll
(
"万"
,
" "
));
fanCount
=
new
Double
(
dou
*
10000
).
longValue
();
}
else
{
fanCount
=
Long
.
valueOf
(
fan
);
}
hotSearchList
.
setFans
(
fanCount
);
}
return
hotSearchList
;
}
else
{
return
hotSearchList
;
}
}
catch
(
Exception
e
)
{
log
.
error
(
"单条B站排行榜数据页面连接失败:{}"
,
e
);
return
hotSearchList
;
}
}
/**
* B站热搜的采集
* @param date
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
1fd52a37
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.*
;
import
com.zhiwei.crawler.core.config.SslProvider
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
...
...
@@ -20,6 +18,12 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
* @ClassName: ZhihuHotCrawler
...
...
@@ -30,7 +34,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
@Log4j2
public
class
ZhihuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
sslProvider
(
SslProvider
.
CONSCRYPT
).
retryTimes
(
3
).
build
();
/**
* @Title: getZhihuHotList
* @author hero
...
...
@@ -100,37 +104,84 @@ public class ZhihuHotSearchCrawler {
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
))
{
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
dataJson
=
topSearch
.
getJSONArray
(
"data"
);
String
link
=
null
;
String
displayQuery
=
null
;
Long
hotCount
=
null
;
String
hotText
=
null
;
for
(
int
i
=
0
;
i
<
dataJson
.
size
();
i
++)
{
JSONObject
data
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"target"
);
displayQuery
=
data
.
getString
(
"title"
);
link
=
"https://www.zhihu.com/question/"
+
data
.
getLongValue
(
"id"
);
hotText
=
dataJson
.
getJSONObject
(
i
).
getString
(
"detail_text"
);
try
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
))
{
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
dataJson
=
topSearch
.
getJSONArray
(
"data"
);
String
link
=
null
;
String
displayQuery
=
null
;
Long
hotCount
=
null
;
String
hotText
=
null
;
for
(
int
i
=
0
;
i
<
dataJson
.
size
();
i
++)
{
JSONObject
data
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"target"
);
displayQuery
=
data
.
getString
(
"title"
);
link
=
"https://www.zhihu.com/question/"
+
data
.
getLongValue
(
"id"
);
hotText
=
dataJson
.
getJSONObject
(
i
).
getString
(
"detail_text"
);
//计算热度
try
{
if
(
hotText
.
contains
(
"万"
))
{
hotText
=
hotText
.
replaceAll
(
"万.*"
,
""
).
trim
();
hotCount
=
(
long
)
(
Double
.
parseDouble
(
hotText
)
*
10000
);
}
else
if
(
hotText
.
contains
(
"亿"
))
{
hotText
=
hotText
.
replaceAll
(
"亿.*"
,
""
).
trim
();
hotCount
=
(
long
)
(
Double
.
parseDouble
(
hotText
)
*
100000000
);
}
else
{
hotCount
=
Long
.
getLong
(
hotText
);
//计算热度
try
{
if
(
hotText
.
contains
(
"万"
))
{
hotText
=
hotText
.
replaceAll
(
"万.*"
,
""
).
trim
();
hotCount
=
(
long
)
(
Double
.
parseDouble
(
hotText
)
*
10000
);
}
else
if
(
hotText
.
contains
(
"亿"
))
{
hotText
=
hotText
.
replaceAll
(
"亿.*"
,
""
).
trim
();
hotCount
=
(
long
)
(
Double
.
parseDouble
(
hotText
)
*
100000000
);
}
else
{
hotCount
=
Long
.
getLong
(
hotText
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
org
.
bson
.
Document
doc
=
getTag
(
link
);
String
tog
=
nonNull
(
doc
.
get
(
"tag"
))
?
doc
.
getString
(
"tag"
)
:
null
;
Long
view
=
nonNull
(
doc
.
get
(
"view"
))
?
Long
.
valueOf
(
doc
.
get
(
"view"
).
toString
())
:
null
;
Long
fans
=
nonNull
(
doc
.
get
(
"fans"
))
?
Long
.
valueOf
(
doc
.
get
(
"fans"
).
toString
())
:
null
;
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
hotCount
,
i
+
1
,
HotSearchType
.
知乎热搜
.
name
(),
date
);
zhihu
.
setFans
(
fans
);
zhihu
.
setView
(
view
);
zhihu
.
setTag
(
tog
);
list
.
add
(
zhihu
);
}
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
hotCount
,
i
+
1
,
HotSearchType
.
知乎热搜
.
name
(),
date
);
list
.
add
(
zhihu
);
}
}
catch
(
Exception
e
)
{
log
.
info
(
"知乎热搜解析异常"
,
e
);
}
return
list
;
}
//访问pc端 获取标签及浏览量关注数
private
static
org
.
bson
.
Document
getTag
(
String
url
)
{
org
.
bson
.
Document
doc
=
new
org
.
bson
.
Document
();
doc
.
put
(
"tag"
,
null
);
//浏览量
doc
.
put
(
"view"
,
null
);
//粉丝
doc
.
put
(
"fans"
,
null
);
Map
<
String
,
String
>
Map
=
HeaderTool
.
getCommonHead
();
Map
.
put
(
"cookie"
,
"_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
Map
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
String
htmlBody
=
response
.
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"QuestionHeader"
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
//获取标签
String
content
=
"`"
+
document
.
select
(
"div.Tag"
).
text
()+
";"
;
String
label
=
content
.
replaceAll
(
" "
,
";`"
);
doc
.
put
(
"tag"
,
label
.
trim
());
String
strong
=
document
.
select
(
"div.NumberBoard-itemInner"
).
select
(
"strong"
).
text
();
String
[]
count
=
strong
.
split
(
" "
);
//获取关注数
doc
.
put
(
"fans"
,
Long
.
valueOf
(
count
[
0
].
replaceAll
(
","
,
""
).
trim
()));
//获取浏览量
doc
.
put
(
"view"
,
Long
.
valueOf
(
count
[
1
].
replaceAll
(
","
,
""
).
trim
()));
return
doc
;
}
else
{
return
doc
;
}
}
catch
(
Exception
e
)
{
log
.
error
(
"单条知乎热搜数据页面连接失败"
,
e
);
return
doc
;
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
1fd52a37
...
...
@@ -96,6 +96,9 @@ public class HotSearchCacheDAO {
document
.
put
(
"view"
,
hotSearch
.
getView
());
document
.
put
(
"barrage"
,
hotSearch
.
getBarrage
());
document
.
put
(
"pictureUrl"
,
hotSearch
.
getPictureUrl
());
document
.
put
(
"tag"
,
hotSearch
.
getTag
());
document
.
put
(
"downtext"
,
hotSearch
.
getDowntext
());
document
.
put
(
"fans"
,
hotSearch
.
getFans
());
}
if
(
"B站综合热门"
.
equals
(
hotSearch
.
getType
()))
{
document
.
put
(
"heatLabel"
,
hotSearch
.
getHeatLabel
());
...
...
@@ -103,6 +106,11 @@ public class HotSearchCacheDAO {
document
.
put
(
"pictureUrl"
,
hotSearch
.
getPictureUrl
());
document
.
put
(
"commentCount"
,
hotSearch
.
getCommentCount
());
}
if
(
"知乎热搜"
.
equals
(
hotSearch
.
getType
()))
{
document
.
put
(
"tag"
,
hotSearch
.
getTag
());
document
.
put
(
"view"
,
hotSearch
.
getView
());
document
.
put
(
"fans"
,
hotSearch
.
getFans
());
}
addAndUpdateData
(
document
);
if
(
"百度热搜"
.
equals
(
hotSearch
.
getType
()))
{
document
.
remove
(
"topic_lead"
);
...
...
@@ -113,6 +121,9 @@ public class HotSearchCacheDAO {
if
(
"网易热榜"
.
equals
(
hotSearch
.
getType
()))
{
document
.
remove
(
"downtext"
);
}
if
(
"B站排行榜"
.
equals
(
hotSearch
.
getType
()))
{
document
.
remove
(
"downtext"
);
}
dataes
.
add
(
document
);
}
return
dataes
;
...
...
@@ -278,6 +289,13 @@ public class HotSearchCacheDAO {
if
(
"B站综合热门"
.
equals
(
type
))
{
nowDoc
.
put
(
"pictureUrl"
,
pictureUrl
);
}
if
(
"知乎热搜"
.
equals
(
type
))
{
nowDoc
.
put
(
"tag"
,
nonNull
(
document
.
get
(
"tag"
))
?
document
.
getString
(
"tag"
)
:
null
);
}
if
(
"B站排行榜"
.
equals
(
type
))
{
nowDoc
.
put
(
"tag"
,
nonNull
(
document
.
get
(
"tag"
))
?
document
.
getString
(
"tag"
)
:
null
);
nowDoc
.
put
(
"downtext"
,
nonNull
(
document
.
get
(
"downtext"
))
?
document
.
getString
(
"downtext"
)
:
null
);
}
if
(
"微博热搜"
.
equals
(
type
))
{
nowDoc
=
WeiboHotSearchCrawler
.
weiboUpdate
(
nowDoc
);
//更新微博话题贡献者,关于功能
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment