Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
ad09acf2
Commit
ad09acf2
authored
Sep 06, 2022
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
抖音娱乐榜,微博品牌(9个子榜单)榜上线,更新微博pc端游客cookie
parent
466ef41d
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
195 additions
and
6 deletions
+195
-6
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
+10
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+10
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
+47
-3
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoBrandCrawler.java
+0
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+19
-2
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+108
-0
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
View file @
ad09acf2
...
...
@@ -150,6 +150,16 @@ public class HotSearchList implements Serializable{
*/
private
Double
exponent
;
/**
* 阅读量(微博品牌在用)
*/
private
Long
readCount
;
/**
* 讨论量(微博品牌在用)
*/
private
Long
discussCount
;
public
HotSearchList
(){}
public
HotSearchList
(
String
url
,
String
name
,
Long
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
,
Date
date
){
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
ad09acf2
...
...
@@ -34,5 +34,15 @@ public enum HotSearchType {
微视热榜
,
微博出圈榜
,
微博视频榜
,
抖音娱乐榜
,
微博品牌总榜
,
微博品牌汽车榜
,
微博品牌手机榜
,
微博品牌美妆榜
,
微博品牌奢侈品榜
,
微博品牌食品饮料榜
,
微博品牌家电榜
,
微博品牌服装鞋帽榜
,
微博品牌母婴榜
,
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
View file @
ad09acf2
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.*
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
...
...
@@ -34,6 +32,8 @@ public class DouyinHotSearchCrawler {
public
static
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
public
static
Set
<
String
>
set
=
new
HashSet
<>();
/**
* @Title: getMobileDouyinHotList
* @author hero
...
...
@@ -113,4 +113,48 @@ public class DouyinHotSearchCrawler {
return
resultUrl
;
}
/**
* @Title: getMobileDouyinEntertainmentList
* @author hero
* @Description: 移动端抖音娱乐榜榜
* @param @return 设定文件
* @return List<HotSearchList> 返回类型
*/
public
static
List
<
HotSearchList
>
getMobileDouyinEntertainmentList
(
Date
date
){
List
<
HotSearchList
>
entertainmentList
=
new
ArrayList
<>();
String
url
=
"https://api5-normal-c-lq.amemv.com/aweme/v1/hot/search/list/?board_type=2&board_sub_type=2&version_code=140900"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取抖音娱乐榜榜时出现问题:{}"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
))
{
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
wordList
=
data
.
getJSONObject
(
"data"
).
getJSONArray
(
"word_list"
);
String
positionStr
=
null
;
String
word
=
null
;
String
hotValueStr
=
null
;
for
(
int
i
=
0
;
i
<
wordList
.
size
();
i
++)
{
JSONObject
wl
=
wordList
.
getJSONObject
(
i
);
//获取排名
positionStr
=
wl
.
getString
(
"position"
);
Integer
position
=
null
;
position
=
Integer
.
valueOf
(
positionStr
);
//获取关键词
word
=
wl
.
getString
(
"word"
);
//获取热度值
hotValueStr
=
wl
.
getString
(
"hot_value"
);
Long
hotValue
=
null
;
hotValue
=
Long
.
valueOf
(
hotValueStr
);
HotSearchList
douyinEntertainment
=
new
HotSearchList
(
null
,
word
,
hotValue
,
position
,
HotSearchType
.
抖音娱乐榜
.
name
(),
date
);
entertainmentList
.
add
(
douyinEntertainment
);
set
.
add
(
word
);
}
}
return
entertainmentList
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoBrandCrawler.java
0 → 100644
View file @
ad09acf2
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
ad09acf2
...
...
@@ -505,7 +505,7 @@ public class WeiboHotSearchCrawler {
String
url
=
"https://s.weibo.com/weibo?q="
+
encode
+
"&Refer=top"
;
String
htmlBody
=
null
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"Cookie"
,
"SUB=_2AkM
WEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN
"
);
headerMap
.
put
(
"Cookie"
,
"SUB=_2AkM
UShJMf8NxqwJRmP0RyWvgb4RwwgnEieKiFuOXJRMxHRl-yT92qlQvtRB6P8o8oso9Ew-s6vf16fdCca-Xz6DwwAMH; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFdAobr6HdAbgQQ9vbUQKDx
"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
ad09acf2
...
...
@@ -120,7 +120,10 @@ public class HotSearchCacheDAO {
document
.
put
(
"exponent"
,
hotSearch
.
getExponent
());
document
.
put
(
"iconUrl"
,
hotSearch
.
getIconUrl
());
}
if
(
hotSearch
.
getType
().
contains
(
"微博品牌"
))
{
document
.
put
(
"readCount"
,
hotSearch
.
getReadCount
());
document
.
put
(
"discussCount"
,
hotSearch
.
getDiscussCount
());
}
if
(
"微视热榜"
.
equals
(
hotSearch
.
getType
()))
{
document
.
put
(
"iconUrl"
,
hotSearch
.
getIconUrl
());
addAndUpdateData
(
document
,
true
);
...
...
@@ -140,6 +143,10 @@ public class HotSearchCacheDAO {
if
(
"B站排行榜"
.
equals
(
hotSearch
.
getType
()))
{
document
.
remove
(
"downtext"
);
}
if
(
hotSearch
.
getType
().
contains
(
"微博品牌"
))
{
document
.
remove
(
"readCount"
);
document
.
remove
(
"discussCount"
);
}
dataes
.
add
(
document
);
}
return
dataes
;
...
...
@@ -245,6 +252,10 @@ public class HotSearchCacheDAO {
nowDoc
.
put
(
"duration"
,
durationNow
);
nowDoc
.
put
(
"recommend"
,
recommend
);
nowDoc
.
put
(
"riseSpeed"
,
riseSpeed
);
if
(
type
.
contains
(
"微博品牌"
))
{
nowDoc
.
put
(
"readCount"
,
nonNull
(
document
.
get
(
"readCount"
))
?
Long
.
valueOf
(
document
.
get
(
"readCount"
).
toString
())
:
null
);
nowDoc
.
put
(
"discussCount"
,
nonNull
(
document
.
get
(
"discussCount"
))
?
Long
.
valueOf
(
document
.
get
(
"discussCount"
).
toString
())
:
null
);
}
if
(
"微博热搜"
.
equals
(
type
))
{
nowDoc
.
put
(
"realLastRank"
,
realLastRank
);
nowDoc
.
put
(
"realHighestRank"
,
realHighestRank
);
...
...
@@ -320,6 +331,10 @@ public class HotSearchCacheDAO {
nowDoc
.
put
(
"tag"
,
nonNull
(
document
.
get
(
"tag"
))
?
document
.
getString
(
"tag"
)
:
null
);
nowDoc
.
put
(
"downtext"
,
nonNull
(
document
.
get
(
"downtext"
))
?
document
.
getString
(
"downtext"
)
:
null
);
}
if
(
type
.
contains
(
"微博品牌"
))
{
nowDoc
.
put
(
"readCount"
,
nonNull
(
document
.
get
(
"readCount"
))
?
Long
.
valueOf
(
document
.
get
(
"readCount"
).
toString
())
:
null
);
nowDoc
.
put
(
"discussCount"
,
nonNull
(
document
.
get
(
"discussCount"
))
?
Long
.
valueOf
(
document
.
get
(
"discussCount"
).
toString
())
:
null
);
}
if
(
"微博热搜"
.
equals
(
type
))
{
nowDoc
=
WeiboHotSearchCrawler
.
weiboUpdate
(
nowDoc
);
//更新微博话题贡献者,关于功能
...
...
@@ -423,7 +438,9 @@ public class HotSearchCacheDAO {
duration
=
duration
+
30
;
}
else
if
(
"B站综合热门"
.
equals
(
type
))
{
duration
=
duration
+
60
;
}
else
{
}
else
if
(
type
.
contains
(
"微博品牌"
)){
duration
=
duration
+
60
;
}
else
{
duration
=
duration
+
1
;
}
return
duration
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
ad09acf2
...
...
@@ -696,4 +696,112 @@ public class GatherTimer {
TipsUtils
.
addHotList
(
"微博视频榜"
,
weiBoVideoList
);
log
.
info
(
"微博视频榜采集结束........"
);
}
/**
* 抖音娱乐榜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerDouYinEntertainment
(){
log
.
info
(
"抖音娱乐榜开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
douyinList
=
DouyinHotSearchCrawler
.
getMobileDouyinEntertainmentList
(
date
);
log
.
info
(
"{}, 抖音娱乐榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
douyinList
!=
null
?
douyinList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
抖音娱乐榜
.
name
(),
douyinList
);
log
.
info
(
"抖音娱乐榜采集结束..."
);
}
/**
* 抖音娱乐榜链接的更新
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 0/5 * * * ? "
)
public
void
updateDouYinEntertainmentUrl
(){
log
.
info
(
"抖音娱乐榜链接更新开始..."
);
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
Set
<
String
>
wordList
=
DouyinHotSearchCrawler
.
set
;
Set
<
String
>
douyinEntertainmentList
=
new
HashSet
<>();
douyinEntertainmentList
.
addAll
(
wordList
);
DouyinHotSearchCrawler
.
set
.
clear
();
if
(
douyinEntertainmentList
!=
null
&&
douyinEntertainmentList
.
size
()>
0
){
for
(
String
word
:
douyinEntertainmentList
)
{
String
id
=
word
+
"_"
+
HotSearchType
.
抖音娱乐榜
.
name
();
String
url
=
DouyinHotSearchCrawler
.
getDouyinUrl
(
"https://aweme-hl.snssdk.com/aweme/v1/hot/search/video/list/?hotword="
+
word
);
if
(
url
!=
null
)
{
Document
document
=
new
Document
();
document
.
put
(
"id"
,
id
);
document
.
put
(
"url"
,
url
);
hotSearchCacheDAO
.
updateDouyinUrl
(
document
);
}
}
log
.
info
(
"抖音娱乐榜链接更新结束"
);
}
else
{
log
.
info
(
"抖音娱乐榜链接更新失败,抖音娱乐榜列表获取为空。"
);
}
}
/**
* 微博品牌榜采集(一小时采集一次)
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 0 0/1 * * ? "
)
public
void
crawlerWeiBoBrandTotalList
(){
log
.
info
(
"微博品牌总榜采集开始........"
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
weiBoBrandTotalList
=
WeiBoBrandCrawler
.
weiBoBrandTotalList
(
date
);
log
.
info
(
"{}, 此轮微博品牌总榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandTotalList
!=
null
?
weiBoBrandTotalList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌总榜"
,
weiBoBrandTotalList
);
log
.
info
(
"微博品牌总榜采集结束........"
);
log
.
info
(
"微博品牌汽车榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandCarList
=
WeiBoBrandCrawler
.
weiBoBrandCar
(
date
);
log
.
info
(
"{}, 此轮微博品牌汽车榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandCarList
!=
null
?
weiBoBrandCarList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌汽车榜"
,
weiBoBrandCarList
);
log
.
info
(
"微博品牌汽车榜采集结束........"
);
log
.
info
(
"微博品牌手机榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandPhoneList
=
WeiBoBrandCrawler
.
weiBoBrandPhone
(
date
);
log
.
info
(
"{}, 此轮微博品牌手机榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandPhoneList
!=
null
?
weiBoBrandPhoneList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌手机榜"
,
weiBoBrandPhoneList
);
log
.
info
(
"微博品牌手机榜采集结束........"
);
log
.
info
(
"微博品牌美妆榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandMakeupList
=
WeiBoBrandCrawler
.
weiBoBrandMakeup
(
date
);
log
.
info
(
"{}, 此轮微博品牌美妆榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandMakeupList
!=
null
?
weiBoBrandMakeupList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌美妆榜"
,
weiBoBrandMakeupList
);
log
.
info
(
"微博品牌美妆榜采集结束........"
);
log
.
info
(
"微博品牌奢侈品榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandLuxuryList
=
WeiBoBrandCrawler
.
weiBoBrandLuxury
(
date
);
log
.
info
(
"{}, 此轮微博品牌奢侈品榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandLuxuryList
!=
null
?
weiBoBrandLuxuryList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌奢侈品榜"
,
weiBoBrandLuxuryList
);
log
.
info
(
"微博品牌奢侈品榜采集结束........"
);
log
.
info
(
"微博品牌食品饮料榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandFoodList
=
WeiBoBrandCrawler
.
weiBoBrandFood
(
date
);
log
.
info
(
"{}, 此轮微博品牌食品饮料榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandFoodList
!=
null
?
weiBoBrandFoodList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌食品饮料榜"
,
weiBoBrandFoodList
);
log
.
info
(
"微博品牌食品饮料榜采集结束........"
);
log
.
info
(
"微博品牌家电榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandHomeApplianceList
=
WeiBoBrandCrawler
.
weiBoBrandHomeAppliance
(
date
);
log
.
info
(
"{}, 此轮微博品牌家电榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandHomeApplianceList
!=
null
?
weiBoBrandHomeApplianceList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌家电榜"
,
weiBoBrandHomeApplianceList
);
log
.
info
(
"微博品牌家电榜采集结束........"
);
log
.
info
(
"微博品牌服装鞋帽榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandDressList
=
WeiBoBrandCrawler
.
weiBoBrandDress
(
date
);
log
.
info
(
"{}, 此轮微博品牌服装鞋帽榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandDressList
!=
null
?
weiBoBrandDressList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌服装鞋帽榜"
,
weiBoBrandDressList
);
log
.
info
(
"微博品牌服装鞋帽榜采集结束........"
);
log
.
info
(
"微博品牌母婴榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandMotherAndInfantList
=
WeiBoBrandCrawler
.
weiBoBrandMotherAndInfant
(
date
);
log
.
info
(
"{}, 此轮微博品牌母婴榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandMotherAndInfantList
!=
null
?
weiBoBrandMotherAndInfantList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌母婴榜"
,
weiBoBrandMotherAndInfantList
);
log
.
info
(
"微博品牌母婴榜采集结束........"
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment