Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
ad09acf2
Commit
ad09acf2
authored
Sep 06, 2022
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
抖音娱乐榜,微博品牌(9个子榜单)榜上线,更新微博pc端游客cookie
parent
466ef41d
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
589 additions
and
6 deletions
+589
-6
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
+10
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+10
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
+47
-3
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoBrandCrawler.java
+394
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+19
-2
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+108
-0
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
View file @
ad09acf2
...
...
@@ -150,6 +150,16 @@ public class HotSearchList implements Serializable{
*/
private
Double
exponent
;
/**
* 阅读量(微博品牌在用)
*/
private
Long
readCount
;
/**
* 讨论量(微博品牌在用)
*/
private
Long
discussCount
;
public
HotSearchList
(){}
public
HotSearchList
(
String
url
,
String
name
,
Long
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
,
Date
date
){
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
ad09acf2
...
...
@@ -34,5 +34,15 @@ public enum HotSearchType {
微视热榜
,
微博出圈榜
,
微博视频榜
,
抖音娱乐榜
,
微博品牌总榜
,
微博品牌汽车榜
,
微博品牌手机榜
,
微博品牌美妆榜
,
微博品牌奢侈品榜
,
微博品牌食品饮料榜
,
微博品牌家电榜
,
微博品牌服装鞋帽榜
,
微博品牌母婴榜
,
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
View file @
ad09acf2
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.*
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
...
...
@@ -34,6 +32,8 @@ public class DouyinHotSearchCrawler {
public
static
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
public
static
Set
<
String
>
set
=
new
HashSet
<>();
/**
* @Title: getMobileDouyinHotList
* @author hero
...
...
@@ -113,4 +113,48 @@ public class DouyinHotSearchCrawler {
return
resultUrl
;
}
/**
* @Title: getMobileDouyinEntertainmentList
* @author hero
* @Description: 移动端抖音娱乐榜榜
* @param @return 设定文件
* @return List<HotSearchList> 返回类型
*/
public
static
List
<
HotSearchList
>
getMobileDouyinEntertainmentList
(
Date
date
){
List
<
HotSearchList
>
entertainmentList
=
new
ArrayList
<>();
String
url
=
"https://api5-normal-c-lq.amemv.com/aweme/v1/hot/search/list/?board_type=2&board_sub_type=2&version_code=140900"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取抖音娱乐榜榜时出现问题:{}"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
))
{
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
wordList
=
data
.
getJSONObject
(
"data"
).
getJSONArray
(
"word_list"
);
String
positionStr
=
null
;
String
word
=
null
;
String
hotValueStr
=
null
;
for
(
int
i
=
0
;
i
<
wordList
.
size
();
i
++)
{
JSONObject
wl
=
wordList
.
getJSONObject
(
i
);
//获取排名
positionStr
=
wl
.
getString
(
"position"
);
Integer
position
=
null
;
position
=
Integer
.
valueOf
(
positionStr
);
//获取关键词
word
=
wl
.
getString
(
"word"
);
//获取热度值
hotValueStr
=
wl
.
getString
(
"hot_value"
);
Long
hotValue
=
null
;
hotValue
=
Long
.
valueOf
(
hotValueStr
);
HotSearchList
douyinEntertainment
=
new
HotSearchList
(
null
,
word
,
hotValue
,
position
,
HotSearchType
.
抖音娱乐榜
.
name
(),
date
);
entertainmentList
.
add
(
douyinEntertainment
);
set
.
add
(
word
);
}
}
return
entertainmentList
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoBrandCrawler.java
0 → 100644
View file @
ad09acf2
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Objects
;
/**
* @author ll
* @ClassName: WeiBoBrandCrawler
* @Description: 微博品牌全榜采集
* @date 2022年2月22日 上午09:54:31
*/
@Log4j2
public
class
WeiBoBrandCrawler
{
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
build
();
/**
* @return void 返回类型
* @Title: weiBoBrandTotalList
* @author ll
* @Description: 微博品牌总榜采集
*/
public
static
List
<
HotSearchList
>
weiBoBrandTotalList
(
Date
date
)
{
List
<
HotSearchList
>
result
=
new
ArrayList
();
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
String
url
=
"https://api.weibo.cn/2/guest/page?page="
+
i
+
"&containerid=231648_-_10_-_brand_v2_-_%E5%93%81%E7%89%8C%E8%AF%9D%E9%A2%98%E6%A6%9C"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌总榜时出现连接失败"
,
cause
);
}
else
{
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
.
contains
(
"cards"
)
&&
Objects
.
nonNull
(
JSONObject
.
parseObject
(
htmlBody
).
get
(
"cards"
)))
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
cards
.
size
()
>
0
)
{
JSONArray
cardGroup
=
cards
.
getJSONObject
(
0
).
getJSONArray
(
"card_group"
);
List
<
HotSearchList
>
list
=
ansysData
(
cardGroup
,
date
,
HotSearchType
.
微博品牌总榜
.
name
());
result
.
addAll
(
list
);
}
break
;
}
}
}
}
return
result
;
}
/**
* @return void 返回类型
* @Title: weiBoBrandCar
* @author ll
* @Description: 微博品牌汽车榜采集
*/
public
static
List
<
HotSearchList
>
weiBoBrandCar
(
Date
date
)
{
List
<
HotSearchList
>
result
=
new
ArrayList
();
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
String
url
=
"https://api.weibo.cn/2/guest/page?page="
+
i
+
"&containerid=231648_-_10_-_brand_car_v2"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌汽车榜时出现连接失败"
,
cause
);
}
else
{
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
.
contains
(
"cards"
)
&&
Objects
.
nonNull
(
JSONObject
.
parseObject
(
htmlBody
).
get
(
"cards"
)))
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
cards
.
size
()
>
0
)
{
JSONArray
cardGroup
=
cards
.
getJSONObject
(
0
).
getJSONArray
(
"card_group"
);
List
<
HotSearchList
>
list
=
ansysData
(
cardGroup
,
date
,
HotSearchType
.
微博品牌汽车榜
.
name
());
result
.
addAll
(
list
);
}
break
;
}
}
}
}
return
result
;
}
/**
* @return void 返回类型
* @Title: weiBoBrandPhone
* @author ll
* @Description: 微博品牌手机榜采集
*/
public
static
List
<
HotSearchList
>
weiBoBrandPhone
(
Date
date
)
{
List
<
HotSearchList
>
result
=
new
ArrayList
();
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
String
url
=
"https://api.weibo.cn/2/guest/page?page="
+
i
+
"&containerid=231648_-_10_-_brand_phone_v2"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌手机榜时出现连接失败"
,
cause
);
}
else
{
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
.
contains
(
"cards"
)
&&
Objects
.
nonNull
(
JSONObject
.
parseObject
(
htmlBody
).
get
(
"cards"
)))
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
cards
.
size
()
>
0
)
{
JSONArray
cardGroup
=
cards
.
getJSONObject
(
0
).
getJSONArray
(
"card_group"
);
List
<
HotSearchList
>
list
=
ansysData
(
cardGroup
,
date
,
HotSearchType
.
微博品牌手机榜
.
name
());
result
.
addAll
(
list
);
}
break
;
}
}
}
}
return
result
;
}
/**
* @return void 返回类型
* @Title: weiBoBrandMakeup
* @author ll
* @Description: 微博品牌美妆榜采集
*/
public
static
List
<
HotSearchList
>
weiBoBrandMakeup
(
Date
date
)
{
List
<
HotSearchList
>
result
=
new
ArrayList
();
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
String
url
=
"https://api.weibo.cn/2/guest/page?page="
+
i
+
"&containerid=231648_-_10_-_brand_makeup_v2"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌美妆榜时出现连接失败"
,
cause
);
}
else
{
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
.
contains
(
"cards"
)
&&
Objects
.
nonNull
(
JSONObject
.
parseObject
(
htmlBody
).
get
(
"cards"
)))
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
cards
.
size
()
>
0
)
{
JSONArray
cardGroup
=
cards
.
getJSONObject
(
0
).
getJSONArray
(
"card_group"
);
List
<
HotSearchList
>
list
=
ansysData
(
cardGroup
,
date
,
HotSearchType
.
微博品牌美妆榜
.
name
());
result
.
addAll
(
list
);
}
break
;
}
}
}
}
return
result
;
}
/**
* @return void 返回类型
* @Title: weiBoBrandLuxury
* @author ll
* @Description: 微博品牌奢侈品榜采集
*/
public
static
List
<
HotSearchList
>
weiBoBrandLuxury
(
Date
date
)
{
List
<
HotSearchList
>
result
=
new
ArrayList
();
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
String
url
=
"https://api.weibo.cn/2/guest/page?page="
+
i
+
"&containerid=231648_-_10_-_brand_luxury_v2"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌奢侈品榜时出现连接失败"
,
cause
);
}
else
{
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
.
contains
(
"cards"
)
&&
Objects
.
nonNull
(
JSONObject
.
parseObject
(
htmlBody
).
get
(
"cards"
)))
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
cards
.
size
()
>
0
)
{
JSONArray
cardGroup
=
cards
.
getJSONObject
(
0
).
getJSONArray
(
"card_group"
);
List
<
HotSearchList
>
list
=
ansysData
(
cardGroup
,
date
,
HotSearchType
.
微博品牌奢侈品榜
.
name
());
result
.
addAll
(
list
);
}
break
;
}
}
}
}
return
result
;
}
/**
* @return void 返回类型
* @Title: weiBoBrandFood
* @author ll
* @Description: 微博品牌食品饮料榜采集
*/
public
static
List
<
HotSearchList
>
weiBoBrandFood
(
Date
date
)
{
List
<
HotSearchList
>
result
=
new
ArrayList
();
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
String
url
=
"https://api.weibo.cn/2/guest/page?page="
+
i
+
"&containerid=231648_-_10_-_brand_food_v2"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌食品饮料榜时出现连接失败"
,
cause
);
}
else
{
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
.
contains
(
"cards"
)
&&
Objects
.
nonNull
(
JSONObject
.
parseObject
(
htmlBody
).
get
(
"cards"
)))
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
cards
.
size
()
>
0
)
{
JSONArray
cardGroup
=
cards
.
getJSONObject
(
0
).
getJSONArray
(
"card_group"
);
List
<
HotSearchList
>
list
=
ansysData
(
cardGroup
,
date
,
HotSearchType
.
微博品牌食品饮料榜
.
name
());
result
.
addAll
(
list
);
}
break
;
}
}
}
}
return
result
;
}
/**
* @return void 返回类型
* @Title: weiBoBrandHomeAppliance
* @author ll
* @Description: 微博品牌家电榜采集
*/
public
static
List
<
HotSearchList
>
weiBoBrandHomeAppliance
(
Date
date
)
{
List
<
HotSearchList
>
result
=
new
ArrayList
();
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
String
url
=
"https://api.weibo.cn/2/guest/page?page="
+
i
+
"&containerid=231648_-_10_-_brand_happ_v2"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌家电榜时出现连接失败"
,
cause
);
}
else
{
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
.
contains
(
"cards"
)
&&
Objects
.
nonNull
(
JSONObject
.
parseObject
(
htmlBody
).
get
(
"cards"
)))
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
cards
.
size
()
>
0
)
{
JSONArray
cardGroup
=
cards
.
getJSONObject
(
0
).
getJSONArray
(
"card_group"
);
List
<
HotSearchList
>
list
=
ansysData
(
cardGroup
,
date
,
HotSearchType
.
微博品牌家电榜
.
name
());
result
.
addAll
(
list
);
}
break
;
}
}
}
}
return
result
;
}
/**
* @return void 返回类型
* @Title: weiBoBrandDress
* @author ll
* @Description: 微博品牌服装鞋帽榜采集
*/
public
static
List
<
HotSearchList
>
weiBoBrandDress
(
Date
date
)
{
List
<
HotSearchList
>
result
=
new
ArrayList
();
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
String
url
=
"https://api.weibo.cn/2/guest/page?page="
+
i
+
"&containerid=231648_-_10_-_brand_clothing_v2"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌服装鞋帽榜时出现连接失败"
,
cause
);
}
else
{
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
.
contains
(
"cards"
)
&&
Objects
.
nonNull
(
JSONObject
.
parseObject
(
htmlBody
).
get
(
"cards"
)))
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
cards
.
size
()
>
0
)
{
JSONArray
cardGroup
=
cards
.
getJSONObject
(
0
).
getJSONArray
(
"card_group"
);
List
<
HotSearchList
>
list
=
ansysData
(
cardGroup
,
date
,
HotSearchType
.
微博品牌服装鞋帽榜
.
name
());
result
.
addAll
(
list
);
}
break
;
}
}
}
}
return
result
;
}
/**
* @return void 返回类型
* @Title: weiBoBrandMotherAndInfant
* @author ll
* @Description: 微博品牌母婴榜采集
*/
public
static
List
<
HotSearchList
>
weiBoBrandMotherAndInfant
(
Date
date
)
{
List
<
HotSearchList
>
result
=
new
ArrayList
();
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
String
url
=
"https://api.weibo.cn/2/guest/page?page="
+
i
+
"&containerid=231648_-_10_-_brand_mb_v2"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌服装鞋帽榜时出现连接失败"
,
cause
);
}
else
{
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
.
contains
(
"cards"
)
&&
Objects
.
nonNull
(
JSONObject
.
parseObject
(
htmlBody
).
get
(
"cards"
)))
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
cards
.
size
()
>
0
)
{
JSONArray
cardGroup
=
cards
.
getJSONObject
(
0
).
getJSONArray
(
"card_group"
);
List
<
HotSearchList
>
list
=
ansysData
(
cardGroup
,
date
,
HotSearchType
.
微博品牌母婴榜
.
name
());
result
.
addAll
(
list
);
}
break
;
}
}
}
}
return
result
;
}
/**
* 解析数据
*
* @param cardGroup
* @param date
* @param type
* @return List<HotSearchList>
*/
private
static
List
<
HotSearchList
>
ansysData
(
JSONArray
cardGroup
,
Date
date
,
String
type
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
try
{
for
(
Object
card
:
cardGroup
)
{
JSONObject
json
=
(
JSONObject
)
JSONObject
.
toJSON
(
card
);
//获取标题
String
name
=
json
.
getString
(
"title_sub"
).
replaceAll
(
"#"
,
""
);
//获取排名
Integer
rank
=
json
.
getInteger
(
"top_mark_text"
);
//获取链接
String
scheme
=
json
.
getString
(
"scheme"
).
split
(
"[?]"
)[
1
];
String
url
=
"https://s.weibo.com/weibo?"
+
scheme
;
String
desc
=
json
.
getString
(
"desc"
);
String
[]
dis
=
desc
.
split
(
"讨论"
);
Long
discussCount
=
null
;
Long
readCount
=
null
;
if
(
dis
.
length
==
2
)
{
discussCount
=
getCount
(
dis
[
0
]);
readCount
=
getCount
(
dis
[
1
]);
}
HotSearchList
hotSearchList
=
new
HotSearchList
(
url
,
name
,
null
,
rank
,
type
,
date
);
hotSearchList
.
setReadCount
(
readCount
);
hotSearchList
.
setDiscussCount
(
discussCount
);
list
.
add
(
hotSearchList
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析"
+
type
+
"时出现解析错误,数据不是json结构"
,
e
);
}
return
list
;
}
private
static
Long
getCount
(
String
string
)
{
Long
count
=
null
;
if
(
string
.
contains
(
"万"
))
{
String
[]
split
=
string
.
split
(
"万"
);
Double
aDouble
=
Double
.
valueOf
(
split
[
0
])
*
10000
;
count
=
new
Double
(
aDouble
).
longValue
();
}
else
if
(
string
.
contains
(
"亿"
))
{
String
[]
split
=
string
.
split
(
"亿"
);
Double
aDouble
=
Double
.
valueOf
(
split
[
0
])
*
100000000
;
count
=
new
Double
(
aDouble
).
longValue
();
}
else
{
count
=
Long
.
valueOf
(
string
);
}
return
count
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
ad09acf2
...
...
@@ -505,7 +505,7 @@ public class WeiboHotSearchCrawler {
String
url
=
"https://s.weibo.com/weibo?q="
+
encode
+
"&Refer=top"
;
String
htmlBody
=
null
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"Cookie"
,
"SUB=_2AkM
WEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN
"
);
headerMap
.
put
(
"Cookie"
,
"SUB=_2AkM
UShJMf8NxqwJRmP0RyWvgb4RwwgnEieKiFuOXJRMxHRl-yT92qlQvtRB6P8o8oso9Ew-s6vf16fdCca-Xz6DwwAMH; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFdAobr6HdAbgQQ9vbUQKDx
"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
ad09acf2
...
...
@@ -120,7 +120,10 @@ public class HotSearchCacheDAO {
document
.
put
(
"exponent"
,
hotSearch
.
getExponent
());
document
.
put
(
"iconUrl"
,
hotSearch
.
getIconUrl
());
}
if
(
hotSearch
.
getType
().
contains
(
"微博品牌"
))
{
document
.
put
(
"readCount"
,
hotSearch
.
getReadCount
());
document
.
put
(
"discussCount"
,
hotSearch
.
getDiscussCount
());
}
if
(
"微视热榜"
.
equals
(
hotSearch
.
getType
()))
{
document
.
put
(
"iconUrl"
,
hotSearch
.
getIconUrl
());
addAndUpdateData
(
document
,
true
);
...
...
@@ -140,6 +143,10 @@ public class HotSearchCacheDAO {
if
(
"B站排行榜"
.
equals
(
hotSearch
.
getType
()))
{
document
.
remove
(
"downtext"
);
}
if
(
hotSearch
.
getType
().
contains
(
"微博品牌"
))
{
document
.
remove
(
"readCount"
);
document
.
remove
(
"discussCount"
);
}
dataes
.
add
(
document
);
}
return
dataes
;
...
...
@@ -245,6 +252,10 @@ public class HotSearchCacheDAO {
nowDoc
.
put
(
"duration"
,
durationNow
);
nowDoc
.
put
(
"recommend"
,
recommend
);
nowDoc
.
put
(
"riseSpeed"
,
riseSpeed
);
if
(
type
.
contains
(
"微博品牌"
))
{
nowDoc
.
put
(
"readCount"
,
nonNull
(
document
.
get
(
"readCount"
))
?
Long
.
valueOf
(
document
.
get
(
"readCount"
).
toString
())
:
null
);
nowDoc
.
put
(
"discussCount"
,
nonNull
(
document
.
get
(
"discussCount"
))
?
Long
.
valueOf
(
document
.
get
(
"discussCount"
).
toString
())
:
null
);
}
if
(
"微博热搜"
.
equals
(
type
))
{
nowDoc
.
put
(
"realLastRank"
,
realLastRank
);
nowDoc
.
put
(
"realHighestRank"
,
realHighestRank
);
...
...
@@ -320,6 +331,10 @@ public class HotSearchCacheDAO {
nowDoc
.
put
(
"tag"
,
nonNull
(
document
.
get
(
"tag"
))
?
document
.
getString
(
"tag"
)
:
null
);
nowDoc
.
put
(
"downtext"
,
nonNull
(
document
.
get
(
"downtext"
))
?
document
.
getString
(
"downtext"
)
:
null
);
}
if
(
type
.
contains
(
"微博品牌"
))
{
nowDoc
.
put
(
"readCount"
,
nonNull
(
document
.
get
(
"readCount"
))
?
Long
.
valueOf
(
document
.
get
(
"readCount"
).
toString
())
:
null
);
nowDoc
.
put
(
"discussCount"
,
nonNull
(
document
.
get
(
"discussCount"
))
?
Long
.
valueOf
(
document
.
get
(
"discussCount"
).
toString
())
:
null
);
}
if
(
"微博热搜"
.
equals
(
type
))
{
nowDoc
=
WeiboHotSearchCrawler
.
weiboUpdate
(
nowDoc
);
//更新微博话题贡献者,关于功能
...
...
@@ -423,7 +438,9 @@ public class HotSearchCacheDAO {
duration
=
duration
+
30
;
}
else
if
(
"B站综合热门"
.
equals
(
type
))
{
duration
=
duration
+
60
;
}
else
{
}
else
if
(
type
.
contains
(
"微博品牌"
)){
duration
=
duration
+
60
;
}
else
{
duration
=
duration
+
1
;
}
return
duration
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
ad09acf2
...
...
@@ -696,4 +696,112 @@ public class GatherTimer {
TipsUtils
.
addHotList
(
"微博视频榜"
,
weiBoVideoList
);
log
.
info
(
"微博视频榜采集结束........"
);
}
/**
* 抖音娱乐榜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerDouYinEntertainment
(){
log
.
info
(
"抖音娱乐榜开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
douyinList
=
DouyinHotSearchCrawler
.
getMobileDouyinEntertainmentList
(
date
);
log
.
info
(
"{}, 抖音娱乐榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
douyinList
!=
null
?
douyinList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
抖音娱乐榜
.
name
(),
douyinList
);
log
.
info
(
"抖音娱乐榜采集结束..."
);
}
/**
* 抖音娱乐榜链接的更新
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 0/5 * * * ? "
)
public
void
updateDouYinEntertainmentUrl
(){
log
.
info
(
"抖音娱乐榜链接更新开始..."
);
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
Set
<
String
>
wordList
=
DouyinHotSearchCrawler
.
set
;
Set
<
String
>
douyinEntertainmentList
=
new
HashSet
<>();
douyinEntertainmentList
.
addAll
(
wordList
);
DouyinHotSearchCrawler
.
set
.
clear
();
if
(
douyinEntertainmentList
!=
null
&&
douyinEntertainmentList
.
size
()>
0
){
for
(
String
word
:
douyinEntertainmentList
)
{
String
id
=
word
+
"_"
+
HotSearchType
.
抖音娱乐榜
.
name
();
String
url
=
DouyinHotSearchCrawler
.
getDouyinUrl
(
"https://aweme-hl.snssdk.com/aweme/v1/hot/search/video/list/?hotword="
+
word
);
if
(
url
!=
null
)
{
Document
document
=
new
Document
();
document
.
put
(
"id"
,
id
);
document
.
put
(
"url"
,
url
);
hotSearchCacheDAO
.
updateDouyinUrl
(
document
);
}
}
log
.
info
(
"抖音娱乐榜链接更新结束"
);
}
else
{
log
.
info
(
"抖音娱乐榜链接更新失败,抖音娱乐榜列表获取为空。"
);
}
}
/**
* 微博品牌榜采集(一小时采集一次)
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 0 0/1 * * ? "
)
public
void
crawlerWeiBoBrandTotalList
(){
log
.
info
(
"微博品牌总榜采集开始........"
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
weiBoBrandTotalList
=
WeiBoBrandCrawler
.
weiBoBrandTotalList
(
date
);
log
.
info
(
"{}, 此轮微博品牌总榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandTotalList
!=
null
?
weiBoBrandTotalList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌总榜"
,
weiBoBrandTotalList
);
log
.
info
(
"微博品牌总榜采集结束........"
);
log
.
info
(
"微博品牌汽车榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandCarList
=
WeiBoBrandCrawler
.
weiBoBrandCar
(
date
);
log
.
info
(
"{}, 此轮微博品牌汽车榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandCarList
!=
null
?
weiBoBrandCarList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌汽车榜"
,
weiBoBrandCarList
);
log
.
info
(
"微博品牌汽车榜采集结束........"
);
log
.
info
(
"微博品牌手机榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandPhoneList
=
WeiBoBrandCrawler
.
weiBoBrandPhone
(
date
);
log
.
info
(
"{}, 此轮微博品牌手机榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandPhoneList
!=
null
?
weiBoBrandPhoneList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌手机榜"
,
weiBoBrandPhoneList
);
log
.
info
(
"微博品牌手机榜采集结束........"
);
log
.
info
(
"微博品牌美妆榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandMakeupList
=
WeiBoBrandCrawler
.
weiBoBrandMakeup
(
date
);
log
.
info
(
"{}, 此轮微博品牌美妆榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandMakeupList
!=
null
?
weiBoBrandMakeupList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌美妆榜"
,
weiBoBrandMakeupList
);
log
.
info
(
"微博品牌美妆榜采集结束........"
);
log
.
info
(
"微博品牌奢侈品榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandLuxuryList
=
WeiBoBrandCrawler
.
weiBoBrandLuxury
(
date
);
log
.
info
(
"{}, 此轮微博品牌奢侈品榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandLuxuryList
!=
null
?
weiBoBrandLuxuryList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌奢侈品榜"
,
weiBoBrandLuxuryList
);
log
.
info
(
"微博品牌奢侈品榜采集结束........"
);
log
.
info
(
"微博品牌食品饮料榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandFoodList
=
WeiBoBrandCrawler
.
weiBoBrandFood
(
date
);
log
.
info
(
"{}, 此轮微博品牌食品饮料榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandFoodList
!=
null
?
weiBoBrandFoodList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌食品饮料榜"
,
weiBoBrandFoodList
);
log
.
info
(
"微博品牌食品饮料榜采集结束........"
);
log
.
info
(
"微博品牌家电榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandHomeApplianceList
=
WeiBoBrandCrawler
.
weiBoBrandHomeAppliance
(
date
);
log
.
info
(
"{}, 此轮微博品牌家电榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandHomeApplianceList
!=
null
?
weiBoBrandHomeApplianceList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌家电榜"
,
weiBoBrandHomeApplianceList
);
log
.
info
(
"微博品牌家电榜采集结束........"
);
log
.
info
(
"微博品牌服装鞋帽榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandDressList
=
WeiBoBrandCrawler
.
weiBoBrandDress
(
date
);
log
.
info
(
"{}, 此轮微博品牌服装鞋帽榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandDressList
!=
null
?
weiBoBrandDressList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌服装鞋帽榜"
,
weiBoBrandDressList
);
log
.
info
(
"微博品牌服装鞋帽榜采集结束........"
);
log
.
info
(
"微博品牌母婴榜采集开始........"
);
List
<
HotSearchList
>
weiBoBrandMotherAndInfantList
=
WeiBoBrandCrawler
.
weiBoBrandMotherAndInfant
(
date
);
log
.
info
(
"{}, 此轮微博品牌母婴榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoBrandMotherAndInfantList
!=
null
?
weiBoBrandMotherAndInfantList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"微博品牌母婴榜"
,
weiBoBrandMotherAndInfantList
);
log
.
info
(
"微博品牌母婴榜采集结束........"
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment