Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
3e5c72ea
Commit
3e5c72ea
authored
Sep 30, 2021
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'working' into 'master'
新增微博要闻榜采集功能 See merge request
!136
parents
172e5b3c
e07e6507
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
215 additions
and
1 deletions
+215
-1
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
+5
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+1
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboNewsCrawler.java
+184
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+12
-1
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+13
-0
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
View file @
3e5c72ea
...
@@ -113,6 +113,11 @@ public class HotSearchList implements Serializable{
...
@@ -113,6 +113,11 @@ public class HotSearchList implements Serializable{
**/
**/
private
String
rankPic
;
private
String
rankPic
;
/**
* 主持人
*/
private
String
downtext
;
public
HotSearchList
(){}
public
HotSearchList
(){}
public
HotSearchList
(
String
url
,
String
name
,
Long
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
,
Date
date
){
public
HotSearchList
(
String
url
,
String
name
,
Long
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
,
Date
date
){
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
3e5c72ea
...
@@ -29,5 +29,6 @@ public enum HotSearchType {
...
@@ -29,5 +29,6 @@ public enum HotSearchType {
淘宝热搜
,
淘宝热搜
,
抖音同城榜
,
抖音同城榜
,
微博娱乐榜
,
微博娱乐榜
,
微博要闻榜
,
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboNewsCrawler.java
0 → 100644
View file @
3e5c72ea
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.util.*
;
/**
* @author ll
* @ClassName: WeiboNewsCrawler
* @Description: 微博要闻榜
* @date 2021年9月27日 上午10:54:31
*/
@Log4j2
public
class
WeiboNewsCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @return void 返回类型
* @Title: weiboNewsByPhone
* @author ll
* @Description: 手机端微博要闻榜采集
*/
public
static
List
<
HotSearchList
>
weiboNewsByPhone
(
Date
date
)
{
String
url1
=
"https://api.weibo.cn/2/page?st_bottom_bar_new_style_enable=1&networktype=wifi&extparam=seat%3D1%26dgr%3D0%26pos%3D0_0%26position%3D%257B%2522objectid%2522%253A%25228008633020000000000%2522%252C%2522name%2522%253A%2522%255Cu5b81%255Cu6ce2%2522%257D%26mi_cid%3D100103%26c_type%3D30%26filter_type%3Drealtimehot%26cate%3D10103%26refresh_type%3D0%26display_time%3D1632707109%26pre_seqid%3D803934167&page_reform_enable=1&launchid=10000365--x&page_interrupt_enable=1&orifid=231619&uicode=10000011&ul_hid=893ae4f7-7b63-459e-a622-454a6fa3542c&ul_sid=893ae4f7-7b63-459e-a622-454a6fa3542c&moduleID=708&just_followed=false&wb_version=4880&lcardid=hot_search&c=android&s=f735389f&ft=0&ua=Xiaomi-Redmi%208__weibo__11.3.0__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&fid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&v_f=2&v_p=87&from=10B3095010&gsid=_2A25MVVJCDeRxGeNI7VMV9izPwjSIHXVtQ-KKrDV6PUJbkdCOLWz8kWpNSF_8k5iEXT9MqlN5YIgRREu9j71HIlCa&imsi=&lang=zh_CN&lfid=231619&page=1&skin=default&count=20&oldwm=20005_0002&sflag=1&oriuicode=10000010&containerid=231648_-_3&ignore_inturrpted_error=true&no_location_permission=1&luicode=10000010&android_id=0febc80e083662a7&header_skin_enable=0&ul_ctime=1632709238222&cum=2682A02C"
;
String
url2
=
"https://api.weibo.cn/2/page?st_bottom_bar_new_style_enable=1&networktype=wifi&extparam=seat%3D1%26dgr%3D0%26pos%3D0_0%26position%3D%257B%2522objectid%2522%253A%25228008633020000000000%2522%252C%2522name%2522%253A%2522%255Cu5b81%255Cu6ce2%2522%257D%26mi_cid%3D100103%26c_type%3D30%26filter_type%3Drealtimehot%26cate%3D10103%26refresh_type%3D0%26display_time%3D1632707109%26pre_seqid%3D803934167&page_reform_enable=1&launchid=10000365--x&page_interrupt_enable=1&orifid=231619&uicode=10000011&ul_hid=893ae4f7-7b63-459e-a622-454a6fa3542c&ul_sid=893ae4f7-7b63-459e-a622-454a6fa3542c&moduleID=708&just_followed=false&wb_version=4880&lcardid=hot_search&c=android&s=f735389f&ft=0&ua=Xiaomi-Redmi%208__weibo__11.3.0__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&fid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&v_f=2&v_p=87&from=10B3095010&gsid=_2A25MVVJCDeRxGeNI7VMV9izPwjSIHXVtQ-KKrDV6PUJbkdCOLWz8kWpNSF_8k5iEXT9MqlN5YIgRREu9j71HIlCa&imsi=&lang=zh_CN&lfid=231619&page=2&skin=default&count=20&oldwm=20005_0002&sflag=1&oriuicode=10000010&containerid=231648_-_3&ignore_inturrpted_error=true&no_location_permission=1&luicode=10000010&android_id=0febc80e083662a7&header_skin_enable=0&ul_ctime=1632709278776&cum=C4386412"
;
String
url3
=
"https://api.weibo.cn/2/page?st_bottom_bar_new_style_enable=1&networktype=wifi&extparam=seat%3D1%26dgr%3D0%26pos%3D0_0%26position%3D%257B%2522objectid%2522%253A%25228008633020000000000%2522%252C%2522name%2522%253A%2522%255Cu5b81%255Cu6ce2%2522%257D%26mi_cid%3D100103%26c_type%3D30%26filter_type%3Drealtimehot%26cate%3D10103%26refresh_type%3D0%26display_time%3D1632707109%26pre_seqid%3D803934167&page_reform_enable=1&launchid=10000365--x&page_interrupt_enable=1&orifid=231619&uicode=10000011&ul_hid=893ae4f7-7b63-459e-a622-454a6fa3542c&ul_sid=893ae4f7-7b63-459e-a622-454a6fa3542c&moduleID=708&just_followed=false&wb_version=4880&lcardid=hot_search&c=android&s=f735389f&ft=0&ua=Xiaomi-Redmi%208__weibo__11.3.0__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&fid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&v_f=2&v_p=87&from=10B3095010&gsid=_2A25MVVJCDeRxGeNI7VMV9izPwjSIHXVtQ-KKrDV6PUJbkdCOLWz8kWpNSF_8k5iEXT9MqlN5YIgRREu9j71HIlCa&imsi=&lang=zh_CN&lfid=231619&page=3&skin=default&count=20&oldwm=20005_0002&sflag=1&oriuicode=10000010&containerid=231648_-_3&ignore_inturrpted_error=true&no_location_permission=1&luicode=10000010&android_id=0febc80e083662a7&header_skin_enable=0&ul_ctime=1632709335385&cum=E51D64AB"
;
String
htmlBody
=
null
;
Request
request1
=
RequestUtils
.
wrapGet
(
url1
);
Request
request2
=
RequestUtils
.
wrapGet
(
url2
);
Request
request3
=
RequestUtils
.
wrapGet
(
url3
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
List
<
HotSearchList
>
result
=
new
ArrayList
();
//发送第一次请求获取前20条数据
try
(
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"第一次请求解析微博要闻榜时出现连接失败"
,
e
);
continue
;
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
int
rank
=
0
;
List
<
HotSearchList
>
list
=
parsWeiboNews
(
date
,
cards
,
rank
);
result
.
addAll
(
list
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博要闻榜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
"第一次解析微博要闻榜时出现解析错误,页面结构有问题"
);
continue
;
}
//发送第二次请求获取中间20条数据
try
(
Response
response
=
httpBoot
.
syncCall
(
request2
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"第二次请求解析微博要闻榜时出现连接失败"
,
e
);
continue
;
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
int
rank
=
20
;
List
<
HotSearchList
>
list
=
parsWeiboNews
(
date
,
cards
,
rank
);
result
.
addAll
(
list
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博要闻榜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
"第二次解析微博要闻榜时出现解析错误,页面结构有问题"
);
continue
;
}
//发送第三次请求获取最后10条数据
try
(
Response
response
=
httpBoot
.
syncCall
(
request3
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"第三次请求解析微博要闻榜时出现连接失败"
,
e
);
continue
;
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
int
rank
=
40
;
List
<
HotSearchList
>
list
=
parsWeiboNews
(
date
,
cards
,
rank
);
result
.
addAll
(
list
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博要闻榜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
"第三次解析微博要闻榜时出现解析错误,页面结构有问题"
);
continue
;
}
return
result
;
}
return
Collections
.
emptyList
();
}
//解析微博要闻榜
public
static
List
<
HotSearchList
>
parsWeiboNews
(
Date
date
,
JSONArray
cards
,
int
rank
)
{
List
<
HotSearchList
>
weiBoNewsList
=
new
ArrayList
();
try
{
JSONObject
card
=
cards
.
getJSONObject
(
0
);
JSONArray
cardGroup
=
card
.
getJSONArray
(
"card_group"
);
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
boolean
hot
=
true
;
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
JSONObject
cardInfo
=
cardGroup
.
getJSONObject
(
i
);
//获取标题
String
title
=
cardInfo
.
getString
(
"title_sub"
);
String
name
=
title
.
replaceAll
(
"#"
,
""
);
//获取热搜类型
String
iconUrl
=
cardInfo
.
getString
(
"title_flag_pic"
);
String
icon
=
null
;
if
(
StringUtils
.
isNotBlank
(
iconUrl
))
{
icon
=
iconUrl
.
split
(
"card8_"
)[
1
].
split
(
".png"
)[
0
];
}
//获取链接
String
id
=
cardInfo
.
getString
(
"scheme"
);
String
ul
=
"https://m.weibo.cn/search?"
+
id
.
split
(
"[?]"
)[
1
];
//排名自增
rank
++;
//获取主持人及阅读量
String
desc
=
cardInfo
.
getString
(
"desc"
);
Long
commentCount
=
null
;
String
downtext
=
null
;
if
(
Objects
.
nonNull
(
desc
))
{
if
(
desc
.
split
(
"[|]"
).
length
>
1
){
//获取主持人
downtext
=
desc
.
split
(
"[|]"
)[
1
].
replace
(
"@"
,
""
).
trim
();
String
read
=
desc
.
split
(
"[|]"
)[
0
];
if
(
read
.
contains
(
"万阅读"
))
{
Double
num
=
Double
.
valueOf
(
read
.
split
(
"万"
)[
0
])
*
10000
;
commentCount
=
new
Double
(
num
).
longValue
();
}
else
if
(
read
.
contains
(
"亿阅读"
))
{
Double
num
=
Double
.
valueOf
(
read
.
split
(
"亿"
)[
0
])
*
100000000
;
commentCount
=
new
Double
(
num
).
longValue
();
}
else
{
commentCount
=
Long
.
valueOf
(
read
.
split
(
"阅读"
)[
0
]);
}
}
}
//默认热度值为零
Long
hotCount
=
0L
;
HotSearchList
hotSearch
=
new
HotSearchList
(
ul
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博要闻榜
.
name
(),
icon
,
date
);
//增加主持人
if
(
Objects
.
nonNull
(
downtext
))
{
hotSearch
.
setDowntext
(
downtext
);
}
//增加阅读量
hotSearch
.
setCommentCount
(
commentCount
);
//增加热搜类型链接
if
(
Objects
.
nonNull
(
iconUrl
))
{
hotSearch
.
setIconUrl
(
iconUrl
);
}
weiBoNewsList
.
add
(
hotSearch
);
}
}
else
{
log
.
info
(
"card 数据结构为:{}"
,
card
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博要闻榜时出现解析错误"
,
e
);
}
return
weiBoNewsList
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
3e5c72ea
...
@@ -63,6 +63,11 @@ public class HotSearchCacheDAO {
...
@@ -63,6 +63,11 @@ public class HotSearchCacheDAO {
if
(
"虎嗅热文推荐"
.
equals
(
hotSearch
.
getType
())){
if
(
"虎嗅热文推荐"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"comment_count"
,
hotSearch
.
getCommentCount
());
document
.
put
(
"comment_count"
,
hotSearch
.
getCommentCount
());
}
}
if
(
"微博要闻榜"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"comment_count"
,
hotSearch
.
getCommentCount
());
document
.
put
(
"iconUrl"
,
hotSearch
.
getIconUrl
());
document
.
put
(
"downtext"
,
hotSearch
.
getDowntext
());
}
if
(
"百度热搜"
.
equals
(
hotSearch
.
getType
())){
if
(
"百度热搜"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"topic_lead"
,
hotSearch
.
getTopicLead
());
document
.
put
(
"topic_lead"
,
hotSearch
.
getTopicLead
());
}
}
...
@@ -80,6 +85,9 @@ public class HotSearchCacheDAO {
...
@@ -80,6 +85,9 @@ public class HotSearchCacheDAO {
if
(
"百度热搜"
.
equals
(
hotSearch
.
getType
())){
if
(
"百度热搜"
.
equals
(
hotSearch
.
getType
())){
document
.
remove
(
"topic_lead"
);
document
.
remove
(
"topic_lead"
);
}
}
if
(
"微博要闻榜"
.
equals
(
hotSearch
.
getType
())){
document
.
remove
(
"downtext"
);
}
dataes
.
add
(
document
);
dataes
.
add
(
document
);
});
});
return
dataes
;
return
dataes
;
...
@@ -210,7 +218,10 @@ public class HotSearchCacheDAO {
...
@@ -210,7 +218,10 @@ public class HotSearchCacheDAO {
if
(
"虎嗅热文推荐"
.
equals
(
type
)){
if
(
"虎嗅热文推荐"
.
equals
(
type
)){
nowDoc
.
put
(
"comment_count"
,
document
.
getLong
(
"comment_count"
));
nowDoc
.
put
(
"comment_count"
,
document
.
getLong
(
"comment_count"
));
}
}
if
(
"微博要闻榜"
.
equals
(
type
)){
nowDoc
.
put
(
"downtext"
,
document
.
getString
(
"downtext"
));
nowDoc
.
put
(
"comment_count"
,
document
.
getLong
(
"comment_count"
));
}
if
(
topicResult
!=
null
){
if
(
topicResult
!=
null
){
nowDoc
.
put
(
"topicResult"
,
topicResult
);
nowDoc
.
put
(
"topicResult"
,
topicResult
);
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
3e5c72ea
...
@@ -588,4 +588,17 @@ public class GatherTimer {
...
@@ -588,4 +588,17 @@ public class GatherTimer {
TipsUtils
.
addHotList
(
HotSearchType
.
微博娱乐榜
.
name
(),
weiBoEntertainmentList
);
TipsUtils
.
addHotList
(
HotSearchType
.
微博娱乐榜
.
name
(),
weiBoEntertainmentList
);
logger
.
info
(
"微博娱乐榜采集结束..."
);
logger
.
info
(
"微博娱乐榜采集结束..."
);
}
}
/**
*微博娱乐榜采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerWeiBoNews
(){
logger
.
info
(
"微博要闻榜开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
WeiboNewsList
=
WeiboNewsCrawler
.
weiboNewsByPhone
(
date
);
logger
.
info
(
"{}, 微博要闻榜此轮采集到的数据量为:{}"
,
new
Date
(),
WeiboNewsList
!=
null
?
WeiboNewsList
.
size
()
:
0
);
TipsUtils
.
addHotList
(
HotSearchType
.
微博要闻榜
.
name
(),
WeiboNewsList
);
logger
.
info
(
"微博要闻榜采集结束..."
);
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment