Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
49d50468
Commit
49d50468
authored
Sep 02, 2021
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'working' into 'master'
新增微博娱乐榜采集功能 See merge request
!125
parents
fd08d8f7
d5a49080
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
283 additions
and
0 deletions
+283
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+2
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboEntertainmentCrawler.java
+92
-0
src/main/java/com/zhiwei/searchhotcrawler/test/WeiBoYuLeRun.java
+40
-0
src/main/java/com/zhiwei/searchhotcrawler/test/WeiboEntertainmentCrawlerTest.java
+93
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+56
-0
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
49d50468
...
...
@@ -27,5 +27,7 @@ public enum HotSearchType {
虎嗅热文推荐
,
快手热榜
,
淘宝热搜
,
抖音同城榜
,
微博娱乐榜
,
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboEntertainmentCrawler.java
0 → 100644
View file @
49d50468
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.util.*
;
/**
* @ClassName: weiboEntertainmentByPhone
* @Description: 微博娱乐榜采集
* @author ll
* @date 2021年9月1日 上午10:54:31
*/
@Log4j2
public
class
WeiboEntertainmentCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @return void 返回类型
* @Title: weiboHotSearchYuLeByPhone
* @author ll
* @Description: 手机端微博娱乐榜采集
* @date 2021年9月2日 下午16:10:31
*/
public
static
List
<
HotSearchList
>
weiboEntertainmentByPhone
(
Date
date
)
{
String
url
=
"https://api.weibo.cn/2/guest/page?st_bottom_bar_new_style_enable=1&networktype=wifi&extparam=seat%3D1%26dgr%3D0%26pos%3D0_0%26position%3D%255B%255D%26mi_cid%3D100103%26c_type%3D30%26filter_type%3Drealtimehot%26cate%3D10103%26refresh_type%3D0%26display_time%3D1630311727%26pre_seqid%3D759583440&page_reform_enable=1&launchid=10000365--x&page_interrupt_enable=1&orifid=231619&uicode=10000011&ul_hid=069f1ce5-c01b-452a-8e35-63cc129b4922&ul_sid=069f1ce5-c01b-452a-8e35-63cc129b4922&moduleID=708&checktoken=49e4ed3181ae0f794326d93b345953a6&just_followed=false&wb_version=4880&lcardid=hot_search&c=android&s=14a75bb5&ft=0&ua=Xiaomi-Redmi%208__weibo__11.3.0__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&did=0f607264fc6318a92b9e13c65db7cd3c4421c4d0&fid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&uid=2004639399897&v_f=2&v_p=87&from=10B3095010&gsid=_2AkMWcBzYf8NhqwFRmPwTz2LhZYR_ww_EieKgLO0DJRM3HRl-wT9kqmIltRV6PfAyN0yL-qVVp2I3Kl7SamvpS9NmO7Ur&imsi=&lang=zh_CN&lfid=231619&page=1&skin=default&count=20&oldwm=20005_0002&sflag=1&oriuicode=10000512&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Dfun&ignore_inturrpted_error=true&no_location_permission=1&luicode=10000512&android_id=0febc80e083662a7&header_skin_enable=0&ul_ctime=1630311759544&cum=E30CEEEA"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博娱乐榜时出现连接失败"
,
e
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
int
rank
=
0
;
try
{
JSONObject
card
=
cards
.
getJSONObject
(
0
);
JSONArray
cardGroup
=
card
.
getJSONArray
(
"card_group"
);
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
boolean
hot
=
true
;
for
(
int
j
=
0
;
j
<
cardGroup
.
size
();
j
++)
{
JSONObject
cardInfo
=
cardGroup
.
getJSONObject
(
j
);
String
name
=
cardInfo
.
getString
(
"desc"
);
long
hotCount
=
cardInfo
.
getLongValue
(
"desc_extr"
);
String
icon
=
cardInfo
.
getString
(
"icon"
);
if
(
StringUtils
.
isNotBlank
(
icon
))
{
icon
=
icon
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
}
String
id
=
cardInfo
.
getString
(
"scheme"
);
String
ul
=
"https://m.weibo.cn/search?"
+
id
.
split
(
"[?]"
)[
1
];
rank
++;
HotSearchList
hotSearch
=
new
HotSearchList
(
ul
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博娱乐榜
.
name
(),
icon
,
date
);
if
(!
"娱乐动态数据详情"
.
equals
(
hotSearch
.
getName
())){
result
.
add
(
hotSearch
);
}
}
}
else
{
log
.
info
(
"card 数据结构为:{}"
,
card
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博娱乐榜时出现解析错误"
,
e
);
continue
;
}
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博娱乐榜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
"解析微博娱乐榜时出现解析错误,页面结构有问题"
);
}
}
return
Collections
.
emptyList
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/test/WeiBoYuLeRun.java
0 → 100644
View file @
49d50468
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.KuaiShouHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
@Log4j2
public
class
WeiBoYuLeRun
extends
Thread
{
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getHotList
();
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
}
ZhiWeiTools
.
sleep
(
50
);
}
}
private
void
getHotList
()
{
log
.
info
(
"微博娱乐榜采集开始........"
);
List
<
HotSearchList
>
weiBoEntertainmentList
=
WeiboEntertainmentCrawlerTest
.
weiboEntertainmentByPhone
(
new
Date
());
log
.
info
(
"{}, 此轮微博娱乐榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiBoEntertainmentList
!=
null
?
weiBoEntertainmentList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"快手热榜"
,
weiBoEntertainmentList
);
log
.
info
(
"微博娱乐榜采集结束........"
);
}
}
\ No newline at end of file
src/main/java/com/zhiwei/searchhotcrawler/test/WeiboEntertainmentCrawlerTest.java
0 → 100644
View file @
49d50468
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.util.*
;
/**
* @ClassName: weiboEntertainmentByPhone
* @Description: 微博娱乐榜采集
* @author ll
* @date 2021年9月1日 上午10:54:31
*/
@Log4j2
public
class
WeiboEntertainmentCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @return void 返回类型
* @Title: weiboHotSearchYuLeByPhone
* @author ll
* @Description: 手机端微博娱乐榜采集
*/
public
static
List
<
HotSearchList
>
weiboEntertainmentByPhone
(
Date
date
)
{
String
url
=
"https://api.weibo.cn/2/guest/page?st_bottom_bar_new_style_enable=1&networktype=wifi&extparam=seat%3D1%26dgr%3D0%26pos%3D0_0%26position%3D%255B%255D%26mi_cid%3D100103%26c_type%3D30%26filter_type%3Drealtimehot%26cate%3D10103%26refresh_type%3D0%26display_time%3D1630311727%26pre_seqid%3D759583440&page_reform_enable=1&launchid=10000365--x&page_interrupt_enable=1&orifid=231619&uicode=10000011&ul_hid=069f1ce5-c01b-452a-8e35-63cc129b4922&ul_sid=069f1ce5-c01b-452a-8e35-63cc129b4922&moduleID=708&checktoken=49e4ed3181ae0f794326d93b345953a6&just_followed=false&wb_version=4880&lcardid=hot_search&c=android&s=14a75bb5&ft=0&ua=Xiaomi-Redmi%208__weibo__11.3.0__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&did=0f607264fc6318a92b9e13c65db7cd3c4421c4d0&fid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&uid=2004639399897&v_f=2&v_p=87&from=10B3095010&gsid=_2AkMWcBzYf8NhqwFRmPwTz2LhZYR_ww_EieKgLO0DJRM3HRl-wT9kqmIltRV6PfAyN0yL-qVVp2I3Kl7SamvpS9NmO7Ur&imsi=&lang=zh_CN&lfid=231619&page=1&skin=default&count=20&oldwm=20005_0002&sflag=1&oriuicode=10000512&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Dfun&ignore_inturrpted_error=true&no_location_permission=1&luicode=10000512&android_id=0febc80e083662a7&header_skin_enable=0&ul_ctime=1630311759544&cum=E30CEEEA"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博娱乐榜时出现连接失败"
,
e
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
int
rank
=
0
;
try
{
JSONObject
card
=
cards
.
getJSONObject
(
0
);
JSONArray
cardGroup
=
card
.
getJSONArray
(
"card_group"
);
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
boolean
hot
=
true
;
for
(
int
j
=
0
;
j
<
cardGroup
.
size
();
j
++)
{
JSONObject
cardInfo
=
cardGroup
.
getJSONObject
(
j
);
String
name
=
cardInfo
.
getString
(
"desc"
);
long
hotCount
=
cardInfo
.
getLongValue
(
"desc_extr"
);
String
icon
=
cardInfo
.
getString
(
"icon"
);
if
(
StringUtils
.
isNotBlank
(
icon
))
{
icon
=
icon
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
}
String
id
=
cardInfo
.
getString
(
"scheme"
);
String
ul
=
"https://m.weibo.cn/search?"
+
id
.
split
(
"[?]"
)[
1
];
rank
++;
HotSearchList
hotSearch
=
new
HotSearchList
(
ul
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博娱乐榜
.
name
(),
icon
,
date
);
if
(!
"娱乐动态数据详情"
.
equals
(
hotSearch
.
getName
())){
result
.
add
(
hotSearch
);
}
}
}
else
{
log
.
info
(
"card 数据结构为:{}"
,
card
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博娱乐榜时出现解析错误"
,
e
);
continue
;
}
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博娱乐榜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
"解析微博娱乐榜时出现解析错误,页面结构有问题"
);
}
}
return
Collections
.
emptyList
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
49d50468
...
...
@@ -10,6 +10,7 @@ import com.zhiwei.searchhotcrawler.dao.RedisDao;
import
com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO
;
import
com.zhiwei.searchhotcrawler.crawler.HotSearch36KrCrawler
;
import
com.zhiwei.searchhotcrawler.crawler.HuXiuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.test.DouYinTongChengCrawlerTest
;
import
com.zhiwei.searchhotcrawler.timer.TouTiaoExecutor
;
import
com.zhiwei.searchhotcrawler.util.DateUtils
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
...
...
@@ -533,4 +534,59 @@ public class GatherTimer {
// TipsUtils.addHotList(HotSearchType.淘宝热搜.name(), taoBaoList);
// logger.info("淘宝热搜采集结束...");
// }
//
// /**
// * 抖音同城榜的采集
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 * * * * ? ")
// public void crawlerDouYinTongCheng(){
// logger.info("抖音同城榜开始采集...");
// Date date = DateUtils.getMillSecondTime(new Date());
// List<HotSearchList> douyinTongChengList = DouYinTongChengCrawlerTest.DouYinTongChengCrawler(date);
// logger.info("{}, 抖音同城榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(douyinTongChengList != null ? douyinTongChengList.size() : 0));
// TipsUtils.addHotList(HotSearchType.抖音同城榜.name(),douyinTongChengList);
// logger.info("抖音同城榜采集结束...");
// }
//
// /**
// * 抖音同城链接的更新
// */
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 0/5 * * * ? ")
// public void updateDouYinTongChengUrl(){
// logger.info("抖音同城链接更新开始...");
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<HotSearchList> douyinTongChengList = DouYinTongChengCrawlerTest.list;
// if(douyinTongChengList!=null && douyinTongChengList.size()>0){
// for(int i=0; i<douyinTongChengList.size(); i++){
// String name = douyinTongChengList.get(i).getName();
// String id = name+"_"+douyinTongChengList.get(i).getType();
// String url = DouYinTongChengCrawlerTest.getDouyinTongChengUrl("https://api5-normal-c-lq.amemv.com/aweme/v1/hot/search/video/list/?hotword="+name);
// if(url != null) {
// Document document = new Document();
// document.put("id", id);
// document.put("url", url);
// hotSearchCacheDAO.updateDouyinUrl(document);
// }
// }
// logger.info("抖音同城链接更新结束");
// }else{
// logger.info("抖音同城链接更新失败,抖音同城榜列表获取为空。");
// }
// }
/**
*微博娱乐榜采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerWeiBoEntertainment
(){
logger
.
info
(
"微博娱乐榜开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
weiBoEntertainmentList
=
WeiboEntertainmentCrawler
.
weiboEntertainmentByPhone
(
date
);
logger
.
info
(
"{}, 微博娱乐榜此轮采集到的数据量为:{}"
,
new
Date
(),
weiBoEntertainmentList
!=
null
?
weiBoEntertainmentList
.
size
()
:
0
);
TipsUtils
.
addHotList
(
HotSearchType
.
微博娱乐榜
.
name
(),
weiBoEntertainmentList
);
logger
.
info
(
"微博娱乐榜采集结束..."
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment