Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
21336ec9
Commit
21336ec9
authored
Feb 08, 2022
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'working' into 'master'
更改移动端话题采集程序 See merge request
!178
parents
b6b50e01
14470085
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
84 additions
and
84 deletions
+84
-84
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
+84
-84
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
View file @
21336ec9
...
...
@@ -131,90 +131,90 @@ public class WeiboTopicCrawler {
/**
* 微博平话题榜采集
*/
//
public static List<HotSearchList> startCrawlerByPhone(Date date){
//
List<HotSearchList> topicList = new ArrayList<>();
//
for(int page=1; page<=3; page++){
//
String pageUrl = "https://api.weibo.cn/2/page?st_bottom_bar_new_style_enable=1&c=android&s=34dc160d&from=10A9295010&gsid=_2A25NH7inDeRxGeNH4lUX9ifIzTWIHXVvjUtvrDV6PUJbkdANLRjfkWpNSk7RXJ9vYwBfAr66TNj0zcFmOBPKZDuI&containerid=231648_-_4&page=" + page;
//
Request request = RequestUtils.wrapGet(pageUrl);
//
String htmlBody = null;
//
//重试三次
//
for(int retryTimes = 1; retryTimes<=5; retryTimes++) {
//
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
//
if (response.hasCause()){
//
Throwable cause = response.cause();
//
log.error("下载榜单列表页面时出现错误,错误为:{}", cause);
//
continue;
//
}else {
//
htmlBody = response.bodyString();
//
}
//
if (StringUtils.isNotBlank(htmlBody)) {
//
topicList.addAll(parseTopicHtml(htmlBody,date));
//
break;
//
} else {
//
log.info("下载榜单列表页面时数据格式错误,页面为:{}", htmlBody);
//
}
//
}
//
}
//
return topicList;
//
}
//
//
//
private static List<HotSearchList> parseTopicHtml(String htmlBody,Date date) {
//
try {
//
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("cards");
//
if(Objects.nonNull(jsonArray) && !jsonArray.isEmpty()) {
//
for (int j=0; j< jsonArray.size(); j++){
//
JSONObject card = jsonArray.getJSONObject(j);
//
if(card.containsKey("card_group")){
//
JSONArray cards = card.getJSONArray("card_group");
//
List<HotSearchList> topicList = new ArrayList<>();
//
Integer rank = null;
//
String topicName = null;
//
String url = null;
//
String description = null;
//
Long commentNum = null;
//
Long readNum = null;
//
String desc2 = null;
//
for(int i=0; i<cards.size(); i++) {
//
JSONObject cardGroup = cards.getJSONObject(i);
//
rank = cardGroup.getInteger("top_mark_text");
//
topicName = cardGroup.getString("title_sub");
//
url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode(topicName, "utf-8");
//
description = null;
//
if(cardGroup.containsKey("card_expand")){
//
description = cardGroup.getJSONObject("card_expand").getString("content");
//
}
//
desc2 = cardGroup.getString("desc");
//
String commentNumStr = desc2.replaceAll("讨论.*", "").trim();
//
String readNumStr = desc2.replaceAll(".*讨论|阅读", "").trim();
//
try {
//
commentNum = TipsUtils.getHotCount(commentNumStr);
//
readNum = TipsUtils.getHotCount(readNumStr);
//
}catch (Exception e){
//
e.printStackTrace();
//
}
//
HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博话题.name(), commentNum, description,date);
//
if(cardGroup.containsKey("title_flag_pic")){
//
String titlePic = cardGroup.getString("title_flag_pic");
//
if(titlePic.contains("new")){
//
topic.setIcon("新");
//
}else if(titlePic.contains("hot")){
//
topic.setIcon("热");
//
}
//
}
//
topicList.add(topic);
//
}
//
return topicList;
//
}
//
}
//
}else{
//
//
log.info("html:{}",htmlBody);
//
}
//
} catch (Exception e) {
//
log.error("解析榜单列表页面时出现错误,错误为:{}", e);
//
}
//
return Collections.emptyList();
//
}
public
static
List
<
HotSearchList
>
startCrawlerByPhone
(
Date
date
){
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
for
(
int
page
=
1
;
page
<=
3
;
page
++){
String
pageUrl
=
"https://api.weibo.cn/2/page?st_bottom_bar_new_style_enable=1&c=android&s=34dc160d&from=10A9295010&gsid=_2A25NH7inDeRxGeNH4lUX9ifIzTWIHXVvjUtvrDV6PUJbkdANLRjfkWpNSk7RXJ9vYwBfAr66TNj0zcFmOBPKZDuI&containerid=231648_-_4&page="
+
page
;
Request
request
=
RequestUtils
.
wrapGet
(
pageUrl
);
String
htmlBody
=
null
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
5
;
retryTimes
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"下载榜单列表页面时出现错误,错误为:{}"
,
cause
);
continue
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
topicList
.
addAll
(
parseTopicHtml
(
htmlBody
,
date
));
break
;
}
else
{
log
.
info
(
"下载榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
}
}
return
topicList
;
}
private
static
List
<
HotSearchList
>
parseTopicHtml
(
String
htmlBody
,
Date
date
)
{
try
{
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
Objects
.
nonNull
(
jsonArray
)
&&
!
jsonArray
.
isEmpty
())
{
for
(
int
j
=
0
;
j
<
jsonArray
.
size
();
j
++){
JSONObject
card
=
jsonArray
.
getJSONObject
(
j
);
if
(
card
.
containsKey
(
"card_group"
)){
JSONArray
cards
=
card
.
getJSONArray
(
"card_group"
);
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
Integer
rank
=
null
;
String
topicName
=
null
;
String
url
=
null
;
String
description
=
null
;
Long
commentNum
=
null
;
Long
readNum
=
null
;
String
desc2
=
null
;
for
(
int
i
=
0
;
i
<
cards
.
size
();
i
++)
{
JSONObject
cardGroup
=
cards
.
getJSONObject
(
i
);
rank
=
cardGroup
.
getInteger
(
"top_mark_text"
);
topicName
=
cardGroup
.
getString
(
"title_sub"
);
url
=
"https://s.weibo.com/weibo?q="
+
URLCodeUtil
.
getURLEncode
(
topicName
,
"utf-8"
);
description
=
null
;
if
(
cardGroup
.
containsKey
(
"card_expand"
)){
description
=
cardGroup
.
getJSONObject
(
"card_expand"
).
getString
(
"content"
);
}
desc2
=
cardGroup
.
getString
(
"desc"
);
String
commentNumStr
=
desc2
.
replaceAll
(
"讨论.*"
,
""
).
trim
();
String
readNumStr
=
desc2
.
replaceAll
(
".*讨论|阅读"
,
""
).
trim
();
try
{
commentNum
=
TipsUtils
.
getHotCount
(
commentNumStr
);
readNum
=
TipsUtils
.
getHotCount
(
readNumStr
);
}
catch
(
Exception
e
){
e
.
printStackTrace
();
}
HotSearchList
topic
=
new
HotSearchList
(
url
,
topicName
,
readNum
,
rank
,
HotSearchType
.
微博话题
.
name
(),
commentNum
,
description
,
date
);
if
(
cardGroup
.
containsKey
(
"title_flag_pic"
)){
String
titlePic
=
cardGroup
.
getString
(
"title_flag_pic"
);
if
(
titlePic
.
contains
(
"new"
)){
topic
.
setIcon
(
"新"
);
}
else
if
(
titlePic
.
contains
(
"hot"
)){
topic
.
setIcon
(
"热"
);
}
}
topicList
.
add
(
topic
);
}
return
topicList
;
}
}
}
else
{
// log.info("html:{}",htmlBody);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析榜单列表页面时出现错误,错误为:{}"
,
e
);
}
return
Collections
.
emptyList
();
}
/**
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment