Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
206c358e
Commit
206c358e
authored
Feb 08, 2022
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
更新微博话题采集程序
parent
70ceeae4
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
107 additions
and
56 deletions
+107
-56
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
+106
-55
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+1
-1
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
View file @
206c358e
...
...
@@ -13,7 +13,9 @@ import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.MediaType
;
import
okhttp3.Request
;
import
okhttp3.RequestBody
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Element
;
...
...
@@ -129,14 +131,103 @@ public class WeiboTopicCrawler {
/**
* 微博平话题榜采集
*/
public
static
List
<
HotSearchList
>
startCrawlerByPhone
(
Date
date
){
// public static List<HotSearchList> startCrawlerByPhone(Date date){
// List<HotSearchList> topicList = new ArrayList<>();
// for(int page=1; page<=3; page++){
// String pageUrl = "https://api.weibo.cn/2/page?st_bottom_bar_new_style_enable=1&c=android&s=34dc160d&from=10A9295010&gsid=_2A25NH7inDeRxGeNH4lUX9ifIzTWIHXVvjUtvrDV6PUJbkdANLRjfkWpNSk7RXJ9vYwBfAr66TNj0zcFmOBPKZDuI&containerid=231648_-_4&page=" + page;
// Request request = RequestUtils.wrapGet(pageUrl);
// String htmlBody = null;
// //重试三次
// for(int retryTimes = 1; retryTimes<=5; retryTimes++) {
// Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
// if (response.hasCause()){
// Throwable cause = response.cause();
// log.error("下载榜单列表页面时出现错误,错误为:{}", cause);
// continue;
// }else {
// htmlBody = response.bodyString();
// }
// if (StringUtils.isNotBlank(htmlBody)) {
// topicList.addAll(parseTopicHtml(htmlBody,date));
// break;
// } else {
// log.info("下载榜单列表页面时数据格式错误,页面为:{}", htmlBody);
// }
// }
// }
// return topicList;
// }
//
//
// private static List<HotSearchList> parseTopicHtml(String htmlBody,Date date) {
// try {
// JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("cards");
// if(Objects.nonNull(jsonArray) && !jsonArray.isEmpty()) {
// for (int j=0; j< jsonArray.size(); j++){
// JSONObject card = jsonArray.getJSONObject(j);
// if(card.containsKey("card_group")){
// JSONArray cards = card.getJSONArray("card_group");
// List<HotSearchList> topicList = new ArrayList<>();
// Integer rank = null;
// String topicName = null;
// String url = null;
// String description = null;
// Long commentNum = null;
// Long readNum = null;
// String desc2 = null;
// for(int i=0; i<cards.size(); i++) {
// JSONObject cardGroup = cards.getJSONObject(i);
// rank = cardGroup.getInteger("top_mark_text");
// topicName = cardGroup.getString("title_sub");
// url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode(topicName, "utf-8");
// description = null;
// if(cardGroup.containsKey("card_expand")){
// description = cardGroup.getJSONObject("card_expand").getString("content");
// }
// desc2 = cardGroup.getString("desc");
// String commentNumStr = desc2.replaceAll("讨论.*", "").trim();
// String readNumStr = desc2.replaceAll(".*讨论|阅读", "").trim();
// try {
// commentNum = TipsUtils.getHotCount(commentNumStr);
// readNum = TipsUtils.getHotCount(readNumStr);
// }catch (Exception e){
// e.printStackTrace();
// }
// HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博话题.name(), commentNum, description,date);
// if(cardGroup.containsKey("title_flag_pic")){
// String titlePic = cardGroup.getString("title_flag_pic");
// if(titlePic.contains("new")){
// topic.setIcon("新");
// }else if(titlePic.contains("hot")){
// topic.setIcon("热");
// }
// }
// topicList.add(topic);
// }
// return topicList;
// }
// }
// }else{
//// log.info("html:{}",htmlBody);
// }
// } catch (Exception e) {
// log.error("解析榜单列表页面时出现错误,错误为:{}", e);
// }
// return Collections.emptyList();
// }
/**
* 微博话题采集(PC端)
*/
public
static
List
<
HotSearchList
>
startCrawlerByPc
(
Date
date
){
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
for
(
int
page
=
1
;
page
<=
3
;
page
++){
String
pageUrl
=
"https://
api.weibo.cn/2/page?st_bottom_bar_new_style_enable=1&c=android&s=34dc160d&from=10A9295010&gsid=_2A25NH7inDeRxGeNH4lUX9ifIzTWIHXVvjUtvrDV6PUJbkdANLRjfkWpNSk7RXJ9vYwBfAr66TNj0zcFmOBPKZDuI&containerid=231648_-_4&page="
+
page
;
for
(
int
page
=
1
;
page
<=
2
;
page
++){
String
pageUrl
=
"https://
weibo.com/ajax/statuses/topic_band?sid=v_weibopro&category=all&page="
+
page
+
"&count=50"
;
Request
request
=
RequestUtils
.
wrapGet
(
pageUrl
);
String
htmlBody
=
null
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
5
;
retryTimes
++)
{
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
...
...
@@ -146,7 +237,7 @@ public class WeiboTopicCrawler {
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
topicList
.
addAll
(
parseTopicHtml
(
htmlBody
,
date
));
topicList
.
addAll
(
parseTopic
Pc
Html
(
htmlBody
,
date
));
break
;
}
else
{
log
.
info
(
"下载榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
...
...
@@ -156,67 +247,27 @@ public class WeiboTopicCrawler {
return
topicList
;
}
private
static
List
<
HotSearchList
>
parseTopicHtml
(
String
htmlBody
,
Date
date
)
{
private
static
List
<
HotSearchList
>
parseTopicPcHtml
(
String
htmlBody
,
Date
date
)
{
try
{
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
Objects
.
nonNull
(
jsonArray
)
&&
!
jsonArray
.
isEmpty
())
{
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONArray
jsonArray
=
data
.
getJSONArray
(
"statuses"
);
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
for
(
int
j
=
0
;
j
<
jsonArray
.
size
();
j
++){
JSONObject
card
=
jsonArray
.
getJSONObject
(
j
);
if
(
card
.
containsKey
(
"card_group"
)){
JSONArray
cards
=
card
.
getJSONArray
(
"card_group"
);
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
Integer
rank
=
null
;
String
topicName
=
null
;
String
url
=
null
;
String
description
=
null
;
Long
commentNum
=
null
;
Long
readNum
=
null
;
String
desc2
=
null
;
for
(
int
i
=
0
;
i
<
cards
.
size
();
i
++)
{
JSONObject
cardGroup
=
cards
.
getJSONObject
(
i
);
rank
=
cardGroup
.
getInteger
(
"top_mark_text"
);
topicName
=
cardGroup
.
getString
(
"title_sub"
);
url
=
"https://s.weibo.com/weibo?q="
+
URLCodeUtil
.
getURLEncode
(
topicName
,
"utf-8"
);
description
=
null
;
if
(
cardGroup
.
containsKey
(
"card_expand"
)){
description
=
cardGroup
.
getJSONObject
(
"card_expand"
).
getString
(
"content"
);
}
desc2
=
cardGroup
.
getString
(
"desc"
);
String
commentNumStr
=
desc2
.
replaceAll
(
"讨论.*"
,
""
).
trim
();
String
readNumStr
=
desc2
.
replaceAll
(
".*讨论|阅读"
,
""
).
trim
();
try
{
commentNum
=
TipsUtils
.
getHotCount
(
commentNumStr
);
readNum
=
TipsUtils
.
getHotCount
(
readNumStr
);
}
catch
(
Exception
e
){
e
.
printStackTrace
();
}
Integer
rank
=
card
.
getInteger
(
"rank"
);
String
description
=
card
.
getString
(
"summary"
);
String
topicName
=
card
.
getString
(
"topic"
);
Long
commentNum
=
card
.
getLong
(
"mention"
);
Long
readNum
=
card
.
getLong
(
"read"
);
String
url
=
"https://s.weibo.com/weibo?q="
+
URLCodeUtil
.
getURLEncode
(
"#"
+
topicName
+
"#"
,
"utf-8"
);
HotSearchList
topic
=
new
HotSearchList
(
url
,
topicName
,
readNum
,
rank
,
HotSearchType
.
微博话题
.
name
(),
commentNum
,
description
,
date
);
if
(
cardGroup
.
containsKey
(
"title_flag_pic"
)){
String
titlePic
=
cardGroup
.
getString
(
"title_flag_pic"
);
if
(
titlePic
.
contains
(
"new"
)){
topic
.
setIcon
(
"新"
);
}
else
if
(
titlePic
.
contains
(
"hot"
)){
topic
.
setIcon
(
"热"
);
}
}
topicList
.
add
(
topic
);
}
return
topicList
;
}
}
}
else
{
// log.info("html:{}",htmlBody);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析榜单列表页面时出现错误,错误为:{}"
,
e
);
}
return
Collections
.
emptyList
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
206c358e
...
...
@@ -222,7 +222,7 @@ public class GatherTimer {
public
void
crawlerWeiBoTopic
(){
log
.
info
(
"微博话题开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
WeiboTopicCrawler
.
startCrawlerByP
hone
(
date
);
List
<
HotSearchList
>
list
=
WeiboTopicCrawler
.
startCrawlerByP
c
(
date
);
log
.
info
(
"{}, 微博话题此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
微博话题
.
name
(),
list
);
log
.
info
(
"微博话题采集结束..."
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment