Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
15c0ee7f
Commit
15c0ee7f
authored
Sep 18, 2021
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'working' into 'master'
微博恢复到原来采集状态 See merge request
!133
parents
897ed7ac
bd60cfd9
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
108 additions
and
108 deletions
+108
-108
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+107
-107
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+1
-1
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
15c0ee7f
...
...
@@ -125,102 +125,6 @@ public class WeiboHotSearchCrawler {
// }
/**
* @return void 返回类型
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
*/
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(
Date
date
)
{
//String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
String
url
=
"https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot"
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
);
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
JSONArray
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
int
rank
=
0
;
// for (int i = 0; i < cards.size(); i++) {
try
{
JSONObject
card
=
json
.
getJSONObject
(
0
);
JSONArray
cardGroup
=
card
.
getJSONArray
(
"card_group"
);
JSONObject
topCard
=
cardGroup
.
getJSONObject
(
0
);
if
(!
topCard
.
containsKey
(
"pic"
))
{
rank
=
1
;
}
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
// String title = card.getString("title");
boolean
hot
=
true
;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
// hot = false;
// rank = 51;
// }
for
(
int
j
=
0
;
j
<
cardGroup
.
size
();
j
++)
{
JSONObject
cardInfo
=
cardGroup
.
getJSONObject
(
j
);
String
name
=
cardInfo
.
getString
(
"desc"
);
String
desc_extr
=
cardInfo
.
getString
(
"desc_extr"
);
String
heatLabel
=
null
;
Long
hotCount
=
0L
;
if
(!
StringUtils
.
isEmpty
(
desc_extr
)&&
Objects
.
nonNull
(
desc_extr
)){
String
regEx
=
"[^0-9]"
;
Pattern
p
=
Pattern
.
compile
(
regEx
);
Matcher
m
=
p
.
matcher
(
desc_extr
);
String
num
=
m
.
replaceAll
(
""
).
trim
();
hotCount
=
Long
.
valueOf
(
num
);
String
[]
split
=
desc_extr
.
split
(
" "
);
if
(
split
.
length
>
1
){
String
heat
=
split
[
0
].
trim
();
boolean
flag
=
StringUtils
.
isNumeric
(
heat
);
if
(!
flag
){
heatLabel
=
split
[
0
].
trim
();
}
}
}
String
iconUrl
=
cardInfo
.
getString
(
"icon"
);
String
icon
=
null
;
if
(
StringUtils
.
isNotBlank
(
iconUrl
))
{
icon
=
iconUrl
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
}
// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String
id
=
cardInfo
.
getString
(
"scheme"
);
String
url1
=
"https://m.weibo.cn/search?"
+
id
.
split
(
"[?]"
)[
1
];
HotSearchList
hotSearch
=
new
HotSearchList
(
url1
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
,
date
);
hotSearch
.
setHeatLabel
(
heatLabel
);
if
(
Objects
.
nonNull
(
iconUrl
)){
hotSearch
.
setIconUrl
(
iconUrl
);}
result
.
add
(
hotSearch
);
rank
++;
redisDao
.
addDataToSet
(
RedisConfig
.
WEIBO_HOTSEARCHIDS
,
name
+
"_微博热搜"
);
}
}
else
{
log
.
info
(
"card 数据结构为:{}"
,
card
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时热搜时出现解析错误"
,
e
);
continue
;
}
// }
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时热搜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
"解析微博时热搜时出现解析错误,页面结构有问题"
);
}
}
return
Collections
.
emptyList
();
}
// /**
// * @return void 返回类型
// * @Title: weiboHotSearchByPhoneTest
...
...
@@ -243,12 +147,11 @@ public class WeiboHotSearchCrawler {
// List<HotSearchList> result = new ArrayList<HotSearchList>();
// if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
// try {
// JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data");
// JSONArray cards = json.getJSONArray("cards");
// JSONArray json = JSONObject.parseObject(htmlBody).getJSONArray("cards");
// int rank = 0;
//// for (int i = 0; i < cards.size(); i++) {
// try {
// JSONObject card =
cards
.getJSONObject(0);
// JSONObject card =
json
.getJSONObject(0);
// JSONArray cardGroup = card.getJSONArray("card_group");
// JSONObject topCard = cardGroup.getJSONObject(0);
// if (!topCard.containsKey("pic")) {
...
...
@@ -266,16 +169,22 @@ public class WeiboHotSearchCrawler {
// String name = cardInfo.getString("desc");
// String desc_extr = cardInfo.getString("desc_extr");
// String heatLabel=null;
// Long hotCount =null;
// if (Objects.nonNull(desc_extr)){
// Long hotCount =0L;
// if (!StringUtils.isEmpty(desc_extr)&&Objects.nonNull(desc_extr)){
// String regEx="[^0-9]";
// Pattern p = Pattern.compile(regEx);
// Matcher m = p.matcher(desc_extr);
// String num = m.replaceAll("").trim();
// hotCount = Long.valueOf(num);
// String[] split = desc_extr.split(" ");
// if (split.length>1){
//
heatLabel
= split[0].trim();
//
hotCount= Long.valueOf(split[1].trim()
);
//
//
}else {
//
hotCount = cardInfo.getLongValue("desc_extr");
//
String heat
= split[0].trim();
//
boolean flag = StringUtils.isNumeric(heat
);
//
if (!flag){
//
heatLabel= split[0].trim();
//
}
// }
//
// }
// String iconUrl = cardInfo.getString("icon");
// String icon=null;
...
...
@@ -284,7 +193,8 @@ public class WeiboHotSearchCrawler {
// }
//// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
// String id = cardInfo.getString("scheme");
// HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
// String url1 = "https://m.weibo.cn/search?"+id.split("[?]")[1];
// HotSearchList hotSearch = new HotSearchList(url1, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
// hotSearch.setHeatLabel(heatLabel);
// if (Objects.nonNull(iconUrl)){hotSearch.setIconUrl(iconUrl);}
// result.add(hotSearch);
...
...
@@ -312,6 +222,96 @@ public class WeiboHotSearchCrawler {
// }
/**
* @return void 返回类型
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
*/
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(
Date
date
)
{
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583"
;
//String url = "https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot";
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
);
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONArray
cards
=
json
.
getJSONArray
(
"cards"
);
int
rank
=
0
;
// for (int i = 0; i < cards.size(); i++) {
try
{
JSONObject
card
=
cards
.
getJSONObject
(
0
);
JSONArray
cardGroup
=
card
.
getJSONArray
(
"card_group"
);
JSONObject
topCard
=
cardGroup
.
getJSONObject
(
0
);
if
(!
topCard
.
containsKey
(
"pic"
))
{
rank
=
1
;
}
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
// String title = card.getString("title");
boolean
hot
=
true
;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
// hot = false;
// rank = 51;
// }
for
(
int
j
=
0
;
j
<
cardGroup
.
size
();
j
++)
{
JSONObject
cardInfo
=
cardGroup
.
getJSONObject
(
j
);
String
name
=
cardInfo
.
getString
(
"desc"
);
String
desc_extr
=
cardInfo
.
getString
(
"desc_extr"
);
String
heatLabel
=
null
;
Long
hotCount
=
null
;
if
(
Objects
.
nonNull
(
desc_extr
)){
String
[]
split
=
desc_extr
.
split
(
" "
);
if
(
split
.
length
>
1
){
heatLabel
=
split
[
0
].
trim
();
hotCount
=
Long
.
valueOf
(
split
[
1
].
trim
());
}
else
{
hotCount
=
cardInfo
.
getLongValue
(
"desc_extr"
);
}
}
String
iconUrl
=
cardInfo
.
getString
(
"icon"
);
String
icon
=
null
;
if
(
StringUtils
.
isNotBlank
(
iconUrl
))
{
icon
=
iconUrl
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
}
// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String
id
=
cardInfo
.
getString
(
"scheme"
);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
,
date
);
hotSearch
.
setHeatLabel
(
heatLabel
);
if
(
Objects
.
nonNull
(
iconUrl
)){
hotSearch
.
setIconUrl
(
iconUrl
);}
result
.
add
(
hotSearch
);
rank
++;
//redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS, name + "_微博热搜");
}
}
else
{
log
.
info
(
"card 数据结构为:{}"
,
card
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时热搜时出现解析错误"
,
e
);
continue
;
}
// }
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时热搜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
"解析微博时热搜时出现解析错误,页面结构有问题"
);
}
}
return
Collections
.
emptyList
();
}
/**
* 微博预热榜(实时上升热点采集)
*
* @param date
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
15c0ee7f
...
...
@@ -218,7 +218,7 @@ public class HotSearchCacheDAO {
nowDoc
.
put
(
"pictureUrl"
,
pictureUrl
);
}
if
(
"微博热搜"
.
equals
(
type
)){
//
nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc);
nowDoc
=
WeiboHotSearchCrawler
.
weiboUpdate
(
nowDoc
);
//更新微博话题贡献者,关于功能
Document
documentPC
=
WeiboHotSearchCrawler
.
weiboUpdatePC
(
nowDoc
);
if
(
documentPC
.
containsKey
(
"分类"
))
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment