Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
b59879ef
Commit
b59879ef
authored
Jul 12, 2021
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
新增微博话题采集话题贡献者,关于功能
parent
96ffc323
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
590 additions
and
444 deletions
+590
-444
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoUser.java
+11
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+558
-440
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+11
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoUserDao.java
+10
-2
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoUser.java
View file @
b59879ef
...
...
@@ -53,9 +53,13 @@ public class WeiBoUser implements Serializable {
* 头像地址
*/
private
String
profileImageUrl
;
/**
* 类型
*/
private
String
type
;
public
WeiBoUser
()
{
}
public
WeiBoUser
(
String
userId
,
String
attestationMassage
,
String
userName
,
String
topic
,
Date
time
,
Long
followerCount
,
String
profileImageUrl
)
{
this
.
id
=
userId
+
"_"
+
HotSearchType
.
微博热搜
.
name
()+
"_"
+
topic
;
...
...
@@ -66,6 +70,11 @@ public class WeiBoUser implements Serializable {
this
.
time
=
time
;
this
.
followerCount
=
followerCount
;
this
.
profileImageUrl
=
profileImageUrl
;
}
public
WeiBoUser
(
String
userId
,
String
userName
,
String
topic
,
Date
time
)
{
this
.
userId
=
userId
;
this
.
userName
=
userName
;
this
.
topic
=
topic
;
this
.
time
=
time
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
b59879ef
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.io.IOException
;
import
java.io.UnsupportedEncodingException
;
import
java.net.URLEncoder
;
import
java.text.ParseException
;
import
java.text.SimpleDateFormat
;
import
java.util.*
;
import
java.util.stream.Collectors
;
import
com.alibaba.fastjson.JSON
;
import
com.mongodb.client.result.UpdateResult
;
import
com.zhiwei.searchhotcrawler.bean.*
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
com.zhiwei.searchhotcrawler.dao.RedisDao
;
...
...
@@ -17,6 +20,12 @@ import lombok.extern.log4j.Log4j2;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClients
;
import
org.apache.http.util.EntityUtils
;
import
org.bson.Document
;
import
org.checkerframework.checker.units.qual.C
;
import
org.jsoup.Jsoup
;
...
...
@@ -37,26 +46,26 @@ import org.springframework.beans.factory.annotation.Autowired;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
* @author hero
* @ClassName: WeiboHotSearch
* @Description: 微博实时热搜采集
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
public
class
WeiboHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
RedisDao
redisDao
=
new
RedisDao
();
private
static
RedisDao
redisDao
=
new
RedisDao
();
static
WeiBoUserDao
weiBoUserDao
=
new
WeiBoUserDao
();
static
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
/**
* @Title: weiboHotSearchTest
* @author hero
* @Description: TODO(PC端微博热搜采集)
* @return void 返回类型
*/
static
WeiBoUserDao
weiBoUserDao
=
new
WeiBoUserDao
();
static
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
/**
* @Title: weiboHotSearchTest
* @author hero
* @Description: TODO(PC端微博热搜采集)
* @return void 返回类型
*/
// public static List<HotSearchList> weiboHotSearch(){
// String url = "https://s.weibo.com/top/summary?cate=realtimehot";
//
...
...
@@ -113,444 +122,553 @@ public class WeiboHotSearchCrawler {
// }
/**
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(
Date
date
){
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583"
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
);
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONArray
cards
=
json
.
getJSONArray
(
"cards"
);
int
rank
=
0
;
/**
* @return void 返回类型
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
*/
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(
Date
date
)
{
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583"
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
);
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONArray
cards
=
json
.
getJSONArray
(
"cards"
);
int
rank
=
0
;
// for (int i = 0; i < cards.size(); i++) {
try
{
JSONObject
card
=
cards
.
getJSONObject
(
0
);
JSONArray
cardGroup
=
card
.
getJSONArray
(
"card_group"
);
JSONObject
topCard
=
cardGroup
.
getJSONObject
(
0
);
if
(!
topCard
.
containsKey
(
"pic"
))
{
rank
=
1
;
}
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
try
{
JSONObject
card
=
cards
.
getJSONObject
(
0
);
JSONArray
cardGroup
=
card
.
getJSONArray
(
"card_group"
);
JSONObject
topCard
=
cardGroup
.
getJSONObject
(
0
);
if
(!
topCard
.
containsKey
(
"pic"
))
{
rank
=
1
;
}
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
// String title = card.getString("title");
boolean
hot
=
true
;
boolean
hot
=
true
;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
// hot = false;
// rank = 51;
// }
for
(
int
j
=
0
;
j
<
cardGroup
.
size
();
j
++)
{
JSONObject
cardInfo
=
cardGroup
.
getJSONObject
(
j
);
String
name
=
cardInfo
.
getString
(
"desc"
);
long
hotCount
=
cardInfo
.
getLongValue
(
"desc_extr"
);
String
icon
=
cardInfo
.
getString
(
"icon"
);
if
(
StringUtils
.
isNotBlank
(
icon
))
{
icon
=
icon
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
}
String
rankPic
=
cardInfo
.
getString
(
"pic"
);
for
(
int
j
=
0
;
j
<
cardGroup
.
size
();
j
++)
{
JSONObject
cardInfo
=
cardGroup
.
getJSONObject
(
j
);
String
name
=
cardInfo
.
getString
(
"desc"
);
long
hotCount
=
cardInfo
.
getLongValue
(
"desc_extr"
);
String
icon
=
cardInfo
.
getString
(
"icon"
);
if
(
StringUtils
.
isNotBlank
(
icon
))
{
icon
=
icon
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
}
// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String
urlScheme
=
cardInfo
.
getString
(
"scheme"
);
HotSearchList
hotSearch
=
new
HotSearchList
(
urlScheme
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
,
date
);
hotSearch
.
setRankPic
(
rankPic
);
result
.
add
(
hotSearch
);
rank
++;
redisDao
.
addDataToSet
(
RedisConfig
.
WEIBO_HOTSEARCHIDS
,
name
+
"_微博热搜"
);
}
}
else
{
log
.
info
(
"card 数据结构为:{}"
,
card
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时热搜时出现解析错误"
,
e
);
continue
;
}
String
id
=
cardInfo
.
getString
(
"scheme"
);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
,
date
);
result
.
add
(
hotSearch
);
rank
++;
redisDao
.
addDataToSet
(
RedisConfig
.
WEIBO_HOTSEARCHIDS
,
name
+
"_微博热搜"
);
}
}
else
{
log
.
info
(
"card 数据结构为:{}"
,
card
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时热搜时出现解析错误"
,
e
);
continue
;
}
// }
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时热搜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
"解析微博时热搜时出现解析错误,页面结构有问题"
);
}
}
return
Collections
.
emptyList
();
}
/**
* 微博预热榜(实时上升热点采集)
* @param date
* @return
*/
public
static
List
<
HotSearchList
>
weiboPreheatSearch
(
Date
date
){
String
url
=
"https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜时出现连接失败"
,
e
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
)){
JSONArray
cardArray
=
JSON
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
cardArray
.
size
()
>
1
)
{
JSONObject
jsonObject
=
cardArray
.
getJSONObject
(
1
);
if
(
"实时上升热点"
.
equals
(
jsonObject
.
getString
(
"title"
))
&&
jsonObject
.
containsKey
(
"card_group"
))
{
JSONArray
jsonArray
=
jsonObject
.
getJSONArray
(
"card_group"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++){
JSONObject
cardInfo
=
jsonArray
.
getJSONObject
(
i
);
String
name
=
cardInfo
.
getString
(
"desc"
);
long
hotCount
=
cardInfo
.
getIntValue
(
"desc_extr"
);
String
weiboUrl
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
HotSearchList
hotSearchList
=
new
HotSearchList
(
weiboUrl
,
name
,
hotCount
,
null
,
HotSearchType
.
微博预热榜
.
name
(),
date
);
result
.
add
(
hotSearchList
);
}
//根据热度排序,赋值排名
result
=
result
.
stream
().
sorted
(
Comparator
.
comparing
(
HotSearchList:
:
getCount
).
reversed
()).
collect
(
Collectors
.
toList
());
int
rank
=
1
;
for
(
HotSearchList
hotSearchList
:
result
){
hotSearchList
.
setRank
(
rank
);
rank
++;
}
}
}
}
return
result
;
}
/**
* 微博热搜数据更新导语,阅读量,讨论量
* @param document
* @return
*/
public
static
Document
weiboUpdate
(
Document
document
)
{
log
.
info
(
"更新微博热搜{}导语阅读量和讨论量"
,
document
.
getString
(
"name"
));
String
url
=
"https://m.weibo.cn/api/container/getIndex?"
+
document
.
getString
(
"url"
).
substring
(
document
.
getString
(
"url"
).
indexOf
(
"?"
)+
1
,
document
.
getString
(
"url"
).
indexOf
(
"&"
));
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"cardlistInfo"
);
List
<
JSONObject
>
cardsJsons
=
(
List
<
JSONObject
>)
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
get
(
"cards"
);
if
(
json
.
containsKey
(
"desc"
)){
String
topicLead
=
json
.
getString
(
"desc"
);
if
(!
""
.
equals
(
topicLead
))
{
document
.
put
(
"topicLead"
,
topicLead
);
}
}
if
(
json
.
containsKey
(
"cardlist_head_cards"
)){
JSONObject
readJson
=
json
.
getJSONArray
(
"cardlist_head_cards"
).
getJSONObject
(
0
);
if
(
readJson
.
containsKey
(
"head_data"
))
{
String
midText
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"midtext"
);
String
read
=
midText
.
replaceAll
(
"阅读"
,
""
).
replaceAll
(
"讨论.*"
,
""
).
trim
();
String
discussCount
=
midText
.
replaceAll
(
".*讨论"
,
""
).
replaceAll
(
"详情.*"
,
""
).
trim
();
String
pictureUrl
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"portrait_url"
);
document
.
put
(
"readCount"
,
TipsUtils
.
getHotCount
(
read
));
document
.
put
(
"discussCount"
,
TipsUtils
.
getHotCount
(
discussCount
));
document
.
put
(
"pictureUrl"
,
pictureUrl
);
if
(
readJson
.
getJSONObject
(
"head_data"
).
containsKey
(
"downtext"
)){
String
downtext
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"downtext"
);
if
(!
""
.
equals
(
downtext
))
{
document
.
put
(
"downtext"
,
downtext
.
replaceAll
(
"主持人:"
,
""
));
}
}
}
}
try
{
//解析cards,获取热门微博、人物
if
(
Objects
.
isNull
(
weiBoMassageDao
)){
weiBoMassageDao
=
new
WeiBoMassageDao
();
}
if
(
Objects
.
isNull
(
weiBoUserDao
)){
weiBoUserDao
=
new
WeiBoUserDao
();
}
for
(
JSONObject
jsonObject
:
cardsJsons
)
{
if
(
nonNull
(
jsonObject
)
&&
!
jsonObject
.
isEmpty
())
{
if
(
jsonObject
.
containsKey
(
"mblog"
))
{
if
(
jsonObject
.
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
jsonObject
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
}
}
else
if
(
jsonObject
.
containsKey
(
"card_group"
))
{
JSONArray
cardGroup
=
jsonObject
.
getJSONArray
(
"card_group"
);
WeiBoMassage
weiBoMassage
=
analysisWeiboMassage
(
cardGroup
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
List
<
WeiBoUser
>
weiBoUserList
=
analysisWeiBoUsers
(
cardGroup
,
document
.
getString
(
"name"
));
if
(!
weiBoUserList
.
isEmpty
()){
for
(
int
i
=
0
;
i
<
weiBoUserList
.
size
();
i
++)
{
weiBoUserDao
.
addWeiBoUser
(
weiBoUserList
.
get
(
i
));
}
}
}
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析cards失败,未获得热门微博,人物信息"
,
e
);
}
return
document
;
}
}
return
null
;
}
/**
* 解析微博信息
*
* @param cardGroup
* @param topic
* @return
*/
public
static
WeiBoMassage
analysisWeiboMassage
(
JSONArray
cardGroup
,
String
topic
)
{
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"mblog"
))
{
if
(
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
cardGroup
.
getJSONObject
(
i
),
topic
);
return
weiBoMassage
;
}
}
}
return
null
;
}
/**
* 解析用户信息
*
* @param cardGroup
* @param topic
* @return
*/
public
static
List
<
WeiBoUser
>
analysisWeiBoUsers
(
JSONArray
cardGroup
,
String
topic
)
{
List
<
WeiBoUser
>
weiBoUserList
=
new
ArrayList
();
//解析weibo人物信息
Date
date
=
new
Date
();
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
if
(
3
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"users"
)){
JSONArray
users
=
cardGroup
.
getJSONObject
(
i
).
getJSONArray
(
"users"
);
for
(
int
i1
=
0
;
i1
<
users
.
size
();
i1
++)
{
//获取用户id
String
userId
=
users
.
getJSONObject
(
i1
).
getString
(
"id"
);
//获取用户名
String
userName
=
users
.
getJSONObject
(
i1
).
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
users
.
getJSONObject
(
i1
).
getString
(
"verified_reason"
);
//获取粉丝数量
String
followers_count
=
users
.
getJSONObject
(
i1
).
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(!
followers_count
.
contains
(
"万"
)){
followerCount
=
Long
.
valueOf
(
followers_count
);
}
else
{
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])*
10000
;
}
//用户头像地址
String
profileImageUrl
=
users
.
getJSONObject
(
i1
).
getString
(
"profile_image_url"
);
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
,
profileImageUrl
);
weiBoUserList
.
add
(
weiBoUser
);
}
}
return
weiBoUserList
;
}
else
if
(
10
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"user"
)){
JSONObject
user
=
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"user"
);
//获取用户id
String
userId
=
user
.
getString
(
"id"
);
//获取用户名
String
userName
=
user
.
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
user
.
getString
(
"verified_reason"
);
//获取粉丝数
String
followers_count
=
user
.
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(
followers_count
.
contains
(
"万"
)){
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])*
10000
;
}
else
{
followerCount
=
Long
.
valueOf
(
followers_count
);
}
//用户头像地址
String
profileImageUrl
=
user
.
getString
(
"profile_image_url"
);
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
,
profileImageUrl
);
weiBoUserList
.
add
(
weiBoUser
);
}
return
weiBoUserList
;
}
}
return
Collections
.
emptyList
();
}
/**
* 解析微博类型
*
* @param jsonObject
* @param topic
* @return
*/
public
static
WeiBoMassage
analysisWeiboMBlog
(
JSONObject
jsonObject
,
String
topic
)
{
JSONObject
mblog
=
jsonObject
.
getJSONObject
(
"mblog"
);
String
type
=
mblog
.
getJSONObject
(
"title"
).
getString
(
"text"
);
String
card_type
=
jsonObject
.
getString
(
"card_type"
);
Integer
cardType
=
Integer
.
valueOf
(
card_type
);
String
show_type
=
jsonObject
.
getString
(
"show_type"
);
Integer
showType
=
Integer
.
valueOf
(
show_type
);
//点赞数
String
attitudes_count
=
mblog
.
getString
(
"attitudes_count"
);
Long
attitudeCount
=
null
;
if
(
attitudes_count
.
contains
(
"万"
))
{
String
[]
split
=
attitudes_count
.
split
(
"万"
);
attitudeCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
attitudeCount
=
Long
.
valueOf
(
attitudes_count
);
}
//评论数
String
comments_count
=
mblog
.
getString
(
"comments_count"
);
Long
commentCount
=
null
;
if
(
comments_count
.
contains
(
"万"
))
{
String
[]
split
=
comments_count
.
split
(
"万"
);
commentCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
commentCount
=
Long
.
valueOf
(
comments_count
);
}
//转发数
String
reposts_count
=
mblog
.
getString
(
"reposts_count"
);
Long
repostCount
=
null
;
if
(
reposts_count
.
contains
(
"万"
)){
String
[]
split
=
reposts_count
.
split
(
"万"
);
repostCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
repostCount
=
Long
.
valueOf
(
reposts_count
);
}
Date
createTime
=
null
;
Date
editTime
=
null
;
try
{
SimpleDateFormat
simpleDateFormat
=
new
SimpleDateFormat
(
"EEE MMM dd HH:mm:ss z yyyy"
,
java
.
util
.
Locale
.
US
);
//创建时间
String
created_at
=
mblog
.
getString
(
"created_at"
);
createTime
=
simpleDateFormat
.
parse
(
created_at
);
//编辑时间
if
(
mblog
.
containsKey
(
"edit_at"
)){
String
edit_at
=
mblog
.
getString
(
"edit_at"
);
editTime
=
simpleDateFormat
.
parse
(
edit_at
);
}
}
catch
(
ParseException
e
)
{
log
.
error
(
"创建时间和编辑时间解析异常"
,
e
);
}
String
mid
=
mblog
.
getString
(
"mid"
);
//用户id
String
userId
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//用户名
String
userName
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//来源
String
source
=
mblog
.
getString
(
"source"
);
//用户头像地址
String
profileImageUrl
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"profile_image_url"
);
//内容
String
content
=
null
;
if
(
mblog
.
getString
(
"text"
).
contains
(
"<"
))
{
String
text
=
mblog
.
getString
(
"text"
);
org
.
jsoup
.
nodes
.
Document
parse
=
Jsoup
.
parse
(
text
);
content
=
parse
.
text
();
}
else
{
content
=
mblog
.
getString
(
"text"
);
}
WeiBoMassage
weiBoMassage
=
new
WeiBoMassage
(
userId
,
content
,
userName
,
mid
,
createTime
,
editTime
,
cardType
,
showType
,
repostCount
,
commentCount
,
attitudeCount
,
source
,
type
,
topic
,
profileImageUrl
);
//默认不转发为0
weiBoMassage
.
setForward
(
0
);
JSONObject
weiboJson
=
null
;
//微博实体 是否转发
if
(
mblog
.
containsKey
(
"retweeted_status"
))
{
weiboJson
=
mblog
.
getJSONObject
(
"retweeted_status"
);
//处理转发特有的
//weiBoMassage.set
//源mid
String
rootMid
=
weiboJson
.
getString
(
"mid"
);
//源来源
String
rootSource
=
weiboJson
.
getString
(
"source"
);
//源text
String
text
=
weiboJson
.
getString
(
"text"
);
//解析
org
.
jsoup
.
nodes
.
Document
parse
=
Jsoup
.
parse
(
text
);
String
rootText
=
parse
.
text
();
//源用户id
String
rootId
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//源用户名
String
rootName
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//数据保存到对象中
weiBoMassage
.
setRoot_mid
(
rootMid
);
weiBoMassage
.
setRoot_id
(
rootId
);
weiBoMassage
.
setRoot_source
(
rootSource
);
weiBoMassage
.
setRoot_text
(
rootText
);
weiBoMassage
.
setRoot_name
(
rootName
);
//转发为1
weiBoMassage
.
setForward
(
1
);
}
else
{
weiboJson
=
mblog
;
}
List
<
String
>
pictureUrlList
=
new
ArrayList
();
Long
playCount
=
null
;
//获取播放量和图片链接
if
(
weiboJson
.
getJSONArray
(
"pic_ids"
).
size
()
>
0
)
{
JSONArray
jsonArray
=
weiboJson
.
getJSONArray
(
"pics"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
String
picUrl
=
jsonArray
.
getJSONObject
(
i
).
getString
(
"url"
);
pictureUrlList
.
add
(
picUrl
);
}
}
else
if
(
weiboJson
.
containsKey
(
"page_info"
))
{
if
(
weiboJson
.
getJSONObject
(
"page_info"
).
containsKey
(
"play_count"
)){
String
play
=
weiboJson
.
getJSONObject
(
"page_info"
).
getString
(
"play_count"
);
if
(
play
.
contains
(
"万"
))
{
String
[]
split
=
play
.
split
(
"万"
);
playCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
if
(
play
.
contains
(
"次"
)){
String
[]
split
=
play
.
split
(
"次"
);
playCount
=
Long
.
valueOf
(
split
[
0
]);
}
}
}
weiBoMassage
.
setPlayCount
(
playCount
);
weiBoMassage
.
setPictureUrlList
(
pictureUrlList
);
return
weiBoMassage
;
}
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时热搜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
"解析微博时热搜时出现解析错误,页面结构有问题"
);
}
}
return
Collections
.
emptyList
();
}
/**
* 微博预热榜(实时上升热点采集)
*
* @param date
* @return
*/
public
static
List
<
HotSearchList
>
weiboPreheatSearch
(
Date
date
)
{
String
url
=
"https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜时出现连接失败"
,
e
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
JSONArray
cardArray
=
JSON
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
cardArray
.
size
()
>
1
)
{
JSONObject
jsonObject
=
cardArray
.
getJSONObject
(
1
);
if
(
"实时上升热点"
.
equals
(
jsonObject
.
getString
(
"title"
))
&&
jsonObject
.
containsKey
(
"card_group"
))
{
JSONArray
jsonArray
=
jsonObject
.
getJSONArray
(
"card_group"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
cardInfo
=
jsonArray
.
getJSONObject
(
i
);
String
name
=
cardInfo
.
getString
(
"desc"
);
long
hotCount
=
cardInfo
.
getIntValue
(
"desc_extr"
);
String
weiboUrl
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
HotSearchList
hotSearchList
=
new
HotSearchList
(
weiboUrl
,
name
,
hotCount
,
null
,
HotSearchType
.
微博预热榜
.
name
(),
date
);
result
.
add
(
hotSearchList
);
}
//根据热度排序,赋值排名
result
=
result
.
stream
().
sorted
(
Comparator
.
comparing
(
HotSearchList:
:
getCount
).
reversed
()).
collect
(
Collectors
.
toList
());
int
rank
=
1
;
for
(
HotSearchList
hotSearchList
:
result
)
{
hotSearchList
.
setRank
(
rank
);
rank
++;
}
}
}
}
return
result
;
}
/**
* 微博热搜数据更新导语,阅读量,讨论量
*
* @param document
* @return
*/
public
static
Document
weiboUpdate
(
Document
document
)
{
log
.
info
(
"更新微博热搜{}导语阅读量和讨论量"
,
document
.
getString
(
"name"
));
String
url
=
"https://m.weibo.cn/api/container/getIndex?"
+
document
.
getString
(
"url"
).
substring
(
document
.
getString
(
"url"
).
indexOf
(
"?"
)
+
1
,
document
.
getString
(
"url"
).
indexOf
(
"&"
));
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"cardlistInfo"
);
List
<
JSONObject
>
cardsJsons
=
(
List
<
JSONObject
>)
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
get
(
"cards"
);
if
(
json
.
containsKey
(
"desc"
))
{
String
topicLead
=
json
.
getString
(
"desc"
);
if
(!
""
.
equals
(
topicLead
))
{
document
.
put
(
"topicLead"
,
topicLead
);
}
}
if
(
json
.
containsKey
(
"cardlist_head_cards"
))
{
JSONObject
readJson
=
json
.
getJSONArray
(
"cardlist_head_cards"
).
getJSONObject
(
0
);
if
(
readJson
.
containsKey
(
"head_data"
))
{
String
midText
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"midtext"
);
String
read
=
midText
.
replaceAll
(
"阅读"
,
""
).
replaceAll
(
"讨论.*"
,
""
).
trim
();
String
discussCount
=
midText
.
replaceAll
(
".*讨论"
,
""
).
replaceAll
(
"详情.*"
,
""
).
trim
();
String
pictureUrl
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"portrait_url"
);
document
.
put
(
"readCount"
,
TipsUtils
.
getHotCount
(
read
));
document
.
put
(
"discussCount"
,
TipsUtils
.
getHotCount
(
discussCount
));
document
.
put
(
"pictureUrl"
,
pictureUrl
);
if
(
readJson
.
getJSONObject
(
"head_data"
).
containsKey
(
"downtext"
))
{
String
downtext
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"downtext"
);
if
(!
""
.
equals
(
downtext
))
{
document
.
put
(
"downtext"
,
downtext
.
replaceAll
(
"主持人:"
,
""
));
}
}
}
}
try
{
//解析cards,获取热门微博、人物
if
(
Objects
.
isNull
(
weiBoMassageDao
))
{
weiBoMassageDao
=
new
WeiBoMassageDao
();
}
if
(
Objects
.
isNull
(
weiBoUserDao
))
{
weiBoUserDao
=
new
WeiBoUserDao
();
}
for
(
JSONObject
jsonObject
:
cardsJsons
)
{
if
(
nonNull
(
jsonObject
)
&&
!
jsonObject
.
isEmpty
())
{
if
(
jsonObject
.
containsKey
(
"mblog"
))
{
if
(
jsonObject
.
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
jsonObject
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
}
}
else
if
(
jsonObject
.
containsKey
(
"card_group"
))
{
JSONArray
cardGroup
=
jsonObject
.
getJSONArray
(
"card_group"
);
WeiBoMassage
weiBoMassage
=
analysisWeiboMassage
(
cardGroup
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
List
<
WeiBoUser
>
weiBoUserList
=
analysisWeiBoUsers
(
cardGroup
,
document
.
getString
(
"name"
));
if
(!
weiBoUserList
.
isEmpty
())
{
for
(
int
i
=
0
;
i
<
weiBoUserList
.
size
();
i
++)
{
weiBoUserDao
.
addWeiBoUser
(
weiBoUserList
.
get
(
i
));
}
}
}
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析cards失败,未获得热门微博,人物信息"
,
e
);
}
return
document
;
}
}
return
null
;
}
/**
* 微博热搜数据更新话题贡献者排行,关于
*
* @param document
* @return
*/
public
static
Document
weiboUpdatePC
(
Document
document
)
{
document
.
getString
(
"name"
);
String
topic
=
document
.
getString
(
"name"
);
String
gb
=
"#"
+
topic
+
"#"
;
String
encode
=
null
;
try
{
encode
=
URLEncoder
.
encode
(
gb
,
"utf-8"
);
}
catch
(
UnsupportedEncodingException
e
)
{
log
.
error
(
"字符解析成URl模式异常"
,
e
);
}
String
url
=
"https://s.weibo.com/weibo?q="
+
encode
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"m-main"
))
{
Document
docm
=
new
Document
();
try
{
org
.
jsoup
.
nodes
.
Document
documen
=
Jsoup
.
parse
(
htmlBody
);
//获取贡献者信息
try
{
Elements
cardUser
=
documen
.
select
(
"div.card-user"
);
for
(
Element
element
:
cardUser
)
{
if
(!
element
.
select
(
"div.card-head"
).
text
().
isEmpty
())
{
Elements
li
=
element
.
select
(
"ul.card-user-list-a"
).
select
(
"li"
);
if
(
Objects
.
nonNull
(
li
))
{
//循环获取话题贡献者相关信息
for
(
Element
eleme
:
li
)
{
String
type
=
"话题贡献者"
;
writeUser
(
eleme
,
type
,
topic
);
}
}
}
else
{
Elements
li
=
element
.
select
(
"ul.card-user-list-a"
).
select
(
"li"
);
if
(
Objects
.
nonNull
(
li
))
{
//循环获取话题贡献者相关信息
for
(
Element
eleme
:
li
)
{
String
type
=
"当事人"
;
writeUser
(
eleme
,
type
,
topic
);
}
}
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"话题贡献者排行采集异常"
,
e
);
}
Elements
dt
=
documen
.
select
(
"div.card-about"
).
select
(
"dt"
);
if
(
Objects
.
nonNull
(
dt
))
{
//获取微博关于的相关信息
Elements
dd
=
documen
.
select
(
"div.card-about"
).
select
(
"dd"
);
Document
dtDocument
=
new
Document
();
Document
ddDocument
=
new
Document
();
for
(
int
i
=
0
;
i
<
dt
.
size
();
i
++)
{
String
dtText
=
dt
.
get
(
i
).
text
().
replaceAll
(
":"
,
""
).
trim
();
dtDocument
.
put
(
String
.
valueOf
(
i
),
dtText
);
}
for
(
int
i1
=
0
;
i1
<
dd
.
size
();
i1
++)
{
Elements
a
=
dd
.
get
(
i1
).
select
(
"a"
);
List
<
String
>
str
=
new
ArrayList
<>();
for
(
int
b
=
0
;
b
<
a
.
size
();
b
++)
{
String
text1
=
a
.
get
(
b
).
text
();
str
.
add
(
text1
);
}
ddDocument
.
put
(
String
.
valueOf
(
i1
),
str
);
}
for
(
int
a
=
0
;
a
<
dt
.
size
();
a
++)
{
docm
.
put
(
dtDocument
.
getString
(
String
.
valueOf
(
a
)),
ddDocument
.
get
(
String
.
valueOf
(
a
)));
}
}
return
docm
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博话题时出现解析错误"
,
e
);
}
}
return
document
;
}
/**
* 写入user数据
*
* @param eleme
* @param type
*/
private
static
void
writeUser
(
Element
eleme
,
String
type
,
String
topic
)
{
Date
date
=
new
Date
();
if
(
Objects
.
isNull
(
weiBoUserDao
))
{
weiBoUserDao
=
new
WeiBoUserDao
();
}
//获取用户名
String
userName
=
eleme
.
select
(
"a.name"
).
text
();
String
attr
=
eleme
.
select
(
"span.avator"
).
select
(
"a"
).
first
().
attr
(
"href"
);
//获取用户id
String
userId
=
attr
.
substring
(
14
);
String
id
=
userId
+
"_"
+
type
+
"_"
+
topic
;
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userName
,
userId
,
topic
,
date
);
weiBoUser
.
setType
(
type
);
weiBoUser
.
setId
(
id
);
weiBoUserDao
.
addWeiBoUser
(
weiBoUser
);
}
/**
* 解析微博信息
*
* @param cardGroup
* @param topic
* @return
*/
public
static
WeiBoMassage
analysisWeiboMassage
(
JSONArray
cardGroup
,
String
topic
)
{
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"mblog"
))
{
if
(
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
cardGroup
.
getJSONObject
(
i
),
topic
);
return
weiBoMassage
;
}
}
}
return
null
;
}
/**
* 解析用户信息
*
* @param cardGroup
* @param topic
* @return
*/
public
static
List
<
WeiBoUser
>
analysisWeiBoUsers
(
JSONArray
cardGroup
,
String
topic
)
{
List
<
WeiBoUser
>
weiBoUserList
=
new
ArrayList
();
//解析weibo人物信息
Date
date
=
new
Date
();
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
if
(
3
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"users"
))
{
JSONArray
users
=
cardGroup
.
getJSONObject
(
i
).
getJSONArray
(
"users"
);
for
(
int
i1
=
0
;
i1
<
users
.
size
();
i1
++)
{
//获取用户id
String
userId
=
users
.
getJSONObject
(
i1
).
getString
(
"id"
);
//获取用户名
String
userName
=
users
.
getJSONObject
(
i1
).
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
users
.
getJSONObject
(
i1
).
getString
(
"verified_reason"
);
//获取粉丝数量
String
followers_count
=
users
.
getJSONObject
(
i1
).
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(!
followers_count
.
contains
(
"万"
))
{
followerCount
=
Long
.
valueOf
(
followers_count
);
}
else
{
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
//用户头像地址
String
profileImageUrl
=
users
.
getJSONObject
(
i1
).
getString
(
"profile_image_url"
);
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
,
profileImageUrl
);
weiBoUserList
.
add
(
weiBoUser
);
}
}
return
weiBoUserList
;
}
else
if
(
10
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"user"
))
{
JSONObject
user
=
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"user"
);
//获取用户id
String
userId
=
user
.
getString
(
"id"
);
//获取用户名
String
userName
=
user
.
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
user
.
getString
(
"verified_reason"
);
//获取粉丝数
String
followers_count
=
user
.
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(
followers_count
.
contains
(
"万"
))
{
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
followerCount
=
Long
.
valueOf
(
followers_count
);
}
//用户头像地址
String
profileImageUrl
=
user
.
getString
(
"profile_image_url"
);
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
,
profileImageUrl
);
weiBoUserList
.
add
(
weiBoUser
);
}
return
weiBoUserList
;
}
}
return
Collections
.
emptyList
();
}
/**
* 解析微博类型
*
* @param jsonObject
* @param topic
* @return
*/
public
static
WeiBoMassage
analysisWeiboMBlog
(
JSONObject
jsonObject
,
String
topic
)
{
JSONObject
mblog
=
jsonObject
.
getJSONObject
(
"mblog"
);
String
type
=
mblog
.
getJSONObject
(
"title"
).
getString
(
"text"
);
String
card_type
=
jsonObject
.
getString
(
"card_type"
);
Integer
cardType
=
Integer
.
valueOf
(
card_type
);
String
show_type
=
jsonObject
.
getString
(
"show_type"
);
Integer
showType
=
Integer
.
valueOf
(
show_type
);
//点赞数
String
attitudes_count
=
mblog
.
getString
(
"attitudes_count"
);
Long
attitudeCount
=
null
;
if
(
attitudes_count
.
contains
(
"万"
))
{
String
[]
split
=
attitudes_count
.
split
(
"万"
);
attitudeCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
attitudeCount
=
Long
.
valueOf
(
attitudes_count
);
}
//评论数
String
comments_count
=
mblog
.
getString
(
"comments_count"
);
Long
commentCount
=
null
;
if
(
comments_count
.
contains
(
"万"
))
{
String
[]
split
=
comments_count
.
split
(
"万"
);
commentCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
commentCount
=
Long
.
valueOf
(
comments_count
);
}
//转发数
String
reposts_count
=
mblog
.
getString
(
"reposts_count"
);
Long
repostCount
=
null
;
if
(
reposts_count
.
contains
(
"万"
))
{
String
[]
split
=
reposts_count
.
split
(
"万"
);
repostCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
repostCount
=
Long
.
valueOf
(
reposts_count
);
}
Date
createTime
=
null
;
Date
editTime
=
null
;
try
{
SimpleDateFormat
simpleDateFormat
=
new
SimpleDateFormat
(
"EEE MMM dd HH:mm:ss z yyyy"
,
java
.
util
.
Locale
.
US
);
//创建时间
String
created_at
=
mblog
.
getString
(
"created_at"
);
createTime
=
simpleDateFormat
.
parse
(
created_at
);
//编辑时间
if
(
mblog
.
containsKey
(
"edit_at"
))
{
String
edit_at
=
mblog
.
getString
(
"edit_at"
);
editTime
=
simpleDateFormat
.
parse
(
edit_at
);
}
}
catch
(
ParseException
e
)
{
log
.
error
(
"创建时间和编辑时间解析异常"
,
e
);
}
String
mid
=
mblog
.
getString
(
"mid"
);
//用户id
String
userId
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//用户名
String
userName
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//来源
String
source
=
mblog
.
getString
(
"source"
);
//用户头像地址
String
profileImageUrl
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"profile_image_url"
);
//内容
String
content
=
null
;
if
(
mblog
.
getString
(
"text"
).
contains
(
"<"
))
{
String
text
=
mblog
.
getString
(
"text"
);
org
.
jsoup
.
nodes
.
Document
parse
=
Jsoup
.
parse
(
text
);
content
=
parse
.
text
();
}
else
{
content
=
mblog
.
getString
(
"text"
);
}
WeiBoMassage
weiBoMassage
=
new
WeiBoMassage
(
userId
,
content
,
userName
,
mid
,
createTime
,
editTime
,
cardType
,
showType
,
repostCount
,
commentCount
,
attitudeCount
,
source
,
type
,
topic
,
profileImageUrl
);
//默认不转发为0
weiBoMassage
.
setForward
(
0
);
JSONObject
weiboJson
=
null
;
//微博实体 是否转发
if
(
mblog
.
containsKey
(
"retweeted_status"
))
{
weiboJson
=
mblog
.
getJSONObject
(
"retweeted_status"
);
//处理转发特有的
//weiBoMassage.set
//源mid
String
rootMid
=
weiboJson
.
getString
(
"mid"
);
//源来源
String
rootSource
=
weiboJson
.
getString
(
"source"
);
//源text
String
text
=
weiboJson
.
getString
(
"text"
);
//解析
org
.
jsoup
.
nodes
.
Document
parse
=
Jsoup
.
parse
(
text
);
String
rootText
=
parse
.
text
();
//源用户id
String
rootId
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//源用户名
String
rootName
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//数据保存到对象中
weiBoMassage
.
setRoot_mid
(
rootMid
);
weiBoMassage
.
setRoot_id
(
rootId
);
weiBoMassage
.
setRoot_source
(
rootSource
);
weiBoMassage
.
setRoot_text
(
rootText
);
weiBoMassage
.
setRoot_name
(
rootName
);
//转发为1
weiBoMassage
.
setForward
(
1
);
}
else
{
weiboJson
=
mblog
;
}
List
<
String
>
pictureUrlList
=
new
ArrayList
();
Long
playCount
=
null
;
//获取播放量和图片链接
if
(
weiboJson
.
getJSONArray
(
"pic_ids"
).
size
()
>
0
)
{
JSONArray
jsonArray
=
weiboJson
.
getJSONArray
(
"pics"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
String
picUrl
=
jsonArray
.
getJSONObject
(
i
).
getString
(
"url"
);
pictureUrlList
.
add
(
picUrl
);
}
}
else
if
(
weiboJson
.
containsKey
(
"page_info"
))
{
if
(
weiboJson
.
getJSONObject
(
"page_info"
).
containsKey
(
"play_count"
))
{
String
play
=
weiboJson
.
getJSONObject
(
"page_info"
).
getString
(
"play_count"
);
if
(
play
.
contains
(
"万"
))
{
String
[]
split
=
play
.
split
(
"万"
);
playCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
if
(
play
.
contains
(
"次"
))
{
String
[]
split
=
play
.
split
(
"次"
);
playCount
=
Long
.
valueOf
(
split
[
0
]);
}
}
}
weiBoMassage
.
setPlayCount
(
playCount
);
weiBoMassage
.
setPictureUrlList
(
pictureUrlList
);
return
weiBoMassage
;
}
// /**
// * 微博更新历史数据
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
b59879ef
...
...
@@ -208,6 +208,17 @@ public class HotSearchCacheDAO {
}
if
(
"微博热搜"
.
equals
(
type
)){
nowDoc
=
WeiboHotSearchCrawler
.
weiboUpdate
(
nowDoc
);
//更新微博话题贡献者,关于功能
Document
documentPC
=
WeiboHotSearchCrawler
.
weiboUpdatePC
(
nowDoc
);
if
(
documentPC
.
containsKey
(
"分类"
))
{
nowDoc
.
put
(
"classify"
,
documentPC
.
get
(
"分类"
));
}
if
(
documentPC
.
containsKey
(
"地区"
))
{
nowDoc
.
put
(
"region"
,
documentPC
.
get
(
"地区"
));
}
if
(
documentPC
.
containsKey
(
"标签"
))
{
nowDoc
.
put
(
"label"
,
documentPC
.
get
(
"标签"
));
}
if
(
nowDoc
.
containsKey
(
"topicLead"
)){
nowDoc
.
put
(
"topicLead"
,
nowDoc
.
getString
(
"topicLead"
));
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoUserDao.java
View file @
b59879ef
...
...
@@ -32,6 +32,7 @@ public class WeiBoUserDao {
* @param weiBoUser
*/
public
void
addWeiBoUser
(
WeiBoUser
weiBoUser
){
try
{
Document
document
=
new
Document
();
document
.
put
(
"_id"
,
weiBoUser
.
getId
());
...
...
@@ -42,8 +43,15 @@ public class WeiBoUserDao {
document
.
put
(
"userName"
,
weiBoUser
.
getUserName
());
document
.
put
(
"topic"
,
weiBoUser
.
getTopic
());
document
.
put
(
"time"
,
weiBoUser
.
getTime
());
document
.
put
(
"followerCount"
,
weiBoUser
.
getFollowerCount
());
document
.
put
(
"profileImageUrl"
,
weiBoUser
.
getProfileImageUrl
());
if
(
Objects
.
nonNull
(
weiBoUser
.
getType
())){
document
.
put
(
"type"
,
weiBoUser
.
getType
());
}
if
(
Objects
.
nonNull
(
weiBoUser
.
getFollowerCount
())){
document
.
put
(
"followerCount"
,
weiBoUser
.
getFollowerCount
());
}
if
(
Objects
.
nonNull
(
weiBoUser
.
getProfileImageUrl
())){
document
.
put
(
"profileImageUrl"
,
weiBoUser
.
getProfileImageUrl
());
}
try
{
mongoCollection
.
insertOne
(
document
);
}
catch
(
Exception
e
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment