Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
d59803e9
Commit
d59803e9
authored
Jan 05, 2022
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加微博信息及用户信息异常捕获
parent
402290c1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
180 additions
and
171 deletions
+180
-171
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+180
-171
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
d59803e9
...
@@ -623,68 +623,72 @@ public class WeiboHotSearchCrawler {
...
@@ -623,68 +623,72 @@ public class WeiboHotSearchCrawler {
public
static
List
<
WeiBoUser
>
analysisWeiBoUsers
(
JSONArray
cardGroup
,
String
topic
)
{
public
static
List
<
WeiBoUser
>
analysisWeiBoUsers
(
JSONArray
cardGroup
,
String
topic
)
{
List
<
WeiBoUser
>
weiBoUserList
=
new
ArrayList
();
List
<
WeiBoUser
>
weiBoUserList
=
new
ArrayList
();
//解析weibo人物信息
//解析weibo人物信息
Date
date
=
new
Date
();
try
{
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
Date
date
=
new
Date
();
Integer
cardType
=
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
));
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
if
(
24
==
cardType
||
3
==
cardType
)
{
Integer
cardType
=
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
));
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"users"
))
{
if
(
24
==
cardType
||
3
==
cardType
)
{
JSONArray
users
=
cardGroup
.
getJSONObject
(
i
).
getJSONArray
(
"users"
);
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"users"
))
{
for
(
int
i1
=
0
;
i1
<
users
.
size
();
i1
++)
{
JSONArray
users
=
cardGroup
.
getJSONObject
(
i
).
getJSONArray
(
"users"
);
for
(
int
i1
=
0
;
i1
<
users
.
size
();
i1
++)
{
//获取用户id
String
userId
=
users
.
getJSONObject
(
i1
).
getString
(
"id"
);
//获取用户名
String
userName
=
users
.
getJSONObject
(
i1
).
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
users
.
getJSONObject
(
i1
).
getString
(
"verified_reason"
);
//获取粉丝数量
String
followers_count
=
users
.
getJSONObject
(
i1
).
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(!
followers_count
.
contains
(
"万"
))
{
followerCount
=
Long
.
valueOf
(
followers_count
);
}
else
{
String
[]
split
=
followers_count
.
split
(
"万"
);
double
foll
=
Double
.
parseDouble
(
split
[
0
]);
followerCount
=
new
Double
(
foll
*
10000
).
longValue
();
// followerCount = Long.valueOf(split[0]) * 10000;
}
//用户头像地址
String
profileImageUrl
=
users
.
getJSONObject
(
i1
).
getString
(
"profile_image_url"
);
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
,
profileImageUrl
);
weiBoUserList
.
add
(
weiBoUser
);
}
}
return
weiBoUserList
;
}
else
if
(
10
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"user"
))
{
JSONObject
user
=
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"user"
);
//获取用户id
//获取用户id
String
userId
=
user
s
.
getJSONObject
(
i1
)
.
getString
(
"id"
);
String
userId
=
user
.
getString
(
"id"
);
//获取用户名
//获取用户名
String
userName
=
user
s
.
getJSONObject
(
i1
)
.
getString
(
"screen_name"
);
String
userName
=
user
.
getString
(
"screen_name"
);
//获取认证信息
//获取认证信息
String
attestationMassage
=
users
.
getJSONObject
(
i1
).
getString
(
"verified_reason"
);
String
attestationMassage
=
user
.
getString
(
"verified_reason"
);
//获取粉丝数
//获取粉丝数量
String
followers_count
=
user
.
getString
(
"followers_count"
);
String
followers_count
=
users
.
getJSONObject
(
i1
).
getString
(
"followers_count"
);
Long
followerCount
=
null
;
Long
followerCount
=
null
;
if
(!
followers_count
.
contains
(
"万"
))
{
if
(
followers_count
.
contains
(
"万"
))
{
followerCount
=
Long
.
valueOf
(
followers_count
);
}
else
{
String
[]
split
=
followers_count
.
split
(
"万"
);
String
[]
split
=
followers_count
.
split
(
"万"
);
double
foll
=
Double
.
parseDouble
(
split
[
0
]);
Double
aDouble
=
Double
.
valueOf
(
split
[
0
])
*
10000
;
followerCount
=
new
Double
(
foll
*
10000
).
longValue
();
followerCount
=
new
Double
(
aDouble
).
longValue
();
// followerCount = Long.valueOf(split[0]) * 10000;
}
else
if
(
followers_count
.
contains
(
"亿"
))
{
String
[]
split
=
followers_count
.
split
(
"亿"
);
Double
aDouble
=
Double
.
valueOf
(
split
[
0
])
*
100000000
;
followerCount
=
new
Double
(
aDouble
).
longValue
();
}
else
{
followerCount
=
Long
.
valueOf
(
followers_count
);
}
}
//用户头像地址
//用户头像地址
String
profileImageUrl
=
user
s
.
getJSONObject
(
i1
)
.
getString
(
"profile_image_url"
);
String
profileImageUrl
=
user
.
getString
(
"profile_image_url"
);
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
,
profileImageUrl
);
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
,
profileImageUrl
);
weiBoUserList
.
add
(
weiBoUser
);
weiBoUserList
.
add
(
weiBoUser
);
}
}
return
weiBoUserList
;
}
}
return
weiBoUserList
;
}
else
if
(
10
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"user"
))
{
JSONObject
user
=
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"user"
);
//获取用户id
String
userId
=
user
.
getString
(
"id"
);
//获取用户名
String
userName
=
user
.
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
user
.
getString
(
"verified_reason"
);
//获取粉丝数
String
followers_count
=
user
.
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(
followers_count
.
contains
(
"万"
))
{
String
[]
split
=
followers_count
.
split
(
"万"
);
Double
aDouble
=
Double
.
valueOf
(
split
[
0
])
*
10000
;
followerCount
=
new
Double
(
aDouble
).
longValue
();
}
else
if
(
followers_count
.
contains
(
"亿"
))
{
String
[]
split
=
followers_count
.
split
(
"亿"
);
Double
aDouble
=
Double
.
valueOf
(
split
[
0
])
*
100000000
;
followerCount
=
new
Double
(
aDouble
).
longValue
();
}
else
{
followerCount
=
Long
.
valueOf
(
followers_count
);
}
//用户头像地址
String
profileImageUrl
=
user
.
getString
(
"profile_image_url"
);
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
,
profileImageUrl
);
weiBoUserList
.
add
(
weiBoUser
);
}
return
weiBoUserList
;
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析人物信息失败"
,
e
);
}
}
return
Collections
.
emptyList
();
return
Collections
.
emptyList
();
}
}
...
@@ -698,137 +702,142 @@ public class WeiboHotSearchCrawler {
...
@@ -698,137 +702,142 @@ public class WeiboHotSearchCrawler {
* @return
* @return
*/
*/
public
static
WeiBoMassage
analysisWeiboMBlog
(
JSONObject
jsonObject
,
String
topic
)
{
public
static
WeiBoMassage
analysisWeiboMBlog
(
JSONObject
jsonObject
,
String
topic
)
{
JSONObject
mblog
=
jsonObject
.
getJSONObject
(
"mblog"
);
WeiBoMassage
weiBoMassage
=
null
;
String
type
=
mblog
.
getJSONObject
(
"title"
).
getString
(
"text"
);
try
{
String
card_type
=
jsonObject
.
getString
(
"card_type"
);
JSONObject
mblog
=
jsonObject
.
getJSONObject
(
"mblog"
);
Integer
cardType
=
Integer
.
valueOf
(
card_type
);
String
type
=
mblog
.
getJSONObject
(
"title"
).
getString
(
"text"
);
String
show_type
=
jsonObject
.
getString
(
"show_type"
);
String
card_type
=
jsonObject
.
getString
(
"card_type"
);
Integer
showType
=
Integer
.
valueOf
(
show_type
);
Integer
cardType
=
Integer
.
valueOf
(
card_type
);
//点赞数
String
show_type
=
jsonObject
.
getString
(
"show_type"
);
String
attitudes_count
=
mblog
.
getString
(
"attitudes_count"
);
Integer
showType
=
Integer
.
valueOf
(
show_type
);
Long
attitudeCount
=
null
;
//点赞数
if
(
attitudes_count
.
contains
(
"万"
))
{
String
attitudes_count
=
mblog
.
getString
(
"attitudes_count"
);
String
[]
split
=
attitudes_count
.
split
(
"万"
);
Long
attitudeCount
=
null
;
attitudeCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
if
(
attitudes_count
.
contains
(
"万"
))
{
}
else
{
String
[]
split
=
attitudes_count
.
split
(
"万"
);
attitudeCount
=
Long
.
valueOf
(
attitudes_count
);
attitudeCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
}
else
{
attitudeCount
=
Long
.
valueOf
(
attitudes_count
);
}
//评论数
//评论数
String
comments_count
=
mblog
.
getString
(
"comments_count"
);
String
comments_count
=
mblog
.
getString
(
"comments_count"
);
Long
commentCount
=
null
;
Long
commentCount
=
null
;
if
(
comments_count
.
contains
(
"万"
))
{
if
(
comments_count
.
contains
(
"万"
))
{
String
[]
split
=
comments_count
.
split
(
"万"
);
String
[]
split
=
comments_count
.
split
(
"万"
);
commentCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
commentCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
}
else
{
commentCount
=
Long
.
valueOf
(
comments_count
);
commentCount
=
Long
.
valueOf
(
comments_count
);
}
}
//转发数
//转发数
String
reposts_count
=
mblog
.
getString
(
"reposts_count"
);
String
reposts_count
=
mblog
.
getString
(
"reposts_count"
);
Long
repostCount
=
null
;
Long
repostCount
=
null
;
if
(
reposts_count
.
contains
(
"万"
))
{
if
(
reposts_count
.
contains
(
"万"
))
{
String
[]
split
=
reposts_count
.
split
(
"万"
);
String
[]
split
=
reposts_count
.
split
(
"万"
);
repostCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
repostCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
}
else
{
repostCount
=
Long
.
valueOf
(
reposts_count
);
repostCount
=
Long
.
valueOf
(
reposts_count
);
}
}
Date
createTime
=
null
;
Date
createTime
=
null
;
Date
editTime
=
null
;
Date
editTime
=
null
;
try
{
try
{
SimpleDateFormat
simpleDateFormat
=
new
SimpleDateFormat
(
"EEE MMM dd HH:mm:ss z yyyy"
,
java
.
util
.
Locale
.
US
);
SimpleDateFormat
simpleDateFormat
=
new
SimpleDateFormat
(
"EEE MMM dd HH:mm:ss z yyyy"
,
Locale
.
US
);
//创建时间
//创建时间
String
created_at
=
mblog
.
getString
(
"created_at"
);
String
created_at
=
mblog
.
getString
(
"created_at"
);
createTime
=
simpleDateFormat
.
parse
(
created_at
);
createTime
=
simpleDateFormat
.
parse
(
created_at
);
//编辑时间
//编辑时间
if
(
mblog
.
containsKey
(
"edit_at"
))
{
if
(
mblog
.
containsKey
(
"edit_at"
))
{
String
edit_at
=
mblog
.
getString
(
"edit_at"
);
String
edit_at
=
mblog
.
getString
(
"edit_at"
);
editTime
=
simpleDateFormat
.
parse
(
edit_at
);
editTime
=
simpleDateFormat
.
parse
(
edit_at
);
}
}
catch
(
ParseException
e
)
{
log
.
error
(
"创建时间和编辑时间解析异常"
,
e
);
}
}
}
catch
(
ParseException
e
)
{
log
.
error
(
"创建时间和编辑时间解析异常"
,
e
);
}
String
mid
=
mblog
.
getString
(
"mid"
);
String
mid
=
mblog
.
getString
(
"mid"
);
//用户id
//用户id
String
userId
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"id"
);
String
userId
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//用户名
//用户名
String
userName
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
String
userName
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//来源
//来源
String
source
=
mblog
.
getString
(
"source"
);
String
source
=
mblog
.
getString
(
"source"
);
//用户头像地址
//用户头像地址
String
profileImageUrl
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"profile_image_url"
);
String
profileImageUrl
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"profile_image_url"
);
//内容
//内容
String
content
=
null
;
String
content
=
null
;
if
(
mblog
.
getString
(
"text"
).
contains
(
"<"
))
{
if
(
mblog
.
getString
(
"text"
).
contains
(
"<"
))
{
String
text
=
mblog
.
getString
(
"text"
);
String
text
=
mblog
.
getString
(
"text"
);
org
.
jsoup
.
nodes
.
Document
parse
=
Jsoup
.
parse
(
text
);
org
.
jsoup
.
nodes
.
Document
parse
=
Jsoup
.
parse
(
text
);
content
=
parse
.
text
();
content
=
parse
.
text
();
}
else
{
content
=
mblog
.
getString
(
"text"
);
}
WeiBoMassage
weiBoMassage
=
new
WeiBoMassage
(
userId
,
content
,
userName
,
mid
,
createTime
,
editTime
,
cardType
,
showType
,
}
else
{
repostCount
,
commentCount
,
attitudeCount
,
source
,
type
,
topic
,
profileImageUrl
);
content
=
mblog
.
getString
(
"text"
);
//默认不转发为0
weiBoMassage
.
setForward
(
0
);
JSONObject
weiboJson
=
null
;
//微博实体 是否转发
if
(
mblog
.
containsKey
(
"retweeted_status"
))
{
weiboJson
=
mblog
.
getJSONObject
(
"retweeted_status"
);
//处理转发特有的
//weiBoMassage.set
//源mid
String
rootMid
=
weiboJson
.
getString
(
"mid"
);
//源来源
String
rootSource
=
weiboJson
.
getString
(
"source"
);
//源text
String
text
=
weiboJson
.
getString
(
"text"
);
//解析
org
.
jsoup
.
nodes
.
Document
parse
=
Jsoup
.
parse
(
text
);
String
rootText
=
parse
.
text
();
//源用户id
String
rootId
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//源用户名
String
rootName
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//数据保存到对象中
weiBoMassage
.
setRoot_mid
(
rootMid
);
weiBoMassage
.
setRoot_id
(
rootId
);
weiBoMassage
.
setRoot_source
(
rootSource
);
weiBoMassage
.
setRoot_text
(
rootText
);
weiBoMassage
.
setRoot_name
(
rootName
);
//转发为1
weiBoMassage
.
setForward
(
1
);
}
else
{
weiboJson
=
mblog
;
}
List
<
String
>
pictureUrlList
=
new
ArrayList
();
Long
playCount
=
null
;
//获取播放量和图片链接
if
(
weiboJson
.
getJSONArray
(
"pic_ids"
).
size
()
>
0
)
{
JSONArray
jsonArray
=
weiboJson
.
getJSONArray
(
"pics"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
String
picUrl
=
jsonArray
.
getJSONObject
(
i
).
getString
(
"url"
);
pictureUrlList
.
add
(
picUrl
);
}
}
}
else
if
(
weiboJson
.
containsKey
(
"page_info"
))
{
if
(
weiboJson
.
getJSONObject
(
"page_info"
).
containsKey
(
"play_count"
))
{
weiBoMassage
=
new
WeiBoMassage
(
userId
,
content
,
userName
,
mid
,
createTime
,
editTime
,
cardType
,
showType
,
String
play
=
weiboJson
.
getJSONObject
(
"page_info"
).
getString
(
"play_count"
);
repostCount
,
commentCount
,
attitudeCount
,
source
,
type
,
topic
,
profileImageUrl
);
if
(
play
.
contains
(
"万"
))
{
//默认不转发为0
String
[]
split
=
play
.
split
(
"万"
);
weiBoMassage
.
setForward
(
0
);
playCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
if
(
play
.
contains
(
"次"
))
{
JSONObject
weiboJson
=
null
;
String
[]
split
=
play
.
split
(
"次"
);
//微博实体 是否转发
playCount
=
Long
.
valueOf
(
split
[
0
]);
if
(
mblog
.
containsKey
(
"retweeted_status"
))
{
weiboJson
=
mblog
.
getJSONObject
(
"retweeted_status"
);
//处理转发特有的
//weiBoMassage.set
//源mid
String
rootMid
=
weiboJson
.
getString
(
"mid"
);
//源来源
String
rootSource
=
weiboJson
.
getString
(
"source"
);
//源text
String
text
=
weiboJson
.
getString
(
"text"
);
//解析
org
.
jsoup
.
nodes
.
Document
parse
=
Jsoup
.
parse
(
text
);
String
rootText
=
parse
.
text
();
//源用户id
String
rootId
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//源用户名
String
rootName
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//数据保存到对象中
weiBoMassage
.
setRoot_mid
(
rootMid
);
weiBoMassage
.
setRoot_id
(
rootId
);
weiBoMassage
.
setRoot_source
(
rootSource
);
weiBoMassage
.
setRoot_text
(
rootText
);
weiBoMassage
.
setRoot_name
(
rootName
);
//转发为1
weiBoMassage
.
setForward
(
1
);
}
else
{
weiboJson
=
mblog
;
}
List
<
String
>
pictureUrlList
=
new
ArrayList
();
Long
playCount
=
null
;
//获取播放量和图片链接
if
(
weiboJson
.
getJSONArray
(
"pic_ids"
).
size
()
>
0
)
{
JSONArray
jsonArray
=
weiboJson
.
getJSONArray
(
"pics"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
String
picUrl
=
jsonArray
.
getJSONObject
(
i
).
getString
(
"url"
);
pictureUrlList
.
add
(
picUrl
);
}
}
else
if
(
weiboJson
.
containsKey
(
"page_info"
))
{
if
(
weiboJson
.
getJSONObject
(
"page_info"
).
containsKey
(
"play_count"
))
{
String
play
=
weiboJson
.
getJSONObject
(
"page_info"
).
getString
(
"play_count"
);
if
(
play
.
contains
(
"万"
))
{
String
[]
split
=
play
.
split
(
"万"
);
playCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
if
(
play
.
contains
(
"次"
))
{
String
[]
split
=
play
.
split
(
"次"
);
playCount
=
Long
.
valueOf
(
split
[
0
]);
}
}
}
}
}
weiBoMassage
.
setPlayCount
(
playCount
);
weiBoMassage
.
setPictureUrlList
(
pictureUrlList
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博信息失败"
,
e
);
}
}
weiBoMassage
.
setPlayCount
(
playCount
);
weiBoMassage
.
setPictureUrlList
(
pictureUrlList
);
return
weiBoMassage
;
return
weiBoMassage
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment