Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
c495fcc6
Commit
c495fcc6
authored
Jun 04, 2021
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
微博话题解析新增采集微博信息和微博用户
parent
f01e39b6
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1042 additions
and
60 deletions
+1042
-60
dependency-reduced-pom.xml
+26
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoMassage.java
+131
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoUser.java
+65
-0
src/main/java/com/zhiwei/searchhotcrawler/config/DBConfig.java
+5
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+272
-3
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoMassageDao.java
+81
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoUserDao.java
+59
-0
src/test/java/weiboTest/WeiboHotSearchTest.java
+403
-57
No files found.
dependency-reduced-pom.xml
View file @
c495fcc6
...
...
@@ -71,6 +71,32 @@
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.6.7.2-RELEASE
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
<version>
4.13
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
org.projectlombok
</groupId>
<artifactId>
lombok
</artifactId>
<version>
1.18.20
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-test
</artifactId>
<version>
5.3.6
</version>
<scope>
test
</scope>
</dependency>
</dependencies>
<properties>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoMassage.java
0 → 100644
View file @
c495fcc6
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
/**
* @ClassName: WeiBoMassage
* @Description: 微博主要信息
* @author ll
* @date 2021年5月27日 下午2:26:11
*/
import
lombok.Data
;
import
lombok.ToString
;
import
java.io.Serializable
;
import
java.util.Date
;
import
java.util.List
;
@Data
@ToString
public
class
WeiBoMassage
implements
Serializable
{
private
static
final
long
serialVersionUID
=
5640606453392799871L
;
/**
* 主键
*/
private
String
id
;
/**
* 用户id
*/
private
String
userId
;
/**
* 内容
*/
private
String
text
;
/**
* 用户名
*/
private
String
userName
;
/**
*
*/
private
String
mid
;
/**
* 创建时间
*/
private
Date
creatTime
;
/**
* 编辑时间
*/
private
Date
editTime
;
/**
*
*/
private
Integer
cardType
;
/**
* 显示类型
*/
private
Integer
showType
;
/**
* 转发数
*/
private
Long
repostCount
;
/**
* 评论数
*/
private
Long
commentCount
;
/**
* 点赞数
*/
private
Long
attitudeCount
;
/**
* 播放量
*/
private
Long
playCount
;
/**
* 图片地址
*/
private
List
<
String
>
pictureUrlList
;
/**
* 来源
*/
private
String
source
;
/**
* 类型
*/
private
String
type
;
/**
* 话题
*/
private
String
topic
;
//是否转发
private
Integer
forward
;
//转发 源微博mid
private
String
root_mid
;
//转发 源微博user信息
//转发 源id
private
String
root_id
;
//转发 源name
private
String
root_name
;
//转发 源微博text
private
String
root_text
;
//转发 源来源
private
String
root_source
;
public
WeiBoMassage
()
{
}
public
WeiBoMassage
(
String
userId
,
String
text
,
String
userName
,
String
mid
,
Date
creatTime
,
Date
editTime
,
Integer
cardType
,
Integer
showType
,
Long
repostCount
,
Long
commentCount
,
Long
attitudeCount
,
String
source
,
String
type
,
String
topic
)
{
this
.
id
=
mid
+
"_"
+
HotSearchType
.
微博热搜
.
name
()+
"_"
+
topic
;
this
.
userId
=
userId
;
this
.
text
=
text
;
this
.
userName
=
userName
;
this
.
mid
=
mid
;
this
.
creatTime
=
creatTime
;
this
.
editTime
=
editTime
;
this
.
cardType
=
cardType
;
this
.
showType
=
showType
;
this
.
repostCount
=
repostCount
;
this
.
commentCount
=
commentCount
;
this
.
attitudeCount
=
attitudeCount
;
this
.
source
=
source
;
this
.
type
=
type
;
this
.
topic
=
topic
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoUser.java
0 → 100644
View file @
c495fcc6
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
/**
* @ClassName: WeiBoUser
* @Description: 微博用户
* @author ll
* @date 2021年5月27日 下午3:26:11
*/
import
lombok.Data
;
import
lombok.ToString
;
import
java.io.Serializable
;
import
java.util.Date
;
@Data
@ToString
public
class
WeiBoUser
implements
Serializable
{
private
static
final
long
serialVersionUID
=
-
2856936638431788899L
;
/**
* 主键
*/
private
String
id
;
/**
* 用户id
*/
private
String
userId
;
/**
* 认证信息
*/
private
String
attestationMassage
;
/**
* 用户名
*/
private
String
userName
;
/**
* 话题
*/
private
String
topic
;
/**
*时间
*/
private
Date
time
;
/**
* 粉丝数
*/
private
Long
followerCount
;
public
WeiBoUser
()
{
}
public
WeiBoUser
(
String
userId
,
String
attestationMassage
,
String
userName
,
String
topic
,
Date
time
,
Long
followerCount
)
{
this
.
id
=
userId
+
"_"
+
HotSearchType
.
微博热搜
.
name
()+
"_"
+
topic
;
this
.
userId
=
userId
;
this
.
attestationMassage
=
attestationMassage
;
this
.
userName
=
userName
;
this
.
topic
=
topic
;
this
.
time
=
time
;
this
.
followerCount
=
followerCount
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/config/DBConfig.java
View file @
c495fcc6
...
...
@@ -19,6 +19,9 @@ public class DBConfig {
searchCacheCollName
=
conf
.
getProperty
(
"searchCacheCollName"
);
topicCollName
=
conf
.
getProperty
(
"topicCollName"
);
collWechatUserName
=
conf
.
getProperty
(
"collWechatUserName"
);
weiBoMassageCollName
=
conf
.
getProperty
(
"weiBoMassageCollName"
);
weiBoUserCollName
=
conf
.
getProperty
(
"weiBoUserCollName"
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
...
...
@@ -32,4 +35,6 @@ public class DBConfig {
public
static
String
searchCacheCollName
;
public
static
String
topicCollName
;
public
static
String
collWechatUserName
;
public
static
String
weiBoMassageCollName
;
public
static
String
weiBoUserCollName
;
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
c495fcc6
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.io.IOException
;
import
java.text.ParseException
;
import
java.text.SimpleDateFormat
;
import
java.util.*
;
import
java.util.stream.Collectors
;
import
com.alibaba.fastjson.JSON
;
import
com.zhiwei.searchhotcrawler.bean.
HotSearchCache
;
import
com.zhiwei.searchhotcrawler.bean.
*
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
com.zhiwei.searchhotcrawler.dao.RedisDao
;
import
com.zhiwei.searchhotcrawler.dao.WeiBoMassageDao
;
import
com.zhiwei.searchhotcrawler.dao.WeiBoUserDao
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
...
...
@@ -25,12 +29,12 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.mail.SendMailWeibo
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
* @ClassName: WeiboHotSearch
* @Description: 微博实时热搜采集
...
...
@@ -169,6 +173,7 @@ public class WeiboHotSearchCrawler {
continue
;
}
// }
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时热搜时出现解析错误,数据不是json结构"
,
e
);
...
...
@@ -242,6 +247,7 @@ public class WeiboHotSearchCrawler {
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"cardlistInfo"
);
List
<
JSONObject
>
cardsJsons
=
(
List
<
JSONObject
>)
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
get
(
"cards"
);
if
(
json
.
containsKey
(
"desc"
)){
String
topicLead
=
json
.
getString
(
"desc"
);
if
(!
""
.
equals
(
topicLead
))
{
...
...
@@ -266,12 +272,275 @@ public class WeiboHotSearchCrawler {
}
}
}
//调用weiBoMassageDao添加数据
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
//解析cards,获取热门微博、人物
for
(
JSONObject
jsonObject
:
cardsJsons
)
{
if
(
nonNull
(
jsonObject
)
&&
!
jsonObject
.
isEmpty
())
{
if
(
jsonObject
.
containsKey
(
"mblog"
))
{
if
(
jsonObject
.
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
jsonObject
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
}
}
else
if
(
jsonObject
.
containsKey
(
"card_group"
))
{
JSONArray
cardGroup
=
jsonObject
.
getJSONArray
(
"card_group"
);
WeiBoMassage
weiBoMassage
=
analysisWeiboMassage
(
cardGroup
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
analysisWeiBoUsers
(
cardGroup
,
document
.
getString
(
"name"
));
}
}
else
{
log
.
info
(
"获取数据失败"
);
}
}
return
document
;
}
}
return
null
;
}
/**
* 解析微博信息
*
* @param cardGroup
* @param topic
* @return
*/
public
static
WeiBoMassage
analysisWeiboMassage
(
JSONArray
cardGroup
,
String
topic
)
{
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"mblog"
))
{
if
(
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
cardGroup
.
getJSONObject
(
i
),
topic
);
return
weiBoMassage
;
}
}
}
return
null
;
}
/**
* 解析用户信息
*
* @param cardGroup
* @param topic
* @return
*/
public
static
void
analysisWeiBoUsers
(
JSONArray
cardGroup
,
String
topic
)
{
//解析weibo人物信息
//创建weiBoUserDao
WeiBoUserDao
weiBoUserDao
=
new
WeiBoUserDao
();
Date
date
=
new
Date
();
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
if
(
3
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"users"
)){
JSONArray
users
=
cardGroup
.
getJSONObject
(
i
).
getJSONArray
(
"users"
);
for
(
int
i1
=
0
;
i1
<
users
.
size
();
i1
++)
{
//获取用户id
String
userId
=
users
.
getJSONObject
(
i1
).
getString
(
"id"
);
//获取用户名
String
userName
=
users
.
getJSONObject
(
i1
).
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
users
.
getJSONObject
(
i1
).
getString
(
"verified_reason"
);
//获取粉丝数量
String
followers_count
=
users
.
getJSONObject
(
i1
).
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(!
followers_count
.
contains
(
"万"
)){
followerCount
=
Long
.
valueOf
(
followers_count
);
}
else
{
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])*
10000
;
}
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
);
//判断weiBoUser是否为空添加数据
if
(
weiBoUser
!=
null
)
{
//调用weiBoUserDao中的方法添加数据
weiBoUserDao
.
addWeiBoUser
(
weiBoUser
);
}
else
{
log
.
info
(
"未采集到用户信息"
);
}
}
}
}
else
if
(
10
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"user"
)){
JSONObject
user
=
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"user"
);
//获取用户id
String
userId
=
user
.
getString
(
"id"
);
//获取用户名
String
userName
=
user
.
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
user
.
getString
(
"verified_reason"
);
//获取粉丝数
String
followers_count
=
user
.
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(
followers_count
.
contains
(
"万"
)){
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])*
10000
;
}
else
{
followerCount
=
Long
.
valueOf
(
followers_count
);
}
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
);
//判断weiBoUser是否为空添加数据
if
(
weiBoUser
!=
null
)
{
//调用weiBoUserDao中的方法添加数据
weiBoUserDao
.
addWeiBoUser
(
weiBoUser
);
}
else
{
log
.
info
(
"未采集到用户信息"
);
}
}
}
}
}
/**
* 解析微博类型
*
* @param jsonObject
* @param topic
* @return
*/
public
static
WeiBoMassage
analysisWeiboMBlog
(
JSONObject
jsonObject
,
String
topic
)
{
JSONObject
mblog
=
jsonObject
.
getJSONObject
(
"mblog"
);
String
type
=
mblog
.
getJSONObject
(
"title"
).
getString
(
"text"
);
String
card_type
=
jsonObject
.
getString
(
"card_type"
);
Integer
cardType
=
Integer
.
valueOf
(
card_type
);
String
show_type
=
jsonObject
.
getString
(
"show_type"
);
Integer
showType
=
Integer
.
valueOf
(
show_type
);
//点赞数
String
attitudes_count
=
mblog
.
getString
(
"attitudes_count"
);
Long
attitudeCount
=
null
;
if
(
attitudes_count
.
contains
(
"万"
))
{
String
[]
split
=
attitudes_count
.
split
(
"万"
);
attitudeCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
attitudeCount
=
Long
.
valueOf
(
attitudes_count
);
}
//评论数
String
comments_count
=
mblog
.
getString
(
"comments_count"
);
Long
commentCount
=
null
;
if
(
comments_count
.
contains
(
"万"
))
{
String
[]
split
=
comments_count
.
split
(
"万"
);
commentCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
commentCount
=
Long
.
valueOf
(
comments_count
);
}
//转发数
String
reposts_count
=
mblog
.
getString
(
"reposts_count"
);
Long
repostCount
=
null
;
if
(
reposts_count
.
contains
(
"万"
)){
String
[]
split
=
reposts_count
.
split
(
"万"
);
repostCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
repostCount
=
Long
.
valueOf
(
reposts_count
);
}
Date
createTime
=
null
;
Date
editTime
=
null
;
try
{
SimpleDateFormat
simpleDateFormat
=
new
SimpleDateFormat
(
"EEE MMM dd HH:mm:ss z yyyy"
,
java
.
util
.
Locale
.
US
);
//创建时间
String
created_at
=
mblog
.
getString
(
"created_at"
);
createTime
=
simpleDateFormat
.
parse
(
created_at
);
//编辑时间
if
(
mblog
.
containsKey
(
"edit_at"
)){
String
edit_at
=
mblog
.
getString
(
"edit_at"
);
editTime
=
simpleDateFormat
.
parse
(
edit_at
);
}
}
catch
(
ParseException
e
)
{
log
.
error
(
"创建时间和编辑时间解析异常"
,
e
);
}
String
mid
=
mblog
.
getString
(
"mid"
);
//用户id
String
userId
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//用户名
String
userName
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//来源
String
source
=
mblog
.
getString
(
"source"
);
//内容
String
content
=
null
;
if
(
mblog
.
getString
(
"text"
).
contains
(
"<"
))
{
String
text
=
mblog
.
getString
(
"text"
);
org
.
jsoup
.
nodes
.
Document
parse
=
Jsoup
.
parse
(
text
);
content
=
parse
.
text
();
}
else
{
content
=
mblog
.
getString
(
"text"
);
}
WeiBoMassage
weiBoMassage
=
new
WeiBoMassage
(
userId
,
content
,
userName
,
mid
,
createTime
,
editTime
,
cardType
,
showType
,
repostCount
,
commentCount
,
attitudeCount
,
source
,
type
,
topic
);
//默认不转发为0
weiBoMassage
.
setForward
(
0
);
JSONObject
weiboJson
=
null
;
//微博实体 是否转发
if
(
mblog
.
containsKey
(
"retweeted_status"
))
{
weiboJson
=
mblog
.
getJSONObject
(
"retweeted_status"
);
//处理转发特有的
//weiBoMassage.set
//源mid
String
rootMid
=
weiboJson
.
getString
(
"mid"
);
//源来源
String
rootSource
=
weiboJson
.
getString
(
"source"
);
//源text
String
text
=
weiboJson
.
getString
(
"text"
);
//解析
org
.
jsoup
.
nodes
.
Document
parse
=
Jsoup
.
parse
(
text
);
String
rootText
=
parse
.
text
();
//源用户id
String
rootId
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//源用户名
String
rootName
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//数据保存到对象中
weiBoMassage
.
setRoot_mid
(
rootMid
);
weiBoMassage
.
setRoot_id
(
rootId
);
weiBoMassage
.
setRoot_source
(
rootSource
);
weiBoMassage
.
setRoot_text
(
rootText
);
weiBoMassage
.
setRoot_name
(
rootName
);
//转发为1
weiBoMassage
.
setForward
(
1
);
}
else
{
weiboJson
=
mblog
;
}
List
<
String
>
pictureUrlList
=
new
ArrayList
();
Long
playCount
=
null
;
//获取播放量和图片链接
if
(
weiboJson
.
getJSONArray
(
"pic_ids"
).
size
()
>
0
)
{
JSONArray
jsonArray
=
weiboJson
.
getJSONArray
(
"pics"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
String
picUrl
=
jsonArray
.
getJSONObject
(
i
).
getString
(
"url"
);
pictureUrlList
.
add
(
picUrl
);
}
}
else
if
(
weiboJson
.
containsKey
(
"page_info"
))
{
if
(
weiboJson
.
getJSONObject
(
"page_info"
).
containsKey
(
"play_count"
)){
String
play
=
weiboJson
.
getJSONObject
(
"page_info"
).
getString
(
"play_count"
);
if
(
play
.
contains
(
"万"
))
{
String
[]
split
=
play
.
split
(
"万"
);
playCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
if
(
play
.
contains
(
"次"
)){
String
[]
split
=
play
.
split
(
"次"
);
playCount
=
Long
.
valueOf
(
split
[
0
]);
}
}
}
weiBoMassage
.
setPlayCount
(
playCount
);
weiBoMassage
.
setPictureUrlList
(
pictureUrlList
);
return
weiBoMassage
;
}
// /**
// * 微博更新历史数据
// * @param hotSearch
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoMassageDao.java
0 → 100644
View file @
c495fcc6
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
com.mongodb.client.MongoCollection
;
import
com.mongodb.client.MongoDatabase
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoMassage
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Objects
;
/**
*微博信息入库
*/
@Log4j2
public
class
WeiBoMassageDao
{
public
static
MongoDatabase
mongoDatabase
=
MongoDBTemplate
.
getDB
(
DBConfig
.
dbName
);
public
static
MongoCollection
mongoCollection
;
public
WeiBoMassageDao
()
{
String
collName
=
DBConfig
.
weiBoMassageCollName
;
mongoCollection
=
mongoDatabase
.
getCollection
(
collName
);
//给数据表创建索引
MongoDBTemplate
.
createIndex
(
DBConfig
.
dbName
,
collName
);
}
/**
* 添加数据入库
* @param weiBoMassage
*/
public
void
addWeiBoMassage
(
WeiBoMassage
weiBoMassage
){
log
.
info
(
"weiBoMassage对象开始转document对象"
);
Document
document
=
new
Document
();
document
.
put
(
"_id"
,
weiBoMassage
.
getId
());
document
.
put
(
"userId"
,
weiBoMassage
.
getUserId
());
document
.
put
(
"text"
,
weiBoMassage
.
getText
());
document
.
put
(
"userName"
,
weiBoMassage
.
getUserName
());
document
.
put
(
"mid"
,
weiBoMassage
.
getMid
());
document
.
put
(
"creatTime"
,
weiBoMassage
.
getCreatTime
());
if
(
Objects
.
nonNull
(
weiBoMassage
.
getEditTime
())){
document
.
put
(
"editTime"
,
weiBoMassage
.
getEditTime
());
}
document
.
put
(
"cardType"
,
weiBoMassage
.
getCardType
());
document
.
put
(
"showType"
,
weiBoMassage
.
getShowType
());
document
.
put
(
"repostCount"
,
weiBoMassage
.
getRepostCount
());
document
.
put
(
"commentCount"
,
weiBoMassage
.
getCommentCount
());
document
.
put
(
"attitudeCount"
,
weiBoMassage
.
getAttitudeCount
());
if
(
Objects
.
nonNull
(
weiBoMassage
.
getPlayCount
())){
document
.
put
(
"playCount"
,
weiBoMassage
.
getPlayCount
());
}
if
(
weiBoMassage
.
getPictureUrlList
().
size
()!=
0
){
document
.
put
(
"pictureUrlList"
,
weiBoMassage
.
getPictureUrlList
());
}
document
.
put
(
"source"
,
weiBoMassage
.
getSource
());
document
.
put
(
"type"
,
weiBoMassage
.
getType
());
document
.
put
(
"topic"
,
weiBoMassage
.
getTopic
());
document
.
put
(
"forward"
,
weiBoMassage
.
getForward
());
if
(
0
!=
weiBoMassage
.
getForward
()){
document
.
put
(
"root_mid"
,
weiBoMassage
.
getRoot_mid
());
document
.
put
(
"root_id"
,
weiBoMassage
.
getRoot_id
());
document
.
put
(
"root_name"
,
weiBoMassage
.
getRoot_name
());
document
.
put
(
"root_text"
,
weiBoMassage
.
getRoot_text
());
document
.
put
(
"root_source"
,
weiBoMassage
.
getRoot_source
());
}
log
.
info
(
"weiBoMassage对象转document对象完成"
);
try
{
mongoCollection
.
insertOne
(
document
);
log
.
info
(
"数据插入成功"
);
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoUserDao.java
0 → 100644
View file @
c495fcc6
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
com.mongodb.client.MongoCollection
;
import
com.mongodb.client.MongoDatabase
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoMassage
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoUser
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Objects
;
@Log4j2
public
class
WeiBoUserDao
{
public
static
MongoDatabase
mongoDatabase
=
MongoDBTemplate
.
getDB
(
DBConfig
.
dbName
);
public
static
MongoCollection
mongoCollection
;
public
WeiBoUserDao
()
{
String
collName
=
DBConfig
.
weiBoUserCollName
;
mongoCollection
=
mongoDatabase
.
getCollection
(
collName
);
//给数据表创建索引
MongoDBTemplate
.
createIndex
(
DBConfig
.
dbName
,
collName
);
}
/**
* 添加数据入库
* @param weiBoUser
*/
public
void
addWeiBoUser
(
WeiBoUser
weiBoUser
){
log
.
info
(
"WeiBoUser对象开始转document对象"
);
Document
document
=
new
Document
();
document
.
put
(
"_id"
,
weiBoUser
.
getId
());
document
.
put
(
"userId"
,
weiBoUser
.
getUserId
());
if
(
Objects
.
nonNull
(
weiBoUser
.
getAttestationMassage
())){
document
.
put
(
"attestationMassage"
,
weiBoUser
.
getAttestationMassage
());
}
document
.
put
(
"userName"
,
weiBoUser
.
getUserName
());
document
.
put
(
"topic"
,
weiBoUser
.
getTopic
());
document
.
put
(
"time"
,
weiBoUser
.
getTime
());
document
.
put
(
"followerCount"
,
weiBoUser
.
getFollowerCount
());
log
.
info
(
"WeiBoUser对象转document对象完成"
);
try
{
mongoCollection
.
insertOne
(
document
);
log
.
info
(
"数据插入成功"
);
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
}
src/test/java/weiboTest/WeiboHotSearchTest.java
View file @
c495fcc6
...
...
@@ -4,12 +4,19 @@ package weiboTest;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoMassage
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoUser
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.dao.WeiBoMassageDao
;
import
com.zhiwei.searchhotcrawler.dao.WeiBoUserDao
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
...
...
@@ -20,9 +27,14 @@ import org.junit.Test;
import
org.junit.runner.RunWith
;
import
org.springframework.test.context.ContextConfiguration
;
import
org.springframework.test.context.junit4.SpringJUnit4ClassRunner
;
import
java.io.IOException
;
import
java.text.ParseException
;
import
java.text.SimpleDateFormat
;
import
java.util.*
;
import
java.util.concurrent.TimeUnit
;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
* @author cwt
...
...
@@ -31,69 +43,161 @@ import java.util.*;
@Log4j2
@RunWith
(
SpringJUnit4ClassRunner
.
class
)
@ContextConfiguration
(
locations
=
{
"classpath:applicationContext.xml"
})
public
class
WeiboHotSearchTest
{
{
"classpath:applicationContext.xml"
})
public
class
WeiboHotSearchTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
@Test
public
void
test
(){
Document
document
=
Jsoup
.
parse
(
"a href=\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23&extparam=%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23&luicode=10000011&lfid=100103type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23\" data-hide=\"\"><span class=\"surl-text\">#邓伦讲戏专业#</span></a><a href=\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E6%9E%81%E9%99%90%E6%8C%91%E6%88%98%23&luicode=10000011&lfid=100103type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23\" data-hide=\"\"><span class=\"surl-text\">#极限挑战#</span></a> <a href='/n/邓伦'>@邓伦</a> 和<a href='/n/景甜'>@景甜</a> 改编《甄嬛传》剧本,伦伦认真讲戏的样子让人瞬间穿越到拍摄现场。看来戏瘾上身的邓伦还过了一把导演的瘾,这专业的模样要不要考虑跨界当当导演呀~<span class=\"url-icon\"><img alt=[哈哈] src=\"https://h5.sinaimg.cn/m/emoticon/icon/default/d_haha-0ec05e6dad.png\" style=\"width:1em; height:1em;\" /></span><a data-url=\"http://t.cn/A6VJPN9w\" href=\"https://video.weibo.com/show?fid=1034:4640837901156490\" data-hide=\"\"><span class='url-icon'><img style='width: 1rem;height: 1rem' src='https://h5.sinaimg.cn/upload/2015/09/25/3/timeline_card_small_video_default.png'></span><span class=\"surl-text\">东方卫视极限挑战的微博视频</span></a>"
);
public
void
test
()
{
Document
document
=
Jsoup
.
parse
(
"<a href=\\\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E5%91%A8%E6%9F%AF%E5%AE%87%E7%88%B8%E7%88%B8%23&extparam=%23%E5%91%A8%E6%9F%AF%E5%AE%87%E7%88%B8%E7%88%B8%23&luicode=10000011&lfid=231522type%3D1%26q%3D%23%E5%91%A8%E6%9F%AF%E5%AE%87%E7%88%B8%E7%88%B8%23\\\" data-hide=\\\"\\\"><span class=\\\"surl-text\\\">#周柯宇爸爸#</span></a> \uD83E\uDDD0<a href=\\\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E5%91%A8%E6%9F%AF%E5%AE%87%E4%BC%A0%E9%94%80%E4%B9%8B%E5%AD%90%23&extparam=%23%E5%91%A8%E6%9F%AF%E5%AE%87%E4%BC%A0%E9%94%80%E4%B9%8B%E5%AD%90%23&luicode=10000011&lfid=231522type%3D1%26q%3D%23%E5%91%A8%E6%9F%AF%E5%AE%87%E7%88%B8%E7%88%B8%23\\\" data-hide=\\\"\\\"><span class=\\\"surl-text\\\">#周柯宇传销之子#</span></a> <br />周柯宇粉丝今天懂法了吗?没有我一会再来普法。周柯宇粉丝为传销洗地,周柯宇偶像失格,周柯宇粉丝素质低下,道德沦丧 \"\n"
);
System
.
out
.
println
(
document
.
text
());
}
@Test
public
void
test1
()
{
String
url
=
"<a href"
;
System
.
out
.
println
(
url
.
startsWith
(
"<"
));
}
@Test
public
void
testHotWeibo
(){
public
void
testHotWeibo
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
Date
date
=
new
Date
();
while
(
true
)
{
try
{
Date
date
=
new
Date
();
List
<
HotSearchList
>
hotSearchLists
=
weiboHotSearchByPhone
(
date
);
for
(
HotSearchList
hotSearchList
:
hotSearchLists
)
{
try
{
org
.
bson
.
Document
document
=
new
org
.
bson
.
Document
();
//System.out.println(hotSearchList);
document
.
put
(
"url"
,
hotSearchList
.
getUrl
());
document
.
put
(
"name"
,
hotSearchList
.
getName
());
test12
(
document
);
}
catch
(
Exception
e
)
{
log
.
info
(
"数据解析异常"
,
e
);
}
}
log
.
info
(
"本轮微博话题采集解析完毕"
);
log
.
info
(
hotSearchLists
.
size
());
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
log
.
info
(
"微博热搜采集异常"
,
e
);
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
}
ZhiWeiTools
.
sleep
(
50
);
}
//
// Date date = new Date();
// List<HotSearchList> hotSearchLists = weiboHotSearchByPhone(date);
// for (HotSearchList hotSearchList : hotSearchLists) {
// System.out.println(hotSearchList);
// }
}
//org.bson.Document document
// @Test
public
void
test12
(
org
.
bson
.
Document
document
)
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
// org.bson.Document document = new org.bson.Document();
// document.put("name","新疆人讲述真实的新疆");
// document.put("url","https://m.weibo.cn/search?containerid=100103type%3D1%26t%3D10%26q%3D%23%E6%96%B0%E7%96%86%E4%BA%BA%E8%AE%B2%E8%BF%B0%E7%9C%9F%E5%AE%9E%E7%9A%84%E6%96%B0%E7%96%86%23&isnewpage=1&extparam=seat%3D1%26filter_type%3Drealtimehot%26dgr%3D0%26cate%3D0%26pos%3D1%26realpos%3D2%26flag%3D1%26c_type%3D31%26display_time%3D1622705918&luicode=10000011&lfid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot");
log
.
info
(
"更新微博热搜{}导语阅读量和讨论量"
,
document
.
getString
(
"name"
));
String
url
=
"https://m.weibo.cn/api/container/getIndex?"
+
document
.
getString
(
"url"
).
substring
(
document
.
getString
(
"url"
).
indexOf
(
"?"
)
+
1
,
document
.
getString
(
"url"
).
indexOf
(
"&"
));
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
2
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONObject
cardlistInfoJson
=
dataJson
.
getJSONObject
(
"cardlistInfo"
);
List
<
JSONObject
>
cardsJsons
=
(
List
<
JSONObject
>)
dataJson
.
get
(
"cards"
);
//解析cardlistInfo,讨论、导语、阅读
if
(
cardlistInfoJson
.
containsKey
(
"desc"
))
{
String
topicLead
=
cardlistInfoJson
.
getString
(
"desc"
);
if
(!
""
.
equals
(
topicLead
))
{
document
.
put
(
"topicLead"
,
topicLead
);
}
}
if
(
cardlistInfoJson
.
containsKey
(
"cardlist_head_cards"
))
{
JSONObject
readJson
=
cardlistInfoJson
.
getJSONArray
(
"cardlist_head_cards"
).
getJSONObject
(
0
);
if
(
readJson
.
containsKey
(
"head_data"
))
{
String
midText
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"midtext"
);
String
read
=
midText
.
replaceAll
(
"阅读"
,
""
).
replaceAll
(
"讨论.*"
,
""
).
trim
();
String
discussCount
=
midText
.
replaceAll
(
".*讨论"
,
""
).
replaceAll
(
"详情.*"
,
""
).
trim
();
String
pictureUrl
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"portrait_url"
);
document
.
put
(
"readCount"
,
TipsUtils
.
getHotCount
(
read
));
document
.
put
(
"discussCount"
,
TipsUtils
.
getHotCount
(
discussCount
));
document
.
put
(
"pictureUrl"
,
pictureUrl
);
if
(
readJson
.
getJSONObject
(
"head_data"
).
containsKey
(
"downtext"
))
{
String
downtext
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"downtext"
);
if
(!
""
.
equals
(
downtext
))
{
document
.
put
(
"downtext"
,
downtext
.
replaceAll
(
"主持人:"
,
""
));
}
}
}
}
//调用weiBoMassageDao添加数据
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
//解析cards,获取热门微博、人物
for
(
JSONObject
jsonObject
:
cardsJsons
)
{
if
(
nonNull
(
jsonObject
)
&&
!
jsonObject
.
isEmpty
())
{
if
(
jsonObject
.
containsKey
(
"mblog"
))
{
if
(
jsonObject
.
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
jsonObject
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
}
}
else
if
(
jsonObject
.
containsKey
(
"card_group"
))
{
JSONArray
cardGroup
=
jsonObject
.
getJSONArray
(
"card_group"
);
WeiBoMassage
weiBoMassage
=
analysisWeiboMassage
(
cardGroup
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
analysisWeiBoUsers
(
cardGroup
,
document
.
getString
(
"name"
));
}
}
else
{
log
.
info
(
"获取数据失败"
);
}
}
break
;
}
}
}
/**
* 微博热搜数据更新导语,阅读量,讨论量
*
* @param document
* @return
*/
public
static
org
.
bson
.
Document
weiboUpdate
(
org
.
bson
.
Document
document
)
{
log
.
info
(
"更新微博热搜{}导语阅读量和讨论量"
,
document
.
getString
(
"name"
));
String
url
=
"https://m.weibo.cn/api/container/getIndex?"
+
document
.
getString
(
"url"
).
substring
(
document
.
getString
(
"url"
).
indexOf
(
"?"
)
+
1
,
document
.
getString
(
"url"
).
indexOf
(
"&"
));
log
.
info
(
"更新微博热搜{}导语阅读量和讨论量"
,
document
.
getString
(
"name"
));
String
url
=
"https://m.weibo.cn/api/container/getIndex?"
+
document
.
getString
(
"url"
).
substring
(
document
.
getString
(
"url"
).
indexOf
(
"?"
)
+
1
,
document
.
getString
(
"url"
).
indexOf
(
"&"
));
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
for
(
int
count
=
0
;
count
<=
2
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
...
...
@@ -104,13 +208,13 @@ public class WeiboHotSearchTest{
JSONObject
cardlistInfoJson
=
dataJson
.
getJSONObject
(
"cardlistInfo"
);
List
<
JSONObject
>
cardsJsons
=
(
List
<
JSONObject
>)
dataJson
.
get
(
"cards"
);
//解析cardlistInfo,讨论、导语、阅读
if
(
cardlistInfoJson
.
containsKey
(
"desc"
))
{
if
(
cardlistInfoJson
.
containsKey
(
"desc"
))
{
String
topicLead
=
cardlistInfoJson
.
getString
(
"desc"
);
if
(!
""
.
equals
(
topicLead
))
{
if
(!
""
.
equals
(
topicLead
))
{
document
.
put
(
"topicLead"
,
topicLead
);
}
}
if
(
cardlistInfoJson
.
containsKey
(
"cardlist_head_cards"
))
{
if
(
cardlistInfoJson
.
containsKey
(
"cardlist_head_cards"
))
{
JSONObject
readJson
=
cardlistInfoJson
.
getJSONArray
(
"cardlist_head_cards"
).
getJSONObject
(
0
);
if
(
readJson
.
containsKey
(
"head_data"
))
{
String
midText
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"midtext"
);
...
...
@@ -119,63 +223,304 @@ public class WeiboHotSearchTest{
String
pictureUrl
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"portrait_url"
);
document
.
put
(
"readCount"
,
TipsUtils
.
getHotCount
(
read
));
document
.
put
(
"discussCount"
,
TipsUtils
.
getHotCount
(
discussCount
));
document
.
put
(
"pictureUrl"
,
pictureUrl
);
if
(
readJson
.
getJSONObject
(
"head_data"
).
containsKey
(
"downtext"
)){
document
.
put
(
"pictureUrl"
,
pictureUrl
);
if
(
readJson
.
getJSONObject
(
"head_data"
).
containsKey
(
"downtext"
))
{
String
downtext
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"downtext"
);
if
(!
""
.
equals
(
downtext
))
{
document
.
put
(
"downtext"
,
downtext
.
replaceAll
(
"主持人:"
,
""
));
if
(!
""
.
equals
(
downtext
))
{
document
.
put
(
"downtext"
,
downtext
.
replaceAll
(
"主持人:"
,
""
));
}
}
}
}
//调用weiBoMassageDao添加数据
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
//解析cards,获取热门微博、人物
for
(
JSONObject
jsonObject
:
cardsJsons
)
{
if
(
nonNull
(
jsonObject
)
&&
!
jsonObject
.
isEmpty
())
{
if
(
jsonObject
.
containsKey
(
"mblog"
))
{
if
(
jsonObject
.
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
jsonObject
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
}
}
else
if
(
jsonObject
.
containsKey
(
"card_group"
))
{
JSONArray
cardGroup
=
jsonObject
.
getJSONArray
(
"card_group"
);
WeiBoMassage
weiBoMassage
=
analysisWeiboMassage
(
cardGroup
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
analysisWeiBoUsers
(
cardGroup
,
document
.
getString
(
"name"
));
}
}
else
{
log
.
info
(
"获取数据失败"
);
}
}
return
document
;
}
}
return
null
;
}
/**
* 解析微博信息
*
* @param cardGroup
* @param topic
* @return
*/
public
static
WeiBoMassage
analysisWeiboMassage
(
JSONArray
cardGroup
,
String
topic
)
{
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"mblog"
))
{
if
(
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
cardGroup
.
getJSONObject
(
i
),
topic
);
return
weiBoMassage
;
}
}
}
return
null
;
}
/**
* 解析用户信息
*
* @param cardGroup
* @param topic
* @return
*/
public
static
void
analysisWeiBoUsers
(
JSONArray
cardGroup
,
String
topic
)
{
//解析weibo人物信息
//创建weiBoUserDao
WeiBoUserDao
weiBoUserDao
=
new
WeiBoUserDao
();
Date
date
=
new
Date
();
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
if
(
3
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"users"
)){
JSONArray
users
=
cardGroup
.
getJSONObject
(
i
).
getJSONArray
(
"users"
);
for
(
int
i1
=
0
;
i1
<
users
.
size
();
i1
++)
{
//获取用户id
String
userId
=
users
.
getJSONObject
(
i1
).
getString
(
"id"
);
//获取用户名
String
userName
=
users
.
getJSONObject
(
i1
).
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
users
.
getJSONObject
(
i1
).
getString
(
"verified_reason"
);
//获取粉丝数量
String
followers_count
=
users
.
getJSONObject
(
i1
).
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(!
followers_count
.
contains
(
"万"
)){
followerCount
=
Long
.
valueOf
(
followers_count
);
}
else
{
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])*
10000
;
}
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
);
//判断weiBoUser是否为空添加数据
if
(
weiBoUser
!=
null
)
{
//调用weiBoUserDao中的方法添加数据
weiBoUserDao
.
addWeiBoUser
(
weiBoUser
);
}
else
{
log
.
info
(
"未采集到用户信息"
);
}
}
}
}
else
if
(
10
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"user"
)){
JSONObject
user
=
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"user"
);
//获取用户id
String
userId
=
user
.
getString
(
"id"
);
//获取用户名
String
userName
=
user
.
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
user
.
getString
(
"verified_reason"
);
//获取粉丝数
String
followers_count
=
user
.
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(
followers_count
.
contains
(
"万"
)){
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])*
10000
;
}
else
{
followerCount
=
Long
.
valueOf
(
followers_count
);
}
return
document
;
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
);
//判断weiBoUser是否为空添加数据
if
(
weiBoUser
!=
null
)
{
//调用weiBoUserDao中的方法添加数据
weiBoUserDao
.
addWeiBoUser
(
weiBoUser
);
}
else
{
log
.
info
(
"未采集到用户信息"
);
}
}
}
}
return
null
;
}
public
JSONObject
analysisWeiboSon
(
JSONObject
readJson
){
/**
* 解析微博类型
*
* @param jsonObject
* @param topic
* @return
*/
public
static
WeiBoMassage
analysisWeiboMBlog
(
JSONObject
jsonObject
,
String
topic
)
{
JSONObject
mblog
=
jsonObject
.
getJSONObject
(
"mblog"
);
String
type
=
mblog
.
getJSONObject
(
"title"
).
getString
(
"text"
);
String
card_type
=
jsonObject
.
getString
(
"card_type"
);
Integer
cardType
=
Integer
.
valueOf
(
card_type
);
String
show_type
=
jsonObject
.
getString
(
"show_type"
);
Integer
showType
=
Integer
.
valueOf
(
show_type
);
//点赞数
String
attitudes_count
=
mblog
.
getString
(
"attitudes_count"
);
Long
attitudeCount
=
null
;
if
(
attitudes_count
.
contains
(
"万"
))
{
String
[]
split
=
attitudes_count
.
split
(
"万"
);
attitudeCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
attitudeCount
=
Long
.
valueOf
(
attitudes_count
);
}
//评论数
String
comments_count
=
mblog
.
getString
(
"comments_count"
);
Long
commentCount
=
null
;
if
(
comments_count
.
contains
(
"万"
))
{
String
[]
split
=
comments_count
.
split
(
"万"
);
commentCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
commentCount
=
Long
.
valueOf
(
comments_count
);
}
//转发数
String
reposts_count
=
mblog
.
getString
(
"reposts_count"
);
Long
repostCount
=
null
;
if
(
reposts_count
.
contains
(
"万"
)){
String
[]
split
=
reposts_count
.
split
(
"万"
);
repostCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
repostCount
=
Long
.
valueOf
(
reposts_count
);
}
Date
createTime
=
null
;
Date
editTime
=
null
;
return
null
;
try
{
SimpleDateFormat
simpleDateFormat
=
new
SimpleDateFormat
(
"EEE MMM dd HH:mm:ss z yyyy"
,
java
.
util
.
Locale
.
US
);
//创建时间
String
created_at
=
mblog
.
getString
(
"created_at"
);
createTime
=
simpleDateFormat
.
parse
(
created_at
);
//编辑时间
if
(
mblog
.
containsKey
(
"edit_at"
)){
String
edit_at
=
mblog
.
getString
(
"edit_at"
);
editTime
=
simpleDateFormat
.
parse
(
edit_at
);
}
}
catch
(
ParseException
e
)
{
log
.
error
(
"创建时间和编辑时间解析异常"
,
e
);
}
String
mid
=
mblog
.
getString
(
"mid"
);
//用户id
String
userId
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//用户名
String
userName
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//来源
String
source
=
mblog
.
getString
(
"source"
);
//内容
String
content
=
null
;
if
(
mblog
.
getString
(
"text"
).
contains
(
"<"
))
{
String
text
=
mblog
.
getString
(
"text"
);
Document
parse
=
Jsoup
.
parse
(
text
);
content
=
parse
.
text
();
}
else
{
content
=
mblog
.
getString
(
"text"
);
}
WeiBoMassage
weiBoMassage
=
new
WeiBoMassage
(
userId
,
content
,
userName
,
mid
,
createTime
,
editTime
,
cardType
,
showType
,
repostCount
,
commentCount
,
attitudeCount
,
source
,
type
,
topic
);
//默认不转发为0
weiBoMassage
.
setForward
(
0
);
JSONObject
weiboJson
=
null
;
//微博实体 是否转发
if
(
mblog
.
containsKey
(
"retweeted_status"
))
{
weiboJson
=
mblog
.
getJSONObject
(
"retweeted_status"
);
//处理转发特有的
//weiBoMassage.set
//源mid
String
rootMid
=
weiboJson
.
getString
(
"mid"
);
//源来源
String
rootSource
=
weiboJson
.
getString
(
"source"
);
//源text
String
text
=
weiboJson
.
getString
(
"text"
);
//解析
Document
parse
=
Jsoup
.
parse
(
text
);
String
rootText
=
parse
.
text
();
//源用户id
String
rootId
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//源用户名
String
rootName
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//数据保存到对象中
weiBoMassage
.
setRoot_mid
(
rootMid
);
weiBoMassage
.
setRoot_id
(
rootId
);
weiBoMassage
.
setRoot_source
(
rootSource
);
weiBoMassage
.
setRoot_text
(
rootText
);
weiBoMassage
.
setRoot_name
(
rootName
);
//转发为1
weiBoMassage
.
setForward
(
1
);
}
else
{
weiboJson
=
mblog
;
}
List
<
String
>
pictureUrlList
=
new
ArrayList
();
Long
playCount
=
null
;
//获取播放量和图片链接
if
(
weiboJson
.
getJSONArray
(
"pic_ids"
).
size
()
>
0
)
{
JSONArray
jsonArray
=
weiboJson
.
getJSONArray
(
"pics"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
String
picUrl
=
jsonArray
.
getJSONObject
(
i
).
getString
(
"url"
);
pictureUrlList
.
add
(
picUrl
);
}
}
else
if
(
weiboJson
.
containsKey
(
"page_info"
))
{
if
(
weiboJson
.
getJSONObject
(
"page_info"
).
containsKey
(
"play_count"
)){
String
play
=
weiboJson
.
getJSONObject
(
"page_info"
).
getString
(
"play_count"
);
if
(
play
.
contains
(
"万"
))
{
String
[]
split
=
play
.
split
(
"万"
);
playCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
if
(
play
.
contains
(
"次"
)){
String
[]
split
=
play
.
split
(
"次"
);
playCount
=
Long
.
valueOf
(
split
[
0
]);
}
}
}
weiBoMassage
.
setPlayCount
(
playCount
);
weiBoMassage
.
setPictureUrlList
(
pictureUrlList
);
return
weiBoMassage
;
}
/**
* @return void 返回类型
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(
Date
date
){
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(
Date
date
)
{
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583"
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
);
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
...
...
@@ -187,11 +532,11 @@ public class WeiboHotSearchTest{
try
{
JSONObject
card
=
cards
.
getJSONObject
(
0
);
JSONArray
cardGroup
=
card
.
getJSONArray
(
"card_group"
);
JSONObject
topCard
=
cardGroup
.
getJSONObject
(
0
);
if
(!
topCard
.
containsKey
(
"pic"
))
{
JSONObject
topCard
=
cardGroup
.
getJSONObject
(
0
);
if
(!
topCard
.
containsKey
(
"pic"
))
{
rank
=
1
;
}
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
if
(
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
// String title = card.getString("title");
boolean
hot
=
true
;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
...
...
@@ -232,4 +577,5 @@ public class WeiboHotSearchTest{
return
Collections
.
emptyList
();
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment