Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
241bc05a
Commit
241bc05a
authored
Jul 02, 2021
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/working' into working
parents
eb71665b
d544547c
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
465 additions
and
12 deletions
+465
-12
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+1
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoMassage.java
+7
-2
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoUser.java
+7
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+0
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
+92
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+9
-5
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+6
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoMassageDao.java
+1
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoUserDao.java
+1
-0
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchRunTest.java
+21
-0
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchCrawlerTest.java
+93
-0
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchRun.java
+42
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+13
-0
src/test/java/hotSaerchTest/HotSearchTest.java
+163
-0
src/test/java/weiboTest/WeiboHotSearchTest.java
+9
-4
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
241bc05a
...
...
@@ -25,4 +25,5 @@ public enum HotSearchType {
B
站热搜
,
人气榜
36
氪
,
虎嗅热文推荐
,
快手热榜
,
}
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoMassage.java
View file @
241bc05a
...
...
@@ -88,7 +88,10 @@ public class WeiBoMassage implements Serializable {
* 话题
*/
private
String
topic
;
/**
* 头像地址
*/
private
String
profileImageUrl
;
//是否转发
private
Integer
forward
;
//转发 源微博mid
...
...
@@ -110,7 +113,7 @@ public class WeiBoMassage implements Serializable {
public
WeiBoMassage
(
String
userId
,
String
text
,
String
userName
,
String
mid
,
Date
creatTime
,
Date
editTime
,
Integer
cardType
,
Integer
showType
,
Long
repostCount
,
Long
commentCount
,
Long
attitudeCount
,
String
source
,
String
type
,
String
topic
)
{
Long
commentCount
,
Long
attitudeCount
,
String
source
,
String
type
,
String
topic
,
String
profileImageUrl
)
{
this
.
id
=
mid
+
"_"
+
HotSearchType
.
微博热搜
.
name
()+
"_"
+
topic
;
this
.
userId
=
userId
;
this
.
text
=
text
;
...
...
@@ -126,6 +129,8 @@ public class WeiBoMassage implements Serializable {
this
.
source
=
source
;
this
.
type
=
type
;
this
.
topic
=
topic
;
this
.
profileImageUrl
=
profileImageUrl
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoUser.java
View file @
241bc05a
...
...
@@ -48,10 +48,15 @@ public class WeiBoUser implements Serializable {
* 粉丝数
*/
private
Long
followerCount
;
/**
* 头像地址
*/
private
String
profileImageUrl
;
public
WeiBoUser
()
{
}
public
WeiBoUser
(
String
userId
,
String
attestationMassage
,
String
userName
,
String
topic
,
Date
time
,
Long
followerCount
)
{
public
WeiBoUser
(
String
userId
,
String
attestationMassage
,
String
userName
,
String
topic
,
Date
time
,
Long
followerCount
,
String
profileImageUrl
)
{
this
.
id
=
userId
+
"_"
+
HotSearchType
.
微博热搜
.
name
()+
"_"
+
topic
;
this
.
userId
=
userId
;
...
...
@@ -60,6 +65,7 @@ public class WeiBoUser implements Serializable {
this
.
topic
=
topic
;
this
.
time
=
time
;
this
.
followerCount
=
followerCount
;
this
.
profileImageUrl
=
profileImageUrl
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
View file @
241bc05a
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
0 → 100644
View file @
241bc05a
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.time.Duration
;
import
java.util.*
;
/**
* @author ll
* @ClassName:KuaiShouHotSearchCrawlerTeat
* @Description:快手采集
* @date 2021年6月10日 下午5:54:31
*/
@Log4j2
public
class
KuaiShouHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public
static
List
<
HotSearchList
>
KuaiShouHotSearchCrawler
(
Date
date
)
{
String
url
=
"https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"APOLLO_STATE"
))
{
return
ansysData
(
htmlBody
,
date
);
}
else
{
log
.
info
(
"解析快手热榜时出现解析错误,页面结构有问题"
);
}
return
Collections
.
emptyList
();
}
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
,
Date
date
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
JSONObject
jsonObject
=
null
;
try
{
String
substring
=
htmlBody
.
substring
(
htmlBody
.
indexOf
(
"homexxunknown"
)+
15
,
htmlBody
.
indexOf
(
"homexxfilmcomlist"
)+
18
);
String
sub
=
"{"
+
substring
.
substring
(
substring
.
indexOf
(
"VisionHotRankResult"
)
+
22
,
substring
.
indexOf
(
"llsid"
)
-
2
)+
"}}"
;
String
substring1
=
sub
.
substring
(
0
,
sub
.
indexOf
(
"$ROOT_QUERY.visionMovieRank"
)
-
2
)+
"}"
;
jsonObject
=
JSONObject
.
parseObject
(
substring1
);
//获取每个jsonObject对象的值
Collection
<
Object
>
values
=
jsonObject
.
values
();
for
(
Object
value
:
values
)
{
try
{
JSONObject
object
=
(
JSONObject
)
JSONObject
.
toJSON
(
value
);
//获取话题名
String
name
=
object
.
getString
(
"name"
);
//排名
Integer
rank
=
object
.
getInteger
(
"rank"
);
String
hotValue
=
object
.
getString
(
"hotValue"
);
String
[]
ws
=
hotValue
.
split
(
"w"
);
//热度
Double
d
=
Double
.
valueOf
(
ws
[
0
])*
10000
;
long
hot
=
d
.
longValue
();
//话题链接
String
url
=
object
.
getString
(
"poster"
);
//标签类型
String
tagType
=
null
;
if
(
object
.
containsKey
(
"tagType"
)){
tagType
=
object
.
getString
(
"tagType"
);
}
HotSearchList
hotSearchList
=
new
HotSearchList
(
url
,
name
,
hot
,
true
,
rank
,
HotSearchType
.
快手热榜
.
name
(),
tagType
,
date
);
list
.
add
(
hotSearchList
);
}
catch
(
NumberFormatException
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误"
,
e
);
}
}
}
catch
(
NumberFormatException
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误,数据不是json结构"
,
e
);
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
241bc05a
...
...
@@ -373,8 +373,9 @@ public class WeiboHotSearchCrawler {
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])*
10000
;
}
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
);
//用户头像地址
String
profileImageUrl
=
users
.
getJSONObject
(
i1
).
getString
(
"profile_image_url"
);
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
,
profileImageUrl
);
weiBoUserList
.
add
(
weiBoUser
);
}
}
...
...
@@ -397,8 +398,9 @@ public class WeiboHotSearchCrawler {
}
else
{
followerCount
=
Long
.
valueOf
(
followers_count
);
}
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
);
//用户头像地址
String
profileImageUrl
=
user
.
getString
(
"profile_image_url"
);
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
,
profileImageUrl
);
weiBoUserList
.
add
(
weiBoUser
);
}
return
weiBoUserList
;
...
...
@@ -476,6 +478,8 @@ public class WeiboHotSearchCrawler {
String
userName
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//来源
String
source
=
mblog
.
getString
(
"source"
);
//用户头像地址
String
profileImageUrl
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"profile_image_url"
);
//内容
String
content
=
null
;
if
(
mblog
.
getString
(
"text"
).
contains
(
"<"
))
{
...
...
@@ -488,7 +492,7 @@ public class WeiboHotSearchCrawler {
}
WeiBoMassage
weiBoMassage
=
new
WeiBoMassage
(
userId
,
content
,
userName
,
mid
,
createTime
,
editTime
,
cardType
,
showType
,
repostCount
,
commentCount
,
attitudeCount
,
source
,
type
,
topic
);
repostCount
,
commentCount
,
attitudeCount
,
source
,
type
,
topic
,
profileImageUrl
);
//默认不转发为0
weiBoMassage
.
setForward
(
0
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
241bc05a
...
...
@@ -54,6 +54,9 @@ public class HotSearchCacheDAO {
if
(
"虎嗅热文推荐"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"comment_count"
,
hotSearch
.
getCommentCount
());
}
if
(
"百度热搜"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"topic_lead"
,
hotSearch
.
getTopicLead
());
}
if
(
"腾讯较真榜"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"topic_result"
,
hotSearch
.
getTopicResult
());
...
...
@@ -65,6 +68,9 @@ public class HotSearchCacheDAO {
document
.
put
(
"pictureUrl"
,
hotSearch
.
getPictureUrl
());
}
addAndUpdateData
(
document
);
if
(
"百度热搜"
.
equals
(
hotSearch
.
getType
())){
document
.
remove
(
"topic_lead"
);
}
dataes
.
add
(
document
);
});
return
dataes
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoMassageDao.java
View file @
241bc05a
...
...
@@ -49,6 +49,7 @@ public class WeiBoMassageDao {
document
.
put
(
"repostCount"
,
weiBoMassage
.
getRepostCount
());
document
.
put
(
"commentCount"
,
weiBoMassage
.
getCommentCount
());
document
.
put
(
"attitudeCount"
,
weiBoMassage
.
getAttitudeCount
());
document
.
put
(
"profileImageUrl"
,
weiBoMassage
.
getProfileImageUrl
());
if
(
Objects
.
nonNull
(
weiBoMassage
.
getPlayCount
())){
document
.
put
(
"playCount"
,
weiBoMassage
.
getPlayCount
());
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoUserDao.java
View file @
241bc05a
...
...
@@ -43,6 +43,7 @@ public class WeiBoUserDao {
document
.
put
(
"topic"
,
weiBoUser
.
getTopic
());
document
.
put
(
"time"
,
weiBoUser
.
getTime
());
document
.
put
(
"followerCount"
,
weiBoUser
.
getFollowerCount
());
document
.
put
(
"profileImageUrl"
,
weiBoUser
.
getProfileImageUrl
());
try
{
mongoCollection
.
insertOne
(
document
);
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchRunTest.java
0 → 100644
View file @
241bc05a
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
java.text.ParseException
;
public
class
HotSearchRunTest
{
public
static
void
main
(
String
[]
args
)
throws
ParseException
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
//微博热搜开始采集
// new WeiboHotSearchRun().start();
//快手热榜开始采集
// new KuaiShouHotSearchRun().start();
}
}
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchCrawlerTest.java
0 → 100644
View file @
241bc05a
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.time.Duration
;
import
java.util.*
;
/**
* @author ll
* @ClassName:KuaiShouHotSearchCrawlerTeat
* @Description:
* @date 2021年6月10日 下午5:54:31
*/
@Log4j2
public
class
KuaiShouHotSearchCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public
static
List
<
HotSearchList
>
KuaiShouHotSearchCrawler
(
Date
date
)
{
String
url
=
"https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"APOLLO_STATE"
))
{
return
ansysData
(
htmlBody
,
date
);
}
else
{
log
.
info
(
"解析快手热榜时出现解析错误,页面结构有问题"
);
}
return
Collections
.
emptyList
();
}
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
,
Date
date
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
JSONObject
jsonObject
=
null
;
try
{
String
substring
=
htmlBody
.
substring
(
htmlBody
.
indexOf
(
"homexxunknown"
)+
15
,
htmlBody
.
indexOf
(
"homexxfilmcomlist"
)+
18
);
String
sub
=
"{"
+
substring
.
substring
(
substring
.
indexOf
(
"VisionHotRankResult"
)
+
22
,
substring
.
indexOf
(
"llsid"
)
-
2
)+
"}}"
;
String
substring1
=
sub
.
substring
(
0
,
sub
.
indexOf
(
"$ROOT_QUERY.visionMovieRank"
)
-
2
)+
"}"
;
jsonObject
=
JSONObject
.
parseObject
(
substring1
);
//获取每个jsonObject对象的值
Collection
<
Object
>
values
=
jsonObject
.
values
();
for
(
Object
value
:
values
)
{
try
{
JSONObject
object
=
(
JSONObject
)
JSONObject
.
toJSON
(
value
);
//获取话题名
String
name
=
object
.
getString
(
"name"
);
//排名
Integer
rank
=
object
.
getInteger
(
"rank"
);
String
hotValue
=
object
.
getString
(
"hotValue"
);
String
[]
ws
=
hotValue
.
split
(
"w"
);
//热度
Double
d
=
Double
.
valueOf
(
ws
[
0
])*
10000
;
long
hot
=
d
.
longValue
();
//话题链接
String
url
=
object
.
getString
(
"poster"
);
//标签类型
String
tagType
=
null
;
if
(
object
.
containsKey
(
"tagType"
)){
tagType
=
object
.
getString
(
"tagType"
);
}
HotSearchList
hotSearchList
=
new
HotSearchList
(
url
,
name
,
hot
,
true
,
rank
,
HotSearchType
.
快手热榜
.
name
(),
tagType
,
date
);
list
.
add
(
hotSearchList
);
}
catch
(
NumberFormatException
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误"
,
e
);
}
}
}
catch
(
NumberFormatException
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误,数据不是json结构"
,
e
);
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchRun.java
0 → 100644
View file @
241bc05a
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
@Log4j2
public
class
KuaiShouHotSearchRun
extends
Thread
{
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getHotList
();
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
}
ZhiWeiTools
.
sleep
(
50
);
}
}
private
void
getHotList
()
{
log
.
info
(
"快手热榜采集开始........"
);
List
<
HotSearchList
>
kuaiShouList
=
KuaiShouHotSearchCrawlerTest
.
KuaiShouHotSearchCrawler
(
new
Date
());
log
.
info
(
"{}, 此轮快手热榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
kuaiShouList
!=
null
?
kuaiShouList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"快手热榜"
,
kuaiShouList
);
log
.
info
(
"快手热榜采集结束........"
);
}
}
\ No newline at end of file
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
241bc05a
...
...
@@ -507,5 +507,18 @@ public class GatherTimer {
}
return
name
;
}
/**
*快手热榜采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerKuaiShou
(){
logger
.
info
(
"快手热榜开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
kuaiShouList
=
KuaiShouHotSearchCrawler
.
KuaiShouHotSearchCrawler
(
date
);
logger
.
info
(
"{}, 快手此轮采集到的数据量为:{}"
,
new
Date
(),
kuaiShouList
!=
null
?
kuaiShouList
.
size
()
:
0
);
TipsUtils
.
addHotList
(
HotSearchType
.
快手热榜
.
name
(),
kuaiShouList
);
logger
.
info
(
"快手热榜采集结束..."
);
}
}
src/test/java/hotSaerchTest/HotSearchTest.java
0 → 100644
View file @
241bc05a
package
hotSaerchTest
;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.client.MongoCollection
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest
;
import
com.zhiwei.searchhotcrawler.test.TaoBaoHotSearchCrawlerTest
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.bson.Document
;
import
org.junit.Test
;
import
org.junit.runner.RunWith
;
import
org.springframework.test.context.ContextConfiguration
;
import
org.springframework.test.context.junit4.SpringJUnit4ClassRunner
;
import
java.io.IOException
;
import
java.util.Date
;
import
java.util.List
;
import
static
com
.
ibm
.
icu
.
util
.
LocalePriorityList
.
add
;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
* @author ll
* @date 2021/6/10 6:30
*/
@Log4j2
@RunWith
(
SpringJUnit4ClassRunner
.
class
)
@ContextConfiguration
(
locations
=
{
"classpath:applicationContext.xml"
})
public
class
HotSearchTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* 测试快手热榜采集
*/
@Test
public
void
kuaiShouTestCrawler
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
List
<
HotSearchList
>
hotSearchLists
=
KuaiShouHotSearchCrawlerTest
.
KuaiShouHotSearchCrawler
(
new
Date
());
System
.
out
.
println
(
hotSearchLists
);
System
.
out
.
println
(
hotSearchLists
.
size
());
}
@Test
public
void
WeiBoUpdate
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
Document
document
=
new
Document
();
//String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26t%3D10%26q%3D%23我国新冠疫苗接种剂次超9亿%23";
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23可口可乐回应C罗拒绝与可乐同框%23"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"cardlistInfo"
);
if
(
json
.
containsKey
(
"desc"
))
{
String
topicLead
=
json
.
getString
(
"desc"
);
if
(!
""
.
equals
(
topicLead
))
{
document
.
put
(
"topicLead"
,
topicLead
);
}
}
if
(
json
.
containsKey
(
"cardlist_head_cards"
))
{
JSONObject
readJson
=
json
.
getJSONArray
(
"cardlist_head_cards"
).
getJSONObject
(
0
);
if
(
readJson
.
containsKey
(
"head_data"
))
{
String
midText
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"midtext"
);
String
read
=
midText
.
replaceAll
(
"阅读"
,
""
).
replaceAll
(
"讨论.*"
,
""
).
trim
();
String
discussCount
=
midText
.
replaceAll
(
".*讨论"
,
""
).
replaceAll
(
"详情.*"
,
""
).
trim
();
String
pictureUrl
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"portrait_url"
);
document
.
put
(
"readCount"
,
TipsUtils
.
getHotCount
(
read
));
document
.
put
(
"discussCount"
,
TipsUtils
.
getHotCount
(
discussCount
));
document
.
put
(
"pictureUrl"
,
pictureUrl
);
if
(
readJson
.
getJSONObject
(
"head_data"
).
containsKey
(
"downtext"
))
{
String
downtext
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"downtext"
);
if
(!
""
.
equals
(
downtext
))
{
document
.
put
(
"downtext"
,
downtext
.
replaceAll
(
"主持人:"
,
""
));
}
}
}
}
}
ad
(
document
);
System
.
out
.
println
(
document
);
}
private
void
ad
(
Document
nowDoc
)
{
MongoCollection
collection
=
MongoDBTemplate
.
getCollection
(
DBConfig
.
dbName
,
DBConfig
.
searchCacheCollName
);
if
(
nowDoc
.
containsKey
(
"topicLead"
)){
nowDoc
.
put
(
"topicLead"
,
nowDoc
.
getString
(
"topicLead"
));
}
if
(
nowDoc
.
containsKey
(
"readCount"
)
&&
nowDoc
.
containsKey
(
"discussCount"
))
{
nowDoc
.
put
(
"readCount"
,
nonNull
(
nowDoc
.
get
(
"readCount"
))?
Long
.
valueOf
(
nowDoc
.
get
(
"readCount"
).
toString
()):
null
);
nowDoc
.
put
(
"discussCount"
,
nonNull
(
nowDoc
.
get
(
"discussCount"
))?
Long
.
valueOf
(
nowDoc
.
get
(
"discussCount"
).
toString
()):
null
);
}
if
(
nowDoc
.
containsKey
(
"pictureUrl"
))
{
nowDoc
.
put
(
"pictureUrl"
,
nowDoc
.
getString
(
"pictureUrl"
));
}
if
(
nowDoc
.
containsKey
(
"downtext"
))
{
nowDoc
.
put
(
"downtext"
,
nowDoc
.
getString
(
"downtext"
));
}
collection
.
insertOne
(
nowDoc
);
}
/**
* 测试淘宝热搜采集
*/
@Test
public
void
taoBaoTestCrawler
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
List
<
HotSearchList
>
hotSearchLists
=
TaoBaoHotSearchCrawlerTest
.
taoBaoHotSearch
(
new
Date
());
System
.
out
.
println
(
hotSearchLists
);
System
.
out
.
println
(
hotSearchLists
.
size
());
}
/**
* 测试百度热搜采集
*/
@Test
public
void
baiDuTestCrawler
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
List
<
HotSearchList
>
hotSearchLists
=
BaiDuHotSearchCrawler
.
baiduHotSearch
(
new
Date
());
System
.
out
.
println
(
hotSearchLists
);
System
.
out
.
println
(
hotSearchLists
.
size
());
}
}
src/test/java/weiboTest/WeiboHotSearchTest.java
View file @
241bc05a
...
...
@@ -333,7 +333,9 @@ public class WeiboHotSearchTest {
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])*
10000
;
}
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
);
//用户头像地址
String
profileImageUrl
=
users
.
getJSONObject
(
i1
).
getString
(
"profile_image_url"
);
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
,
profileImageUrl
);
weiBoUserList
.
add
(
weiBoUser
);
}
}
...
...
@@ -356,8 +358,9 @@ public class WeiboHotSearchTest {
}
else
{
followerCount
=
Long
.
valueOf
(
followers_count
);
}
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
);
//用户头像地址
String
profileImageUrl
=
user
.
getString
(
"profile_image_url"
);
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
,
profileImageUrl
);
weiBoUserList
.
add
(
weiBoUser
);
}
...
...
@@ -436,6 +439,8 @@ public class WeiboHotSearchTest {
String
userName
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//来源
String
source
=
mblog
.
getString
(
"source"
);
//用户头像地址
String
profileImageUrl
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"profile_image_url"
);
//内容
String
content
=
null
;
if
(
mblog
.
getString
(
"text"
).
contains
(
"<"
))
{
...
...
@@ -448,7 +453,7 @@ public class WeiboHotSearchTest {
}
WeiBoMassage
weiBoMassage
=
new
WeiBoMassage
(
userId
,
content
,
userName
,
mid
,
createTime
,
editTime
,
cardType
,
showType
,
repostCount
,
commentCount
,
attitudeCount
,
source
,
type
,
topic
);
repostCount
,
commentCount
,
attitudeCount
,
source
,
type
,
topic
,
profileImageUrl
);
//默认不转发为0
weiBoMassage
.
setForward
(
0
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment