Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
d544547c
Commit
d544547c
authored
Jun 23, 2021
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
百度热搜的更新
parent
ac59b5ab
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
185 additions
and
15 deletions
+185
-15
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+59
-15
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+6
-0
src/test/java/hotSaerchTest/HotSearchTest.java
+120
-0
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
View file @
d544547c
...
...
@@ -12,6 +12,7 @@ import okhttp3.Response;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -21,9 +22,9 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
/**
* @author hero
* @ClassName:BaiDuHotSearch
* @Description: TODO(百度风云榜热搜采集)
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
@Log4j2
...
...
@@ -32,35 +33,79 @@ public class BaiDuHotSearchCrawler {
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @return void 返回类型
* @Title: BaiDuHotSearchTest
* @author hero
* @Description: PC端百度风云榜采集
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
baiduHotSearch
(
Date
date
)
{
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"mainBody
"
))
{
return
ansysData
(
htmlBody
,
date
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"container-bg_lQ801
"
))
{
return
ansysNewData
(
htmlBody
,
date
);
}
else
{
log
.
info
(
"解析百度风云榜时出现解析错误,页面结构有问题"
);
}
return
Collections
.
emptyList
();
}
/**
* 更新解析
*
* @param htmlBody
* @param date
* @return
*/
private
static
List
<
HotSearchList
>
ansysNewData
(
String
htmlBody
,
Date
date
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"div.category-wrap_iQLoo"
);
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
())
{
for
(
Element
element
:
elements
)
{
try
{
//获取排名
String
strRank
=
element
.
select
(
"a.img-wrapper_29V76"
).
select
(
"div.index_1Ew5p"
).
text
();
Integer
rank
=
Integer
.
valueOf
(
strRank
);
//获取标题
String
strTitle
=
element
.
select
(
"a.title_dIF3B"
).
text
();
String
title
=
strTitle
.
split
(
" "
)[
0
];
//获取链接
String
url
=
element
.
select
(
"div.content_1YWBm"
).
select
(
"a.title_dIF3B"
).
attr
(
"href"
);
//获取内容
String
content
=
element
.
select
(
"div.small_Uvkd3"
).
text
();
//获取搜索指数
String
strCount
=
element
.
select
(
"div.hot-index_1Bl1a"
).
text
();
Long
count
=
Long
.
valueOf
(
strCount
);
HotSearchList
hotSearch
=
new
HotSearchList
(
url
,
title
,
count
,
rank
,
HotSearchType
.
百度热搜
.
name
(),
date
);
hotSearch
.
setTopicLead
(
content
);
list
.
add
(
hotSearch
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析百度风云榜时出现解析错误"
,
e
);
}
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析百度风云榜时出现解析错误,数据不是json结构"
,
e
);
}
return
list
;
}
/**
* 解析数据
*
* @param htmlBody
* @return
*/
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
,
Date
date
)
{
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
,
Date
date
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
@@ -88,12 +133,12 @@ public class BaiDuHotSearchCrawler {
String
kw
=
element
.
select
(
"td.keyword"
).
select
(
"a.list-title"
).
text
();
// logger.info("关键词:{}", kw);
//从连接中获取正确编码关键词
try
{
if
(!
everurl
.
isEmpty
())
{
kw
=
URLDecoder
.
decode
(
everurl
.
substring
(
everurl
.
indexOf
(
"&wd="
)+
4
).
split
(
"&"
)[
0
],
"GB2312"
);
try
{
if
(!
everurl
.
isEmpty
())
{
kw
=
URLDecoder
.
decode
(
everurl
.
substring
(
everurl
.
indexOf
(
"&wd="
)
+
4
).
split
(
"&"
)[
0
],
"GB2312"
);
}
}
catch
(
Exception
e1
)
{
log
.
error
(
"解析百度风云榜,地址"
,
e1
);
}
catch
(
Exception
e1
)
{
log
.
error
(
"解析百度风云榜,地址"
,
e1
);
}
// 获取搜索指数count(int)
String
hot
=
null
;
...
...
@@ -102,8 +147,7 @@ public class BaiDuHotSearchCrawler {
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-fall"
).
text
();
}
else
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
isEmpty
())
{
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
text
();
}
else
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-fair"
).
isEmpty
())
{
}
else
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-fair"
).
isEmpty
())
{
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-fair"
).
text
();
}
long
count
=
0
;
...
...
@@ -112,12 +156,12 @@ public class BaiDuHotSearchCrawler {
count
=
Integer
.
valueOf
(
hot
);
}
if
(
Objects
.
nonNull
(
rank
))
{
if
(
count
==
0
)
{
if
(
count
==
0
)
{
log
.
info
(
htmlBody
);
log
.
info
(
hot
);
log
.
info
(
element
);
}
else
{
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
count
,
rank
,
HotSearchType
.
百度热搜
.
name
(),
date
);
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
count
,
rank
,
HotSearchType
.
百度热搜
.
name
(),
date
);
list
.
add
(
hotSearch
);
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
d544547c
...
...
@@ -53,6 +53,9 @@ public class HotSearchCacheDAO {
if
(
"虎嗅热文推荐"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"comment_count"
,
hotSearch
.
getCommentCount
());
}
if
(
"百度热搜"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"topic_lead"
,
hotSearch
.
getTopicLead
());
}
if
(
"腾讯较真榜"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"topic_result"
,
hotSearch
.
getTopicResult
());
...
...
@@ -64,6 +67,9 @@ public class HotSearchCacheDAO {
document
.
put
(
"pictureUrl"
,
hotSearch
.
getPictureUrl
());
}
addAndUpdateData
(
document
);
if
(
"百度热搜"
.
equals
(
hotSearch
.
getType
())){
document
.
remove
(
"topic_lead"
);
}
dataes
.
add
(
document
);
});
return
dataes
;
...
...
src/test/java/hotSaerchTest/HotSearchTest.java
View file @
d544547c
package
hotSaerchTest
;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.client.MongoCollection
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest
;
import
com.zhiwei.searchhotcrawler.test.TaoBaoHotSearchCrawlerTest
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.bson.Document
;
import
org.junit.Test
;
import
org.junit.runner.RunWith
;
import
org.springframework.test.context.ContextConfiguration
;
import
org.springframework.test.context.junit4.SpringJUnit4ClassRunner
;
import
java.io.IOException
;
import
java.util.Date
;
import
java.util.List
;
import
static
com
.
ibm
.
icu
.
util
.
LocalePriorityList
.
add
;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
* @author ll
* @date 2021/6/10 6:30
...
...
@@ -24,6 +41,7 @@ import java.util.List;
{
"classpath:applicationContext.xml"
})
public
class
HotSearchTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* 测试快手热榜采集
...
...
@@ -40,4 +58,106 @@ public class HotSearchTest {
System
.
out
.
println
(
hotSearchLists
.
size
());
}
@Test
public
void
WeiBoUpdate
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
Document
document
=
new
Document
();
//String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26t%3D10%26q%3D%23我国新冠疫苗接种剂次超9亿%23";
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23可口可乐回应C罗拒绝与可乐同框%23"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"cardlistInfo"
);
if
(
json
.
containsKey
(
"desc"
))
{
String
topicLead
=
json
.
getString
(
"desc"
);
if
(!
""
.
equals
(
topicLead
))
{
document
.
put
(
"topicLead"
,
topicLead
);
}
}
if
(
json
.
containsKey
(
"cardlist_head_cards"
))
{
JSONObject
readJson
=
json
.
getJSONArray
(
"cardlist_head_cards"
).
getJSONObject
(
0
);
if
(
readJson
.
containsKey
(
"head_data"
))
{
String
midText
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"midtext"
);
String
read
=
midText
.
replaceAll
(
"阅读"
,
""
).
replaceAll
(
"讨论.*"
,
""
).
trim
();
String
discussCount
=
midText
.
replaceAll
(
".*讨论"
,
""
).
replaceAll
(
"详情.*"
,
""
).
trim
();
String
pictureUrl
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"portrait_url"
);
document
.
put
(
"readCount"
,
TipsUtils
.
getHotCount
(
read
));
document
.
put
(
"discussCount"
,
TipsUtils
.
getHotCount
(
discussCount
));
document
.
put
(
"pictureUrl"
,
pictureUrl
);
if
(
readJson
.
getJSONObject
(
"head_data"
).
containsKey
(
"downtext"
))
{
String
downtext
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"downtext"
);
if
(!
""
.
equals
(
downtext
))
{
document
.
put
(
"downtext"
,
downtext
.
replaceAll
(
"主持人:"
,
""
));
}
}
}
}
}
ad
(
document
);
System
.
out
.
println
(
document
);
}
private
void
ad
(
Document
nowDoc
)
{
MongoCollection
collection
=
MongoDBTemplate
.
getCollection
(
DBConfig
.
dbName
,
DBConfig
.
searchCacheCollName
);
if
(
nowDoc
.
containsKey
(
"topicLead"
)){
nowDoc
.
put
(
"topicLead"
,
nowDoc
.
getString
(
"topicLead"
));
}
if
(
nowDoc
.
containsKey
(
"readCount"
)
&&
nowDoc
.
containsKey
(
"discussCount"
))
{
nowDoc
.
put
(
"readCount"
,
nonNull
(
nowDoc
.
get
(
"readCount"
))?
Long
.
valueOf
(
nowDoc
.
get
(
"readCount"
).
toString
()):
null
);
nowDoc
.
put
(
"discussCount"
,
nonNull
(
nowDoc
.
get
(
"discussCount"
))?
Long
.
valueOf
(
nowDoc
.
get
(
"discussCount"
).
toString
()):
null
);
}
if
(
nowDoc
.
containsKey
(
"pictureUrl"
))
{
nowDoc
.
put
(
"pictureUrl"
,
nowDoc
.
getString
(
"pictureUrl"
));
}
if
(
nowDoc
.
containsKey
(
"downtext"
))
{
nowDoc
.
put
(
"downtext"
,
nowDoc
.
getString
(
"downtext"
));
}
collection
.
insertOne
(
nowDoc
);
}
/**
* 测试淘宝热搜采集
*/
@Test
public
void
taoBaoTestCrawler
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
List
<
HotSearchList
>
hotSearchLists
=
TaoBaoHotSearchCrawlerTest
.
taoBaoHotSearch
(
new
Date
());
System
.
out
.
println
(
hotSearchLists
);
System
.
out
.
println
(
hotSearchLists
.
size
());
}
/**
* 测试百度热搜采集
*/
@Test
public
void
baiDuTestCrawler
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
List
<
HotSearchList
>
hotSearchLists
=
BaiDuHotSearchCrawler
.
baiduHotSearch
(
new
Date
());
System
.
out
.
println
(
hotSearchLists
);
System
.
out
.
println
(
hotSearchLists
.
size
());
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment