Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
982e420e
Commit
982e420e
authored
May 16, 2022
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加知乎采集循环次数
parent
7151cb11
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
34 additions
and
29 deletions
+34
-29
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+34
-29
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
982e420e
...
@@ -12,7 +12,6 @@ import lombok.extern.log4j.Log4j2;
...
@@ -12,7 +12,6 @@ import lombok.extern.log4j.Log4j2;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
@@ -27,9 +26,9 @@ import org.jsoup.select.Elements;
...
@@ -27,9 +26,9 @@ import org.jsoup.select.Elements;
import
static
java
.
util
.
Objects
.
nonNull
;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
/**
* @author hero
* @ClassName: ZhihuHotCrawler
* @ClassName: ZhihuHotCrawler
* @Description: 知乎热搜采集程序
* @Description: 知乎热搜采集程序
* @author hero
* @date 2017年9月15日 上午10:54:31
* @date 2017年9月15日 上午10:54:31
*/
*/
@Log4j2
@Log4j2
...
@@ -81,18 +80,17 @@ public class ZhihuHotSearchCrawler {
...
@@ -81,18 +80,17 @@ public class ZhihuHotSearchCrawler {
// }
// }
/**
/**
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
* @Title: getMobileZhihuHotList
* @Title: getMobileZhihuHotList
* @author hero
* @author hero
* @Description: 移動端知乎熱搜榜
* @Description: 移動端知乎熱搜榜
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
*/
public
static
List
<
HotSearchList
>
getMobileZhihuHotList
(
Date
date
)
{
public
static
List
<
HotSearchList
>
getMobileZhihuHotList
(
Date
date
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
url
=
"https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0"
;
String
url
=
"https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0"
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"api.zhihu.com"
);
headerMap
.
put
(
"Host"
,
"api.zhihu.com"
);
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
);
...
@@ -100,13 +98,17 @@ public class ZhihuHotSearchCrawler {
...
@@ -100,13 +98,17 @@ public class ZhihuHotSearchCrawler {
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
x
=
0
;
x
<=
5
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
cause
);
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
cause
);
return
list
;
continue
;
}
else
{
}
else
{
htmlBody
=
response
.
bodyString
();
htmlBody
=
response
.
bodyString
();
if
(!
htmlBody
.
contains
(
"author"
))
{
continue
;
}
}
}
try
{
try
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
))
{
...
@@ -140,61 +142,64 @@ public class ZhihuHotSearchCrawler {
...
@@ -140,61 +142,64 @@ public class ZhihuHotSearchCrawler {
String
tog
=
nonNull
(
doc
.
get
(
"tag"
))
?
doc
.
getString
(
"tag"
)
:
null
;
String
tog
=
nonNull
(
doc
.
get
(
"tag"
))
?
doc
.
getString
(
"tag"
)
:
null
;
Long
view
=
nonNull
(
doc
.
get
(
"view"
))
?
Long
.
valueOf
(
doc
.
get
(
"view"
).
toString
())
:
null
;
Long
view
=
nonNull
(
doc
.
get
(
"view"
))
?
Long
.
valueOf
(
doc
.
get
(
"view"
).
toString
())
:
null
;
Long
fans
=
nonNull
(
doc
.
get
(
"fans"
))
?
Long
.
valueOf
(
doc
.
get
(
"fans"
).
toString
())
:
null
;
Long
fans
=
nonNull
(
doc
.
get
(
"fans"
))
?
Long
.
valueOf
(
doc
.
get
(
"fans"
).
toString
())
:
null
;
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
hotCount
,
i
+
1
,
HotSearchType
.
知乎热搜
.
name
(),
date
);
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
hotCount
,
i
+
1
,
HotSearchType
.
知乎热搜
.
name
(),
date
);
zhihu
.
setFans
(
fans
);
zhihu
.
setFans
(
fans
);
zhihu
.
setView
(
view
);
zhihu
.
setView
(
view
);
zhihu
.
setTag
(
tog
);
zhihu
.
setTag
(
tog
);
list
.
add
(
zhihu
);
list
.
add
(
zhihu
);
}
}
return
list
;
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
.
info
(
"知乎热搜解析异常"
,
e
);
log
.
info
(
"知乎热搜解析异常"
,
e
);
}
}
}
return
list
;
return
list
;
}
}
//访问pc端 获取标签及浏览量关注数
//访问pc端 获取标签及浏览量关注数
private
static
org
.
bson
.
Document
getTag
(
String
url
)
{
private
static
org
.
bson
.
Document
getTag
(
String
url
)
{
org
.
bson
.
Document
doc
=
new
org
.
bson
.
Document
();
org
.
bson
.
Document
doc
=
new
org
.
bson
.
Document
();
doc
.
put
(
"tag"
,
null
);
doc
.
put
(
"tag"
,
null
);
//浏览量
//浏览量
doc
.
put
(
"view"
,
null
);
doc
.
put
(
"view"
,
null
);
//粉丝
//粉丝
doc
.
put
(
"fans"
,
null
);
doc
.
put
(
"fans"
,
null
);
Map
<
String
,
String
>
Map
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
Map
=
HeaderTool
.
getCommonHead
();
Map
.
put
(
"cookie"
,
"_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4"
);
Map
.
put
(
"cookie"
,
"_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
Map
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
Map
);
try
{
try
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
Throwable
cause
=
response
.
cause
();
log
.
error
(
"单条知乎热搜数据页面连接失败"
,
cause
);
log
.
error
(
"单条知乎热搜数据页面连接失败"
,
cause
);
return
doc
;
return
doc
;
}
else
{
}
else
{
String
htmlBody
=
response
.
bodyString
();
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"QuestionHeader"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"QuestionHeader"
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
//获取标签
//获取标签
String
label
=
""
;
String
label
=
""
;
Elements
select
=
document
.
select
(
"div.Tag"
);
Elements
select
=
document
.
select
(
"div.Tag"
);
for
(
Element
element
:
select
)
{
for
(
Element
element
:
select
)
{
String
text
=
"`"
+
element
.
select
(
"div.Popover"
).
text
()+
";"
;
String
text
=
"`"
+
element
.
select
(
"div.Popover"
).
text
()
+
";"
;
label
=
label
+
text
;
label
=
label
+
text
;
}
}
doc
.
put
(
"tag"
,
label
.
trim
());
doc
.
put
(
"tag"
,
label
.
trim
());
String
strong
=
document
.
select
(
"div.NumberBoard-itemInner"
).
select
(
"strong"
).
text
();
String
strong
=
document
.
select
(
"div.NumberBoard-itemInner"
).
select
(
"strong"
).
text
();
String
[]
count
=
strong
.
split
(
" "
);
String
[]
count
=
strong
.
split
(
" "
);
//获取关注数
//获取关注数
doc
.
put
(
"fans"
,
Long
.
valueOf
(
count
[
0
].
replaceAll
(
","
,
""
).
trim
()));
doc
.
put
(
"fans"
,
Long
.
valueOf
(
count
[
0
].
replaceAll
(
","
,
""
).
trim
()));
//获取浏览量
//获取浏览量
doc
.
put
(
"view"
,
Long
.
valueOf
(
count
[
1
].
replaceAll
(
","
,
""
).
trim
()));
doc
.
put
(
"view"
,
Long
.
valueOf
(
count
[
1
].
replaceAll
(
","
,
""
).
trim
()));
return
doc
;
return
doc
;
}
else
{
}
else
{
return
doc
;
return
doc
;
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
.
info
(
"知乎热搜标签解析异常"
,
e
);
log
.
info
(
"知乎热搜标签解析异常"
,
e
);
}
}
return
doc
;
return
doc
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment