Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
e800df88
Commit
e800df88
authored
May 16, 2022
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'working' into 'master'
增加知乎采集循环次数 See merge request
!191
parents
1ea78f29
982e420e
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
34 additions
and
29 deletions
+34
-29
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+34
-29
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
e800df88
...
...
@@ -12,7 +12,6 @@ import lombok.extern.log4j.Log4j2;
import
okhttp3.Request
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -27,9 +26,9 @@ import org.jsoup.select.Elements;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
* @author hero
* @ClassName: ZhihuHotCrawler
* @Description: 知乎热搜采集程序
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
...
...
@@ -81,18 +80,17 @@ public class ZhihuHotSearchCrawler {
// }
/**
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
* @Title: getMobileZhihuHotList
* @author hero
* @Description: 移動端知乎熱搜榜
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public
static
List
<
HotSearchList
>
getMobileZhihuHotList
(
Date
date
)
{
public
static
List
<
HotSearchList
>
getMobileZhihuHotList
(
Date
date
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
url
=
"https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0"
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"api.zhihu.com"
);
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
);
...
...
@@ -100,13 +98,17 @@ public class ZhihuHotSearchCrawler {
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
x
=
0
;
x
<=
5
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
cause
);
return
list
;
}
else
{
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
cause
);
continue
;
}
else
{
htmlBody
=
response
.
bodyString
();
if
(!
htmlBody
.
contains
(
"author"
))
{
continue
;
}
}
try
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
))
{
...
...
@@ -140,61 +142,64 @@ public class ZhihuHotSearchCrawler {
String
tog
=
nonNull
(
doc
.
get
(
"tag"
))
?
doc
.
getString
(
"tag"
)
:
null
;
Long
view
=
nonNull
(
doc
.
get
(
"view"
))
?
Long
.
valueOf
(
doc
.
get
(
"view"
).
toString
())
:
null
;
Long
fans
=
nonNull
(
doc
.
get
(
"fans"
))
?
Long
.
valueOf
(
doc
.
get
(
"fans"
).
toString
())
:
null
;
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
hotCount
,
i
+
1
,
HotSearchType
.
知乎热搜
.
name
(),
date
);
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
hotCount
,
i
+
1
,
HotSearchType
.
知乎热搜
.
name
(),
date
);
zhihu
.
setFans
(
fans
);
zhihu
.
setView
(
view
);
zhihu
.
setTag
(
tog
);
list
.
add
(
zhihu
);
}
return
list
;
}
}
catch
(
Exception
e
)
{
log
.
info
(
"知乎热搜解析异常"
,
e
);
log
.
info
(
"知乎热搜解析异常"
,
e
);
}
}
return
list
;
}
//访问pc端 获取标签及浏览量关注数
private
static
org
.
bson
.
Document
getTag
(
String
url
)
{
org
.
bson
.
Document
doc
=
new
org
.
bson
.
Document
();
doc
.
put
(
"tag"
,
null
);
doc
.
put
(
"tag"
,
null
);
//浏览量
doc
.
put
(
"view"
,
null
);
doc
.
put
(
"view"
,
null
);
//粉丝
doc
.
put
(
"fans"
,
null
);
Map
<
String
,
String
>
Map
=
HeaderTool
.
getCommonHead
();
doc
.
put
(
"fans"
,
null
);
Map
<
String
,
String
>
Map
=
HeaderTool
.
getCommonHead
();
Map
.
put
(
"cookie"
,
"_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
Map
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
Map
);
try
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"单条知乎热搜数据页面连接失败"
,
cause
);
log
.
error
(
"单条知乎热搜数据页面连接失败"
,
cause
);
return
doc
;
}
else
{
}
else
{
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"QuestionHeader"
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
//获取标签
String
label
=
""
;
String
label
=
""
;
Elements
select
=
document
.
select
(
"div.Tag"
);
for
(
Element
element
:
select
)
{
String
text
=
"`"
+
element
.
select
(
"div.Popover"
).
text
()+
";"
;
label
=
label
+
text
;
String
text
=
"`"
+
element
.
select
(
"div.Popover"
).
text
()
+
";"
;
label
=
label
+
text
;
}
doc
.
put
(
"tag"
,
label
.
trim
());
doc
.
put
(
"tag"
,
label
.
trim
());
String
strong
=
document
.
select
(
"div.NumberBoard-itemInner"
).
select
(
"strong"
).
text
();
String
[]
count
=
strong
.
split
(
" "
);
//获取关注数
doc
.
put
(
"fans"
,
Long
.
valueOf
(
count
[
0
].
replaceAll
(
","
,
""
).
trim
()));
doc
.
put
(
"fans"
,
Long
.
valueOf
(
count
[
0
].
replaceAll
(
","
,
""
).
trim
()));
//获取浏览量
doc
.
put
(
"view"
,
Long
.
valueOf
(
count
[
1
].
replaceAll
(
","
,
""
).
trim
()));
doc
.
put
(
"view"
,
Long
.
valueOf
(
count
[
1
].
replaceAll
(
","
,
""
).
trim
()));
return
doc
;
}
else
{
}
else
{
return
doc
;
}
}
}
catch
(
Exception
e
)
{
log
.
info
(
"知乎热搜标签解析异常"
,
e
);
log
.
info
(
"知乎热搜标签解析异常"
,
e
);
}
return
doc
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment