Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
8f07a0cc
Commit
8f07a0cc
authored
Nov 21, 2022
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'working' into 'master'
Working See merge request
!218
parents
3b1f63a6
ab4d9e51
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
61 additions
and
1 deletions
+61
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+1
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+10
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+4
-0
src/main/java/com/zhiwei/searchhotcrawler/util/DelTagsUtil.java
+44
-0
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
View file @
8f07a0cc
...
...
@@ -33,6 +33,7 @@ public class BaiDuHotSearchCrawler {
// private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @return void 返回类型
* @Title: BaiDuHotSearchTest
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
View file @
8f07a0cc
...
...
@@ -35,7 +35,8 @@ public class KuaiShouHotSearchCrawler {
* @Description: PC端36Kr人气榜采集
*/
public
static
List
<
HotSearchList
>
KuaiShouHotSearchCrawler
(
Date
date
)
{
String
url
=
"https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"
;
//String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791";
String
url
=
"https://www.kuaishou.com/?isHome=1"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
HttpClientUtils
.
httpBoot
.
syncCall
(
request
,
ProxyServerSupplier
.
NAT_HEAVY_PROXY
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
8f07a0cc
...
...
@@ -8,6 +8,7 @@ import com.zhiwei.http.boot.Response;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.util.DelTagsUtil
;
import
com.zhiwei.searchhotcrawler.util.HttpClientUtils
;
import
io.netty.handler.ssl.SslProvider
;
import
lombok.extern.log4j.Log4j2
;
...
...
@@ -144,10 +145,12 @@ public class ZhihuHotSearchCrawler {
String
tog
=
nonNull
(
doc
.
get
(
"tag"
))
?
doc
.
getString
(
"tag"
)
:
null
;
Long
view
=
nonNull
(
doc
.
get
(
"view"
))
?
Long
.
valueOf
(
doc
.
get
(
"view"
).
toString
())
:
null
;
Long
fans
=
nonNull
(
doc
.
get
(
"fans"
))
?
Long
.
valueOf
(
doc
.
get
(
"fans"
).
toString
())
:
null
;
String
topicLead
=
nonNull
(
doc
.
get
(
"topicLead"
))
?
doc
.
getString
(
"topicLead"
)
:
null
;
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
hotCount
,
i
+
1
,
HotSearchType
.
知乎热搜
.
name
(),
date
);
zhihu
.
setFans
(
fans
);
zhihu
.
setView
(
view
);
zhihu
.
setTag
(
tog
);
zhihu
.
setTopicLead
(
topicLead
);
list
.
add
(
zhihu
);
}
return
list
;
...
...
@@ -178,12 +181,18 @@ public class ZhihuHotSearchCrawler {
log
.
error
(
"单条知乎热搜数据页面连接失败"
,
cause
);
return
doc
;
}
else
{
String
[]
split
=
url
.
split
(
"/"
);
String
id
=
split
[
4
];
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"QuestionHeader"
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
//获取标签
String
label
=
""
;
Elements
select
=
document
.
select
(
"div.QuestionHeader-topics"
).
select
(
"div.css-1gomreu"
);
String
substring
=
htmlBody
.
substring
(
htmlBody
.
indexOf
(
"initialState"
)
-
2
,
htmlBody
.
indexOf
(
"subAppName"
)
+
19
);
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
substring
);
String
detail
=
jsonObject
.
getJSONObject
(
"initialState"
).
getJSONObject
(
"entities"
).
getJSONObject
(
"questions"
).
getJSONObject
(
id
).
getString
(
"detail"
);
String
topicLead
=
DelTagsUtil
.
getTextFromHtml
(
detail
);
for
(
Element
element
:
select
)
{
String
text
=
"`"
+
element
.
select
(
"div.css-1gomreu"
).
text
()
+
";"
;
label
=
label
+
text
;
...
...
@@ -195,6 +204,7 @@ public class ZhihuHotSearchCrawler {
doc
.
put
(
"fans"
,
Long
.
valueOf
(
count
[
0
].
replaceAll
(
","
,
""
).
trim
()));
//获取浏览量
doc
.
put
(
"view"
,
Long
.
valueOf
(
count
[
1
].
replaceAll
(
","
,
""
).
trim
()));
doc
.
put
(
"topicLead"
,
topicLead
);
return
doc
;
}
else
{
return
doc
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
8f07a0cc
...
...
@@ -114,6 +114,7 @@ public class HotSearchCacheDAO {
document
.
put
(
"tag"
,
hotSearch
.
getTag
());
document
.
put
(
"view"
,
hotSearch
.
getView
());
document
.
put
(
"fans"
,
hotSearch
.
getFans
());
document
.
put
(
"topic_lead"
,
hotSearch
.
getTopicLead
());
}
if
(
"微博出圈榜"
.
equals
(
hotSearch
.
getType
()))
{
...
...
@@ -143,6 +144,9 @@ public class HotSearchCacheDAO {
if
(
"B站排行榜"
.
equals
(
hotSearch
.
getType
()))
{
document
.
remove
(
"downtext"
);
}
if
(
"知乎热搜"
.
equals
(
hotSearch
.
getType
()))
{
document
.
remove
(
"topic_lead"
);
}
if
(
hotSearch
.
getType
().
contains
(
"微博品牌"
))
{
document
.
remove
(
"readCount"
);
document
.
remove
(
"discussCount"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/util/DelTagsUtil.java
0 → 100644
View file @
8f07a0cc
package
com
.
zhiwei
.
searchhotcrawler
.
util
;
/**
* 去除文章内容页页面代码里的HTML标签
*/
public
class
DelTagsUtil
{
/**
* 去除html代码中含有的标签
* @param htmlStr
* @return
*/
public
static
String
delHtmlTags
(
String
htmlStr
)
{
//定义script的正则表达式,去除js可以防止注入
String
scriptRegex
=
"<script[^>]*?>[\\s\\S]*?<\\/script>"
;
//定义style的正则表达式,去除style样式,防止css代码过多时只截取到css样式代码
String
styleRegex
=
"<style[^>]*?>[\\s\\S]*?<\\/style>"
;
//定义HTML标签的正则表达式,去除标签,只提取文字内容
String
htmlRegex
=
"<[^>]+>"
;
//定义空格,回车,换行符,制表符
String
spaceRegex
=
"\\s*|\t|\r|\n"
;
// 过滤script标签
htmlStr
=
htmlStr
.
replaceAll
(
scriptRegex
,
""
);
// 过滤style标签
htmlStr
=
htmlStr
.
replaceAll
(
styleRegex
,
""
);
// 过滤html标签
htmlStr
=
htmlStr
.
replaceAll
(
htmlRegex
,
""
);
// 过滤空格等
htmlStr
=
htmlStr
.
replaceAll
(
spaceRegex
,
""
);
return
htmlStr
.
trim
();
// 返回文本字符串
}
/**
* 获取HTML代码里的内容
* @param htmlStr
* @return
*/
public
static
String
getTextFromHtml
(
String
htmlStr
){
//去除html标签
htmlStr
=
delHtmlTags
(
htmlStr
);
//去除空格" "
htmlStr
=
htmlStr
.
replaceAll
(
" "
,
""
);
return
htmlStr
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment