Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
5ed24a98
Commit
5ed24a98
authored
Jun 16, 2021
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
新增快手热榜的采集
parent
d2e5b1cc
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
305 additions
and
0 deletions
+305
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+1
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
+92
-0
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchRunTest.java
+21
-0
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchCrawlerTest.java
+93
-0
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchRun.java
+42
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+13
-0
src/test/java/hotSaerchTest/HotSearchTest.java
+43
-0
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
5ed24a98
...
...
@@ -25,4 +25,5 @@ public enum HotSearchType {
B
站热搜
,
人气榜
36
氪
,
虎嗅热文推荐
,
快手热榜
,
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
0 → 100644
View file @
5ed24a98
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.time.Duration
;
import
java.util.*
;
/**
* @author ll
* @ClassName:KuaiShouHotSearchCrawlerTeat
* @Description:
* @date 2021年6月10日 下午5:54:31
*/
@Log4j2
public
class
KuaiShouHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public
static
List
<
HotSearchList
>
KuaiShouHotSearchCrawler
(
Date
date
)
{
String
url
=
"https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"APOLLO_STATE"
))
{
return
ansysData
(
htmlBody
,
date
);
}
else
{
log
.
info
(
"解析快手热榜时出现解析错误,页面结构有问题"
);
}
return
Collections
.
emptyList
();
}
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
,
Date
date
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
JSONObject
jsonObject
=
null
;
try
{
String
substring
=
htmlBody
.
substring
(
htmlBody
.
indexOf
(
"homexxunknown"
)+
15
,
htmlBody
.
indexOf
(
"homexxfilmcomlist"
)+
18
);
String
sub
=
"{"
+
substring
.
substring
(
substring
.
indexOf
(
"VisionHotRankResult"
)
+
22
,
substring
.
indexOf
(
"llsid"
)
-
2
)+
"}}"
;
String
substring1
=
sub
.
substring
(
0
,
sub
.
indexOf
(
"$ROOT_QUERY.visionMovieRank"
)
-
2
)+
"}"
;
jsonObject
=
JSONObject
.
parseObject
(
substring1
);
//获取每个jsonObject对象的值
Collection
<
Object
>
values
=
jsonObject
.
values
();
for
(
Object
value
:
values
)
{
try
{
JSONObject
object
=
(
JSONObject
)
JSONObject
.
toJSON
(
value
);
//获取话题名
String
name
=
object
.
getString
(
"name"
);
//排名
Integer
rank
=
object
.
getInteger
(
"rank"
);
String
hotValue
=
object
.
getString
(
"hotValue"
);
String
[]
ws
=
hotValue
.
split
(
"w"
);
//热度
Double
d
=
Double
.
valueOf
(
ws
[
0
])*
10000
;
long
hot
=
d
.
longValue
();
//话题链接
String
url
=
object
.
getString
(
"poster"
);
//标签类型
String
tagType
=
null
;
if
(
object
.
containsKey
(
"tagType"
)){
tagType
=
object
.
getString
(
"tagType"
);
}
HotSearchList
hotSearchList
=
new
HotSearchList
(
url
,
name
,
hot
,
true
,
rank
,
HotSearchType
.
快手热榜
.
name
(),
tagType
,
date
);
list
.
add
(
hotSearchList
);
}
catch
(
NumberFormatException
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误"
,
e
);
}
}
}
catch
(
NumberFormatException
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误,数据不是json结构"
,
e
);
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchRunTest.java
0 → 100644
View file @
5ed24a98
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
java.text.ParseException
;
public
class
HotSearchRunTest
{
public
static
void
main
(
String
[]
args
)
throws
ParseException
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
//微博热搜开始采集
// new WeiboHotSearchRun().start();
//快手热榜开始采集
// new KuaiShouHotSearchRun().start();
}
}
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchCrawlerTest.java
0 → 100644
View file @
5ed24a98
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.time.Duration
;
import
java.util.*
;
/**
* @author ll
* @ClassName:KuaiShouHotSearchCrawlerTeat
* @Description:
* @date 2021年6月10日 下午5:54:31
*/
@Log4j2
public
class
KuaiShouHotSearchCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public
static
List
<
HotSearchList
>
KuaiShouHotSearchCrawler
(
Date
date
)
{
String
url
=
"https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"APOLLO_STATE"
))
{
return
ansysData
(
htmlBody
,
date
);
}
else
{
log
.
info
(
"解析快手热榜时出现解析错误,页面结构有问题"
);
}
return
Collections
.
emptyList
();
}
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
,
Date
date
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
JSONObject
jsonObject
=
null
;
try
{
String
substring
=
htmlBody
.
substring
(
htmlBody
.
indexOf
(
"homexxunknown"
)+
15
,
htmlBody
.
indexOf
(
"homexxfilmcomlist"
)+
18
);
String
sub
=
"{"
+
substring
.
substring
(
substring
.
indexOf
(
"VisionHotRankResult"
)
+
22
,
substring
.
indexOf
(
"llsid"
)
-
2
)+
"}}"
;
String
substring1
=
sub
.
substring
(
0
,
sub
.
indexOf
(
"$ROOT_QUERY.visionMovieRank"
)
-
2
)+
"}"
;
jsonObject
=
JSONObject
.
parseObject
(
substring1
);
//获取每个jsonObject对象的值
Collection
<
Object
>
values
=
jsonObject
.
values
();
for
(
Object
value
:
values
)
{
try
{
JSONObject
object
=
(
JSONObject
)
JSONObject
.
toJSON
(
value
);
//获取话题名
String
name
=
object
.
getString
(
"name"
);
//排名
Integer
rank
=
object
.
getInteger
(
"rank"
);
String
hotValue
=
object
.
getString
(
"hotValue"
);
String
[]
ws
=
hotValue
.
split
(
"w"
);
//热度
Double
d
=
Double
.
valueOf
(
ws
[
0
])*
10000
;
long
hot
=
d
.
longValue
();
//话题链接
String
url
=
object
.
getString
(
"poster"
);
//标签类型
String
tagType
=
null
;
if
(
object
.
containsKey
(
"tagType"
)){
tagType
=
object
.
getString
(
"tagType"
);
}
HotSearchList
hotSearchList
=
new
HotSearchList
(
url
,
name
,
hot
,
true
,
rank
,
HotSearchType
.
快手热榜
.
name
(),
tagType
,
date
);
list
.
add
(
hotSearchList
);
}
catch
(
NumberFormatException
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误"
,
e
);
}
}
}
catch
(
NumberFormatException
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误,数据不是json结构"
,
e
);
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchRun.java
0 → 100644
View file @
5ed24a98
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
@Log4j2
public
class
KuaiShouHotSearchRun
extends
Thread
{
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getHotList
();
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
}
ZhiWeiTools
.
sleep
(
50
);
}
}
private
void
getHotList
()
{
log
.
info
(
"快手热榜采集开始........"
);
List
<
HotSearchList
>
kuaiShouList
=
KuaiShouHotSearchCrawlerTest
.
KuaiShouHotSearchCrawler
(
new
Date
());
log
.
info
(
"{}, 此轮快手热榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
kuaiShouList
!=
null
?
kuaiShouList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
"快手热榜"
,
kuaiShouList
);
log
.
info
(
"快手热榜采集结束........"
);
}
}
\ No newline at end of file
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
5ed24a98
...
...
@@ -507,5 +507,18 @@ public class GatherTimer {
}
return
name
;
}
/**
*快手热榜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerKuaiShou
(){
logger
.
info
(
"快手热榜开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
kuaiShouList
=
KuaiShouHotSearchCrawler
.
KuaiShouHotSearchCrawler
(
date
);
logger
.
info
(
"{}, 快手此轮采集到的数据量为:{}"
,
new
Date
(),
kuaiShouList
!=
null
?
kuaiShouList
.
size
()
:
0
);
TipsUtils
.
addHotList
(
HotSearchType
.
快手热榜
.
name
(),
kuaiShouList
);
logger
.
info
(
"快手热榜采集结束..."
);
}
}
src/test/java/hotSaerchTest/HotSearchTest.java
0 → 100644
View file @
5ed24a98
package
hotSaerchTest
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest
;
import
lombok.extern.log4j.Log4j2
;
import
org.junit.Test
;
import
org.junit.runner.RunWith
;
import
org.springframework.test.context.ContextConfiguration
;
import
org.springframework.test.context.junit4.SpringJUnit4ClassRunner
;
import
java.util.Date
;
import
java.util.List
;
/**
* @author ll
* @date 2021/6/10 6:30
*/
@Log4j2
@RunWith
(
SpringJUnit4ClassRunner
.
class
)
@ContextConfiguration
(
locations
=
{
"classpath:applicationContext.xml"
})
public
class
HotSearchTest
{
/**
* 测试快手热榜采集
*/
@Test
public
void
kuaiShouTestCrawler
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
List
<
HotSearchList
>
hotSearchLists
=
KuaiShouHotSearchCrawlerTest
.
KuaiShouHotSearchCrawler
(
new
Date
());
System
.
out
.
println
(
hotSearchLists
);
System
.
out
.
println
(
hotSearchLists
.
size
());
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment