Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
1d1a7503
Commit
1d1a7503
authored
Nov 08, 2022
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
暂停知乎数码,微博超话,微博热词定时任务及头条阅读量更新
parent
2450a48a
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
37 additions
and
27 deletions
+37
-27
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
+8
-13
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+16
-8
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+10
-5
src/main/resources/proxyip.properties
+3
-1
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
View file @
1d1a7503
...
@@ -166,7 +166,6 @@ public class ToutiaoHotSearchCrawler {
...
@@ -166,7 +166,6 @@ public class ToutiaoHotSearchCrawler {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
String
url
=
hotSearchList
.
getUrl
();
String
url
=
hotSearchList
.
getUrl
();
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
i
=
0
;
i
<=
5
;
i
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyServerSupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyServerSupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
Throwable
cause
=
response
.
cause
();
...
@@ -174,26 +173,22 @@ public class ToutiaoHotSearchCrawler {
...
@@ -174,26 +173,22 @@ public class ToutiaoHotSearchCrawler {
}
else
{
}
else
{
htmlBody
=
response
.
bodyString
();
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)&&
htmlBody
.
contains
(
"data"
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
try
{
Elements
elements
=
document
.
select
(
".result-content .cs-view .cs-topone-tail .cs-view .margin-bottom-m .margin-left-m"
);
String
substring
=
htmlBody
.
substring
(
htmlBody
.
indexOf
(
"read_count"
)+
12
,
htmlBody
.
indexOf
(
"search_bar_controll"
));
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
())
{
String
s
=
substring
.
split
(
","
)[
0
];
Element
element
=
elements
.
first
();
Long
commentCount
=
Long
.
valueOf
(
s
);
String
readCount
=
element
.
text
().
replaceAll
(
"阅读"
,
""
);
hotSearchList
.
setCommentCount
(
commentCount
);
Long
count
=
TipsUtils
.
getHotCount
(
readCount
);
log
.
info
(
"{},阅读量:{}"
,
hotSearchList
.
getName
(),
count
);
hotSearchList
.
setCommentCount
(
count
);
hotSearchListDAO
.
updateTouTiaoReadCount
(
hotSearchList
);
hotSearchListDAO
.
updateTouTiaoReadCount
(
hotSearchList
);
return
hotSearchList
;
return
hotSearchList
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
}
ZhiWeiTools
.
sleep
(
1000L
);
}
}
}
return
hotSearchList
;
return
hotSearchList
;
}
}
/**
/**
* 热搜类型
* 热搜类型
*
*
...
...
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
1d1a7503
...
@@ -4,6 +4,7 @@ package com.zhiwei.searchhotcrawler.run;
...
@@ -4,6 +4,7 @@ package com.zhiwei.searchhotcrawler.run;
import
com.zhiwei.http.proxy.CynomysFactory
;
import
com.zhiwei.http.proxy.CynomysFactory
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumer
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumer
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.*
;
import
com.zhiwei.searchhotcrawler.timer.*
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.dubbo.config.ApplicationConfig
;
import
org.apache.dubbo.config.ApplicationConfig
;
...
@@ -18,21 +19,28 @@ public class HotSearchRun {
...
@@ -18,21 +19,28 @@ public class HotSearchRun {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ApplicationContext
context
=
new
ClassPathXmlApplicationContext
(
"applicationContext.xml"
);
ApplicationContext
context
=
new
ClassPathXmlApplicationContext
(
"applicationContext.xml"
);
// SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
// SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
// .group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
// .group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
// ProxyFactory.init(simpleConfig);
// ProxyFactory.init(simpleConfig);
ApplicationConfig
applicationConfig
=
new
ApplicationConfig
();
ApplicationConfig
applicationConfig
=
new
ApplicationConfig
();
applicationConfig
.
setName
(
"
xxx
-project"
);
applicationConfig
.
setName
(
"
hot_search
-project"
);
RegistryConfig
registryConfig
=
new
RegistryConfig
();
RegistryConfig
registryConfig
=
new
RegistryConfig
();
registryConfig
.
setAddress
(
"zookeeper://192.168.0.30:2181?timeout=30000"
);
ConsumerConfig
consumerConfig
=
new
ConsumerConfig
();
ConsumerConfig
consumerConfig
=
new
ConsumerConfig
();
String
username
=
null
;
String
password
=
null
;
if
(
ProxyConfig
.
isLocal
)
{
registryConfig
.
setAddress
(
ProxyConfig
.
localRegistry
);
// 设置分组
// 设置分组
consumerConfig
.
setGroup
(
"test"
);
consumerConfig
.
setGroup
(
ProxyConfig
.
localGroup
);
String
username
=
"your cool username"
;
username
=
ProxyConfig
.
localUsername
;
String
password
=
"your cool password"
;
password
=
ProxyConfig
.
localPassword
;
}
else
{
registryConfig
.
setAddress
(
ProxyConfig
.
hangzhouRegistry
);
// 设置分组
consumerConfig
.
setGroup
(
ProxyConfig
.
hangzhouGroup
);
username
=
ProxyConfig
.
hangzhouUsername
;
password
=
ProxyConfig
.
hangzhouPassword
;
}
// 创建 consumer,applicationConfig 非必需参数
// 创建 consumer,applicationConfig 非必需参数
CynomysConsumer
consumer
=
CynomysConsumerFactory
.
create
(
applicationConfig
,
registryConfig
,
consumerConfig
,
username
,
password
);
CynomysConsumer
consumer
=
CynomysConsumerFactory
.
create
(
applicationConfig
,
registryConfig
,
consumerConfig
,
username
,
password
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
1d1a7503
...
@@ -113,8 +113,13 @@ public class GatherTimer {
...
@@ -113,8 +113,13 @@ public class GatherTimer {
log
.
info
(
"{}, 今日头条此轮采集到的数据量为:{}"
,
new
Date
(),
toutiaoList
!=
null
?
toutiaoList
.
size
()
:
0
);
log
.
info
(
"{}, 今日头条此轮采集到的数据量为:{}"
,
new
Date
(),
toutiaoList
!=
null
?
toutiaoList
.
size
()
:
0
);
TipsUtils
.
addHotList
(
HotSearchType
.
今日头条热搜
.
name
(),
toutiaoList
);
TipsUtils
.
addHotList
(
HotSearchType
.
今日头条热搜
.
name
(),
toutiaoList
);
log
.
info
(
"今日头条热搜采集结束..."
);
log
.
info
(
"今日头条热搜采集结束..."
);
log
.
info
(
"今日头条热搜详情趋势阅读量更新..."
);
//暂停今日头条阅读量更新
TouTiaoExecutor
.
countTouTiaoReadCount
(
toutiaoList
);
// log.info("今日头条热搜详情趋势阅读量更新开始...");
// //TouTiaoExecutor.countTouTiaoReadCount(toutiaoList);
// for (HotSearchList hotSearchList : toutiaoList) {
// ToutiaoHotSearchCrawler.toutiaoReadCount(hotSearchList);
// }
// log.info("今日头条热搜详情趋势阅读量更新结束...");
}
}
/**
/**
...
@@ -362,7 +367,7 @@ public class GatherTimer {
...
@@ -362,7 +367,7 @@ public class GatherTimer {
* 知乎热搜数码分类采集
* 知乎热搜数码分类采集
*/
*/
@Async
(
value
=
"myScheduler"
)
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"20 * * * * ? "
)
//
@Scheduled(cron = "20 * * * * ? ")
public
void
crawlerZhiHuDigital
(){
public
void
crawlerZhiHuDigital
(){
this
.
crawlerZhiHuChild
(
DIGITAL
);
this
.
crawlerZhiHuChild
(
DIGITAL
);
}
}
...
@@ -428,7 +433,7 @@ public class GatherTimer {
...
@@ -428,7 +433,7 @@ public class GatherTimer {
* 微博超话的采集
* 微博超话的采集
*/
*/
@Async
(
value
=
"myScheduler"
)
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 0 0/3 * * ? "
)
//
@Scheduled(cron = "0 0 0/3 * * ? ")
public
void
crawlerWeiBoSuperTopic
(){
public
void
crawlerWeiBoSuperTopic
(){
log
.
info
(
"微博超话采集开始........"
);
log
.
info
(
"微博超话采集开始........"
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
...
@@ -675,7 +680,7 @@ public class GatherTimer {
...
@@ -675,7 +680,7 @@ public class GatherTimer {
*微博热词采集
*微博热词采集
*/
*/
@Async
(
value
=
"myScheduler"
)
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 0 0/1 * * ? "
)
//
@Scheduled(cron = "0 0 0/1 * * ? ")
public
void
WeiBoSearchHotWordsCrawler
(){
public
void
WeiBoSearchHotWordsCrawler
(){
log
.
info
(
"微博热词采集开始........"
);
log
.
info
(
"微博热词采集开始........"
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
...
...
src/main/resources/proxyip.properties
View file @
1d1a7503
isLocal
=
tru
e
isLocal
=
fals
e
hangzhou.registry
=
zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182&timeout=60000
hangzhou.registry
=
zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182&timeout=60000
hangzhou.group
=
hangzhou
hangzhou.group
=
hangzhou
...
@@ -7,5 +7,7 @@ hangzhou.password=gRG9QJ6QghuLcCC9
...
@@ -7,5 +7,7 @@ hangzhou.password=gRG9QJ6QghuLcCC9
########################################################
########################################################
local.registry
=
zookeeper://192.168.0.35:2181?backup=192.168.0.30:2181,192.168.0.11:2181&timeout=60000
local.registry
=
zookeeper://192.168.0.35:2181?backup=192.168.0.30:2181,192.168.0.11:2181&timeout=60000
local.group
=
local
local.group
=
local
#local.username=15139460980
#local.password=lllq2w3e4r
local.username
=
15757871020
local.username
=
15757871020
local.password
=
Cwt1q2w3e4r@
local.password
=
Cwt1q2w3e4r@
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment