Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
1d1a7503
Commit
1d1a7503
authored
Nov 08, 2022
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
暂停知乎数码,微博超话,微博热词定时任务及头条阅读量更新
parent
2450a48a
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
47 additions
and
37 deletions
+47
-37
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
+17
-22
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+17
-9
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+10
-5
src/main/resources/proxyip.properties
+3
-1
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
View file @
1d1a7503
...
...
@@ -166,34 +166,29 @@ public class ToutiaoHotSearchCrawler {
String
htmlBody
=
null
;
String
url
=
hotSearchList
.
getUrl
();
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
i
=
0
;
i
<=
5
;
i
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyServerSupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析今日头条热搜详情页面出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
".result-content .cs-view .cs-topone-tail .cs-view .margin-bottom-m .margin-left-m"
);
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
())
{
Element
element
=
elements
.
first
();
String
readCount
=
element
.
text
().
replaceAll
(
"阅读"
,
""
);
Long
count
=
TipsUtils
.
getHotCount
(
readCount
);
log
.
info
(
"{},阅读量:{}"
,
hotSearchList
.
getName
(),
count
);
hotSearchList
.
setCommentCount
(
count
);
hotSearchListDAO
.
updateTouTiaoReadCount
(
hotSearchList
);
return
hotSearchList
;
}
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyServerSupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析今日头条热搜详情页面出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)&&
htmlBody
.
contains
(
"data"
))
{
try
{
String
substring
=
htmlBody
.
substring
(
htmlBody
.
indexOf
(
"read_count"
)+
12
,
htmlBody
.
indexOf
(
"search_bar_controll"
));
String
s
=
substring
.
split
(
","
)[
0
];
Long
commentCount
=
Long
.
valueOf
(
s
);
hotSearchList
.
setCommentCount
(
commentCount
);
hotSearchListDAO
.
updateTouTiaoReadCount
(
hotSearchList
);
return
hotSearchList
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
ZhiWeiTools
.
sleep
(
1000L
);
}
}
return
hotSearchList
;
}
/**
* 热搜类型
*
...
...
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
1d1a7503
...
...
@@ -4,6 +4,7 @@ package com.zhiwei.searchhotcrawler.run;
import
com.zhiwei.http.proxy.CynomysFactory
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumer
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.*
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.dubbo.config.ApplicationConfig
;
...
...
@@ -18,21 +19,28 @@ public class HotSearchRun {
public
static
void
main
(
String
[]
args
)
{
ApplicationContext
context
=
new
ClassPathXmlApplicationContext
(
"applicationContext.xml"
);
// SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
// .group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
// ProxyFactory.init(simpleConfig);
ApplicationConfig
applicationConfig
=
new
ApplicationConfig
();
applicationConfig
.
setName
(
"
xxx
-project"
);
applicationConfig
.
setName
(
"
hot_search
-project"
);
RegistryConfig
registryConfig
=
new
RegistryConfig
();
registryConfig
.
setAddress
(
"zookeeper://192.168.0.30:2181?timeout=30000"
);
ConsumerConfig
consumerConfig
=
new
ConsumerConfig
();
// 设置分组
consumerConfig
.
setGroup
(
"test"
);
String
username
=
"your cool username"
;
String
password
=
"your cool password"
;
String
username
=
null
;
String
password
=
null
;
if
(
ProxyConfig
.
isLocal
)
{
registryConfig
.
setAddress
(
ProxyConfig
.
localRegistry
);
// 设置分组
consumerConfig
.
setGroup
(
ProxyConfig
.
localGroup
);
username
=
ProxyConfig
.
localUsername
;
password
=
ProxyConfig
.
localPassword
;
}
else
{
registryConfig
.
setAddress
(
ProxyConfig
.
hangzhouRegistry
);
// 设置分组
consumerConfig
.
setGroup
(
ProxyConfig
.
hangzhouGroup
);
username
=
ProxyConfig
.
hangzhouUsername
;
password
=
ProxyConfig
.
hangzhouPassword
;
}
// 创建 consumer,applicationConfig 非必需参数
CynomysConsumer
consumer
=
CynomysConsumerFactory
.
create
(
applicationConfig
,
registryConfig
,
consumerConfig
,
username
,
password
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
1d1a7503
...
...
@@ -113,8 +113,13 @@ public class GatherTimer {
log
.
info
(
"{}, 今日头条此轮采集到的数据量为:{}"
,
new
Date
(),
toutiaoList
!=
null
?
toutiaoList
.
size
()
:
0
);
TipsUtils
.
addHotList
(
HotSearchType
.
今日头条热搜
.
name
(),
toutiaoList
);
log
.
info
(
"今日头条热搜采集结束..."
);
log
.
info
(
"今日头条热搜详情趋势阅读量更新..."
);
TouTiaoExecutor
.
countTouTiaoReadCount
(
toutiaoList
);
//暂停今日头条阅读量更新
// log.info("今日头条热搜详情趋势阅读量更新开始...");
// //TouTiaoExecutor.countTouTiaoReadCount(toutiaoList);
// for (HotSearchList hotSearchList : toutiaoList) {
// ToutiaoHotSearchCrawler.toutiaoReadCount(hotSearchList);
// }
// log.info("今日头条热搜详情趋势阅读量更新结束...");
}
/**
...
...
@@ -362,7 +367,7 @@ public class GatherTimer {
* 知乎热搜数码分类采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"20 * * * * ? "
)
//
@Scheduled(cron = "20 * * * * ? ")
public
void
crawlerZhiHuDigital
(){
this
.
crawlerZhiHuChild
(
DIGITAL
);
}
...
...
@@ -428,7 +433,7 @@ public class GatherTimer {
* 微博超话的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 0 0/3 * * ? "
)
//
@Scheduled(cron = "0 0 0/3 * * ? ")
public
void
crawlerWeiBoSuperTopic
(){
log
.
info
(
"微博超话采集开始........"
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
...
...
@@ -675,7 +680,7 @@ public class GatherTimer {
*微博热词采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 0 0/1 * * ? "
)
//
@Scheduled(cron = "0 0 0/1 * * ? ")
public
void
WeiBoSearchHotWordsCrawler
(){
log
.
info
(
"微博热词采集开始........"
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
...
...
src/main/resources/proxyip.properties
View file @
1d1a7503
isLocal
=
tru
e
isLocal
=
fals
e
hangzhou.registry
=
zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182&timeout=60000
hangzhou.group
=
hangzhou
...
...
@@ -7,5 +7,7 @@ hangzhou.password=gRG9QJ6QghuLcCC9
########################################################
local.registry
=
zookeeper://192.168.0.35:2181?backup=192.168.0.30:2181,192.168.0.11:2181&timeout=60000
local.group
=
local
#local.username=15139460980
#local.password=lllq2w3e4r
local.username
=
15757871020
local.password
=
Cwt1q2w3e4r@
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment