Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
f671bae7
Commit
f671bae7
authored
Aug 14, 2020
by
马黎滨
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'mlbWork' into 'master'
定时器采集 See merge request
!23
parents
d5791e24
f3e0b6c2
Hide whitespace changes
Inline
Side-by-side
Showing
33 changed files
with
1036 additions
and
409 deletions
+1036
-409
pom.xml
+57
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
+9
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+5
-8
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
+6
-3
src/main/java/com/zhiwei/searchhotcrawler/crawler/FengHuangSearchCrawler.java
+4
-4
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
+3
-7
src/main/java/com/zhiwei/searchhotcrawler/crawler/SouhuTopicCrawler.java
+10
-3
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
+2
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
+2
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/WangYiHotSearchCrawler.java
+4
-4
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+56
-56
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
+4
-4
src/main/java/com/zhiwei/searchhotcrawler/crawler/XinLangHotSearchCrawler.java
+10
-4
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuChildHotSearchCrawler.java
+3
-6
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+39
-38
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuTopicSearchCrawler.java
+4
-6
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+5
-2
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+16
-13
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
+27
-26
src/main/java/com/zhiwei/searchhotcrawler/timer/DouYinUrlHotSearchRun.java
+19
-19
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
+17
-17
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
+13
-13
src/main/java/com/zhiwei/searchhotcrawler/timer/ThreadOneRun.java
+17
-17
src/main/java/com/zhiwei/searchhotcrawler/timer/ToutiaoHotSearchRun.java
+13
-13
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
+11
-11
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboSuperTopicRun.java
+22
-22
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
+29
-29
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuChildHotSearchRun.java
+44
-44
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
+14
-14
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuTopSearchRun.java
+13
-13
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+354
-0
src/main/java/com/zhiwei/searchhotcrawler/util/DateUtils.java
+180
-0
src/main/resources/applicationContext.xml
+24
-0
No files found.
pom.xml
View file @
f671bae7
...
@@ -10,6 +10,7 @@
...
@@ -10,6 +10,7 @@
<properties>
<properties>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
<spring.version>
4.2.2.RELEASE
</spring.version>
</properties>
</properties>
<developers>
<developers>
...
@@ -55,6 +56,62 @@
...
@@ -55,6 +56,62 @@
<artifactId>
crawler-core
</artifactId>
<artifactId>
crawler-core
</artifactId>
<version>
0.6.0.4-RELEASE
</version>
<version>
0.6.0.4-RELEASE
</version>
</dependency>
</dependency>
<dependency>
<groupId>
org.quartz-scheduler
</groupId>
<artifactId>
quartz
</artifactId>
<version>
${quartz.version}
</version>
</dependency>
<dependency>
<groupId>
org.quartz-scheduler
</groupId>
<artifactId>
quartz-jobs
</artifactId>
<version>
${quartz.version}
</version>
</dependency>
<!-- Spring文件配置 -->
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-aop
</artifactId>
<version>
${spring.version}
</version>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-beans
</artifactId>
<version>
${spring.version}
</version>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-core
</artifactId>
<version>
${spring.version}
</version>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-test
</artifactId>
<version>
${spring.version}
</version>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-context
</artifactId>
<version>
${spring.version}
</version>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-expression
</artifactId>
<version>
${spring.version}
</version>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-context-support
</artifactId>
<version>
${spring.version}
</version>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-web
</artifactId>
<version>
${spring.version}
</version>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-tx
</artifactId>
<version>
${spring.version}
</version>
</dependency>
</dependencies>
</dependencies>
<build>
<build>
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
View file @
f671bae7
...
@@ -81,42 +81,42 @@ public class HotSearchList implements Serializable{
...
@@ -81,42 +81,42 @@ public class HotSearchList implements Serializable{
public
HotSearchList
(){}
public
HotSearchList
(){}
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
){
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
,
Date
date
){
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
()
+
"_"
+
type
;
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
()
+
"_"
+
type
;
this
.
url
=
url
;
this
.
url
=
url
;
this
.
name
=
name
;
this
.
name
=
name
;
this
.
count
=
count
;
this
.
count
=
count
;
this
.
hot
=
hot
;
this
.
hot
=
hot
;
this
.
rank
=
rank
;
this
.
rank
=
rank
;
this
.
time
=
new
Date
()
;
this
.
time
=
date
;
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
()
,
"yyyy-MM-dd"
);
this
.
day
=
TimeParse
.
dateFormartString
(
date
,
"yyyy-MM-dd"
);
this
.
type
=
type
;
this
.
type
=
type
;
this
.
icon
=
icon
;
this
.
icon
=
icon
;
}
}
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Integer
rank
,
String
type
){
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Integer
rank
,
String
type
,
Date
date
){
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
()+
"_"
+
type
;
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
()+
"_"
+
type
;
this
.
url
=
url
;
this
.
url
=
url
;
this
.
name
=
name
;
this
.
name
=
name
;
this
.
count
=
count
;
this
.
count
=
count
;
this
.
hot
=
true
;
this
.
hot
=
true
;
this
.
rank
=
rank
;
this
.
rank
=
rank
;
this
.
time
=
new
Date
()
;
this
.
time
=
date
;
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
()
,
"yyyy-MM-dd"
);
this
.
day
=
TimeParse
.
dateFormartString
(
date
,
"yyyy-MM-dd"
);
this
.
type
=
type
;
this
.
type
=
type
;
}
}
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Integer
rank
,
String
type
,
Integer
commentCount
,
String
topicLead
){
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Integer
rank
,
String
type
,
Integer
commentCount
,
String
topicLead
,
Date
date
){
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
()+
"_"
+
type
;
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
()+
"_"
+
type
;
this
.
url
=
url
;
this
.
url
=
url
;
this
.
name
=
name
;
this
.
name
=
name
;
this
.
count
=
count
;
this
.
count
=
count
;
this
.
hot
=
true
;
this
.
hot
=
true
;
this
.
rank
=
rank
;
this
.
rank
=
rank
;
this
.
time
=
new
Date
()
;
this
.
time
=
date
;
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
()
,
"yyyy-MM-dd"
);
this
.
day
=
TimeParse
.
dateFormartString
(
date
,
"yyyy-MM-dd"
);
this
.
type
=
type
;
this
.
type
=
type
;
this
.
commentCount
=
commentCount
;
this
.
commentCount
=
commentCount
;
this
.
topicLead
=
topicLead
;
this
.
topicLead
=
topicLead
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
View file @
f671bae7
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.time.Duration
;
import
java.time.Duration
;
import
java.util.ArrayList
;
import
java.util.*
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
...
@@ -39,7 +36,7 @@ public class BaiDuHotSearchCrawler {
...
@@ -39,7 +36,7 @@ public class BaiDuHotSearchCrawler {
* @Description: PC端百度风云榜采集
* @Description: PC端百度风云榜采集
* @return void 返回类型
* @return void 返回类型
*/
*/
public
static
List
<
HotSearchList
>
baiduHotSearch
()
{
public
static
List
<
HotSearchList
>
baiduHotSearch
(
Date
date
)
{
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
...
@@ -49,7 +46,7 @@ public class BaiDuHotSearchCrawler {
...
@@ -49,7 +46,7 @@ public class BaiDuHotSearchCrawler {
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"mainBody"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"mainBody"
))
{
return
ansysData
(
htmlBody
);
return
ansysData
(
htmlBody
,
date
);
}
else
{
}
else
{
log
.
info
(
"解析百度风云榜时出现解析错误,页面结构有问题"
);
log
.
info
(
"解析百度风云榜时出现解析错误,页面结构有问题"
);
}
}
...
@@ -62,7 +59,7 @@ public class BaiDuHotSearchCrawler {
...
@@ -62,7 +59,7 @@ public class BaiDuHotSearchCrawler {
* @param htmlBody
* @param htmlBody
* @return
* @return
*/
*/
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
){
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
,
Date
date
){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
try
{
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
@@ -110,7 +107,7 @@ public class BaiDuHotSearchCrawler {
...
@@ -110,7 +107,7 @@ public class BaiDuHotSearchCrawler {
log
.
info
(
hot
);
log
.
info
(
hot
);
log
.
info
(
element
);
log
.
info
(
element
);
}
else
{
}
else
{
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
count
,
rank
,
HotSearchType
.
百度热搜
.
name
());
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
count
,
rank
,
HotSearchType
.
百度热搜
.
name
()
,
date
);
list
.
add
(
hotSearch
);
list
.
add
(
hotSearch
);
}
}
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
View file @
f671bae7
...
@@ -2,6 +2,7 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,6 +2,7 @@ package com.zhiwei.searchhotcrawler.crawler;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
...
@@ -31,6 +32,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
...
@@ -31,6 +32,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
public
class
DouyinHotSearchCrawler
{
public
class
DouyinHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
public
static
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
/**
/**
* @Title: getMobileDouyinHotList
* @Title: getMobileDouyinHotList
...
@@ -39,8 +42,8 @@ public class DouyinHotSearchCrawler {
...
@@ -39,8 +42,8 @@ public class DouyinHotSearchCrawler {
* @param @return 设定文件
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
* @return List<ZhihuHotSearch> 返回类型
*/
*/
public
static
List
<
HotSearchList
>
getMobileDouyinHotList
(){
public
static
List
<
HotSearchList
>
getMobileDouyinHotList
(
Date
date
){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
//
List<HotSearchList> list = new ArrayList<>();
String
url
=
"https://api.amemv.com/aweme/v1/hot/search/list/"
;
String
url
=
"https://api.amemv.com/aweme/v1/hot/search/list/"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
...
@@ -69,7 +72,7 @@ public class DouyinHotSearchCrawler {
...
@@ -69,7 +72,7 @@ public class DouyinHotSearchCrawler {
Integer
hotValue
=
null
;
Integer
hotValue
=
null
;
hotValue
=
Integer
.
valueOf
(
hotValueStr
);
hotValue
=
Integer
.
valueOf
(
hotValueStr
);
// logger.info("热度为:::{}", hot_value);
// logger.info("热度为:::{}", hot_value);
HotSearchList
douyin
=
new
HotSearchList
(
null
,
word
,
hotValue
,
position
,
HotSearchType
.
抖音热搜
.
name
());
HotSearchList
douyin
=
new
HotSearchList
(
null
,
word
,
hotValue
,
position
,
HotSearchType
.
抖音热搜
.
name
()
,
date
);
list
.
add
(
douyin
);
list
.
add
(
douyin
);
}
}
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/FengHuangSearchCrawler.java
View file @
f671bae7
...
@@ -26,7 +26,7 @@ public class FengHuangSearchCrawler {
...
@@ -26,7 +26,7 @@ public class FengHuangSearchCrawler {
* 获取凤凰新闻热榜
* 获取凤凰新闻热榜
* @return
* @return
*/
*/
public
static
List
<
HotSearchList
>
getFengHuangHot
List
(
){
public
static
List
<
HotSearchList
>
getFengHuangHot
Data
(
Date
date
){
log
.
info
(
"凤凰新闻热榜开始采集"
);
log
.
info
(
"凤凰新闻热榜开始采集"
);
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
for
(
int
page
=
1
;
page
<=
2
;
page
++)
{
for
(
int
page
=
1
;
page
<=
2
;
page
++)
{
...
@@ -50,7 +50,7 @@ public class FengHuangSearchCrawler {
...
@@ -50,7 +50,7 @@ public class FengHuangSearchCrawler {
Integer
count
=
TipsUtils
.
getHotCount
(
hotValue
);
Integer
count
=
TipsUtils
.
getHotCount
(
hotValue
);
Integer
commentCount
=
jsonArray
.
getJSONObject
(
i
).
getIntValue
(
"commentsall"
);
Integer
commentCount
=
jsonArray
.
getJSONObject
(
i
).
getIntValue
(
"commentsall"
);
HotSearchList
hotSearchList
=
new
HotSearchList
(
fenghuangUrl
,
name
,
count
,
HotSearchList
hotSearchList
=
new
HotSearchList
(
fenghuangUrl
,
name
,
count
,
rank
,
HotSearchType
.
凤凰新闻热榜
.
name
(),
commentCount
,
topicLead
);
rank
,
HotSearchType
.
凤凰新闻热榜
.
name
(),
commentCount
,
topicLead
,
date
);
list
.
add
(
hotSearchList
);
list
.
add
(
hotSearchList
);
}
}
}
}
...
@@ -65,7 +65,7 @@ public class FengHuangSearchCrawler {
...
@@ -65,7 +65,7 @@ public class FengHuangSearchCrawler {
* 获取凤凰新闻热搜
* 获取凤凰新闻热搜
* @return
* @return
*/
*/
public
static
List
<
HotSearchList
>
getFengHuangHot
Data
(
){
public
static
List
<
HotSearchList
>
getFengHuangHot
Search
(
Date
date
){
log
.
info
(
"凤凰新闻热搜开始采集"
);
log
.
info
(
"凤凰新闻热搜开始采集"
);
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
url
=
"https://shankapi.ifeng.com/autumn/sogouSearchHotword"
;
String
url
=
"https://shankapi.ifeng.com/autumn/sogouSearchHotword"
;
...
@@ -89,7 +89,7 @@ public class FengHuangSearchCrawler {
...
@@ -89,7 +89,7 @@ public class FengHuangSearchCrawler {
fenghuangUrl
=
"https://so.ifeng.com/?q="
+
id
;
fenghuangUrl
=
"https://so.ifeng.com/?q="
+
id
;
}
}
HotSearchList
hotSearchList
=
new
HotSearchList
(
fenghuangUrl
,
name
,
null
,
rank
,
HotSearchList
hotSearchList
=
new
HotSearchList
(
fenghuangUrl
,
name
,
null
,
rank
,
HotSearchType
.
凤凰新闻热搜
.
name
());
HotSearchType
.
凤凰新闻热搜
.
name
()
,
date
);
list
.
add
(
hotSearchList
);
list
.
add
(
hotSearchList
);
}
}
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
View file @
f671bae7
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.*
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
...
@@ -41,7 +37,7 @@ public class SougoHotSearchCrawler {
...
@@ -41,7 +37,7 @@ public class SougoHotSearchCrawler {
* @Description: TODO(PC端搜狗微信关键词采集)
* @Description: TODO(PC端搜狗微信关键词采集)
* @return void 返回类型
* @return void 返回类型
*/
*/
public
static
List
<
HotSearchList
>
sougoHotSearch
()
{
public
static
List
<
HotSearchList
>
sougoHotSearch
(
Date
date
)
{
String
url
=
"https://weixin.sogou.com"
;
String
url
=
"https://weixin.sogou.com"
;
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
Map
<
String
,
String
>
headMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headMap
=
HeaderTool
.
getCommonHead
();
...
@@ -75,7 +71,7 @@ public class SougoHotSearchCrawler {
...
@@ -75,7 +71,7 @@ public class SougoHotSearchCrawler {
String
everurl
=
element
.
select
(
"li"
).
select
(
"a"
).
attr
(
"href"
);
String
everurl
=
element
.
select
(
"li"
).
select
(
"a"
).
attr
(
"href"
);
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
null
,
rank
,
HotSearchType
.
搜狗微信热搜
.
name
());
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
null
,
rank
,
HotSearchType
.
搜狗微信热搜
.
name
()
,
date
);
if
(
Objects
.
nonNull
(
rank
))
{
if
(
Objects
.
nonNull
(
rank
))
{
list
.
add
(
hotSearch
);
list
.
add
(
hotSearch
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/SouhuTopicCrawler.java
View file @
f671bae7
...
@@ -15,6 +15,7 @@ import org.apache.commons.lang3.StringUtils;
...
@@ -15,6 +15,7 @@ import org.apache.commons.lang3.StringUtils;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
...
@@ -22,7 +23,7 @@ import java.util.List;
...
@@ -22,7 +23,7 @@ import java.util.List;
public
class
SouhuTopicCrawler
{
public
class
SouhuTopicCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
public
static
List
<
HotSearchList
>
getSouhuTopic
(){
public
static
List
<
HotSearchList
>
getSouhuTopic
(
Date
date
){
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
log
.
info
(
"搜狐话题榜开始采集..."
);
log
.
info
(
"搜狐话题榜开始采集..."
);
JSONArray
dataJson
=
null
;
JSONArray
dataJson
=
null
;
...
@@ -43,10 +44,16 @@ public class SouhuTopicCrawler {
...
@@ -43,10 +44,16 @@ public class SouhuTopicCrawler {
Integer
rank
=
i
+
1
;
Integer
rank
=
i
+
1
;
String
name
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"eventNewsInfo"
).
getString
(
"title"
);
String
name
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"eventNewsInfo"
).
getString
(
"title"
);
String
hotValue
=
dataJson
.
getJSONObject
(
i
).
getString
(
"value"
);
String
hotValue
=
dataJson
.
getJSONObject
(
i
).
getString
(
"value"
);
Integer
count
=
TipsUtils
.
getHotCount
(
hotValue
.
substring
(
0
,
hotValue
.
indexOf
(
"观点"
)));
Integer
count
=
0
;
if
(
hotValue
.
contains
(
"观点"
))
{
count
=
TipsUtils
.
getHotCount
(
hotValue
.
substring
(
0
,
hotValue
.
indexOf
(
"观点"
)));
}
else
{
log
.
error
(
"搜狐话题采集热度为空,采集结束"
);
return
Collections
.
emptyList
();
}
String
souguUrl
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"eventNewsInfo"
).
getString
(
"h5Link"
);
String
souguUrl
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"eventNewsInfo"
).
getString
(
"h5Link"
);
String
icon
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"attrInfo"
).
getString
(
"displayText"
);
String
icon
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"attrInfo"
).
getString
(
"displayText"
);
HotSearchList
hotSearchList
=
new
HotSearchList
(
souguUrl
,
name
,
count
,
true
,
rank
,
HotSearchType
.
搜狐话题
.
name
(),
icon
);
HotSearchList
hotSearchList
=
new
HotSearchList
(
souguUrl
,
name
,
count
,
true
,
rank
,
HotSearchType
.
搜狐话题
.
name
(),
icon
,
date
);
hotSearchLists
.
add
(
hotSearchList
);
hotSearchLists
.
add
(
hotSearchList
);
}
}
log
.
info
(
"{}, 此轮搜狐话题榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
));
log
.
info
(
"{}, 此轮搜狐话题榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
));
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
View file @
f671bae7
...
@@ -26,7 +26,7 @@ public class TengXunCrawler {
...
@@ -26,7 +26,7 @@ public class TengXunCrawler {
* 腾讯热榜数据采集
* 腾讯热榜数据采集
* @return
* @return
*/
*/
public
static
List
<
HotSearchList
>
getTengXunHotList
()
{
public
static
List
<
HotSearchList
>
getTengXunHotList
(
Date
date
)
{
log
.
info
(
"腾讯新闻热榜开始采集..."
);
log
.
info
(
"腾讯新闻热榜开始采集..."
);
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
JSONArray
dataJson
=
null
;
JSONArray
dataJson
=
null
;
...
@@ -68,7 +68,7 @@ public class TengXunCrawler {
...
@@ -68,7 +68,7 @@ public class TengXunCrawler {
icon
=
"新"
;
icon
=
"新"
;
}
}
}
}
HotSearchList
hotSearchList
=
new
HotSearchList
(
tengxunUrl
,
name
,
count
,
false
,
rank
,
HotSearchType
.
腾讯新闻
.
name
(),
icon
);
HotSearchList
hotSearchList
=
new
HotSearchList
(
tengxunUrl
,
name
,
count
,
false
,
rank
,
HotSearchType
.
腾讯新闻
.
name
(),
icon
,
date
);
list
.
add
(
hotSearchList
);
list
.
add
(
hotSearchList
);
}
}
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
View file @
f671bae7
...
@@ -36,7 +36,7 @@ public class ToutiaoHotSearchCrawler {
...
@@ -36,7 +36,7 @@ public class ToutiaoHotSearchCrawler {
* @Description: TODO(手机端Iphone 微博热搜采集)
* @Description: TODO(手机端Iphone 微博热搜采集)
* @return void 返回类型
* @return void 返回类型
*/
*/
public
static
List
<
HotSearchList
>
toutiaoHotSearchByPhone
(){
public
static
List
<
HotSearchList
>
toutiaoHotSearchByPhone
(
Date
date
){
String
origin
=
"hot_board"
;
String
origin
=
"hot_board"
;
String
jsUrl
=
"https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js"
;
String
jsUrl
=
"https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js"
;
Request
jsRequest
=
RequestUtils
.
wrapGet
(
jsUrl
);
Request
jsRequest
=
RequestUtils
.
wrapGet
(
jsUrl
);
...
@@ -77,7 +77,7 @@ public class ToutiaoHotSearchCrawler {
...
@@ -77,7 +77,7 @@ public class ToutiaoHotSearchCrawler {
String
wordsType
=
word
.
getString
(
"Label"
);
String
wordsType
=
word
.
getString
(
"Label"
);
String
icon
=
getIcon
(
wordsType
);
String
icon
=
getIcon
(
wordsType
);
HotSearchList
hotSearch
=
new
HotSearchList
(
link
,
name
,
hotCount
,
true
,
rank
,
HotSearchType
.
今日头条热搜
.
name
(),
icon
);
HotSearchList
hotSearch
=
new
HotSearchList
(
link
,
name
,
hotCount
,
true
,
rank
,
HotSearchType
.
今日头条热搜
.
name
(),
icon
,
date
);
result
.
add
(
hotSearch
);
result
.
add
(
hotSearch
);
rank
++;
rank
++;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WangYiHotSearchCrawler.java
View file @
f671bae7
...
@@ -32,7 +32,7 @@ public class WangYiHotSearchCrawler {
...
@@ -32,7 +32,7 @@ public class WangYiHotSearchCrawler {
* 网易新闻实时热榜的采集
* 网易新闻实时热榜的采集
* @return
* @return
*/
*/
public
static
List
<
HotSearchList
>
getWangYiHotSearch
(){
public
static
List
<
HotSearchList
>
getWangYiHotSearch
(
Date
date
){
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
log
.
info
(
"网易新闻实时热榜开始采集"
);
log
.
info
(
"网易新闻实时热榜开始采集"
);
String
url
=
"https://v6-gw.m.163.com/nc-main/api/v1/hqc/no-repeat-hot-list"
;
String
url
=
"https://v6-gw.m.163.com/nc-main/api/v1/hqc/no-repeat-hot-list"
;
...
@@ -54,7 +54,7 @@ public class WangYiHotSearchCrawler {
...
@@ -54,7 +54,7 @@ public class WangYiHotSearchCrawler {
int
count
=
jsonObject
.
getJSONObject
(
i
).
getIntValue
(
"hotValue"
);
int
count
=
jsonObject
.
getJSONObject
(
i
).
getIntValue
(
"hotValue"
);
String
contentId
=
jsonObject
.
getJSONObject
(
i
).
getString
(
"contentId"
);
String
contentId
=
jsonObject
.
getJSONObject
(
i
).
getString
(
"contentId"
);
String
wangyiUrl
=
"https://c.m.163.com/news/a/"
+
contentId
+
".html"
;
String
wangyiUrl
=
"https://c.m.163.com/news/a/"
+
contentId
+
".html"
;
HotSearchList
hotSearchList
=
new
HotSearchList
(
wangyiUrl
,
name
,
count
,
rank
,
HotSearchType
.
网易热榜
.
name
());
HotSearchList
hotSearchList
=
new
HotSearchList
(
wangyiUrl
,
name
,
count
,
rank
,
HotSearchType
.
网易热榜
.
name
()
,
date
);
hotSearchLists
.
add
(
hotSearchList
);
hotSearchLists
.
add
(
hotSearchList
);
}
}
log
.
info
(
"{}, 此轮网易新闻热榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
));
log
.
info
(
"{}, 此轮网易新闻热榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
));
...
@@ -71,7 +71,7 @@ public class WangYiHotSearchCrawler {
...
@@ -71,7 +71,7 @@ public class WangYiHotSearchCrawler {
* 网易新闻跟帖热议的采集
* 网易新闻跟帖热议的采集
* @return
* @return
*/
*/
public
static
List
<
HotSearchList
>
getWangYicomment
(){
public
static
List
<
HotSearchList
>
getWangYicomment
(
Date
date
){
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
log
.
info
(
"网易新闻跟贴热议开始采集"
);
log
.
info
(
"网易新闻跟贴热议开始采集"
);
String
url
=
"https://v6-gw.m.163.com/gentie-web/api/v2/products/a2869674571f77b5a0867c3d71db5856/rankDocs/all/list?ibc=newsapph5&limit=30"
;
String
url
=
"https://v6-gw.m.163.com/gentie-web/api/v2/products/a2869674571f77b5a0867c3d71db5856/rankDocs/all/list?ibc=newsapph5&limit=30"
;
...
@@ -93,7 +93,7 @@ public class WangYiHotSearchCrawler {
...
@@ -93,7 +93,7 @@ public class WangYiHotSearchCrawler {
int
count
=
jsonObject
.
getJSONObject
(
i
).
getIntValue
(
"hotScore"
)*
10000
;
int
count
=
jsonObject
.
getJSONObject
(
i
).
getIntValue
(
"hotScore"
)*
10000
;
String
contentId
=
jsonObject
.
getJSONObject
(
i
).
getString
(
"docId"
);
String
contentId
=
jsonObject
.
getJSONObject
(
i
).
getString
(
"docId"
);
String
wangyiUrl
=
"https://c.m.163.com/news/a/"
+
contentId
+
".html"
;
String
wangyiUrl
=
"https://c.m.163.com/news/a/"
+
contentId
+
".html"
;
HotSearchList
hotSearchList
=
new
HotSearchList
(
wangyiUrl
,
name
,
count
,
rank
,
HotSearchType
.
网易跟帖热议
.
name
());
HotSearchList
hotSearchList
=
new
HotSearchList
(
wangyiUrl
,
name
,
count
,
rank
,
HotSearchType
.
网易跟帖热议
.
name
()
,
date
);
hotSearchLists
.
add
(
hotSearchList
);
hotSearchLists
.
add
(
hotSearchList
);
}
}
log
.
info
(
"{}, 此轮网易新闻跟贴热议采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
));
log
.
info
(
"{}, 此轮网易新闻跟贴热议采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
));
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
f671bae7
...
@@ -40,60 +40,60 @@ public class WeiboHotSearchCrawler {
...
@@ -40,60 +40,60 @@ public class WeiboHotSearchCrawler {
* @Description: TODO(PC端微博热搜采集)
* @Description: TODO(PC端微博热搜采集)
* @return void 返回类型
* @return void 返回类型
*/
*/
public
static
List
<
HotSearchList
>
weiboHotSearch
(){
//
public static List<HotSearchList> weiboHotSearch(){
String
url
=
"https://s.weibo.com/top/summary?cate=realtimehot"
;
//
String url = "https://s.weibo.com/top/summary?cate=realtimehot";
//
List
<
HotSearchList
>
list
=
new
ArrayList
<
HotSearchList
>();
//
List<HotSearchList> list = new ArrayList<HotSearchList>();
for
(
int
i
=
0
;
i
<
3
;
i
++){
//
for(int i =0; i<3; i++){
String
htmlBody
=
null
;
//
String htmlBody = null;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//
Request request = RequestUtils.wrapGet(url);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
//
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody
=
response
.
body
().
string
();
//
htmlBody = response.body().string();
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
if
(
i
==
2
){
//
if(i==2){
return
list
;
//
return list;
}
else
{
//
}else{
continue
;
//
continue;
}
//
}
}
//
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"pl_top_realtimehot"
))
{
//
if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) {
try
{
//
try {
// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
//
//
String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
// script = script.replace("(", "").replace(")", "");
//
//
script = script.replace("(", "").replace(")", "");
// JSONObject json = JSONObject.parseObject(script);
//
//
JSONObject json = JSONObject.parseObject(script);
// String html = json.getString("html");
//
//
String html = json.getString("html");
Document
document
=
Jsoup
.
parse
(
htmlBody
);
//
Document document = Jsoup.parse(htmlBody);
Elements
elements
=
document
.
select
(
"div#pl_top_realtimehot"
).
select
(
"tbody"
).
select
(
"tr"
);
//
Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
for
(
Element
element
:
elements
)
{
//
for (Element element : elements) {
try
{
//
try {
String
id
=
"http://s.weibo.com"
+
element
.
select
(
"td.td-02"
).
select
(
"a"
).
attr
(
"href"
);
//
String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href");
String
name
=
element
.
select
(
"td.td-02"
).
select
(
"a"
).
text
();
//
String name = element.select("td.td-02").select("a").text();
String
num
=
!
element
.
select
(
"td.td-02"
).
select
(
"span"
).
text
().
equals
(
""
)
?
element
.
select
(
"td.td-02"
).
select
(
"span"
).
text
()
:
"0"
;
//
String num = !element.select("td.td-02").select("span").text().equals("") ? element.select("td.td-02").select("span").text() : "0";
String
rank
=
!
element
.
select
(
"td[class=\"td-01 ranktop\"]"
).
text
().
equals
(
""
)
?
element
.
select
(
"td[class=\"td-01 ranktop\"]"
).
text
()
:
"-1"
;
//
String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("") ? element.select("td[class=\"td-01 ranktop\"]").text() : "-1";
//
int
hotCount
=
Integer
.
valueOf
(
num
);
//
int hotCount = Integer.valueOf(num);
int
rankCount
=
Integer
.
valueOf
(
rank
);
//
int rankCount = Integer.valueOf(rank);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
true
,
rankCount
,
HotSearchType
.
微博热搜
.
name
(),
null
);
//
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), null);
list
.
add
(
hotSearch
);
//
list.add(hotSearch);
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
//
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
//
log.error("解析微博时时热搜时出现解析错误", e);
continue
;
//
continue;
}
//
}
}
//
}
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
//
log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace());
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
//
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
return
null
;
//
return null;
}
//
}
}
else
{
//
} else {
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
//
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
//
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
}
//
}
break
;
//
break;
}
//
}
return
list
;
//
return list;
}
//
}
...
@@ -104,7 +104,7 @@ public class WeiboHotSearchCrawler {
...
@@ -104,7 +104,7 @@ public class WeiboHotSearchCrawler {
* @Description: TODO(手机端Iphone 微博热搜采集)
* @Description: TODO(手机端Iphone 微博热搜采集)
* @return void 返回类型
* @return void 返回类型
*/
*/
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(){
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(
Date
date
){
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583"
;
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583"
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
);
...
@@ -142,7 +142,7 @@ public class WeiboHotSearchCrawler {
...
@@ -142,7 +142,7 @@ public class WeiboHotSearchCrawler {
icon
=
icon
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
icon
=
icon
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
}
}
String
id
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
String
id
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
,
date
);
result
.
add
(
hotSearch
);
result
.
add
(
hotSearch
);
rank
++;
rank
++;
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
View file @
f671bae7
...
@@ -130,7 +130,7 @@ public class WeiboTopicCrawler {
...
@@ -130,7 +130,7 @@ public class WeiboTopicCrawler {
/**
/**
* 微博平话题榜采集
* 微博平话题榜采集
*/
*/
public
static
List
<
HotSearchList
>
startCrawlerByPhone
(){
public
static
List
<
HotSearchList
>
startCrawlerByPhone
(
Date
date
){
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
for
(
int
page
=
1
;
page
<=
6
;
page
++){
for
(
int
page
=
1
;
page
<=
6
;
page
++){
String
pageUrl
=
"https://m.weibo.cn/api/container/getIndex?containerid=231648_-_2&page="
+
page
;
String
pageUrl
=
"https://m.weibo.cn/api/container/getIndex?containerid=231648_-_2&page="
+
page
;
...
@@ -146,7 +146,7 @@ public class WeiboTopicCrawler {
...
@@ -146,7 +146,7 @@ public class WeiboTopicCrawler {
continue
;
continue
;
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
topicList
.
addAll
(
parseTopicHtml
(
htmlBody
));
topicList
.
addAll
(
parseTopicHtml
(
htmlBody
,
date
));
break
;
break
;
}
else
{
}
else
{
log
.
info
(
"下载榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
log
.
info
(
"下载榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
...
@@ -157,7 +157,7 @@ public class WeiboTopicCrawler {
...
@@ -157,7 +157,7 @@ public class WeiboTopicCrawler {
}
}
private
static
List
<
HotSearchList
>
parseTopicHtml
(
String
htmlBody
)
{
private
static
List
<
HotSearchList
>
parseTopicHtml
(
String
htmlBody
,
Date
date
)
{
try
{
try
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"cards"
);
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"cards"
);
if
(
Objects
.
nonNull
(
cards
)
&&
!
cards
.
isEmpty
())
{
if
(
Objects
.
nonNull
(
cards
)
&&
!
cards
.
isEmpty
())
{
...
@@ -207,7 +207,7 @@ public class WeiboTopicCrawler {
...
@@ -207,7 +207,7 @@ public class WeiboTopicCrawler {
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
e
.
printStackTrace
();
e
.
printStackTrace
();
}
}
HotSearchList
topic
=
new
HotSearchList
(
url
,
topicName
,
readNum
,
rank
,
HotSearchType
.
微博话题
.
name
(),
commentNum
,
description
);
HotSearchList
topic
=
new
HotSearchList
(
url
,
topicName
,
readNum
,
rank
,
HotSearchType
.
微博话题
.
name
(),
commentNum
,
description
,
date
);
topicList
.
add
(
topic
);
topicList
.
add
(
topic
);
}
}
return
topicList
;
return
topicList
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/XinLangHotSearchCrawler.java
View file @
f671bae7
...
@@ -28,7 +28,7 @@ public class XinLangHotSearchCrawler {
...
@@ -28,7 +28,7 @@ public class XinLangHotSearchCrawler {
* 新浪热榜的采集
* 新浪热榜的采集
* @return
* @return
*/
*/
public
static
List
<
HotSearchList
>
getXinLangHotSearch
(){
public
static
List
<
HotSearchList
>
getXinLangHotSearch
(
Date
date
){
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
log
.
info
(
"新浪热榜开始采集"
);
log
.
info
(
"新浪热榜开始采集"
);
String
url
=
"https://sinanews.sina.cn/h5/top_news_list.d.html"
;
String
url
=
"https://sinanews.sina.cn/h5/top_news_list.d.html"
;
...
@@ -54,6 +54,12 @@ public class XinLangHotSearchCrawler {
...
@@ -54,6 +54,12 @@ public class XinLangHotSearchCrawler {
String
hotValue
=
jsonArray
.
getJSONObject
(
i
).
getString
(
"hotValue"
);
String
hotValue
=
jsonArray
.
getJSONObject
(
i
).
getString
(
"hotValue"
);
Integer
count
=
TipsUtils
.
getHotCount
(
hotValue
);
Integer
count
=
TipsUtils
.
getHotCount
(
hotValue
);
String
showTags
=
jsonArray
.
getJSONObject
(
i
).
getString
(
"showTags"
);
String
showTags
=
jsonArray
.
getJSONObject
(
i
).
getString
(
"showTags"
);
String
routeUri
=
jsonArray
.
getJSONObject
(
i
).
getString
(
"routeUri"
);
String
xinLangUrl
=
null
;
if
(
routeUri
.
contains
(
"groupId"
)){
xinLangUrl
=
"https://super.sina.cn/shequn/forum/detail_"
+
routeUri
.
substring
(
routeUri
.
indexOf
(
"groupId="
)+
8
)
+
".html"
;
}
String
icon
=
null
;
String
icon
=
null
;
if
(
showTags
.
contains
(
"新"
))
{
if
(
showTags
.
contains
(
"新"
))
{
icon
=
"新"
;
icon
=
"新"
;
...
@@ -62,7 +68,7 @@ public class XinLangHotSearchCrawler {
...
@@ -62,7 +68,7 @@ public class XinLangHotSearchCrawler {
}
else
if
(
showTags
.
contains
(
"沸"
))
{
}
else
if
(
showTags
.
contains
(
"沸"
))
{
icon
=
"沸"
;
icon
=
"沸"
;
}
}
HotSearchList
hotSearchList
=
new
HotSearchList
(
null
,
name
,
count
,
true
,
rank
,
HotSearchType
.
新浪热榜
.
name
(),
icon
);
HotSearchList
hotSearchList
=
new
HotSearchList
(
xinLangUrl
,
name
,
count
,
true
,
rank
,
HotSearchType
.
新浪热榜
.
name
(),
icon
,
date
);
hotSearchLists
.
add
(
hotSearchList
);
hotSearchLists
.
add
(
hotSearchList
);
}
}
log
.
info
(
"{}, 此轮新浪热榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
));
log
.
info
(
"{}, 此轮新浪热榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
));
...
@@ -80,7 +86,7 @@ public class XinLangHotSearchCrawler {
...
@@ -80,7 +86,7 @@ public class XinLangHotSearchCrawler {
* 新浪热点的采集
* 新浪热点的采集
* @return
* @return
*/
*/
public
static
List
<
HotSearchList
>
getXinLangHotSpot
(){
public
static
List
<
HotSearchList
>
getXinLangHotSpot
(
Date
date
){
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
log
.
info
(
"新浪热点开始采集"
);
log
.
info
(
"新浪热点开始采集"
);
String
url
=
"http://interface.sina.cn/wap_api/hot_rank_data.d.json"
;
String
url
=
"http://interface.sina.cn/wap_api/hot_rank_data.d.json"
;
...
@@ -102,7 +108,7 @@ public class XinLangHotSearchCrawler {
...
@@ -102,7 +108,7 @@ public class XinLangHotSearchCrawler {
String
name
=
dataJson
.
getJSONObject
(
i
).
getString
(
"title"
);
String
name
=
dataJson
.
getJSONObject
(
i
).
getString
(
"title"
);
String
xinlangUrl
=
dataJson
.
getJSONObject
(
i
).
getString
(
"wapurl"
);
String
xinlangUrl
=
dataJson
.
getJSONObject
(
i
).
getString
(
"wapurl"
);
Integer
hot
=
dataJson
.
getJSONObject
(
i
).
getIntValue
(
"hot_value"
);
Integer
hot
=
dataJson
.
getJSONObject
(
i
).
getIntValue
(
"hot_value"
);
HotSearchList
hotSearchList
=
new
HotSearchList
(
xinlangUrl
,
name
,
hot
,
rank
,
HotSearchType
.
新浪热点
.
name
());
HotSearchList
hotSearchList
=
new
HotSearchList
(
xinlangUrl
,
name
,
hot
,
rank
,
HotSearchType
.
新浪热点
.
name
()
,
date
);
hotSearchLists
.
add
(
hotSearchList
);
hotSearchLists
.
add
(
hotSearchList
);
}
}
log
.
info
(
"{}, 此轮新浪热点采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
));
log
.
info
(
"{}, 此轮新浪热点采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
));
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuChildHotSearchCrawler.java
View file @
f671bae7
...
@@ -16,10 +16,7 @@ import okhttp3.Request;
...
@@ -16,10 +16,7 @@ import okhttp3.Request;
import
okhttp3.Response
;
import
okhttp3.Response
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.*
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
@Log4j2
@Log4j2
public
class
ZhihuChildHotSearchCrawler
{
public
class
ZhihuChildHotSearchCrawler
{
...
@@ -32,7 +29,7 @@ public class ZhihuChildHotSearchCrawler {
...
@@ -32,7 +29,7 @@ public class ZhihuChildHotSearchCrawler {
* @param typeName
* @param typeName
* @return
* @return
*/
*/
public
static
List
<
HotSearchList
>
getZhihuTopicSearch
(
String
type
,
String
typeNam
e
)
{
public
static
List
<
HotSearchList
>
getZhihuTopicSearch
(
String
type
,
String
typeName
,
Date
dat
e
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
url
=
"https://www.zhihu.com/api/v3/feed/topstory/hot-lists/"
+
type
;
String
url
=
"https://www.zhihu.com/api/v3/feed/topstory/hot-lists/"
+
type
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
...
@@ -57,7 +54,7 @@ public class ZhihuChildHotSearchCrawler {
...
@@ -57,7 +54,7 @@ public class ZhihuChildHotSearchCrawler {
String
hotCountString
=
jsonObject
.
getJSONObject
(
"metrics_area"
).
getString
(
"text"
);
String
hotCountString
=
jsonObject
.
getJSONObject
(
"metrics_area"
).
getString
(
"text"
);
Integer
count
=
TipsUtils
.
getHotCount
(
hotCountString
.
substring
(
0
,
hotCountString
.
indexOf
(
"领域热度"
)));
Integer
count
=
TipsUtils
.
getHotCount
(
hotCountString
.
substring
(
0
,
hotCountString
.
indexOf
(
"领域热度"
)));
String
childUrl
=
jsonObject
.
getJSONObject
(
"link"
).
getString
(
"url"
);
String
childUrl
=
jsonObject
.
getJSONObject
(
"link"
).
getString
(
"url"
);
HotSearchList
hotSearchList
=
new
HotSearchList
(
childUrl
,
name
,
count
,
rank
,
HotSearchType
.
知乎热搜
.
name
()
+
typeName
+
"分类"
);
HotSearchList
hotSearchList
=
new
HotSearchList
(
childUrl
,
name
,
count
,
rank
,
HotSearchType
.
知乎热搜
.
name
()
+
typeName
+
"分类"
,
date
);
list
.
add
(
hotSearchList
);
list
.
add
(
hotSearchList
);
}
}
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
f671bae7
...
@@ -2,6 +2,7 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,6 +2,7 @@ package com.zhiwei.searchhotcrawler.crawler;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
...
@@ -37,42 +38,42 @@ public class ZhihuHotSearchCrawler {
...
@@ -37,42 +38,42 @@ public class ZhihuHotSearchCrawler {
* @Description: 知乎热搜采集程序
* @Description: 知乎热搜采集程序
* @return void 返回类型
* @return void 返回类型
*/
*/
public
static
List
<
HotSearchList
>
getZhihuHotList
(){
//
public static List<HotSearchList> getZhihuHotList(){
List
<
HotSearchList
>
list
=
null
;
//
List<HotSearchList> list = null;
String
url
=
"https://www.zhihu.com/api/v4/search/top_search"
;
//
String url = "https://www.zhihu.com/api/v4/search/top_search";
String
rerferer
=
"https://www.zhihu.com/search?type=content&q=%E5%BF%AB%E6%89%8B"
;
//
String rerferer = "https://www.zhihu.com/search?type=content&q=%E5%BF%AB%E6%89%8B";
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//
Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
);
//
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");
headerMap
.
put
(
"Host"
,
"www.zhihu.com"
);
//
headerMap.put("Host", "www.zhihu.com");
headerMap
.
put
(
"X-UDID"
,
"AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="
);
//
headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
headerMap
.
put
(
"accept"
,
"application/json, text/plain, */*"
);
//
headerMap.put("accept", "application/json, text/plain, */*");
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
//
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
headerMap
.
put
(
"Referer"
,
rerferer
);
//
headerMap.put("Referer", rerferer);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
//
Request request = RequestUtils.wrapGet(url, headerMap);
String
htmlBody
=
null
;
//
String htmlBody = null;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
//
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody
=
response
.
body
().
string
();
//
htmlBody = response.body().string();
}
catch
(
IOException
e
)
{
//
}catch (IOException e) {
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
//
log.debug("获取知乎热搜时出现问题:{}", e);
}
//
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"words"
))
{
//
if (htmlBody != null && htmlBody.contains("words")) {
list
=
new
ArrayList
<>();
//
list = new ArrayList<>();
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
//
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray
words
=
topSearch
.
getJSONObject
(
"top_search"
).
getJSONArray
(
"words"
);
//
JSONArray words = topSearch.getJSONObject("top_search").getJSONArray("words");
String
link
=
null
;
//
String link = null;
String
displayQuery
=
null
;
//
String displayQuery = null;
String
query
=
null
;
//
String query = null;
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
//
for (int i = 0; i < words.size(); i++) {
JSONObject
word
=
words
.
getJSONObject
(
i
);
//
JSONObject word = words.getJSONObject(i);
query
=
word
.
getString
(
"query"
);
//
query = word.getString("query");
displayQuery
=
word
.
getString
(
"display_query"
);
//
displayQuery = word.getString("display_query");
link
=
"https://www.zhihu.com/search?q="
+
URLCodeUtil
.
getURLEncode
(
query
,
"utf-8"
)
+
"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content"
;
//
link = "https://www.zhihu.com/search?q=" + URLCodeUtil.getURLEncode(query, "utf-8") + "&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content";
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
null
,
i
,
HotSearchType
.
知乎热搜
.
name
());
//
HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name());
list
.
add
(
zhihu
);
//
list.add(zhihu);
}
//
}
}
//
}
return
list
;
//
return list;
}
//
}
...
@@ -83,7 +84,7 @@ public class ZhihuHotSearchCrawler {
...
@@ -83,7 +84,7 @@ public class ZhihuHotSearchCrawler {
* @param @return 设定文件
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
* @return List<ZhihuHotSearch> 返回类型
*/
*/
public
static
List
<
HotSearchList
>
getMobileZhihuHotList
(){
public
static
List
<
HotSearchList
>
getMobileZhihuHotList
(
Date
date
){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
url
=
"https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0"
;
String
url
=
"https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0"
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
...
@@ -127,7 +128,7 @@ public class ZhihuHotSearchCrawler {
...
@@ -127,7 +128,7 @@ public class ZhihuHotSearchCrawler {
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
printStackTrace
();
}
}
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
hotCount
,
i
+
1
,
HotSearchType
.
知乎热搜
.
name
());
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
hotCount
,
i
+
1
,
HotSearchType
.
知乎热搜
.
name
()
,
date
);
list
.
add
(
zhihu
);
list
.
add
(
zhihu
);
}
}
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuTopicSearchCrawler.java
View file @
f671bae7
...
@@ -9,6 +9,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
...
@@ -9,6 +9,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.Data
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
okhttp3.Response
;
...
@@ -19,16 +20,13 @@ import org.w3c.dom.Element;
...
@@ -19,16 +20,13 @@ import org.w3c.dom.Element;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.*
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
@Log4j2
@Log4j2
public
class
ZhihuTopicSearchCrawler
{
public
class
ZhihuTopicSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
public
static
List
<
HotSearchList
>
getZhihuTopicSearch
(){
public
static
List
<
HotSearchList
>
getZhihuTopicSearch
(
Date
date
){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
url
=
"https://www.zhihu.com/topsearch"
;
String
url
=
"https://www.zhihu.com/topsearch"
;
JSONObject
jsonObject
=
null
;
JSONObject
jsonObject
=
null
;
...
@@ -52,7 +50,7 @@ public class ZhihuTopicSearchCrawler {
...
@@ -52,7 +50,7 @@ public class ZhihuTopicSearchCrawler {
String
name
=
data
.
getString
(
"queryDisplay"
);
String
name
=
data
.
getString
(
"queryDisplay"
);
String
realQuery
=
data
.
getString
(
"realQuery"
);
String
realQuery
=
data
.
getString
(
"realQuery"
);
String
zhihuUrl
=
"https://www.zhihu.com/search?q="
+
realQuery
+
"&utm_content=search_hot&type=content"
;
String
zhihuUrl
=
"https://www.zhihu.com/search?q="
+
realQuery
+
"&utm_content=search_hot&type=content"
;
HotSearchList
hotSearchList
=
new
HotSearchList
(
zhihuUrl
,
name
,
null
,
rank
,
HotSearchType
.
知乎热搜榜单
.
name
());
HotSearchList
hotSearchList
=
new
HotSearchList
(
zhihuUrl
,
name
,
null
,
rank
,
HotSearchType
.
知乎热搜榜单
.
name
()
,
date
);
list
.
add
(
hotSearchList
);
list
.
add
(
hotSearchList
);
}
}
return
list
;
return
list
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
f671bae7
...
@@ -40,7 +40,10 @@ public class HotSearchCacheDAO {
...
@@ -40,7 +40,10 @@ public class HotSearchCacheDAO {
document
.
put
(
"rank"
,
hotSearch
.
getRank
());
document
.
put
(
"rank"
,
hotSearch
.
getRank
());
document
.
put
(
"type"
,
hotSearch
.
getType
());
document
.
put
(
"type"
,
hotSearch
.
getType
());
document
.
put
(
"icon"
,
hotSearch
.
getIcon
());
document
.
put
(
"icon"
,
hotSearch
.
getIcon
());
if
(
"微博话题"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"topic_lead"
,
hotSearch
.
getTopicLead
());
document
.
put
(
"comment_count"
,
hotSearch
.
getCommentCount
());
}
addAndUpdateData
(
document
);
addAndUpdateData
(
document
);
dataes
.
add
(
document
);
dataes
.
add
(
document
);
});
});
...
@@ -99,7 +102,7 @@ public class HotSearchCacheDAO {
...
@@ -99,7 +102,7 @@ public class HotSearchCacheDAO {
//计算热搜时长
//计算热搜时长
int
duration
=
nowDoc
.
getInteger
(
"duration"
);
int
duration
=
nowDoc
.
getInteger
(
"duration"
);
int
durationNow
=
getDuration
(
type
,
duration
);
int
durationNow
=
getDuration
(
type
,
duration
);
endTime
=
getEndTime
(
type
,
new
Date
());
//
endTime = getEndTime(type, new Date());
//更新相应信息
//更新相应信息
nowDoc
.
put
(
"endTime"
,
endTime
);
nowDoc
.
put
(
"endTime"
,
endTime
);
nowDoc
.
put
(
"lastRank"
,
lastRank
);
nowDoc
.
put
(
"lastRank"
,
lastRank
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
f671bae7
...
@@ -6,6 +6,8 @@ import com.zhiwei.searchhotcrawler.cache.CacheListener;
...
@@ -6,6 +6,8 @@ import com.zhiwei.searchhotcrawler.cache.CacheListener;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.*
;
import
com.zhiwei.searchhotcrawler.timer.*
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.springframework.context.ApplicationContext
;
import
org.springframework.context.support.ClassPathXmlApplicationContext
;
import
java.util.concurrent.Executors
;
import
java.util.concurrent.Executors
;
import
java.util.concurrent.ScheduledExecutorService
;
import
java.util.concurrent.ScheduledExecutorService
;
...
@@ -16,6 +18,7 @@ public class HotSearchRun {
...
@@ -16,6 +18,7 @@ public class HotSearchRun {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ApplicationContext
context
=
new
ClassPathXmlApplicationContext
(
"applicationContext.xml"
);
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
ProxyFactory
.
init
(
simpleConfig
);
...
@@ -43,18 +46,18 @@ public class HotSearchRun {
...
@@ -43,18 +46,18 @@ public class HotSearchRun {
//采集程序启动
//采集程序启动
new
WeiboHotSearchRun
().
start
();
//
new WeiboHotSearchRun().start();
new
BaiduHotSearchRun
().
start
();
//
new BaiduHotSearchRun().start();
// new SougoHotSearchRun().start();
//
//
new SougoHotSearchRun().start();
new
DouyinHotSearchRun
().
start
();
//
new DouyinHotSearchRun().start();
// new ZhihuHotSearchRun().start();
//
//
new ZhihuHotSearchRun().start();
new
WeiboSuperTopicRun
().
start
();
//
new WeiboSuperTopicRun().start();
new
WeiboTopicRun
().
start
();
//
new WeiboTopicRun().start();
// new ToutiaoHotSearchRun().start();
//
//
new ToutiaoHotSearchRun().start();
// new ZhihuTopSearchRun().start();
//
//
new ZhihuTopSearchRun().start();
new
ZhihuChildHotSearchRun
().
start
();
//
new ZhihuChildHotSearchRun().start();
new
ThreadOneRun
().
start
();
//
new ThreadOneRun().start();
// //抖音链接更新
//
//
//抖音链接更新
new
DouYinUrlHotSearchRun
().
start
();
//
new DouYinUrlHotSearchRun().start();
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
View file @
f671bae7
...
@@ -42,32 +42,32 @@ public class BaiduHotSearchRun extends Thread{
...
@@ -42,32 +42,32 @@ public class BaiduHotSearchRun extends Thread{
private
void
getHotList
()
{
private
void
getHotList
()
{
log
.
info
(
"百度风云榜采集开始........"
);
//
log.info("百度风云榜采集开始........");
// HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
//
//
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
//
//
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List
<
HotSearchList
>
baiduList
=
BaiDuHotSearchCrawler
.
baiduHotSearch
();
//
List<HotSearchList> baiduList = BaiDuHotSearchCrawler.baiduHotSearch();
log
.
info
(
"{}, 此轮百度风云榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
baiduList
!=
null
?
baiduList
.
size
()
:
0
));
//
log.info("{}, 此轮百度风云榜采集到的数据量为:{}", new Date(), Integer.valueOf(baiduList != null ? baiduList.size() : 0));
// if(Objects.nonNull(list) && !list.isEmpty()) {
//
//
if(Objects.nonNull(list) && !list.isEmpty()) {
// List<Document> data = hotSearchCacheDAO.addData(list);
//
//
List<Document> data = hotSearchCacheDAO.addData(list);
// hotSearchDAO.addHotSearchList(data);
//
//
hotSearchDAO.addHotSearchList(data);
// TipsUtils.recoveryTips("百度热搜",new Date());
//
//
TipsUtils.recoveryTips("百度热搜",new Date());
// } else {
//
//
} else {
// TipsUtils.sendTips("百度热搜",new Date());
//
//
TipsUtils.sendTips("百度热搜",new Date());
// }
//
//
}
TipsUtils
.
addHotList
(
"百度热搜"
,
baiduList
);
//
TipsUtils.addHotList("百度热搜",baiduList);
log
.
info
(
"百度风云榜采集结束........"
);
//
log.info("百度风云榜采集结束........");
ZhiWeiTools
.
sleep
(
2000L
);
//
ZhiWeiTools.sleep(2000L);
log
.
info
(
"搜狗微信采集开始........"
);
//
log.info("搜狗微信采集开始........");
List
<
HotSearchList
>
sougouList
=
SougoHotSearchCrawler
.
sougoHotSearch
();
//
List<HotSearchList> sougouList = SougoHotSearchCrawler.sougoHotSearch();
log
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
sougouList
!=
null
?
sougouList
.
size
()
:
0
));
//
log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(sougouList != null ? sougouList.size() : 0));
TipsUtils
.
addHotList
(
"搜狗微信热搜"
,
sougouList
);
//
TipsUtils.addHotList("搜狗微信热搜",sougouList);
log
.
info
(
"搜狗微信采集结束........"
);
//
log.info("搜狗微信采集结束........");
ZhiWeiTools
.
sleep
(
2000L
);
//
ZhiWeiTools.sleep(2000L);
log
.
info
(
"知乎话题采集开始........"
);
//
log.info("知乎话题采集开始........");
List
<
HotSearchList
>
zhihuList
=
ZhihuHotSearchCrawler
.
getMobileZhihuHotList
();
//
List<HotSearchList> zhihuList = ZhihuHotSearchCrawler.getMobileZhihuHotList();
log
.
info
(
"{}, 知乎此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
zhihuList
!=
null
?
zhihuList
.
size
()
:
0
));
//
log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(zhihuList != null ? zhihuList.size() : 0));
TipsUtils
.
addHotList
(
"知乎热搜"
,
zhihuList
);
//
TipsUtils.addHotList("知乎热搜",zhihuList);
log
.
info
(
"知乎话题采集结束........"
);
//
log.info("知乎话题采集结束........");
}
}
}
}
\ No newline at end of file
src/main/java/com/zhiwei/searchhotcrawler/timer/DouYinUrlHotSearchRun.java
View file @
f671bae7
...
@@ -38,24 +38,24 @@ public class DouYinUrlHotSearchRun extends Thread {
...
@@ -38,24 +38,24 @@ public class DouYinUrlHotSearchRun extends Thread {
* @return void
* @return void
*/
*/
private
void
getUrlList
()
{
private
void
getUrlList
()
{
log
.
info
(
"抖音链接更新开始........"
);
//
log.info("抖音链接更新开始........");
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
//
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List
<
HotSearchList
>
list
=
DouyinHotSearchRun
.
list
;
//
List<HotSearchList> list = DouyinHotSearchRun.list;
if
(
list
!=
null
&&
list
.
size
()>
0
)
{
//
if(list != null && list.size()>0) {
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
//
for (int i = 0; i < list.size(); i++) {
String
name
=
list
.
get
(
i
).
getName
();
//
String name = list.get(i).getName();
String
id
=
name
+
"_"
+
list
.
get
(
i
).
getType
();
//
String id = name+"_"+list.get(i).getType();
String
url
=
DouyinHotSearchCrawler
.
getDouyinUrl
(
"https://aweme-hl.snssdk.com/aweme/v1/hot/search/video/list/?hotword="
+
name
);
//
String url = DouyinHotSearchCrawler.getDouyinUrl("https://aweme-hl.snssdk.com/aweme/v1/hot/search/video/list/?hotword="+name);
if
(
url
!=
null
)
{
//
if(url != null) {
Document
document
=
new
Document
();
//
Document document = new Document();
document
.
put
(
"id"
,
id
);
//
document.put("id", id);
document
.
put
(
"url"
,
url
);
//
document.put("url", url);
hotSearchCacheDAO
.
updateDouyinUrl
(
document
);
//
hotSearchCacheDAO.updateDouyinUrl(document);
}
//
}
}
//
}
log
.
info
(
"抖音链接更新结束........"
);
//
log.info("抖音链接更新结束........");
}
else
{
//
}else{
log
.
info
(
"抖音链接更新失败,获取抖音数据为空"
);
//
log.info("抖音链接更新失败,获取抖音数据为空");
}
//
}
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
View file @
f671bae7
...
@@ -48,23 +48,23 @@ public class DouyinHotSearchRun extends Thread{
...
@@ -48,23 +48,23 @@ public class DouyinHotSearchRun extends Thread{
* @return void
* @return void
*/
*/
private
void
getHotList
()
{
private
void
getHotList
()
{
log
.
info
(
"抖音热搜榜采集开始........"
);
//
log.info("抖音热搜榜采集开始........");
list
=
DouyinHotSearchCrawler
.
getMobileDouyinHotList
();
//
list = DouyinHotSearchCrawler.getMobileDouyinHotList();
log
.
info
(
"{}, 抖音热搜榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
//
log.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
TipsUtils
.
addHotList
(
"抖音热搜"
,
list
);
//
TipsUtils.addHotList("抖音热搜",list);
log
.
info
(
"抖音热搜榜采集结束........"
);
//
log.info("抖音热搜榜采集结束........");
ZhiWeiTools
.
sleep
(
3000L
);
//
ZhiWeiTools.sleep(3000L);
log
.
info
(
"今日头条热搜采集开始........"
);
//
log.info("今日头条热搜采集开始........");
List
<
HotSearchList
>
toutiaoList
=
ToutiaoHotSearchCrawler
.
toutiaoHotSearchByPhone
();
//
List<HotSearchList> toutiaoList = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone();
log
.
info
(
"{}, 今日头条此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
toutiaoList
!=
null
?
toutiaoList
.
size
()
:
0
));
//
log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(toutiaoList != null ? toutiaoList.size() : 0));
TipsUtils
.
addHotList
(
HotSearchType
.
今日头条热搜
.
name
(),
toutiaoList
);
//
TipsUtils.addHotList(HotSearchType.今日头条热搜.name(),toutiaoList);
log
.
info
(
"今日头条热搜采集结束........"
);
//
log.info("今日头条热搜采集结束........");
ZhiWeiTools
.
sleep
(
3000L
);
//
ZhiWeiTools.sleep(3000L);
log
.
info
(
"知乎热搜榜单采集开始..."
);
//
log.info("知乎热搜榜单采集开始...");
List
<
HotSearchList
>
zhihuList
=
ZhihuTopicSearchCrawler
.
getZhihuTopicSearch
();
//
List<HotSearchList> zhihuList = ZhihuTopicSearchCrawler.getZhihuTopicSearch();
log
.
info
(
"{}, 知乎热搜榜单此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
zhihuList
!=
null
?
zhihuList
.
size
()
:
0
));
//
log.info("{}, 知乎热搜榜单此轮采集到的数据量为:{}", new Date(), Integer.valueOf(zhihuList != null ? zhihuList.size() : 0));
TipsUtils
.
addHotList
(
HotSearchType
.
知乎热搜榜单
.
name
(),
zhihuList
);
//
TipsUtils.addHotList(HotSearchType.知乎热搜榜单.name(),zhihuList);
log
.
info
(
"知乎热搜榜单采集结束........"
);
//
log.info("知乎热搜榜单采集结束........");
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
View file @
f671bae7
...
@@ -39,19 +39,19 @@ public class SougoHotSearchRun extends Thread {
...
@@ -39,19 +39,19 @@ public class SougoHotSearchRun extends Thread {
private
void
getHotList
()
{
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
//
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
//
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
log
.
info
(
"搜狗微信采集开始........"
);
//
log.info("搜狗微信采集开始........");
List
<
HotSearchList
>
list
=
SougoHotSearchCrawler
.
sougoHotSearch
();
//
List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch();
log
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
//
log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if
(
list
==
null
||
list
.
size
()
==
0
){
//
if(list == null || list.size() == 0){
TipsUtils
.
sendTips
(
"搜狗微信热搜"
,
new
Date
());
//
TipsUtils.sendTips("搜狗微信热搜",new Date());
}
else
{
//
}else {
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
//
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO
.
addHotSearchList
(
data
);
//
hotSearchDAO.addHotSearchList(data);
TipsUtils
.
recoveryTips
(
"搜狗微信热搜"
,
new
Date
());
//
TipsUtils.recoveryTips("搜狗微信热搜",new Date());
}
//
}
log
.
info
(
"搜狗微信采集结束........"
);
//
log.info("搜狗微信采集结束........");
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ThreadOneRun.java
View file @
f671bae7
...
@@ -31,22 +31,22 @@ public class ThreadOneRun extends Thread {
...
@@ -31,22 +31,22 @@ public class ThreadOneRun extends Thread {
}
}
private
void
getHotList
(){
private
void
getHotList
(){
List
<
HotSearchList
>
tengXunlist
=
TengXunCrawler
.
getTengXunHotList
();
//
List<HotSearchList> tengXunlist = TengXunCrawler.getTengXunHotList();
TipsUtils
.
addHotList
(
"腾讯新闻"
,
tengXunlist
);
//
TipsUtils.addHotList("腾讯新闻",tengXunlist);
ZhiWeiTools
.
sleep
(
1500L
);
//
ZhiWeiTools.sleep(1500L);
List
<
HotSearchList
>
xinLanglist
=
XinLangHotSearchCrawler
.
getXinLangHotSearch
();
//
List<HotSearchList> xinLanglist = XinLangHotSearchCrawler.getXinLangHotSearch();
TipsUtils
.
addHotList
(
"新浪热榜"
,
xinLanglist
);
//
TipsUtils.addHotList("新浪热榜",xinLanglist);
ZhiWeiTools
.
sleep
(
1500L
);
//
ZhiWeiTools.sleep(1500L);
List
<
HotSearchList
>
souhuList
=
SouhuTopicCrawler
.
getSouhuTopic
();
//
List<HotSearchList> souhuList = SouhuTopicCrawler.getSouhuTopic();
TipsUtils
.
addHotList
(
"搜狐话题"
,
souhuList
);
//
TipsUtils.addHotList("搜狐话题",souhuList);
ZhiWeiTools
.
sleep
(
1500L
);
//
ZhiWeiTools.sleep(1500L);
List
<
HotSearchList
>
xinLangHotList
=
XinLangHotSearchCrawler
.
getXinLangHotSpot
();
//
List<HotSearchList> xinLangHotList = XinLangHotSearchCrawler.getXinLangHotSpot();
TipsUtils
.
addHotList
(
"新浪热点"
,
xinLangHotList
);
//
TipsUtils.addHotList("新浪热点",xinLangHotList);
ZhiWeiTools
.
sleep
(
1500L
);
//
ZhiWeiTools.sleep(1500L);
List
<
HotSearchList
>
fengHuangHotList
=
FengHuangSearchCrawler
.
getFengHuangHotList
();
//
List<HotSearchList> fengHuangHotList = FengHuangSearchCrawler.getFengHuangHotList();
TipsUtils
.
addHotList
(
"凤凰新闻热榜"
,
fengHuangHotList
);
//
TipsUtils.addHotList("凤凰新闻热榜",fengHuangHotList);
ZhiWeiTools
.
sleep
(
1500L
);
//
ZhiWeiTools.sleep(1500L);
List
<
HotSearchList
>
fengHuangHotDataList
=
FengHuangSearchCrawler
.
getFengHuangHotData
();
//
List<HotSearchList> fengHuangHotDataList = FengHuangSearchCrawler.getFengHuangHotData();
TipsUtils
.
addHotList
(
"凤凰新闻热搜"
,
fengHuangHotDataList
);
//
TipsUtils.addHotList("凤凰新闻热搜",fengHuangHotDataList);
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ToutiaoHotSearchRun.java
View file @
f671bae7
...
@@ -35,19 +35,19 @@ public class ToutiaoHotSearchRun extends Thread{
...
@@ -35,19 +35,19 @@ public class ToutiaoHotSearchRun extends Thread{
private
void
getHotList
()
{
private
void
getHotList
()
{
log
.
info
(
"今日头条热搜采集开始........"
);
//
log.info("今日头条热搜采集开始........");
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
//
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
//
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List
<
HotSearchList
>
list
=
ToutiaoHotSearchCrawler
.
toutiaoHotSearchByPhone
();
//
List<HotSearchList> list = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone();
log
.
info
(
"{}, 今日头条此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
//
log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if
(
list
==
null
||
list
.
size
()
==
0
){
//
if(list == null || list.size() == 0){
TipsUtils
.
sendTips
(
"今日头条热搜"
,
new
Date
());
//
TipsUtils.sendTips("今日头条热搜",new Date());
}
else
{
//
}else {
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
//
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO
.
addHotSearchList
(
data
);
//
hotSearchDAO.addHotSearchList(data);
TipsUtils
.
recoveryTips
(
"今日头条热搜"
,
new
Date
());
//
TipsUtils.recoveryTips("今日头条热搜",new Date());
}
//
}
log
.
info
(
"今日头条热搜采集结束........"
);
//
log.info("今日头条热搜采集结束........");
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
View file @
f671bae7
...
@@ -34,17 +34,17 @@ public class WeiboHotSearchRun extends Thread{
...
@@ -34,17 +34,17 @@ public class WeiboHotSearchRun extends Thread{
private
void
getHotList
()
{
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
//
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
//
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List
<
HotSearchList
>
list
=
WeiboHotSearchCrawler
.
weiboHotSearchByPhone
();
//
List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearchByPhone();
log
.
info
(
"{}, 微博此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
//
log.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if
(
list
==
null
||
list
.
size
()
==
0
){
//
if(list == null || list.size() == 0){
TipsUtils
.
sendTips
(
"微博热搜"
,
new
Date
());
//
TipsUtils.sendTips("微博热搜",new Date());
}
else
{
//
}else {
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
//
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO
.
addHotSearchList
(
data
);
//
hotSearchDAO.addHotSearchList(data);
TipsUtils
.
recoveryTips
(
"微博热搜"
,
new
Date
());
//
TipsUtils.recoveryTips("微博热搜",new Date());
}
//
}
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboSuperTopicRun.java
View file @
f671bae7
...
@@ -32,28 +32,28 @@ public class WeiboSuperTopicRun extends Thread{
...
@@ -32,28 +32,28 @@ public class WeiboSuperTopicRun extends Thread{
private
void
getTopicList
()
{
private
void
getTopicList
()
{
WeiboSuperTopicDAO
weiboTopicDAO
=
new
WeiboSuperTopicDAO
();
//
WeiboSuperTopicDAO weiboTopicDAO = new WeiboSuperTopicDAO();
log
.
info
(
"微博超话采集开始........"
);
//
log.info("微博超话采集开始........");
List
<
WeiboSuperTopic
>
list
=
WeiboSuperTopicCrawler
.
startCrawler
();
//
List<WeiboSuperTopic> list = WeiboSuperTopicCrawler.startCrawler();
log
.
info
(
"{}, 微博超话此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
//
log.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List
<
Document
>
data
=
new
ArrayList
<>();
//
List<Document> data = new ArrayList<>();
for
(
WeiboSuperTopic
topic
:
list
){
//
for(WeiboSuperTopic topic : list){
log
.
info
(
"topic::::{}"
,
topic
);
//
log.info("topic::::{}", topic);
Document
doc
=
new
Document
();
//
Document doc = new Document();
doc
.
put
(
"_id"
,
topic
.
getId
());
//
doc.put("_id", topic.getId());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
//
doc.put("name", topic.getTopicName());
doc
.
put
(
"rank"
,
topic
.
getRank
());
//
doc.put("rank", topic.getRank());
doc
.
put
(
"score_num"
,
topic
.
getScore
());
//
doc.put("score_num", topic.getScore());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
//
doc.put("fensi_num", topic.getFensi());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
//
doc.put("post_num", topic.getPostNum());
doc
.
put
(
"type"
,
topic
.
getType
());
//
doc.put("type", topic.getType());
doc
.
put
(
"day"
,
topic
.
getDay
());
//
doc.put("day", topic.getDay());
doc
.
put
(
"time"
,
topic
.
getTime
());
//
doc.put("time", topic.getTime());
doc
.
put
(
"url"
,
topic
.
getUrl
());
//
doc.put("url", topic.getUrl());
data
.
add
(
doc
);
//
data.add(doc);
}
//
}
weiboTopicDAO
.
addTopicList
(
data
);
//
weiboTopicDAO.addTopicList(data);
log
.
info
(
"微博话题采集结束........"
);
//
log.info("微博话题采集结束........");
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
View file @
f671bae7
...
@@ -33,35 +33,35 @@ public class WeiboTopicRun extends Thread{
...
@@ -33,35 +33,35 @@ public class WeiboTopicRun extends Thread{
private
void
getTopicList
()
{
private
void
getTopicList
()
{
HotSearchListDAO
weiboHotSearchDAO
=
new
HotSearchListDAO
();
//
HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
//
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
log
.
info
(
"微博话题采集开始........"
);
//
log.info("微博话题采集开始........");
List
<
HotSearchList
>
list
=
WeiboTopicCrawler
.
startCrawlerByPhone
();
//
List<HotSearchList> list = WeiboTopicCrawler.startCrawlerByPhone();
log
.
info
(
"{}, 微博话题此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
//
log.info("{}, 微博话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if
(
list
==
null
||
list
.
size
()
==
0
){
//
if(list == null || list.size() == 0){
TipsUtils
.
sendTips
(
"微博话题"
,
new
Date
());
//
TipsUtils.sendTips("微博话题",new Date());
}
else
{
//
}else{
TipsUtils
.
recoveryTips
(
"微博话题"
,
new
Date
());
//
TipsUtils.recoveryTips("微博话题",new Date());
}
//
}
List
<
Document
>
data
=
new
ArrayList
<>();
//
List<Document> data = new ArrayList<>();
for
(
HotSearchList
topic
:
list
){
//
for(HotSearchList topic : list){
Document
doc
=
new
Document
();
//
Document doc = new Document();
doc
.
put
(
"_id"
,
topic
.
getId
());
//
doc.put("_id", topic.getId());
doc
.
put
(
"name"
,
topic
.
getName
());
//
doc.put("name", topic.getName());
doc
.
put
(
"url"
,
topic
.
getUrl
());
//
doc.put("url", topic.getUrl());
doc
.
put
(
"count"
,
topic
.
getCount
());
//
doc.put("count", topic.getCount());
doc
.
put
(
"hot"
,
topic
.
getHot
());
//
doc.put("hot", topic.getHot());
doc
.
put
(
"day"
,
topic
.
getDay
());
//
doc.put("day", topic.getDay());
doc
.
put
(
"time"
,
topic
.
getTime
());
//
doc.put("time", topic.getTime());
doc
.
put
(
"rank"
,
topic
.
getRank
());
//
doc.put("rank", topic.getRank());
doc
.
put
(
"type"
,
topic
.
getType
());
//
doc.put("type", topic.getType());
doc
.
put
(
"topic_lead"
,
topic
.
getTopicLead
());
//
doc.put("topic_lead", topic.getTopicLead());
doc
.
put
(
"comment_count"
,
topic
.
getCommentCount
());
//
doc.put("comment_count", topic.getCommentCount());
data
.
add
(
doc
);
//
data.add(doc);
hotSearchCacheDAO
.
addAndUpdateData
(
doc
);
//
hotSearchCacheDAO.addAndUpdateData(doc);
}
//
}
weiboHotSearchDAO
.
addHotSearchList
(
data
);
//
weiboHotSearchDAO.addHotSearchList(data);
log
.
info
(
"微博话题采集结束........"
);
//
log.info("微博话题采集结束........");
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuChildHotSearchRun.java
View file @
f671bae7
...
@@ -35,50 +35,50 @@ public class ZhihuChildHotSearchRun extends Thread {
...
@@ -35,50 +35,50 @@ public class ZhihuChildHotSearchRun extends Thread {
}
}
private
void
getHotList
()
{
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
//
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
//
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
for
(
int
i
=
0
;
i
<
childType
.
size
();
i
++)
{
//
for (int i = 0; i < childType.size(); i++) {
String
name
=
this
.
getTypeName
(
childType
.
get
(
i
));
//
String name = this.getTypeName(childType.get(i));
if
(!
""
.
equals
(
name
))
{
//
if (!"".equals(name)) {
log
.
info
(
"知乎{}话题热榜采集开始..."
,
name
);
//
log.info("知乎{}话题热榜采集开始...", name);
List
<
HotSearchList
>
list
=
ZhihuChildHotSearchCrawler
.
getZhihuTopicSearch
(
childType
.
get
(
i
),
name
);
//
List<HotSearchList> list = ZhihuChildHotSearchCrawler.getZhihuTopicSearch(childType.get(i), name);
log
.
info
(
"{}, 知乎{}话题此轮采集到的数据量为:{}"
,
new
Date
(),
name
,
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
//
log.info("{}, 知乎{}话题此轮采集到的数据量为:{}", new Date(),name, Integer.valueOf(list != null ? list.size() : 0));
if
(
list
==
null
||
list
.
size
()
==
0
)
{
//
if (list == null || list.size() == 0) {
TipsUtils
.
sendTips
(
"知乎热搜"
+
name
+
"分类"
,
new
Date
());
//
TipsUtils.sendTips("知乎热搜"+name+"分类", new Date());
}
else
{
//
}else {
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
//
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO
.
addHotSearchList
(
data
);
//
hotSearchDAO.addHotSearchList(data);
TipsUtils
.
recoveryTips
(
"知乎热搜"
+
name
+
"分类"
,
new
Date
());
//
TipsUtils.recoveryTips("知乎热搜"+name+"分类",new Date());
}
//
}
log
.
info
(
"知乎{}话题热榜采集结束..."
,
name
);
//
log.info("知乎{}话题热榜采集结束...", name);
ZhiWeiTools
.
sleep
(
3000
);
//
ZhiWeiTools.sleep(3000);
}
//
}
}
//
}
//网易实时热榜采集
//
//网易实时热榜采集
ZhiWeiTools
.
sleep
(
3000L
);
//
ZhiWeiTools.sleep(3000L);
List
<
HotSearchList
>
wangyiHotSearchList
=
WangYiHotSearchCrawler
.
getWangYiHotSearch
();
//
List<HotSearchList> wangyiHotSearchList = WangYiHotSearchCrawler.getWangYiHotSearch();
TipsUtils
.
addHotList
(
"网易热榜"
,
wangyiHotSearchList
);
//
TipsUtils.addHotList("网易热榜",wangyiHotSearchList);
//网易跟帖热议采集
//
//网易跟帖热议采集
ZhiWeiTools
.
sleep
(
3000L
);
//
ZhiWeiTools.sleep(3000L);
List
<
HotSearchList
>
wangyiComment
=
WangYiHotSearchCrawler
.
getWangYicomment
();
//
List<HotSearchList> wangyiComment = WangYiHotSearchCrawler.getWangYicomment();
TipsUtils
.
addHotList
(
"网易跟帖热议"
,
wangyiComment
);
//
TipsUtils.addHotList("网易跟帖热议",wangyiComment);
}
}
private
String
getTypeName
(
String
type
){
//
private String getTypeName(String type){
String
name
;
//
String name;
switch
(
type
)
{
//
switch (type) {
case
"digital"
:
//
case "digital":
name
=
"数码"
;
//
name = "数码";
break
;
//
break;
case
"focus"
:
//
case "focus":
name
=
"国际"
;
//
name = "国际";
break
;
//
break;
case
"depth"
:
//
case "depth":
name
=
"时事"
;
//
name = "时事";
break
;
//
break;
default
:
//
default:
name
=
""
;
//
name = "";
}
//
}
return
name
;
//
return name;
}
//
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
View file @
f671bae7
...
@@ -39,20 +39,20 @@ public class ZhihuHotSearchRun extends Thread{
...
@@ -39,20 +39,20 @@ public class ZhihuHotSearchRun extends Thread{
private
void
getHotList
()
{
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
//
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
//
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
log
.
info
(
"知乎话题采集开始...,当前线程名字:{}"
,
Thread
.
currentThread
().
getName
());
//
log.info("知乎话题采集开始...,当前线程名字:{}", Thread.currentThread().getName());
// List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList();
//
//
List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList();
List
<
HotSearchList
>
list
=
ZhihuHotSearchCrawler
.
getMobileZhihuHotList
();
//
List<HotSearchList> list = ZhihuHotSearchCrawler.getMobileZhihuHotList();
log
.
info
(
"{}, 知乎此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
//
log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if
(
list
==
null
||
list
.
size
()
==
0
){
//
if(list == null || list.size() == 0){
TipsUtils
.
sendTips
(
"知乎热搜"
,
new
Date
());
//
TipsUtils.sendTips("知乎热搜",new Date());
}
else
{
//
}else {
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
//
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO
.
addHotSearchList
(
data
);
//
hotSearchDAO.addHotSearchList(data);
TipsUtils
.
recoveryTips
(
"知乎热搜"
,
new
Date
());
//
TipsUtils.recoveryTips("知乎热搜",new Date());
}
//
}
log
.
info
(
"知乎话题采集结束........"
);
//
log.info("知乎话题采集结束........");
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuTopSearchRun.java
View file @
f671bae7
...
@@ -32,18 +32,18 @@ public class ZhihuTopSearchRun extends Thread {
...
@@ -32,18 +32,18 @@ public class ZhihuTopSearchRun extends Thread {
}
}
public
void
getHotList
(){
public
void
getHotList
(){
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
//
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
//
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
log
.
info
(
"知乎热搜采集开始...,当前线程名字:{}"
,
Thread
.
currentThread
().
getName
());
//
log.info("知乎热搜采集开始...,当前线程名字:{}", Thread.currentThread().getName());
List
<
HotSearchList
>
list
=
ZhihuTopicSearchCrawler
.
getZhihuTopicSearch
();
//
List<HotSearchList> list = ZhihuTopicSearchCrawler.getZhihuTopicSearch();
log
.
info
(
"{}, 知乎热搜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
//
log.info("{}, 知乎热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if
(
list
==
null
||
list
.
size
()
==
0
){
//
if(list == null || list.size() == 0){
TipsUtils
.
sendTips
(
"知乎热搜榜单"
,
new
Date
());
//
TipsUtils.sendTips("知乎热搜榜单",new Date());
}
else
{
//
}else {
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
//
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO
.
addHotSearchList
(
data
);
//
hotSearchDAO.addHotSearchList(data);
TipsUtils
.
recoveryTips
(
"知乎热搜榜单"
,
new
Date
());
//
TipsUtils.recoveryTips("知乎热搜榜单",new Date());
}
//
}
log
.
info
(
"知乎热搜话题采集结束........"
);
//
log.info("知乎热搜话题采集结束........");
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
0 → 100644
View file @
f671bae7
package
com
.
zhiwei
.
searchhotcrawler
.
timer
.
quartz
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
com.zhiwei.searchhotcrawler.crawler.*
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO
;
import
com.zhiwei.searchhotcrawler.util.DateUtils
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
org.bson.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.springframework.scheduling.annotation.Async
;
import
org.springframework.scheduling.annotation.EnableAsync
;
import
org.springframework.scheduling.annotation.EnableScheduling
;
import
org.springframework.scheduling.annotation.Scheduled
;
import
org.springframework.stereotype.Component
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
@Component
@EnableScheduling
@EnableAsync
public
class
GatherTimer
{
private
Logger
logger
=
LoggerFactory
.
getLogger
(
GatherTimer
.
class
);
/** 知乎数码子分类 */
private
String
DIGITAL
=
"digital"
;
/** 知乎国际子分类 */
private
String
FOCUS
=
"focus"
;
/** 知乎时事子分类 */
private
String
DEPTH
=
"depth"
;
/**
* 微博热搜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerWeiBo
(){
logger
.
info
(
"微博热搜开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
weiboList
=
WeiboHotSearchCrawler
.
weiboHotSearchByPhone
(
date
);
logger
.
info
(
"{}, 微博此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
weiboList
!=
null
?
weiboList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
微博热搜
.
name
(),
weiboList
);
logger
.
info
(
"微博热搜采集结束..."
);
}
/**
* 今日头条热搜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerTouTiao
(){
logger
.
info
(
"今日头条热搜开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
toutiaoList
=
ToutiaoHotSearchCrawler
.
toutiaoHotSearchByPhone
(
date
);
logger
.
info
(
"{}, 今日头条此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
toutiaoList
!=
null
?
toutiaoList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
今日头条热搜
.
name
(),
toutiaoList
);
logger
.
info
(
"今日头条热搜采集结束..."
);
}
/**
* 百度热搜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerBaiDu
(){
logger
.
info
(
"百度热搜开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
baiduList
=
BaiDuHotSearchCrawler
.
baiduHotSearch
(
date
);
logger
.
info
(
"{}, 百度热搜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
baiduList
!=
null
?
baiduList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
百度热搜
.
name
(),
baiduList
);
logger
.
info
(
"百度热搜采集结束..."
);
}
/**
* 抖音热搜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerDouYin
(){
logger
.
info
(
"抖音热搜开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
douyinList
=
DouyinHotSearchCrawler
.
getMobileDouyinHotList
(
date
);
logger
.
info
(
"{}, 抖音热搜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
douyinList
!=
null
?
douyinList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
抖音热搜
.
name
(),
douyinList
);
logger
.
info
(
"抖音热搜采集结束..."
);
}
/**
* 抖音链接的更新
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 0/5 * * * ? "
)
public
void
updateDouYinUrl
(){
logger
.
info
(
"抖音链接更新开始..."
);
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
List
<
HotSearchList
>
douyinList
=
DouyinHotSearchCrawler
.
list
;
if
(
douyinList
!=
null
&&
douyinList
.
size
()>
0
){
for
(
int
i
=
0
;
i
<
douyinList
.
size
();
i
++){
String
name
=
douyinList
.
get
(
i
).
getName
();
String
id
=
name
+
"_"
+
douyinList
.
get
(
i
).
getType
();
String
url
=
DouyinHotSearchCrawler
.
getDouyinUrl
(
"https://aweme-hl.snssdk.com/aweme/v1/hot/search/video/list/?hotword="
+
name
);
if
(
url
!=
null
)
{
Document
document
=
new
Document
();
document
.
put
(
"id"
,
id
);
document
.
put
(
"url"
,
url
);
hotSearchCacheDAO
.
updateDouyinUrl
(
document
);
}
}
logger
.
info
(
"抖音链接更新结束"
);
}
else
{
logger
.
info
(
"抖音链接更新失败,抖音热搜列表获取为空。"
);
}
}
/**
* 知乎热榜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerZhihu
(){
logger
.
info
(
"知乎热搜开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
zhihuList
=
ZhihuHotSearchCrawler
.
getMobileZhihuHotList
(
date
);
logger
.
info
(
"{}, 知乎热搜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
zhihuList
!=
null
?
zhihuList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
知乎热搜
.
name
(),
zhihuList
);
logger
.
info
(
"知乎热搜采集结束..."
);
}
/**
* 搜狗微信热点的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerWeChat
(){
logger
.
info
(
"搜狗微信热搜开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
SougoHotSearchCrawler
.
sougoHotSearch
(
date
);
logger
.
info
(
"{}, 搜狗微信热搜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
搜狗微信热搜
.
name
(),
list
);
logger
.
info
(
"搜狗微信热搜采集结束..."
);
}
/**
* 微博话题的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ? "
)
public
void
crawlerWeiBoTopic
(){
logger
.
info
(
"微博话题开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
WeiboTopicCrawler
.
startCrawlerByPhone
(
date
);
logger
.
info
(
"{}, 微博话题此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
微博话题
.
name
(),
list
);
logger
.
info
(
"微博话题采集结束..."
);
}
/**
* 腾讯新闻热点的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"10 * * * * ? "
)
public
void
crawlerTengXun
(){
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
TengXunCrawler
.
getTengXunHotList
(
date
);
TipsUtils
.
addHotList
(
HotSearchType
.
腾讯新闻
.
name
(),
list
);
}
/**
* 新浪热点的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"10 * * * * ? "
)
public
void
crawlerXinLangHotSpot
(){
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
XinLangHotSearchCrawler
.
getXinLangHotSpot
(
date
);
TipsUtils
.
addHotList
(
HotSearchType
.
新浪热点
.
name
(),
list
);
}
/**
* 新浪热榜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"10 * * * * ? "
)
public
void
crawlerXinLangHotSearch
(){
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
XinLangHotSearchCrawler
.
getXinLangHotSearch
(
date
);
TipsUtils
.
addHotList
(
HotSearchType
.
新浪热榜
.
name
(),
list
);
}
/**
* 网易新闻热榜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"10 * * * * ? "
)
public
void
crawlerWangYiHotSearch
(){
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
WangYiHotSearchCrawler
.
getWangYiHotSearch
(
date
);
TipsUtils
.
addHotList
(
HotSearchType
.
网易热榜
.
name
(),
list
);
}
/**
* 网易新闻跟帖热议的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"10 * * * * ? "
)
public
void
crawlerWangYiHotComment
(){
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
WangYiHotSearchCrawler
.
getWangYicomment
(
date
);
TipsUtils
.
addHotList
(
HotSearchType
.
网易跟帖热议
.
name
(),
list
);
}
/**
* 凤凰新闻热榜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"10 * * * * ? "
)
public
void
crawlerFengHuangHotData
(){
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
FengHuangSearchCrawler
.
getFengHuangHotData
(
date
);
TipsUtils
.
addHotList
(
HotSearchType
.
凤凰新闻热榜
.
name
(),
list
);
}
/**
* 凤凰新闻热搜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"10 * * * * ? "
)
public
void
crawlerFengHuangHotSearch
(){
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
FengHuangSearchCrawler
.
getFengHuangHotSearch
(
date
);
TipsUtils
.
addHotList
(
HotSearchType
.
凤凰新闻热搜
.
name
(),
list
);
}
/**
* 搜狐话题的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"20 * * * * ? "
)
public
void
crawlerSouHuTopic
(){
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
SouhuTopicCrawler
.
getSouhuTopic
(
date
);
TipsUtils
.
addHotList
(
HotSearchType
.
搜狐话题
.
name
(),
list
);
}
/**
* 知乎热搜话题的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"20 * * * * ? "
)
public
void
crawlerZhihuHotTopic
(){
logger
.
info
(
"知乎热搜话题开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
ZhihuTopicSearchCrawler
.
getZhihuTopicSearch
(
date
);
logger
.
info
(
"{}, 知乎热搜话题此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
知乎热搜榜单
.
name
(),
list
);
logger
.
info
(
"知乎热搜话题采集结束..."
);
}
/**
* 知乎热搜数码分类采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"20 * * * * ? "
)
public
void
crawlerZhiHuDigital
(){
this
.
crawlerZhiHuChild
(
DIGITAL
);
}
/**
* 知乎热搜国际分类采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"20 * * * * ? "
)
public
void
crawlerZhiHuFocus
(){
this
.
crawlerZhiHuChild
(
FOCUS
);
}
/**
* 知乎热搜时事分类采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"20 * * * * ? "
)
public
void
crawlerZhiHuDepth
(){
this
.
crawlerZhiHuChild
(
DEPTH
);
}
/**
* 微博超话的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 0 0/3 * * ? "
)
public
void
crawlerWeiBoSuperTopic
(){
logger
.
info
(
"微博超话采集开始........"
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
WeiboSuperTopicDAO
weiboTopicDAO
=
new
WeiboSuperTopicDAO
();
List
<
WeiboSuperTopic
>
list
=
WeiboSuperTopicCrawler
.
startCrawler
();
logger
.
info
(
"{}, 微博超话此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
Document
>
data
=
new
ArrayList
<>();
for
(
WeiboSuperTopic
topic
:
list
){
logger
.
info
(
"topic::::{}"
,
topic
);
Document
doc
=
new
Document
();
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"score_num"
,
topic
.
getScore
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"type"
,
topic
.
getType
());
doc
.
put
(
"day"
,
topic
.
getDay
());
doc
.
put
(
"time"
,
topic
.
getTime
());
doc
.
put
(
"url"
,
topic
.
getUrl
());
data
.
add
(
doc
);
}
weiboTopicDAO
.
addTopicList
(
data
);
logger
.
info
(
"微博话题采集结束........"
);
}
/**
* 知乎子类采集函数
* @param type
*/
private
void
crawlerZhiHuChild
(
String
type
){
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
String
name
=
this
.
getTypeName
(
type
);
logger
.
info
(
"知乎{}话题热榜采集开始..."
,
name
);
List
<
HotSearchList
>
list
=
ZhihuChildHotSearchCrawler
.
getZhihuTopicSearch
(
type
,
name
,
date
);
logger
.
info
(
"{}, 知乎{}话题此轮采集到的数据量为:{}"
,
new
Date
(),
name
,
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
TipsUtils
.
addHotList
(
name
,
list
);
logger
.
info
(
"知乎{}话题热榜采集结束..."
,
name
);
}
private
String
getTypeName
(
String
type
){
String
name
;
switch
(
type
)
{
case
"digital"
:
name
=
"数码"
;
break
;
case
"focus"
:
name
=
"国际"
;
break
;
case
"depth"
:
name
=
"时事"
;
break
;
default
:
name
=
""
;
}
return
name
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/util/DateUtils.java
0 → 100644
View file @
f671bae7
package
com
.
zhiwei
.
searchhotcrawler
.
util
;
import
java.text.SimpleDateFormat
;
import
java.util.ArrayList
;
import
java.util.Calendar
;
import
java.util.Date
;
import
java.util.List
;
public
class
DateUtils
{
/**
* 取得指定月份后的时间
* @param date
* @return
*/
public
static
Date
getMonthByMonth
(
Date
date
,
Integer
month
){
Calendar
calendar
=
Calendar
.
getInstance
();
calendar
.
setTime
(
date
);
calendar
.
add
(
Calendar
.
MONTH
,
month
);
return
calendar
.
getTime
();
}
/**
* 获取下一个星期(七天之后)的时间
* @param date
* @return
*/
public
static
Date
getNextWeek
(
Date
date
){
Calendar
calendar
=
Calendar
.
getInstance
();
calendar
.
setTime
(
date
);
calendar
.
add
(
Calendar
.
WEEK_OF_MONTH
,
1
);
return
calendar
.
getTime
();
}
/**
* 获取规定小时前后的时间
* @param date
* @param hour
* @return
*/
public
static
Date
getDateByHour
(
Date
date
,
Integer
hour
){
Calendar
calendar
=
Calendar
.
getInstance
();
calendar
.
setTime
(
date
);
calendar
.
add
(
Calendar
.
HOUR
,
hour
);
return
calendar
.
getTime
();
}
/**
* 获取规定天数前后的时间
* @param date
* @param days
* @return
*/
public
static
Date
getDateByDays
(
Date
date
,
Integer
days
){
Calendar
calendar
=
Calendar
.
getInstance
();
calendar
.
setTime
(
date
);
calendar
.
add
(
Calendar
.
DATE
,
days
);
return
calendar
.
getTime
();
}
/**
* 获取规定分钟前后的时间
* @param date
* @param minutes
* @return
*/
public
static
Date
getDateByMinutes
(
Date
date
,
Integer
minutes
){
Calendar
calendar
=
Calendar
.
getInstance
();
calendar
.
setTime
(
date
);
calendar
.
add
(
Calendar
.
MINUTE
,
minutes
);
return
calendar
.
getTime
();
}
/**
* 返回下一年的时间
* @param date
* @return
*/
public
static
Date
getNextYear
(
Date
date
){
Calendar
calendar
=
Calendar
.
getInstance
();
calendar
.
setTime
(
date
);
calendar
.
add
(
Calendar
.
YEAR
,
1
);
return
calendar
.
getTime
();
}
/**
* 返回上一年的时间
* @param date
* @return
*/
public
static
Date
getLastYear
(
Date
date
){
Calendar
calendar
=
Calendar
.
getInstance
();
calendar
.
setTime
(
date
);
calendar
.
add
(
Calendar
.
YEAR
,-
1
);
return
calendar
.
getTime
();
}
/**
* 时间精确到小时
* @param date
* @return
*/
public
static
Date
getDateToAccurateHour
(
Date
date
){
Calendar
calendar
=
Calendar
.
getInstance
();
calendar
.
setTime
(
date
);
calendar
.
set
(
Calendar
.
MINUTE
,
0
);
calendar
.
set
(
Calendar
.
SECOND
,
0
);
return
calendar
.
getTime
();
}
/**
* 返回年
* @param date
* @return
*/
public
static
String
getYearFormat
(
Date
date
){
SimpleDateFormat
format
=
new
SimpleDateFormat
(
"yyyy"
);
return
format
.
format
(
date
);
}
/**
* 返回年月
* @param date
* @return
*/
public
static
String
getMonthFormat
(
Date
date
){
SimpleDateFormat
format
=
new
SimpleDateFormat
(
"yyyy年MM月"
);
return
format
.
format
(
date
);
}
/**
* 返回年月日
* @param date
* @return
*/
public
static
String
getDayFormat
(
Date
date
){
SimpleDateFormat
format
=
new
SimpleDateFormat
(
"yyyy-MM-dd"
);
return
format
.
format
(
date
);
}
/**
* 返回年月日时
* @param date
* @return
*/
public
static
String
getHourFormat
(
Date
date
){
SimpleDateFormat
format
=
new
SimpleDateFormat
(
"yyyy-MM-dd HH"
);
return
format
.
format
(
date
);
}
/**
* 返回年月日时分秒
* @param date
* @return
*/
public
static
String
getTimeFormat
(
Date
date
){
SimpleDateFormat
format
=
new
SimpleDateFormat
(
"yyyy-MM-dd HH:mm:ss"
);
return
format
.
format
(
date
);
}
//获取近一周的时间格式化
public
static
List
<
String
>
getWeekTimeFormat
(
Date
date
){
List
<
String
>
weekList
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
7
;
i
++){
weekList
.
add
(
getDayFormat
(
date
));
date
=
getDateByDays
(
date
,-
1
);
}
return
weekList
;
}
/**
* 将毫秒值清零
* @param date
* @return
*/
public
static
Date
getMillSecondTime
(
Date
date
){
Calendar
calendar
=
Calendar
.
getInstance
();
calendar
.
setTime
(
date
);
calendar
.
set
(
Calendar
.
MILLISECOND
,
0
);
return
calendar
.
getTime
();
}
}
src/main/resources/applicationContext.xml
0 → 100644
View file @
f671bae7
<?xml version="1.0" encoding="UTF-8"?>
<beans
xmlns=
"http://www.springframework.org/schema/beans"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xmlns:context=
"http://www.springframework.org/schema/context"
xmlns:aop=
"http://www.springframework.org/schema/aop"
xmlns:tx=
"http://www.springframework.org/schema/tx"
xmlns:task=
"http://www.springframework.org/schema/task"
xsi:schemaLocation=
"http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context-4.2.xsd
http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans-4.2.xsd
http://www.springframework.org/schema/tx
http://www.springframework.org/schema/tx/spring-tx-4.2.xsd
http://www.springframework.org/schema/aop
http://www.springframework.org/schema/aop/spring-aop-4.2.xsd
http://www.springframework.org/schema/task
http://www.springframework.org/schema/task/spring-task.xsd"
>
<!-- 开启注解处理器 -->
<context:annotation-config
/>
<!-- 基于注解方式的定时器 -->
<task:annotation-driven
scheduler=
"myScheduler"
/>
<task:scheduler
id=
"myScheduler"
pool-size=
"50"
/>
<!-- 开启组件自动扫描,扫描路径由base-package属性指定 -->
<context:component-scan
base-package=
"com.zhiwei.searchhotcrawler"
/>
</beans>
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment