Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
cb6bcd76
Commit
cb6bcd76
authored
Mar 06, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加微博话题采集,并添加lombok
parent
a9966f9d
Hide whitespace changes
Inline
Side-by-side
Showing
30 changed files
with
2012 additions
and
1780 deletions
+2012
-1780
pom.xml
+120
-114
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
+76
-134
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+10
-9
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboSuperTopic.java
+90
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboTopic.java
+0
-158
src/main/java/com/zhiwei/searchhotcrawler/cache/CacheListener.java
+32
-29
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+114
-113
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
+75
-74
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
+96
-96
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+11
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboSuperTopicCrawler.java
+164
-163
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
+219
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+114
-114
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchListDAO.java
+153
-137
src/main/java/com/zhiwei/searchhotcrawler/dao/WechatUserDao.java
+70
-69
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiboSuperTopicDAO.java
+85
-71
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+7
-12
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchListTest.java
+142
-107
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
+6
-5
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
+5
-5
src/main/java/com/zhiwei/searchhotcrawler/timer/SendWeiboHotSearchRun.java
+125
-124
src/main/java/com/zhiwei/searchhotcrawler/timer/SendZhihuHotSearchRun.java
+127
-126
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
+6
-5
src/main/java/com/zhiwei/searchhotcrawler/timer/UpdateWechatUserRun.java
+50
-49
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
+6
-6
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboSuperTopicRun.java
+63
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
+25
-29
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
+7
-9
src/main/resources/db.properties
+12
-11
src/main/resources/proxyip.properties
+2
-1
No files found.
pom.xml
View file @
cb6bcd76
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
searchhotcrawler
</artifactId>
<version>
0.0.6-SNAPSHOT
</version>
<name>
各平台热搜榜单采集程序
</name>
<description>
各平台热搜榜单采集程序
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序
</description>
<properties>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
</properties>
<developers>
<developer>
<id>
Bewilder
</id>
<name>
zhiwei zhang
</name>
<email>
zhangzhiwei@zhiweidata.com
</email>
</developer>
</developers>
<dependencies>
<!-- 数据解析jar -->
<dependency>
<groupId>
org.mongodb
</groupId>
<artifactId>
mongo-java-driver
</artifactId>
<version>
3.6.3
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
sendmail
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.5-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.5.5.6-SNAPSHOT
</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-shade-plugin
</artifactId>
<version>
2.4.2
</version>
<executions>
<execution>
<phase>
package
</phase>
<goals>
<goal>
shade
</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>
*:*
</artifact>
<excludes>
<exclude>
META-INF/*.SF
</exclude>
<exclude>
META-INF/*.DSA
</exclude>
<exclude>
META-INF/*.RSA
</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation=
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"
>
<mainClass>
com.zhiwei.searchhotcrawler.run.HotSearchRun
</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>
maven-source-plugin
</artifactId>
<version>
2.4
</version>
<configuration>
<attach>
true
</attach>
</configuration>
<executions>
<execution>
<phase>
compile
</phase>
<goals>
<goal>
jar
</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-surefire-plugin
</artifactId>
<version>
2.19.1
</version>
<configuration>
<forkMode>
once
</forkMode>
<argLine>
-Dfile.encoding=UTF-8
</argLine>
<skipTests>
true
</skipTests>
</configuration>
</plugin>
</plugins>
</build>
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
searchhotcrawler
</artifactId>
<version>
0.0.6-SNAPSHOT
</version>
<name>
各平台热搜榜单采集程序
</name>
<description>
各平台热搜榜单采集程序
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序
</description>
<properties>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
</properties>
<developers>
<developer>
<id>
Bewilder
</id>
<name>
zhiwei zhang
</name>
<email>
zhangzhiwei@zhiweidata.com
</email>
</developer>
</developers>
<dependencies>
<!-- 数据解析jar -->
<dependency>
<groupId>
org.mongodb
</groupId>
<artifactId>
mongo-java-driver
</artifactId>
<version>
3.6.3
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
sendmail
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.6-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.6.0.4-RELEASE
</version>
</dependency>
<dependency>
<groupId>
org.projectlombok
</groupId>
<artifactId>
lombok
</artifactId>
<version>
1.18.8
</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-shade-plugin
</artifactId>
<version>
2.4.2
</version>
<executions>
<execution>
<phase>
package
</phase>
<goals>
<goal>
shade
</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>
*:*
</artifact>
<excludes>
<exclude>
META-INF/*.SF
</exclude>
<exclude>
META-INF/*.DSA
</exclude>
<exclude>
META-INF/*.RSA
</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation=
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"
>
<mainClass>
com.zhiwei.searchhotcrawler.run.HotSearchRun
</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>
maven-source-plugin
</artifactId>
<version>
2.4
</version>
<configuration>
<attach>
true
</attach>
</configuration>
<executions>
<execution>
<phase>
compile
</phase>
<goals>
<goal>
jar
</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-surefire-plugin
</artifactId>
<version>
2.19.1
</version>
<configuration>
<forkMode>
once
</forkMode>
<argLine>
-Dfile.encoding=UTF-8
</argLine>
<skipTests>
true
</skipTests>
</configuration>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
View file @
cb6bcd76
...
...
@@ -10,36 +10,80 @@ import java.io.Serializable;
import
java.util.Date
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
lombok.Data
;
import
lombok.ToString
;
@Data
@ToString
public
class
HotSearchList
implements
Serializable
{
private
static
final
long
serialVersionUID
=
2076919584659821600L
;
private
String
id
;
//主键
private
String
url
;
//消息链接
private
String
name
;
//热搜关键词
private
Integer
count
;
//时时热搜量
private
Boolean
hot
;
//状态(true 为热搜; false为时时上升)
private
String
day
;
//天
private
Date
time
;
//时间
private
Integer
changeCount
;
//据上分钟变化量
private
Integer
rank
;
//排名
private
String
type
;
//分类
private
String
icon
;
//热搜类型
/**
* 主键
*/
private
String
id
;
/**
* 消息链接
*/
private
String
url
;
/**
* 热搜关键词
*/
private
String
name
;
/**
* 热搜或话题导语
*/
private
String
topicLead
;
/**
* 时时热搜量
*/
private
Integer
count
;
/**
* 状态(true 为热搜; false为时时上升)
*/
private
Boolean
hot
;
/**
* 天
*/
private
String
day
;
/**
* 时间
*/
private
Date
time
;
/**
* 据上分钟变化量
*/
private
Integer
changeCount
;
/**
* 排名
*/
private
Integer
rank
;
/**
* 分类
*/
private
String
type
;
/**
* 热搜类型
*/
private
String
icon
;
/**
* 话题讨论量
*/
private
Integer
commentCount
;
public
HotSearchList
(){}
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
){
...
...
@@ -67,122 +111,20 @@ public class HotSearchList implements Serializable{
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
type
=
type
;
}
@Override
public
String
toString
(){
return
"new HotSearchList["
+
"id = "
+
id
+
", url = "
+
url
+
", name = "
+
name
+
", count = "
+
count
+
", time = "
+
time
+
", hot = "
+
hot
+
", rank = "
+
rank
+
", day = "
+
day
+
", changeCount = "
+
changeCount
+
", type = "
+
type
+
", icon = "
+
icon
+
"]"
;
}
public
String
getId
()
{
return
id
;
}
public
void
setId
(
String
id
)
{
this
.
id
=
id
;
}
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Integer
rank
,
String
type
,
Integer
commentCount
,
String
topicLead
){
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
();
this
.
url
=
url
;
}
public
String
getName
()
{
return
name
;
}
public
void
setName
(
String
name
)
{
this
.
name
=
name
;
}
public
Integer
getCount
()
{
return
count
;
}
public
void
setCount
(
Integer
count
)
{
this
.
count
=
count
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
public
Integer
getChangeCount
()
{
return
changeCount
;
}
public
void
setChangeCount
(
Integer
changeCount
)
{
this
.
changeCount
=
changeCount
;
}
public
static
long
getSerialversionuid
()
{
return
serialVersionUID
;
}
public
Boolean
isHot
()
{
return
hot
;
}
public
void
setHot
(
Boolean
hot
)
{
this
.
hot
=
hot
;
}
public
Boolean
getHot
()
{
return
hot
;
}
public
String
getIcon
()
{
return
icon
;
}
public
void
setIcon
(
String
icon
)
{
this
.
icon
=
icon
;
}
public
String
getDay
()
{
return
day
;
}
public
void
setDay
(
String
day
)
{
this
.
day
=
day
;
}
public
Integer
getRank
()
{
return
rank
;
}
public
void
setRank
(
Integer
rank
)
{
this
.
hot
=
true
;
this
.
rank
=
rank
;
}
public
String
getType
()
{
return
type
;
}
public
void
setType
(
String
type
)
{
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
type
=
type
;
this
.
commentCount
=
commentCount
;
this
.
topicLead
=
topicLead
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
public
enum
HotSearchType
{
百度热搜
,
微博热搜
,
知乎热搜
,
抖音热搜
,
搜狗微信热搜
}
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
public
enum
HotSearchType
{
百度热搜
,
微博热搜
,
知乎热搜
,
抖音热搜
,
搜狗微信热搜
,
微博话题
}
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboSuperTopic.java
0 → 100644
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
import
java.util.Date
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
lombok.Data
;
import
lombok.ToString
;
/**
*
* @ClassName: WeiboSuperTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
@Data
@ToString
public
class
WeiboSuperTopic
{
/**
* 主键
*/
private
String
id
;
/**
* 话题链接
*/
public
String
url
;
/**
* 话题名称
*/
public
String
topicName
;
/**
* 话题排名
*/
public
Integer
rank
;
/**
* 话题影响力
*/
public
String
score
;
/**
* 话题粉丝数
*/
public
String
fensi
;
/**
* 话题阅读数
*/
public
String
readNum
;
/**
* 话题帖子数
*/
public
String
postNum
;
/**
* 榜单类型
*/
public
String
type
;
/**
* 天
*/
private
String
day
;
/**
* 时间
*/
private
Date
time
;
public
WeiboSuperTopic
()
{}
public
WeiboSuperTopic
(
String
url
,
String
topicName
,
Integer
rank
,
String
score
,
String
fensi
,
String
type
)
{
this
.
url
=
url
;
this
.
topicName
=
topicName
;
this
.
rank
=
rank
;
this
.
score
=
score
;
this
.
fensi
=
fensi
;
this
.
type
=
type
;
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
id
=
topicName
+
"_"
+
type
+
"_"
+
day
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboTopic.java
deleted
100644 → 0
View file @
a9966f9d
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
import
java.util.Date
;
import
com.zhiwei.tools.timeparse.TimeParse
;
/**
*
* @ClassName: WeiboTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
public
class
WeiboTopic
{
private
String
id
;
//主键
public
String
url
;
//话题链接
public
String
topicName
;
//话题名称
public
Integer
rank
;
//话题排名
public
String
score
;
//话题影响力
public
String
fensi
;
//话题粉丝数
public
String
readNum
;
//话题阅读数
public
String
postNum
;
//话题帖子数
public
String
type
;
//榜单类型
private
String
day
;
//天
private
Date
time
;
//时间
public
WeiboTopic
()
{}
public
WeiboTopic
(
String
url
,
String
topicName
,
Integer
rank
,
String
score
,
String
fensi
,
String
type
)
{
this
.
url
=
url
;
this
.
topicName
=
topicName
;
this
.
rank
=
rank
;
this
.
score
=
score
;
this
.
fensi
=
fensi
;
this
.
type
=
type
;
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
id
=
topicName
+
"_"
+
type
+
"_"
+
day
;
}
@Override
public
String
toString
()
{
return
"new WeiboTopic["
+
"topicName = "
+
topicName
+
", rank = "
+
rank
+
", score = "
+
score
+
", fensi = "
+
fensi
+
", type = "
+
type
+
", readNum = "
+
readNum
+
", postNum = "
+
postNum
+
", url = "
+
url
+
"]"
;
}
public
String
getUrl
()
{
return
url
;
}
public
String
getTopicName
()
{
return
topicName
;
}
public
Integer
getRank
()
{
return
rank
;
}
public
String
getScore
()
{
return
score
;
}
public
String
getFensi
()
{
return
fensi
;
}
public
String
getReadNum
()
{
return
readNum
;
}
public
String
getPostNum
()
{
return
postNum
;
}
public
String
getType
()
{
return
type
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
void
setTopicName
(
String
topicName
)
{
this
.
topicName
=
topicName
;
}
public
void
setRank
(
Integer
rank
)
{
this
.
rank
=
rank
;
}
public
void
setScore
(
String
score
)
{
this
.
score
=
score
;
}
public
void
setFensi
(
String
fensi
)
{
this
.
fensi
=
fensi
;
}
public
void
setReadNum
(
String
readNum
)
{
this
.
readNum
=
readNum
;
}
public
void
setPostNum
(
String
postNum
)
{
this
.
postNum
=
postNum
;
}
public
void
setType
(
String
type
)
{
this
.
type
=
type
;
}
public
String
getId
()
{
return
id
;
}
public
String
getDay
()
{
return
day
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setId
(
String
id
)
{
this
.
id
=
id
;
}
public
void
setDay
(
String
day
)
{
this
.
day
=
day
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/cache/CacheListener.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
cache
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
CacheListener
{
Logger
logger
=
LoggerFactory
.
getLogger
(
CacheListener
.
class
);
public
void
startListen
()
{
new
Thread
(){
public
void
run
()
{
while
(
true
)
{
if
(
CacheManager
.
caches
!=
null
&&
CacheManager
.
caches
.
size
()>
0
){
for
(
String
key
:
CacheManager
.
getAllKeys
())
{
if
(
CacheManager
.
isTimeOut
(
key
))
{
CacheManager
.
clearByKey
(
key
);
logger
.
info
(
key
+
"缓存被清除"
);
}
}
}
ZhiWeiTools
.
sleep
(
500
);
}
}
}.
start
();
}
}
package
com
.
zhiwei
.
searchhotcrawler
.
cache
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
CacheListener
{
/**
* 开启缓存监听
*/
public
void
startListen
()
{
new
Thread
(){
public
void
run
()
{
while
(
true
)
{
if
(
CacheManager
.
caches
!=
null
&&
CacheManager
.
caches
.
size
()>
0
){
for
(
String
key
:
CacheManager
.
getAllKeys
())
{
if
(
CacheManager
.
isTimeOut
(
key
))
{
CacheManager
.
clearByKey
(
key
);
log
.
info
(
key
+
"缓存被清除"
);
}
}
}
ZhiWeiTools
.
sleep
(
500
);
}
}
}.
start
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Objects
;
import
org.apache.commons.lang3.StringUtils
;
import
org.
jsoup.Jsoup
;
import
org.jsoup.
nodes.Document
;
import
org.jsoup.
select.Elements
;
import
org.
slf4j.Logger
;
import
org.slf4j.Logger
Factory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.
proxy.ProxyHolder
;
import
com.zhiwei.crawler.
utils.RequestUtils
;
import
com.zhiwei.
searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
Type
;
/**
* @ClassName:BaiDuHotSearch
* @
Description: TODO(百度风云榜热搜采集)
* @
author hero
* @
date 2019年7月10日 上午10:54:31
*
/
public
class
BaiDuHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiDuHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: BaiDuHotSearchTest
* @
author hero
* @
Description: TODO(PC端百度风云榜采集)
* @
param 设定文件
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
baiduHotSearch
()
{
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"mainBody"
))
{
return
ansysData
(
htmlBody
);
}
else
{
log
ger
.
info
(
"解析百度风云榜时出现解析错误,页面结构有问题"
);
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
}
return
Collections
.
emptyList
();
}
/**
* 解析数据
* @param htmlBody
* @return
*/
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"table.list-table"
).
select
(
"tr"
);
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
())
{
elements
.
forEach
(
element
->
{
try
{
// 获取排名rank
String
rankStr
=
null
;
// 根据网页标签,给rankStr做判断
if
(!
element
.
select
(
"td.first"
).
select
(
"span.num-top"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"td.first"
).
select
(
"span.num-top"
).
text
();
}
else
if
(!
element
.
select
(
"td.first"
).
select
(
"span.num-normal"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"td.first"
).
select
(
"span.num-normal"
).
text
();
}
Integer
rank
=
null
;
// 判断rankStr是否为空
if
(
StringUtils
.
isNoneBlank
(
rankStr
))
{
rank
=
Integer
.
valueOf
(
rankStr
);
}
// 获取关键词(String)
String
kw
=
element
.
select
(
"td.keyword"
).
select
(
"a.list-title"
).
text
();
// logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
String
everurl
=
element
.
select
(
"td.keyword"
).
select
(
"a.list-title"
).
attr
(
"href"
);
// 获取搜索指数count(int)
String
hot
=
null
;
// 判断热度值所在的规则是否为null
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-fall"
).
isEmpty
())
{
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-fall"
).
text
();
}
else
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
isEmpty
())
{
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
text
();
}
int
count
=
0
;
// 判断hot是否为空
if
(
StringUtils
.
isNotBlank
(
hot
))
{
count
=
Integer
.
valueOf
(
hot
);
}
if
(
Objects
.
nonNull
(
rank
))
{
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
count
,
rank
,
HotSearchType
.
百度热搜
.
name
());
list
.
add
(
hotSearch
);
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析百度风云榜时出现解析错误"
,
e
);
}
});
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析百度风云榜时出现解析错误,数据不是json结构"
,
e
);
}
return
list
;
}
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
org.
apache.commons.lang3.StringUtils
;
import
org.jsoup.
Jsoup
;
import
org.jsoup.
nodes.Document
;
import
org.
jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.
core.HttpBoot
;
import
com.zhiwei.crawler.
proxy.ProxyHolder
;
import
com.zhiwei.
crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
List
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
/**
* @
ClassName:BaiDuHotSearch
* @
Description: TODO(百度风云榜热搜采集)
* @
author hero
*
@date 2019年7月10日 上午10:54:31
*/
@Log4j2
public
class
BaiDuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @
Title: BaiDuHotSearchTest
* @
author hero
* @
Description: PC端百度风云榜采集
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
baiduHotSearch
()
{
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"mainBody"
))
{
return
ansysData
(
htmlBody
);
}
else
{
log
.
info
(
"解析百度风云榜时出现解析错误,页面结构有问题"
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
}
return
Collections
.
emptyList
();
}
/**
* 解析数据
* @param htmlBody
* @return
*/
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"table.list-table"
).
select
(
"tr"
);
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
())
{
elements
.
forEach
(
element
->
{
try
{
// 获取排名rank
String
rankStr
=
null
;
// 根据网页标签,给rankStr做判断
if
(!
element
.
select
(
"td.first"
).
select
(
"span.num-top"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"td.first"
).
select
(
"span.num-top"
).
text
();
}
else
if
(!
element
.
select
(
"td.first"
).
select
(
"span.num-normal"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"td.first"
).
select
(
"span.num-normal"
).
text
();
}
Integer
rank
=
null
;
// 判断rankStr是否为空
if
(
StringUtils
.
isNoneBlank
(
rankStr
))
{
rank
=
Integer
.
valueOf
(
rankStr
);
}
// 获取关键词(String)
String
kw
=
element
.
select
(
"td.keyword"
).
select
(
"a.list-title"
).
text
();
// logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
String
everurl
=
element
.
select
(
"td.keyword"
).
select
(
"a.list-title"
).
attr
(
"href"
);
// 获取搜索指数count(int)
String
hot
=
null
;
// 判断热度值所在的规则是否为null
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-fall"
).
isEmpty
())
{
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-fall"
).
text
();
}
else
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
isEmpty
())
{
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
text
();
}
int
count
=
0
;
// 判断hot是否为空
if
(
StringUtils
.
isNotBlank
(
hot
))
{
count
=
Integer
.
valueOf
(
hot
);
}
if
(
Objects
.
nonNull
(
rank
))
{
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
count
,
rank
,
HotSearchType
.
百度热搜
.
name
());
list
.
add
(
hotSearch
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析百度风云榜时出现解析错误"
,
e
);
}
});
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析百度风云榜时出现解析错误,数据不是json结构"
,
e
);
}
return
list
;
}
}
\ No newline at end of file
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.List
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
/**
* @className DouyinHotSearchCrawler
* @Description:抖音热搜榜采集程序
* @author win 10
* @date:2019年07月11日 上午10:26:21
*/
public
class
DouyinHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: getMobileDouyinHotList
* @author hero
* @Description: 移动端抖音热搜榜
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public
static
List
<
HotSearchList
>
getMobileDouyinHotList
(){
List
<
HotSearchList
>
list
=
null
;
String
url
=
"https://api.amemv.com/aweme/v1/hot/search/list/"
;
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
)){
list
=
new
ArrayList
<>();
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
wordList
=
data
.
getJSONObject
(
"data"
).
getJSONArray
(
"word_list"
);
String
positionStr
=
null
;
String
word
=
null
;
String
hotValueStr
=
null
;
for
(
int
i
=
0
;
i
<
wordList
.
size
();
i
++)
{
JSONObject
wl
=
wordList
.
getJSONObject
(
i
);
//获取排名
positionStr
=
wl
.
getString
(
"position"
);
Integer
position
=
null
;
position
=
Integer
.
valueOf
(
positionStr
);
//获取关键词
word
=
wl
.
getString
(
"word"
);
//获取热度值
hotValueStr
=
wl
.
getString
(
"hot_value"
);
Integer
hotValue
=
null
;
hotValue
=
Integer
.
valueOf
(
hotValueStr
);
// logger.info("热度为:::{}", hot_value);
HotSearchList
douyin
=
new
HotSearchList
(
null
,
word
,
hotValue
,
position
,
HotSearchType
.
抖音热搜
.
name
());
list
.
add
(
douyin
);
}
}
}
catch
(
IOException
e
)
{
logger
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
);
}
return
list
;
}
}
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.List
;
import
lombok.extern.log4j.Log4j2
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
/**
* @className DouyinHotSearchCrawler
* @Description:抖音热搜榜采集程序
* @author win 10
* @date:2019年07月11日 上午10:26:21
*/
@Log4j2
public
class
DouyinHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: getMobileDouyinHotList
* @author hero
* @Description: 移动端抖音热搜榜
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public
static
List
<
HotSearchList
>
getMobileDouyinHotList
(){
List
<
HotSearchList
>
list
=
null
;
String
url
=
"https://api.amemv.com/aweme/v1/hot/search/list/"
;
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
)){
list
=
new
ArrayList
<>();
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
wordList
=
data
.
getJSONObject
(
"data"
).
getJSONArray
(
"word_list"
);
String
positionStr
=
null
;
String
word
=
null
;
String
hotValueStr
=
null
;
for
(
int
i
=
0
;
i
<
wordList
.
size
();
i
++)
{
JSONObject
wl
=
wordList
.
getJSONObject
(
i
);
//获取排名
positionStr
=
wl
.
getString
(
"position"
);
Integer
position
=
null
;
position
=
Integer
.
valueOf
(
positionStr
);
//获取关键词
word
=
wl
.
getString
(
"word"
);
//获取热度值
hotValueStr
=
wl
.
getString
(
"hot_value"
);
Integer
hotValue
=
null
;
hotValue
=
Integer
.
valueOf
(
hotValueStr
);
// logger.info("热度为:::{}", hot_value);
HotSearchList
douyin
=
new
HotSearchList
(
null
,
word
,
hotValue
,
position
,
HotSearchType
.
抖音热搜
.
name
());
list
.
add
(
douyin
);
}
}
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
);
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.apache.commons.lang3.StringUtils
;
import
org.
jsoup.Jsoup
;
import
org.jsoup.
nodes.Document
;
import
org.jsoup.nodes.
Element
;
import
org.jsoup.
select.Elements
;
import
org.
slf4j.Logger
;
import
org.slf4j.Logger
Factory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.
proxy.ProxyHolder
;
import
com.zhiwei.crawler.
utils.RequestUtils
;
import
com.zhiwei.
searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
Type
;
import
com.zhiwei.
tools.httpclient.HeaderTool
;
/**
* @ClassName:SougoHotSearch
* @
Description: TODO(搜狗微信关键词采集)
* @
author hero
* @
date 2019年7月10日 上午10:54:31
*
/
public
class
SougoHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SougoHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: SougoHotSearchTest
* @
author hero
* @
Description: TODO(PC端搜狗微信关键词采集)
* @
param 设定文件
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
sougoHotSearch
()
{
String
url
=
"https://weixin.sogou.com"
;
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
try
{
Map
<
String
,
String
>
headMap
=
HeaderTool
.
getCommonHead
();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"topwords"
))
{
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"ol#topwords"
).
select
(
"li"
);
for
(
Element
element
:
elements
)
{
try
{
// 获取排名rank
String
rankStr
=
null
;
if
(!
element
.
select
(
"li"
).
select
(
"i"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"li"
).
select
(
"i"
).
text
();
}
Integer
rank
=
null
;
if
(
StringUtils
.
isNoneBlank
(
rankStr
))
{
rank
=
Integer
.
valueOf
(
rankStr
);
}
// 获取关键词(String)
String
kw
=
element
.
select
(
"li"
).
select
(
"a"
).
text
();
// logger.info("关键词:{}", kw);
String
everurl
=
element
.
select
(
"li"
).
select
(
"a"
).
attr
(
"href"
);
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
null
,
rank
,
HotSearchType
.
搜狗微信热搜
.
name
());
if
(
Objects
.
nonNull
(
rank
))
{
list
.
add
(
hotSearch
);
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析搜狗微信时出现解析错误"
,
e
);
}
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析搜狗微信时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
return
Collections
.
emptyList
();
}
}
else
{
log
ger
.
info
(
"解析搜狗微信时出现解析错误,页面结构有问题"
);
}
break
;
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
}
}
return
list
;
}
}
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
org.
apache.commons.lang3.StringUtils
;
import
org.jsoup.
Jsoup
;
import
org.jsoup.nodes.
Document
;
import
org.jsoup.
nodes.Element
;
import
org.
jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.
core.HttpBoot
;
import
com.zhiwei.crawler.
proxy.ProxyHolder
;
import
com.zhiwei.
crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
List
;
import
com.zhiwei.
searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
/**
* @
ClassName:SougoHotSearch
* @
Description: TODO(搜狗微信关键词采集)
* @
author hero
*
@date 2019年7月10日 上午10:54:31
*/
@Log4j2
public
class
SougoHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @
Title: SougoHotSearchTest
* @
author hero
* @
Description: TODO(PC端搜狗微信关键词采集)
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
sougoHotSearch
()
{
String
url
=
"https://weixin.sogou.com"
;
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
try
{
Map
<
String
,
String
>
headMap
=
HeaderTool
.
getCommonHead
();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"topwords"
))
{
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"ol#topwords"
).
select
(
"li"
);
for
(
Element
element
:
elements
)
{
try
{
// 获取排名rank
String
rankStr
=
null
;
if
(!
element
.
select
(
"li"
).
select
(
"i"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"li"
).
select
(
"i"
).
text
();
}
Integer
rank
=
null
;
if
(
StringUtils
.
isNoneBlank
(
rankStr
))
{
rank
=
Integer
.
valueOf
(
rankStr
);
}
// 获取关键词(String)
String
kw
=
element
.
select
(
"li"
).
select
(
"a"
).
text
();
// logger.info("关键词:{}", kw);
String
everurl
=
element
.
select
(
"li"
).
select
(
"a"
).
attr
(
"href"
);
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
null
,
rank
,
HotSearchType
.
搜狗微信热搜
.
name
());
if
(
Objects
.
nonNull
(
rank
))
{
list
.
add
(
hotSearch
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析搜狗微信时出现解析错误"
,
e
);
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析搜狗微信时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
return
Collections
.
emptyList
();
}
}
else
{
log
.
info
(
"解析搜狗微信时出现解析错误,页面结构有问题"
);
}
break
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
}
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
cb6bcd76
...
...
@@ -7,6 +7,7 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
lombok.extern.log4j.Log4j2
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -27,13 +28,13 @@ import com.zhiwei.tools.tools.URLCodeUtil;
/**
* @ClassName: WeiboHotSearch
* @Description:
TODO(微博实时热搜采集)
* @Description:
微博实时热搜采集
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
public
class
WeiboHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: weiboHotSearchTest
...
...
@@ -70,18 +71,18 @@ public class WeiboHotSearchCrawler {
list
.
add
(
hotSearch
);
}
catch
(
Exception
e
)
{
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
log
ger
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
log
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
continue
;
}
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
return
null
;
}
}
else
{
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
log
ger
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
log
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
}
break
;
}
catch
(
Exception
e
)
{
...
...
@@ -138,25 +139,25 @@ public class WeiboHotSearchCrawler {
}
String
id
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
);
log
ger
.
info
(
"采集到的数据:::{}"
,
hotSearch
);
log
.
info
(
"采集到的数据:::{}"
,
hotSearch
);
result
.
add
(
hotSearch
);
rank
++;
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
log
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
continue
;
}
}
return
result
;
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
);
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
);
return
Collections
.
emptyList
();
}
}
else
{
log
ger
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
log
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
}
}
catch
(
IOException
e1
)
{
log
ger
.
error
(
"解析微博时时热搜时出现连接失败"
,
e1
);
log
.
error
(
"解析微博时时热搜时出现连接失败"
,
e1
);
return
Collections
.
emptyList
();
}
return
Collections
.
emptyList
();
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/Weibo
Huati
Crawler.java
→
src/main/java/com/zhiwei/searchhotcrawler/crawler/Weibo
SuperTopic
Crawler.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
java.util.Objects
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.WeiboTopic
;
/**
*
* @ClassName: WeiboHuatiCrawler
* @Description: 微博话题榜单采集(明星)
* @author Bewilder ZW
* @date 2019年9月27日 下午3:01:34
*/
public
class
WeiboHuatiCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboHuatiCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
static
{
headMap
.
put
(
"X-Requested-With"
,
"XMLHttpRequest"
);
headMap
.
put
(
"Referer"
,
"https://huati.weibo.cn/discovery/super?extparam=ctg1_2%7Cscorll_1&luicode=10000011&lfid=100803_-_super&sourceType=weixin"
);
headMap
.
put
(
"Host"
,
"huati.weibo.cn"
);
}
/**
*
* 开始采集明星话题
* @return void
*/
public
static
List
<
WeiboTopic
>
startCrawler
()
{
Map
<
String
,
String
>
urlMap
=
new
HashMap
<>();
urlMap
.
put
(
"明星"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=star&from=&wm="
);
urlMap
.
put
(
"明星潜力"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=potential&from=&wm="
);
urlMap
.
put
(
"明星上升"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=up&from=&wm="
);
List
<
WeiboTopic
>
topicList
=
new
ArrayList
<>();
for
(
Entry
<
String
,
String
>
entry
:
urlMap
.
entrySet
())
{
String
url
=
entry
.
getValue
();
String
type
=
entry
.
getKey
();
for
(
int
page
=
1
;
page
<=
5
;
page
++)
{
String
pageUrl
=
url
+
"&page="
+
page
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
try
{
System
.
out
.
println
(
"pageUrl=========="
+
pageUrl
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
pageUrl
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc1"
))
{
topicList
.
addAll
(
parseTopicRankHtml
(
page
,
htmlBody
,
type
));
break
;
}
else
{
logger
.
error
(
"获取榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
e
);
continue
;
}
}
}
}
return
topicList
;
}
/**
*
* 解析话题榜单
* @param htmlBody
* @param type
* @return void
*/
private
static
List
<
WeiboTopic
>
parseTopicRankHtml
(
int
page
,
String
htmlBody
,
String
type
)
{
try
{
JSONArray
list
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
page
=
(
page
-
1
)*
20
;
List
<
WeiboTopic
>
topicList
=
new
ArrayList
<>();
Integer
toprank
=
null
;
String
topicName
=
null
;
String
id
=
null
;
String
score
=
null
;
String
desc1
=
null
;
String
fensi
=
null
;
String
url
=
null
;
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
JSONObject
data
=
list
.
getJSONObject
(
i
);
toprank
=
page
+
data
.
getInteger
(
"toprank"
);
topicName
=
data
.
getString
(
"display_name"
);
id
=
data
.
getString
(
"page_id"
);
score
=
data
.
getString
(
"score"
);
desc1
=
data
.
getString
(
"desc1"
);
fensi
=
desc1
.
replaceAll
(
".*影响力|粉丝"
,
""
).
trim
();
url
=
data
.
getString
(
"link"
);
WeiboTopic
topic
=
new
WeiboTopic
(
url
,
topicName
,
toprank
,
score
,
fensi
,
type
);
topic
=
getTopicInfo
(
id
,
topic
);
System
.
out
.
println
(
"topic====="
+
topic
);
topicList
.
add
(
topic
);
}
return
topicList
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析榜单列表页面时出现错误,错误为:{}"
,
e
);
}
return
Collections
.
emptyList
();
}
/**
*
* 根据单一话题id获取话题阅读数及发帖数
* @param id
* @param topic
* @return
* @return WeiboTopic
*/
private
static
WeiboTopic
getTopicInfo
(
String
id
,
WeiboTopic
topic
)
{
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
try
{
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid="
+
id
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc_more"
))
{
String
descMore
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"pageInfo"
).
getJSONArray
(
"desc_more"
).
getString
(
0
);
if
(
StringUtils
.
isNotBlank
(
descMore
))
{
String
readNum
=
descMore
.
replaceAll
(
"阅读|帖子.*"
,
""
).
trim
();
String
postNum
=
descMore
.
replaceAll
(
".*帖子|粉丝.*"
,
""
).
trim
();
topic
.
setPostNum
(
postNum
);
topic
.
setReadNum
(
readNum
);
return
topic
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
e
);
}
}
return
topic
;
}
}
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
java.util.Objects
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
lombok.extern.log4j.Log4j2
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
/**
*
* @ClassName: WeiboSuperTopicCrawler
* @Description: 微博超话榜单采集(明星)
* @author Bewilder ZW
* @date 2019年9月27日 下午3:01:34
*/
@Log4j2
public
class
WeiboSuperTopicCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
static
{
headMap
.
put
(
"X-Requested-With"
,
"XMLHttpRequest"
);
headMap
.
put
(
"Referer"
,
"https://huati.weibo.cn/discovery/super?extparam=ctg1_2%7Cscorll_1&luicode=10000011&lfid=100803_-_super&sourceType=weixin"
);
headMap
.
put
(
"Host"
,
"huati.weibo.cn"
);
}
/**
*
* 开始采集明星话题
* @return void
*/
public
static
List
<
WeiboSuperTopic
>
startCrawler
()
{
Map
<
String
,
String
>
urlMap
=
new
HashMap
<>();
urlMap
.
put
(
"明星"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=star&from=&wm="
);
urlMap
.
put
(
"明星潜力"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=potential&from=&wm="
);
urlMap
.
put
(
"明星上升"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=up&from=&wm="
);
List
<
WeiboSuperTopic
>
topicList
=
new
ArrayList
<>();
for
(
Entry
<
String
,
String
>
entry
:
urlMap
.
entrySet
())
{
String
url
=
entry
.
getValue
();
String
type
=
entry
.
getKey
();
for
(
int
page
=
1
;
page
<=
5
;
page
++)
{
String
pageUrl
=
url
+
"&page="
+
page
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
try
{
System
.
out
.
println
(
"pageUrl=========="
+
pageUrl
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
pageUrl
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc1"
))
{
topicList
.
addAll
(
parseTopicRankHtml
(
page
,
htmlBody
,
type
));
break
;
}
else
{
log
.
error
(
"获取榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
e
);
continue
;
}
}
}
}
return
topicList
;
}
/**
*
* 解析话题榜单
* @param htmlBody
* @param type
* @return void
*/
private
static
List
<
WeiboSuperTopic
>
parseTopicRankHtml
(
int
page
,
String
htmlBody
,
String
type
)
{
try
{
JSONArray
list
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
page
=
(
page
-
1
)*
20
;
List
<
WeiboSuperTopic
>
topicList
=
new
ArrayList
<>();
Integer
toprank
=
null
;
String
topicName
=
null
;
String
id
=
null
;
String
score
=
null
;
String
desc1
=
null
;
String
fensi
=
null
;
String
url
=
null
;
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
JSONObject
data
=
list
.
getJSONObject
(
i
);
toprank
=
page
+
data
.
getInteger
(
"toprank"
);
topicName
=
data
.
getString
(
"display_name"
);
id
=
data
.
getString
(
"page_id"
);
score
=
data
.
getString
(
"score"
);
desc1
=
data
.
getString
(
"desc1"
);
fensi
=
desc1
.
replaceAll
(
".*影响力|粉丝"
,
""
).
trim
();
url
=
data
.
getString
(
"link"
);
WeiboSuperTopic
topic
=
new
WeiboSuperTopic
(
url
,
topicName
,
toprank
,
score
,
fensi
,
type
);
topic
=
getTopicInfo
(
id
,
topic
);
System
.
out
.
println
(
"topic====="
+
topic
);
topicList
.
add
(
topic
);
}
return
topicList
;
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析榜单列表页面时出现错误,错误为:{}"
,
e
);
}
return
Collections
.
emptyList
();
}
/**
*
* 根据单一话题id获取话题阅读数及发帖数
* @param id
* @param topic
* @return
* @return WeiboTopic
*/
private
static
WeiboSuperTopic
getTopicInfo
(
String
id
,
WeiboSuperTopic
topic
)
{
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
try
{
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid="
+
id
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc_more"
))
{
String
descMore
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"pageInfo"
).
getJSONArray
(
"desc_more"
).
getString
(
0
);
if
(
StringUtils
.
isNotBlank
(
descMore
))
{
String
readNum
=
descMore
.
replaceAll
(
"阅读|帖子.*"
,
""
).
trim
();
String
postNum
=
descMore
.
replaceAll
(
".*帖子|粉丝.*"
,
""
).
trim
();
topic
.
setPostNum
(
postNum
);
topic
.
setReadNum
(
readNum
);
return
topic
;
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
e
);
}
}
return
topic
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
0 → 100644
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
java.util.*
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
/**
*
* @ClassName: WeiboTopicCrawler
* @Description: 微博话题榜单采集
* @author Bewilder ZW
*/
@Log4j2
public
class
WeiboTopicCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
static
{
headMap
.
put
(
"Host"
,
"simg.s.weibo.com"
);
headMap
.
put
(
"User-Agent"
,
"Weibo/40651 CFNetwork/978.0.7 Darwin/18.6.0"
);
}
// /**
// *
// * 开始采集明星话题
// * @return void
// */
// public static List<HotSearchList> startCrawler() {
// List<HotSearchList> topicList = new ArrayList<>();
// for(int page=1; page<=7; page++){
// String pageUrl = "https://d.weibo.com/231650_ctg1_-_all?pids=Pl_Discover_Pt6Rank__4&cfs=920&Pl_Discover_Pt6Rank__4_filter=&Pl_Discover_Pt6Rank__4_page=" + page;
// //重试三次
// for(int retryTimes = 1; retryTimes<=3; retryTimes++) {
// try {
// String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
// if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("pl.content.miniTab.index")) {
// log.info("pageUrl::{}", pageUrl);
// topicList.addAll(parseTopicRankHtml(htmlBody));
// break;
// }else {
// log.error("获取榜单列表页面时数据格式错误,页面为:{}", htmlBody);
// }
// } catch (Exception e) {
// log.error("获取榜单列表页面时出现错误,错误为:{}", e);
// continue;
// }
// }
// }
// return topicList;
// }
//
// /**
// *
// * 解析话题榜单
// * @param htmlBody
// * @return void
// */
// private static List<HotSearchList> parseTopicRankHtml(String htmlBody) {
// try {
// String script = "{\"ns\":\"pl.content.miniTab.index\""+ htmlBody.split("FM.view\\(\\{\"ns\":\"pl.content.miniTab.index\"")[1].split("\\)<\\/script>")[0];
// JSONObject json = JSONObject.parseObject(script);
// String html = json.getString("html");
//
// Elements elements = Jsoup.parse(html).select("div.text_box");
// if(Objects.nonNull(elements) && !elements.isEmpty()) {
// List<HotSearchList> topicList = new ArrayList<>();
// String rankString;
// Integer rank = null;
// String topicName = null;
// String url = null;
// String topicType = null;
// String description = null;
// Integer readNum = null;
// String author = null;
//
// for(Element element : elements) {
// rankString = element.select("div[class=\"title W_autocut\"]").text();
// Matcher matcher = Pattern.compile("\\d+").matcher(rankString);
// while (matcher.find()){
// rank = Integer.valueOf(matcher.group());
// }
// topicName = element.select("div[class=\"title W_autocut\"]").select("a.S_txt1").text();
// url = element.select("div[class=\"title W_autocut\"]").select("a.S_txt1").attr("href");
// topicType = element.select("a[class=\"W_btn_b W_btn_tag\"]").text();
// description = element.select("div.subtitle").text();
// String readNumString = element.select("span.number").text();
// if(readNumString.contains("万")){
// readNumString = readNumString.split("万")[0];
// readNum = Integer.valueOf(readNumString.split("万")[0])*10000;;
// }
// if(readNumString.contains("亿")){
// readNum = Integer.valueOf(readNumString.split("亿")[0])*100000000;
// }
// author = element.select("a[class=\"tlink S_txt1\"]").text();
// HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博话题.name(), author, topicType, description);
// log.info("topic::::" + topic);
// topicList.add(topic);
// }
// return topicList;
// }else{
// log.info("html:{}",html);
// }
// } catch (Exception e) {
// log.error("解析榜单列表页面时出现错误,错误为:{}", e);
// }
// return Collections.emptyList();
// }
/**
* 微博平话题榜采集
*/
public
static
List
<
HotSearchList
>
startCrawlerByPhone
(){
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
for
(
int
page
=
1
;
page
<=
7
;
page
++){
String
pageUrl
=
"https://api.weibo.cn/2/page?gsid=_2A25zJX_EDeRxGedH71YS8CzKzzmIHXVuc_QMrDV6PUJbkdANLXPbkWpNUK3OyitGCJsX8exvua-vfubUqCiaA4lb&from=10A1193010&c=iphone&s=2827eebe&count=20&containerid=106003type%253D25%2526t%253D3%2526disable_hot%253D1%2526filter_type%253Dtopicscene&page="
+
page
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
5
;
retryTimes
++)
{
try
{
log
.
info
(
"pageUrl::{}"
,
pageUrl
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
pageUrl
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"top_mark_text"
))
{
topicList
.
addAll
(
parseTopicHtml
(
htmlBody
));
break
;
}
else
{
log
.
info
(
"下载榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"下载榜单列表页面时出现错误,错误为:{}"
,
e
);
continue
;
}
}
}
return
topicList
;
}
private
static
List
<
HotSearchList
>
parseTopicHtml
(
String
htmlBody
)
{
try
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
Objects
.
nonNull
(
cards
)
&&
!
cards
.
isEmpty
())
{
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
Integer
rank
=
null
;
String
topicName
=
null
;
String
url
=
null
;
String
description
=
null
;
Integer
commentNum
=
null
;
Integer
readNum
=
null
;
String
desc2
=
null
;
for
(
int
i
=
0
;
i
<
cards
.
size
();
i
++)
{
JSONObject
cardGroup
=
cards
.
getJSONObject
(
i
).
getJSONArray
(
"card_group"
).
getJSONObject
(
0
);
rank
=
cardGroup
.
getInteger
(
"top_mark_text"
);
topicName
=
cardGroup
.
getString
(
"title_sub"
);
url
=
"https://s.weibo.com/weibo?q="
+
URLCodeUtil
.
getURLEncode
(
topicName
,
"utf-8"
);
description
=
cardGroup
.
getString
(
"desc1"
);
desc2
=
cardGroup
.
getString
(
"desc2"
);
String
commentNumStr
=
desc2
.
replaceAll
(
"讨论.*"
,
""
).
trim
();
String
readNumStr
=
desc2
.
replaceAll
(
".*讨论|阅读"
,
""
).
trim
();
try
{
if
(
commentNumStr
.
contains
(
"万"
)){
commentNumStr
=
commentNumStr
.
replaceAll
(
"万"
,
""
);
commentNum
=
(
int
)(
Double
.
parseDouble
(
commentNumStr
)*
10000
);
}
else
if
(
commentNumStr
.
contains
(
"亿"
)){
commentNumStr
=
commentNumStr
.
replaceAll
(
"亿"
,
""
);
commentNum
=
(
int
)(
Double
.
parseDouble
(
commentNumStr
)*
10000000
);
}
else
{
commentNum
=
Integer
.
getInteger
(
commentNumStr
);
}
if
(
readNumStr
.
contains
(
"万"
)){
readNumStr
=
readNumStr
.
replaceAll
(
"万"
,
""
);
readNum
=
(
int
)(
Double
.
parseDouble
(
readNumStr
)*
10000
);
}
else
if
(
readNumStr
.
contains
(
"亿"
)){
readNumStr
=
readNumStr
.
replaceAll
(
"亿"
,
""
);
readNum
=
(
int
)(
Double
.
parseDouble
(
readNumStr
)*
10000000
);
}
else
{
readNum
=
Integer
.
getInteger
(
readNumStr
);
}
}
catch
(
Exception
e
){
e
.
printStackTrace
();
}
HotSearchList
topic
=
new
HotSearchList
(
url
,
topicName
,
readNum
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
commentNum
,
description
);
log
.
info
(
"topic::::"
+
topic
);
topicList
.
add
(
topic
);
}
return
topicList
;
}
else
{
log
.
info
(
"html:{}"
,
htmlBody
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析榜单列表页面时出现错误,错误为:{}"
,
e
);
}
return
Collections
.
emptyList
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
Factory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSON
Object
;
import
com.
zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.
proxy.ProxyHolder
;
import
com.zhiwei.crawler.
utils.RequestUtils
;
import
com.zhiwei.
searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
Type
;
import
com.zhiwei.
tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.
tools.URLCodeUtil
;
/**
* @ClassName: ZhihuHotCrawler
* @
Description: TODO(知乎热搜采集程序)
* @
author hero
* @
date 2017年9月15日 上午10:54:31
*
/
public
class
ZhihuHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: getZhihuHotList
* @
author hero
* @
Description: 知乎热搜采集程序
* @
param 设定文件
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
getZhihuHotList
(){
List
<
HotSearchList
>
list
=
null
;
String
url
=
"https://www.zhihu.com/api/v4/search/top_search"
;
String
rerferer
=
"https://www.zhihu.com/search?type=content&q=%E5%BF%AB%E6%89%8B"
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
);
headerMap
.
put
(
"Host"
,
"www.zhihu.com"
);
headerMap
.
put
(
"X-UDID"
,
"AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="
);
headerMap
.
put
(
"accept"
,
"application/json, text/plain, */*"
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
headerMap
.
put
(
"Referer"
,
rerferer
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"words"
)){
list
=
new
ArrayList
<>();
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
words
=
topSearch
.
getJSONObject
(
"top_search"
).
getJSONArray
(
"words"
);
String
link
=
null
;
String
displayQuery
=
null
;
String
query
=
null
;
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
JSONObject
word
=
words
.
getJSONObject
(
i
);
query
=
word
.
getString
(
"query"
);
displayQuery
=
word
.
getString
(
"display_query"
);
link
=
"https://www.zhihu.com/search?q="
+
URLCodeUtil
.
getURLEncode
(
query
,
"utf-8"
)+
"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content"
;
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
null
,
i
,
HotSearchType
.
知乎热搜
.
name
());
list
.
add
(
zhihu
);
}
}
}
catch
(
IOException
e
)
{
log
ger
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
}
return
list
;
}
/**
* @Title: getMobileZhihuHotList
* @author hero
* @Description: 移動端知乎熱搜榜
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public
static
List
<
HotSearchList
>
getMobileZhihuHotList
(){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();;
String
url
=
"https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0"
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"api.zhihu.com"
);
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
);
headerMap
.
put
(
"X-UDID"
,
"AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
)){
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
words
=
topSearch
.
getJSONArray
(
"data"
);
String
link
=
null
;
String
displayQuery
=
null
;
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
JSONObject
word
=
words
.
getJSONObject
(
i
).
getJSONObject
(
"target"
);
displayQuery
=
word
.
getString
(
"title"
);
link
=
"https://www.zhihu.com/question/"
+
word
.
getLongValue
(
"id"
);
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
null
,
i
,
HotSearchType
.
知乎热搜
.
name
());
list
.
add
(
zhihu
);
}
}
}
catch
(
IOException
e
)
{
log
ger
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
}
return
list
;
}
}
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSON
Array
;
import
com.
alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.
core.HttpBoot
;
import
com.zhiwei.crawler.
proxy.ProxyHolder
;
import
com.zhiwei.
crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
List
;
import
com.zhiwei.
searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.
httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
/**
* @
ClassName: ZhihuHotCrawler
* @
Description: TODO(知乎热搜采集程序)
* @
author hero
*
@date 2017年9月15日 上午10:54:31
*/
@Log4j2
public
class
ZhihuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @
Title: getZhihuHotList
* @
author hero
* @
Description: 知乎热搜采集程序
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
getZhihuHotList
(){
List
<
HotSearchList
>
list
=
null
;
String
url
=
"https://www.zhihu.com/api/v4/search/top_search"
;
String
rerferer
=
"https://www.zhihu.com/search?type=content&q=%E5%BF%AB%E6%89%8B"
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
);
headerMap
.
put
(
"Host"
,
"www.zhihu.com"
);
headerMap
.
put
(
"X-UDID"
,
"AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="
);
headerMap
.
put
(
"accept"
,
"application/json, text/plain, */*"
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
headerMap
.
put
(
"Referer"
,
rerferer
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"words"
)){
list
=
new
ArrayList
<>();
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
words
=
topSearch
.
getJSONObject
(
"top_search"
).
getJSONArray
(
"words"
);
String
link
=
null
;
String
displayQuery
=
null
;
String
query
=
null
;
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
JSONObject
word
=
words
.
getJSONObject
(
i
);
query
=
word
.
getString
(
"query"
);
displayQuery
=
word
.
getString
(
"display_query"
);
link
=
"https://www.zhihu.com/search?q="
+
URLCodeUtil
.
getURLEncode
(
query
,
"utf-8"
)+
"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content"
;
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
null
,
i
,
HotSearchType
.
知乎热搜
.
name
());
list
.
add
(
zhihu
);
}
}
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
}
return
list
;
}
/**
* @Title: getMobileZhihuHotList
* @author hero
* @Description: 移動端知乎熱搜榜
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public
static
List
<
HotSearchList
>
getMobileZhihuHotList
(){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();;
String
url
=
"https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0"
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"api.zhihu.com"
);
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
);
headerMap
.
put
(
"X-UDID"
,
"AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
)){
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
words
=
topSearch
.
getJSONArray
(
"data"
);
String
link
=
null
;
String
displayQuery
=
null
;
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
JSONObject
word
=
words
.
getJSONObject
(
i
).
getJSONObject
(
"target"
);
displayQuery
=
word
.
getString
(
"title"
);
link
=
"https://www.zhihu.com/question/"
+
word
.
getLongValue
(
"id"
);
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
null
,
i
,
HotSearchType
.
知乎热搜
.
name
());
list
.
add
(
zhihu
);
}
}
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchListDAO.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBCursor
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.cache.CacheManager
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.tools.timeparse.TimeParse
;
public
class
HotSearchListDAO
extends
MongoDBTemplate
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
HotSearchListDAO
.
class
);
public
HotSearchListDAO
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
String
year
=
time
.
substring
(
0
,
4
);
String
month
=
time
.
substring
(
5
,
7
);
String
collName
=
Config
.
searchCollName
+
year
+
"_"
+
month
;
super
.
setCollName
(
collName
);
DBObject
countIndexDoc
=
new
BasicDBObject
();
countIndexDoc
.
put
(
"count"
,
-
1
);
DBObject
timeIndexDoc
=
new
BasicDBObject
();
timeIndexDoc
.
put
(
"time"
,
-
1
);
DBObject
rankIndexDoc
=
new
BasicDBObject
();
rankIndexDoc
.
put
(
"rank"
,
-
1
);
DBObject
nameIndexDoc
=
new
BasicDBObject
();
nameIndexDoc
.
put
(
"name"
,
-
1
);
DBObject
typeIndexDoc
=
new
BasicDBObject
();
typeIndexDoc
.
put
(
"type"
,
-
1
);
try
{
super
.
getReadColl
().
createIndex
(
countIndexDoc
,
new
BasicDBObject
(
"name"
,
"count_desc"
));
super
.
getReadColl
().
createIndex
(
timeIndexDoc
,
new
BasicDBObject
(
"name"
,
"time_desc"
));
super
.
getReadColl
().
createIndex
(
rankIndexDoc
,
new
BasicDBObject
(
"name"
,
"rank_desc"
));
super
.
getReadColl
().
createIndex
(
nameIndexDoc
,
new
BasicDBObject
(
"name"
,
"name_desc"
));
super
.
getReadColl
().
createIndex
(
typeIndexDoc
,
new
BasicDBObject
(
"name"
,
"type_desc"
));
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
/**
* 添加数据入库
* @param list
*/
public
void
addHotSearchList
(
List
<
DBObject
>
list
){
try
{
this
.
getReadColl
().
insert
(
list
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
public
void
addHotSearch
(
DBObject
doc
){
try
{
this
.
getReadColl
().
insert
(
doc
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
/**
* 查询据上次变化量
* @Title: getChangeCount
* @author hero
* @param @param weiboHotSearch
* @param @return 设定文件
* @return int 返回类型
*/
public
int
getChangeCount
(
HotSearchList
weiboHotSearch
){
int
result
=
0
;
DBObject
query
=
new
BasicDBObject
();
query
.
put
(
"name"
,
weiboHotSearch
.
getName
());
DBObject
sort
=
new
BasicDBObject
();
sort
.
put
(
"time"
,
-
1
);
try
{
DBCursor
cur
=
this
.
getReadColl
().
find
(
query
).
sort
(
sort
).
limit
(
1
);
while
(
cur
.
hasNext
()){
DBObject
doc
=
cur
.
next
();
if
(
doc
.
get
(
"count"
)!=
null
)
{
result
=
weiboHotSearch
.
getCount
()
-
Integer
.
valueOf
(
doc
.
get
(
"count"
).
toString
());
break
;
}
}
cur
.
close
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
return
result
;
}
return
result
;
}
/**
* @Title: getWeiboHotOneHour
* @author hero
* @Description: 查询最近1小时内新增的微博热搜
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
public
List
<
DBObject
>
getHotOneHour
(
String
type
){
List
<
DBObject
>
list
=
new
ArrayList
<>();
Date
date
=
new
Date
((
new
Date
().
getTime
()-
60
*
60
*
1000
));
DBObject
query
=
new
BasicDBObject
();
query
.
put
(
"time"
,
new
BasicDBObject
(
"$gte"
,
date
));
query
.
put
(
"changeCount"
,
0
);
query
.
put
(
"type"
,
type
);
try
{
DBCursor
cur
=
this
.
getReadColl
().
find
(
query
);
while
(
cur
.
hasNext
()){
DBObject
doc
=
cur
.
next
();
String
name
=
doc
.
get
(
"name"
).
toString
();
if
(
CacheManager
.
getCacheByKey
(
name
)==
null
){
CacheManager
.
putCache
(
name
,
doc
,
48
*
60
*
60
*
1000
);
list
.
add
(
doc
);
}
}
cur
.
close
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
return
list
;
}
}
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBCursor
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.cache.CacheManager
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.tools.timeparse.TimeParse
;
@Log4j2
public
class
HotSearchListDAO
extends
MongoDBTemplate
{
public
HotSearchListDAO
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
String
year
=
time
.
substring
(
0
,
4
);
String
month
=
time
.
substring
(
5
,
7
);
String
collName
=
Config
.
searchCollName
+
year
+
"_"
+
month
;
super
.
setCollName
(
collName
);
//给数据表创建索引
createIndex
();
}
/**
* 初次创建表及创建相应的索引
*/
private
void
createIndex
(){
List
<
DBObject
>
indexList
=
this
.
getReadColl
().
getIndexInfo
();
if
(
Objects
.
isNull
(
indexList
)
&&
indexList
.
isEmpty
()){
DBObject
countIndexDoc
=
new
BasicDBObject
();
countIndexDoc
.
put
(
"count"
,
-
1
);
DBObject
timeIndexDoc
=
new
BasicDBObject
();
timeIndexDoc
.
put
(
"time"
,
-
1
);
DBObject
rankIndexDoc
=
new
BasicDBObject
();
rankIndexDoc
.
put
(
"rank"
,
-
1
);
DBObject
nameIndexDoc
=
new
BasicDBObject
();
nameIndexDoc
.
put
(
"name"
,
-
1
);
DBObject
typeIndexDoc
=
new
BasicDBObject
();
typeIndexDoc
.
put
(
"type"
,
-
1
);
try
{
super
.
getReadColl
().
createIndex
(
countIndexDoc
,
new
BasicDBObject
(
"name"
,
"count_desc"
));
super
.
getReadColl
().
createIndex
(
timeIndexDoc
,
new
BasicDBObject
(
"name"
,
"time_desc"
));
super
.
getReadColl
().
createIndex
(
rankIndexDoc
,
new
BasicDBObject
(
"name"
,
"rank_desc"
));
super
.
getReadColl
().
createIndex
(
nameIndexDoc
,
new
BasicDBObject
(
"name"
,
"name_desc"
));
super
.
getReadColl
().
createIndex
(
typeIndexDoc
,
new
BasicDBObject
(
"name"
,
"type_desc"
));
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
/**
* 添加数据入库
* @param list
*/
public
void
addHotSearchList
(
List
<
DBObject
>
list
){
try
{
this
.
getReadColl
().
insert
(
list
);
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
public
void
addHotSearch
(
DBObject
doc
){
try
{
this
.
getReadColl
().
insert
(
doc
);
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
/**
* 查询据上次变化量
* @Title: getChangeCount
* @author hero
* @param @param weiboHotSearch
* @param @return 设定文件
* @return int 返回类型
*/
public
int
getChangeCount
(
HotSearchList
weiboHotSearch
){
int
result
=
0
;
DBObject
query
=
new
BasicDBObject
();
query
.
put
(
"name"
,
weiboHotSearch
.
getName
());
DBObject
sort
=
new
BasicDBObject
();
sort
.
put
(
"time"
,
-
1
);
try
{
DBCursor
cur
=
this
.
getReadColl
().
find
(
query
).
sort
(
sort
).
limit
(
1
);
while
(
cur
.
hasNext
()){
DBObject
doc
=
cur
.
next
();
if
(
doc
.
get
(
"count"
)!=
null
)
{
result
=
weiboHotSearch
.
getCount
()
-
Integer
.
valueOf
(
doc
.
get
(
"count"
).
toString
());
break
;
}
}
cur
.
close
();
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
return
result
;
}
return
result
;
}
/**
* @Title: getWeiboHotOneHour
* @author hero
* @Description: 查询最近1小时内新增的微博热搜
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
public
List
<
DBObject
>
getHotOneHour
(
String
type
){
List
<
DBObject
>
list
=
new
ArrayList
<>();
Date
date
=
new
Date
((
new
Date
().
getTime
()-
60
*
60
*
1000
));
DBObject
query
=
new
BasicDBObject
();
query
.
put
(
"time"
,
new
BasicDBObject
(
"$gte"
,
date
));
query
.
put
(
"changeCount"
,
0
);
query
.
put
(
"type"
,
type
);
try
{
DBCursor
cur
=
this
.
getReadColl
().
find
(
query
);
while
(
cur
.
hasNext
()){
DBObject
doc
=
cur
.
next
();
String
name
=
doc
.
get
(
"name"
).
toString
();
if
(
CacheManager
.
getCacheByKey
(
name
)==
null
){
CacheManager
.
putCache
(
name
,
doc
,
48
*
60
*
60
*
1000
);
list
.
add
(
doc
);
}
}
cur
.
close
();
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/WechatUserDao.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
java.util.Collections
;
import
java.util.List
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
public
class
WechatUserDao
extends
MongoDBTemplate
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiDuHotSearchCrawler
.
class
);
public
WechatUserDao
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
super
.
setCollName
(
Config
.
collWechatUserName
);
}
/**
* 添加分组用户
* @param userlist
* @param groupName
* @param groupId
*/
public
void
addWechatUser
(
List
<
String
>
userlist
,
String
groupName
,
Integer
groupId
){
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
groupId
+
"-"
+
groupName
);
doc
.
put
(
"groupId"
,
groupId
);
doc
.
put
(
"groupName"
,
groupName
);
doc
.
put
(
"user"
,
userlist
);
this
.
getReadColl
().
save
(
doc
);
break
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
}
/**
* 根据分组名称查询分组用户
* @param group
* @return
*/
@SuppressWarnings
(
"unchecked"
)
public
List
<
String
>
getWechatUserByGroup
(
String
group
){
try
{
DBObject
query
=
new
BasicDBObject
();
query
.
put
(
"groupName"
,
group
);
DBObject
doc
=
this
.
getReadColl
().
findOne
(
query
);
if
(
doc
!=
null
){
return
(
List
<
String
>)
doc
.
get
(
"user"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
return
Collections
.
emptyList
();
}
}
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
java.util.Collections
;
import
java.util.List
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
@Log4j2
public
class
WechatUserDao
extends
MongoDBTemplate
{
public
WechatUserDao
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
super
.
setCollName
(
Config
.
collWechatUserName
);
}
/**
* 添加分组用户
* @param userlist
* @param groupName
* @param groupId
*/
public
void
addWechatUser
(
List
<
String
>
userlist
,
String
groupName
,
Integer
groupId
){
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
groupId
+
"-"
+
groupName
);
doc
.
put
(
"groupId"
,
groupId
);
doc
.
put
(
"groupName"
,
groupName
);
doc
.
put
(
"user"
,
userlist
);
this
.
getReadColl
().
save
(
doc
);
break
;
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
}
/**
* 根据分组名称查询分组用户
* @param group
* @return
*/
@SuppressWarnings
(
"unchecked"
)
public
List
<
String
>
getWechatUserByGroup
(
String
group
){
try
{
DBObject
query
=
new
BasicDBObject
();
query
.
put
(
"groupName"
,
group
);
DBObject
doc
=
this
.
getReadColl
().
findOne
(
query
);
if
(
doc
!=
null
){
return
(
List
<
String
>)
doc
.
get
(
"user"
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
return
Collections
.
emptyList
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiboTopicDAO.java
→
src/main/java/com/zhiwei/searchhotcrawler/dao/Weibo
Super
TopicDAO.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
java.util.Date
;
import
java.util.List
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.tools.timeparse.TimeParse
;
public
class
WeiboTopicDAO
extends
MongoDBTemplate
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboTopicDAO
.
class
);
public
WeiboTopicDAO
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
String
year
=
time
.
substring
(
0
,
4
);
String
month
=
time
.
substring
(
5
,
7
);
String
collName
=
Config
.
topicCollName
+
year
+
"_"
+
month
;
super
.
setCollName
(
collName
);
DBObject
countIndexDoc
=
new
BasicDBObject
();
countIndexDoc
.
put
(
"score_num"
,
-
1
);
DBObject
timeIndexDoc
=
new
BasicDBObject
();
timeIndexDoc
.
put
(
"time"
,
-
1
);
DBObject
rankIndexDoc
=
new
BasicDBObject
();
rankIndexDoc
.
put
(
"rank"
,
-
1
);
DBObject
nameIndexDoc
=
new
BasicDBObject
();
nameIndexDoc
.
put
(
"name"
,
-
1
);
DBObject
typeIndexDoc
=
new
BasicDBObject
();
typeIndexDoc
.
put
(
"type"
,
-
1
);
try
{
super
.
getReadColl
().
createIndex
(
countIndexDoc
,
new
BasicDBObject
(
"name"
,
"score_desc"
));
super
.
getReadColl
().
createIndex
(
timeIndexDoc
,
new
BasicDBObject
(
"name"
,
"time_desc"
));
super
.
getReadColl
().
createIndex
(
rankIndexDoc
,
new
BasicDBObject
(
"name"
,
"rank_desc"
));
super
.
getReadColl
().
createIndex
(
nameIndexDoc
,
new
BasicDBObject
(
"name"
,
"name_desc"
));
super
.
getReadColl
().
createIndex
(
typeIndexDoc
,
new
BasicDBObject
(
"name"
,
"type_desc"
));
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
/**
* 添加数据入库
* @param list
*/
public
void
addTopicList
(
List
<
DBObject
>
list
){
try
{
this
.
getReadColl
().
insert
(
list
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
public
void
addTopic
(
DBObject
doc
){
try
{
this
.
getReadColl
().
insert
(
doc
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
}
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.tools.timeparse.TimeParse
;
@Log4j2
public
class
WeiboSuperTopicDAO
extends
MongoDBTemplate
{
public
WeiboSuperTopicDAO
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
String
year
=
time
.
substring
(
0
,
4
);
String
month
=
time
.
substring
(
5
,
7
);
String
collName
=
Config
.
topicCollName
+
year
+
"_"
+
month
;
super
.
setCollName
(
collName
);
createIndex
();
}
/**
* 初次创建表及创建相应的索引
*/
private
void
createIndex
(){
List
<
DBObject
>
indexList
=
this
.
getReadColl
().
getIndexInfo
();
if
(
Objects
.
isNull
(
indexList
)
&&
indexList
.
isEmpty
()){
DBObject
countIndexDoc
=
new
BasicDBObject
();
countIndexDoc
.
put
(
"score_num"
,
-
1
);
DBObject
timeIndexDoc
=
new
BasicDBObject
();
timeIndexDoc
.
put
(
"time"
,
-
1
);
DBObject
rankIndexDoc
=
new
BasicDBObject
();
rankIndexDoc
.
put
(
"rank"
,
-
1
);
DBObject
nameIndexDoc
=
new
BasicDBObject
();
nameIndexDoc
.
put
(
"name"
,
-
1
);
DBObject
typeIndexDoc
=
new
BasicDBObject
();
typeIndexDoc
.
put
(
"type"
,
-
1
);
try
{
super
.
getReadColl
().
createIndex
(
countIndexDoc
,
new
BasicDBObject
(
"name"
,
"score_desc"
));
super
.
getReadColl
().
createIndex
(
timeIndexDoc
,
new
BasicDBObject
(
"name"
,
"time_desc"
));
super
.
getReadColl
().
createIndex
(
rankIndexDoc
,
new
BasicDBObject
(
"name"
,
"rank_desc"
));
super
.
getReadColl
().
createIndex
(
nameIndexDoc
,
new
BasicDBObject
(
"name"
,
"name_desc"
));
super
.
getReadColl
().
createIndex
(
typeIndexDoc
,
new
BasicDBObject
(
"name"
,
"type_desc"
));
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
/**
* 添加数据入库
* @param list
*/
public
void
addTopicList
(
List
<
DBObject
>
list
){
try
{
this
.
getReadColl
().
insert
(
list
);
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
public
void
addTopic
(
DBObject
doc
){
try
{
this
.
getReadColl
().
insert
(
doc
);
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
run
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.cache.CacheListener
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.DouyinHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.SendWeiboHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.SendZhihuHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.SougoHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.UpdateWechatUserRun
;
import
com.zhiwei.searchhotcrawler.timer.WeiboHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.WeiboTopicRun
;
import
com.zhiwei.searchhotcrawler.timer.ZhihuHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.*
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
java.util.concurrent.Executors
;
...
...
@@ -23,8 +15,10 @@ import java.util.concurrent.TimeUnit;
public
class
HotSearchRun
{
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
ProxyConfig
.
registry
,
ProxyConfig
.
group
,
GroupType
.
PROVIDER
,
10000013
);
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
new
UpdateWechatUserRun
().
start
();
ZhiWeiTools
.
sleep
(
10000
);
...
...
@@ -51,6 +45,7 @@ public class HotSearchRun {
new
SougoHotSearchRun
().
start
();
new
DouyinHotSearchRun
().
start
();
new
ZhihuHotSearchRun
().
start
();
new
WeiboSuperTopicRun
().
start
();
new
WeiboTopicRun
().
start
();
//推送程序启动
new
SendWeiboHotSearchRun
().
start
();
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchListTest.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DB
;
import
com.mongodb.DBCollection
;
import
com.mongodb.DBCursor
;
import
com.mongodb.DBObject
;
import
com.mongodb.Mongo
;
import
com.mongodb.MongoClient
;
import
com.mongodb.MongoCredential
;
import
com.mongodb.ServerAddress
;
import
com.mongodb.WriteResult
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.tools.timeparse.TimeParse
;
public
class
HotSearchListTest
{
public
static
void
main
(
String
[]
args
)
{
MongoCredential
credential
=
MongoCredential
.
createCredential
(
Config
.
userName
,
Config
.
authDB
,
Config
.
userPwd
.
toCharArray
());
ServerAddress
address
=
new
ServerAddress
(
Config
.
mongoIp
,
Config
.
mongoPort
);
Mongo
mongo
=
new
MongoClient
(
address
,
Arrays
.
asList
(
credential
));
DB
db
=
mongo
.
getDB
(
"hot_search_list"
);
DBCollection
coll
=
db
.
getCollection
(
"hot_search_list2019_09"
);
// MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray());
// ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
// Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew));
// DB dbNew = mongoNew.getDB("hot_search_list");
Map
<
String
,
String
>
timLine
=
TimeParse
.
getTimeMap
(
"2019-10-01 00:00:00"
,
"2019-10-09 23:59:59"
,
"dd"
,
1
);
timLine
.
forEach
((
start
,
end
)
->{
String
year
=
end
.
substring
(
0
,
4
);
String
month
=
end
.
substring
(
5
,
7
);
Date
startDate
=
TimeParse
.
stringFormartDate
(
start
);
Date
endDate
=
TimeParse
.
stringFormartDate
(
end
);
String
collName
=
"hot_search_list"
+
year
+
"_"
+
month
;
System
.
out
.
println
(
"collName=========="
+
collName
);
// DBCollection collNew = dbNew.getCollection(collName);
// DBObject countIndexDoc = new BasicDBObject();
// countIndexDoc.put("count", -1);
// DBObject timeIndexDoc = new BasicDBObject();
// timeIndexDoc.put("time", -1);
// DBObject rankIndexDoc = new BasicDBObject();
// rankIndexDoc.put("rank", -1);
// DBObject nameIndexDoc = new BasicDBObject();
// nameIndexDoc.put("name", -1);
// DBObject typeIndexDoc = new BasicDBObject();
// typeIndexDoc.put("type", -1);
// try {
// collNew.createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
// collNew.createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
// collNew.createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
// collNew.createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
// collNew.createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
// } catch (Exception e) {
// e.printStackTrace();
// }
DBObject
query
=
new
BasicDBObject
(
new
BasicDBObject
(
"time"
,
new
BasicDBObject
(
"$gte"
,
startDate
).
append
(
"$lte"
,
endDate
)));
System
.
out
.
println
(
query
);
WriteResult
wr
=
coll
.
remove
(
query
);
System
.
out
.
println
(
"========"
+
wr
.
getN
());
// int i = 0;
// DBCursor cur = coll.remove(query);
// System.out.println(query +"======="+ cur.count());
// List<DBObject> dataList = new ArrayList<>();
// while(cur.hasNext()) {
// DBObject doc = cur.next();
// try {
//// collNew.save(doc);
// i++;
// coll.remove(doc);
// } catch (Exception e2) {
// e2.printStackTrace();
// }
// dataList.add(doc);
// }
// System.out.println(collName +"数据量大小" +dataList.size());
// cur.close();
// if(!dataList.isEmpty()) {
// try {
// collNew.insert(dataList);
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
});
mongo
.
close
();
}
}
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DB
;
import
com.mongodb.DBCollection
;
import
com.mongodb.DBCursor
;
import
com.mongodb.DBObject
;
import
com.mongodb.Mongo
;
import
com.mongodb.MongoClient
;
import
com.mongodb.MongoCredential
;
import
com.mongodb.ServerAddress
;
import
com.mongodb.WriteResult
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
public
class
HotSearchListTest
{
public
static
void
main
(
String
[]
args
)
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"zzw"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
String
url
=
"http://app.myzaker.com/news/app.php?f="
;
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
Elements
elements
=
Jsoup
.
parse
(
htmlBody
).
select
(
"div.titlebar>a"
);
for
(
Element
element
:
elements
){
String
lableUrl
=
"http://app.myzaker.com/news/app.php"
+
element
.
attr
(
"href"
);
System
.
out
.
println
(
"lableUrl========="
+
lableUrl
);
String
htmlBodyLable
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
lableUrl
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
Elements
elementsLable
=
Jsoup
.
parse
(
htmlBodyLable
).
select
(
"div#infinite_scroll>a"
);
for
(
Element
elementLable
:
elementsLable
){
System
.
out
.
println
(
elementLable
.
attr
(
"href"
)
+
"============="
+
elementLable
.
text
());
}
}
}
catch
(
Exception
e
){
e
.
printStackTrace
();
}
// MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray());
// ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
// Mongo mongo = new MongoClient(address, Arrays.asList(credential));
//
// DB db = mongo.getDB("hot_search_list");
// DBCollection coll = db.getCollection("hot_search_list2019_09");
//
//// MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray());
//// ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
//// Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew));
//// DB dbNew = mongoNew.getDB("hot_search_list");
//
// Map<String,String> timLine = TimeParse.getTimeMap("2019-10-01 00:00:00", "2019-10-09 23:59:59", "dd", 1);
//
// timLine.forEach((start, end) ->{
//
// String year = end.substring(0,4);
// String month = end.substring(5,7);
// Date startDate = TimeParse.stringFormartDate(start);
// Date endDate = TimeParse.stringFormartDate(end);
//
// String collName = "hot_search_list"+year+"_"+month;
// System.out.println("collName=========="+collName);
//// DBCollection collNew = dbNew.getCollection(collName);
//// DBObject countIndexDoc = new BasicDBObject();
//// countIndexDoc.put("count", -1);
//// DBObject timeIndexDoc = new BasicDBObject();
//// timeIndexDoc.put("time", -1);
//// DBObject rankIndexDoc = new BasicDBObject();
//// rankIndexDoc.put("rank", -1);
//// DBObject nameIndexDoc = new BasicDBObject();
//// nameIndexDoc.put("name", -1);
//// DBObject typeIndexDoc = new BasicDBObject();
//// typeIndexDoc.put("type", -1);
//// try {
//// collNew.createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
//// collNew.createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
//// collNew.createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
//// collNew.createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
//// collNew.createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//
// DBObject query = new BasicDBObject(new BasicDBObject("time",
// new BasicDBObject("$gte",startDate).append("$lte", endDate)));
// System.out.println(query);
// WriteResult wr = coll.remove(query);
// System.out.println("========"+wr.getN());
//// int i = 0;
//// DBCursor cur = coll.remove(query);
//// System.out.println(query +"======="+ cur.count());
//// List<DBObject> dataList = new ArrayList<>();
//// while(cur.hasNext()) {
//// DBObject doc = cur.next();
//// try {
////// collNew.save(doc);
//// i++;
//// coll.remove(doc);
//// } catch (Exception e2) {
//// e2.printStackTrace();
//// }
//// dataList.add(doc);
//// }
//// System.out.println(collName +"数据量大小" +dataList.size());
//// cur.close();
//// if(!dataList.isEmpty()) {
//// try {
//// collNew.insert(dataList);
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// });
// mongo.close();
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
View file @
cb6bcd76
...
...
@@ -6,6 +6,7 @@ import java.util.List;
import
java.util.Objects
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -16,10 +17,9 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
BaiduHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiduHotSearchRun
.
class
);
@Override
public
void
run
()
{
boolean
f
=
true
;
...
...
@@ -37,10 +37,10 @@ public class BaiduHotSearchRun extends Thread{
private
void
getHotList
()
{
log
ger
.
info
(
"百度风云榜采集开始........"
);
log
.
info
(
"百度风云榜采集开始........"
);
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
List
<
HotSearchList
>
list
=
BaiDuHotSearchCrawler
.
baiduHotSearch
();
logger
.
info
(
"{}, 此轮百度风云榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 此轮百度风云榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
saveDataList
=
new
ArrayList
<>();
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
list
.
forEach
(
baiduHotSearch
->{
...
...
@@ -59,7 +59,7 @@ public class BaiduHotSearchRun extends Thread{
});
}
hotSearchDAO
.
addHotSearchList
(
saveDataList
);
log
ger
.
info
(
"百度风云榜采集结束........"
);
log
.
info
(
"百度风云榜采集结束........"
);
}
}
\ No newline at end of file
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
View file @
cb6bcd76
...
...
@@ -5,6 +5,7 @@ import java.util.Date;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.DouyinHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
DouyinHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
DouyinHotSearchRun
.
class
);
@Override
public
void
run
()
{
boolean
f
=
true
;
...
...
@@ -40,10 +40,10 @@ public class DouyinHotSearchRun extends Thread{
* @return void
*/
private
void
getHotList
()
{
log
ger
.
info
(
"抖音热搜榜采集开始........"
);
log
.
info
(
"抖音热搜榜采集开始........"
);
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
List
<
HotSearchList
>
list
=
DouyinHotSearchCrawler
.
getMobileDouyinHotList
();
log
ger
.
info
(
"{}, 抖音热搜榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 抖音热搜榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
HotSearchList
douyinHotSearch
:
list
){
int
changeCount
=
hotSearchDAO
.
getChangeCount
(
douyinHotSearch
);
...
...
@@ -60,7 +60,7 @@ public class DouyinHotSearchRun extends Thread{
data
.
add
(
douyin
);
hotSearchDAO
.
addHotSearch
(
douyin
);
}
log
ger
.
info
(
"抖音热搜榜采集结束........"
);
log
.
info
(
"抖音热搜榜采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/SendWeiboHotSearchRun.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.Calendar
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.dao.WechatUserDao
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.util.Template
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.searchhotcrawler.util.WechatConstant
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
SendWeiboHotSearchRun
extends
Thread
{
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
private
static
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SendWeiboHotSearchRun
.
class
);
@Override
public
void
run
()
{
while
(
true
)
{
try
{
Calendar
calendar
=
Calendar
.
getInstance
();
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
logger
.
info
(
"微博推送,当前系统时间为:"
+
hour
);
if
(
hour
>
6
&&
hour
<
23
)
{
List
<
DBObject
>
list
=
hotSearchDAO
.
getHotOneHour
(
HotSearchType
.
微博热搜
.
name
());
if
(
list
!=
null
&&
!
list
.
isEmpty
())
{
for
(
DBObject
weibo
:
list
)
{
String
title
=
weibo
.
get
(
"name"
).
toString
();
String
time
=
TimeParse
.
dateFormartString
((
Date
)
weibo
.
get
(
"time"
),
"yyyy-MM-dd HH:mm:ss"
);
String
url
=
weibo
.
get
(
"url"
).
toString
();
sendTemplateByUserIds
(
title
,
time
,
url
);
}
}
else
{
logger
.
info
(
"微博最近一小时无数据"
);
sendTemplateByUserIds
(
"最近一小时无数据"
,
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
}
}
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
catch
(
Exception
e
)
{
logger
.
debug
(
"微博热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
continue
;
}
}
}
/**
* @Title: sendTemplateByUserIds
* @author hero
* @Description: 发送模版消息
* @param @param
* microTouTiao
* @param @param
* userList 设定文件
* @return void 返回类型
*/
public
static
void
sendTemplateByUserIds
(
String
title
,
String
time
,
String
url
)
{
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<
String
,
Object
>();
JSONObject
first
=
new
JSONObject
();
first
.
put
(
"value"
,
"您好,有一条来自微博热搜榜的预警通知。"
);
dataMap
.
put
(
"first"
,
first
);
JSONObject
keyword1
=
new
JSONObject
();
keyword1
.
put
(
"value"
,
title
);
keyword1
.
put
(
"color"
,
"#173177"
);
dataMap
.
put
(
"keyword1"
,
keyword1
);
JSONObject
keyword2
=
new
JSONObject
();
keyword2
.
put
(
"value"
,
"微博热搜榜"
);
keyword2
.
put
(
"color"
,
"#173177"
);
dataMap
.
put
(
"keyword2"
,
keyword2
);
JSONObject
keyword3
=
new
JSONObject
();
keyword3
.
put
(
"value"
,
time
);
keyword3
.
put
(
"color"
,
"#173177"
);
dataMap
.
put
(
"keyword3"
,
keyword3
);
JSONObject
remark
=
new
JSONObject
();
remark
.
put
(
"value"
,
"知微情报监测服务"
);
dataMap
.
put
(
"remark"
,
remark
);
List
<
String
>
userList
=
getUserList
();
if
(
userList
!=
null
&&
userList
.
size
()
>
0
)
{
for
(
String
openId
:
userList
)
{
Template
template
=
new
Template
();
template
.
setTouser
(
openId
);
if
(
url
!=
null
)
{
template
.
setUrl
(
url
);
}
template
.
setTemplate_id
(
WechatConstant
.
WECHAT_TEMPLATEID_EARLY_IT
);
template
.
setData
(
dataMap
);
JSONObject
templateJson
=
(
JSONObject
)
JSONObject
.
toJSON
(
template
);
WechatCodeUtil
.
sendDataJson
(
templateJson
);
}
}
else
{
logger
.
info
(
"拉取微博用户列表失败"
);
}
}
/**
* @Title: getUserList
* @author hero
* @Description: 用户列表
* @param @param
* projectName
* @param @return
* 设定文件
* @return List<String> 返回类型
*/
public
static
List
<
String
>
getUserList
()
{
List
<
String
>
userList
=
wechatUserDao
.
getWechatUserByGroup
(
"weibohot"
);
if
(
userList
==
null
){
userList
=
WechatCodeUtil
.
getUserListByGroupName
(
"weibohot"
);
}
return
userList
;
}
}
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.Calendar
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.dao.WechatUserDao
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.util.Template
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.searchhotcrawler.util.WechatConstant
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
SendWeiboHotSearchRun
extends
Thread
{
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
private
static
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
@Override
public
void
run
()
{
while
(
true
)
{
try
{
Calendar
calendar
=
Calendar
.
getInstance
();
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
log
.
info
(
"微博推送,当前系统时间为:"
+
hour
);
if
(
hour
>
6
&&
hour
<
23
)
{
List
<
DBObject
>
list
=
hotSearchDAO
.
getHotOneHour
(
HotSearchType
.
微博热搜
.
name
());
if
(
list
!=
null
&&
!
list
.
isEmpty
())
{
for
(
DBObject
weibo
:
list
)
{
String
title
=
weibo
.
get
(
"name"
).
toString
();
String
time
=
TimeParse
.
dateFormartString
((
Date
)
weibo
.
get
(
"time"
),
"yyyy-MM-dd HH:mm:ss"
);
String
url
=
weibo
.
get
(
"url"
).
toString
();
sendTemplateByUserIds
(
title
,
time
,
url
);
}
}
else
{
log
.
info
(
"微博最近一小时无数据"
);
sendTemplateByUserIds
(
"最近一小时无数据"
,
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
}
}
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
catch
(
Exception
e
)
{
log
.
debug
(
"微博热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
continue
;
}
}
}
/**
* @Title: sendTemplateByUserIds
* @author hero
* @Description: 发送模版消息
* @param @param
* microTouTiao
* @param @param
* userList 设定文件
* @return void 返回类型
*/
public
static
void
sendTemplateByUserIds
(
String
title
,
String
time
,
String
url
)
{
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<
String
,
Object
>();
JSONObject
first
=
new
JSONObject
();
first
.
put
(
"value"
,
"您好,有一条来自微博热搜榜的预警通知。"
);
dataMap
.
put
(
"first"
,
first
);
JSONObject
keyword1
=
new
JSONObject
();
keyword1
.
put
(
"value"
,
title
);
keyword1
.
put
(
"color"
,
"#173177"
);
dataMap
.
put
(
"keyword1"
,
keyword1
);
JSONObject
keyword2
=
new
JSONObject
();
keyword2
.
put
(
"value"
,
"微博热搜榜"
);
keyword2
.
put
(
"color"
,
"#173177"
);
dataMap
.
put
(
"keyword2"
,
keyword2
);
JSONObject
keyword3
=
new
JSONObject
();
keyword3
.
put
(
"value"
,
time
);
keyword3
.
put
(
"color"
,
"#173177"
);
dataMap
.
put
(
"keyword3"
,
keyword3
);
JSONObject
remark
=
new
JSONObject
();
remark
.
put
(
"value"
,
"知微情报监测服务"
);
dataMap
.
put
(
"remark"
,
remark
);
List
<
String
>
userList
=
getUserList
();
if
(
userList
!=
null
&&
userList
.
size
()
>
0
)
{
for
(
String
openId
:
userList
)
{
Template
template
=
new
Template
();
template
.
setTouser
(
openId
);
if
(
url
!=
null
)
{
template
.
setUrl
(
url
);
}
template
.
setTemplate_id
(
WechatConstant
.
WECHAT_TEMPLATEID_EARLY_IT
);
template
.
setData
(
dataMap
);
JSONObject
templateJson
=
(
JSONObject
)
JSONObject
.
toJSON
(
template
);
WechatCodeUtil
.
sendDataJson
(
templateJson
);
}
}
else
{
log
.
info
(
"拉取微博用户列表失败"
);
}
}
/**
* @Title: getUserList
* @author hero
* @Description: 用户列表
* @param @param
* projectName
* @param @return
* 设定文件
* @return List<String> 返回类型
*/
public
static
List
<
String
>
getUserList
()
{
List
<
String
>
userList
=
wechatUserDao
.
getWechatUserByGroup
(
"weibohot"
);
if
(
userList
==
null
){
userList
=
WechatCodeUtil
.
getUserListByGroupName
(
"weibohot"
);
}
return
userList
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/SendZhihuHotSearchRun.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.Calendar
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.dao.WechatUserDao
;
import
com.zhiwei.searchhotcrawler.util.Template
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.searchhotcrawler.util.WechatConstant
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
SendZhihuHotSearchRun
extends
Thread
{
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
private
static
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SendZhihuHotSearchRun
.
class
);
@Override
public
void
run
()
{
while
(
true
)
{
try
{
Calendar
calendar
=
Calendar
.
getInstance
();
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
logger
.
info
(
"知乎推送,当前系统时间为:"
+
hour
);
if
(
hour
>
6
&&
hour
<
23
){
List
<
DBObject
>
list
=
hotSearchDAO
.
getHotOneHour
(
HotSearchType
.
知乎热搜
.
name
());
if
(
list
!=
null
&&
!
list
.
isEmpty
()){
for
(
DBObject
zhihu
:
list
){
String
title
=
zhihu
.
get
(
"display_query"
).
toString
();
String
time
=
TimeParse
.
dateFormartString
((
Date
)
zhihu
.
get
(
"time"
),
"yyyy-MM-dd HH:mm:ss"
);
String
url
=
zhihu
.
get
(
"_id"
).
toString
();
if
(
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
)
>
6
&&
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
)
<
23
){
sendTemplateByUserIds
(
title
,
time
,
url
);
}
}
}
else
{
logger
.
info
(
"知乎最近一小时无数据"
);
sendTemplateByUserIds
(
"最近一小时无数据"
,
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
}
}
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
catch
(
Exception
e
)
{
logger
.
debug
(
"知乎热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
}
}
/**
* @Title: sendTemplateByUserIds
* @author hero
* @Description: 发送模版消息
* @param @param microTouTiao
* @param @param userList 设定文件
* @return void 返回类型
*/
public
static
void
sendTemplateByUserIds
(
String
title
,
String
time
,
String
url
)
{
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<>();
JSONObject
first
=
new
JSONObject
();
first
.
put
(
"value"
,
"您好,有一条来自知乎热搜榜的预警通知。"
);
dataMap
.
put
(
"first"
,
first
);
JSONObject
keyword1
=
new
JSONObject
();
keyword1
.
put
(
"value"
,
title
);
keyword1
.
put
(
"color"
,
"#173177"
);
dataMap
.
put
(
"keyword1"
,
keyword1
);
JSONObject
keyword2
=
new
JSONObject
();
keyword2
.
put
(
"value"
,
"知乎热搜榜"
);
keyword2
.
put
(
"color"
,
"#173177"
);
dataMap
.
put
(
"keyword2"
,
keyword2
);
JSONObject
keyword3
=
new
JSONObject
();
keyword3
.
put
(
"value"
,
time
);
keyword3
.
put
(
"color"
,
"#173177"
);
dataMap
.
put
(
"keyword3"
,
keyword3
);
JSONObject
remark
=
new
JSONObject
();
remark
.
put
(
"value"
,
"知微情报监测服务"
);
dataMap
.
put
(
"remark"
,
remark
);
List
<
String
>
userList
=
getUserList
();
if
(
userList
!=
null
&&
!
userList
.
isEmpty
())
{
for
(
String
openId
:
userList
)
{
Template
template
=
new
Template
();
template
.
setTouser
(
openId
);
if
(
url
!=
null
){
template
.
setUrl
(
url
);
}
template
.
setTemplate_id
(
WechatConstant
.
WECHAT_TEMPLATEID_EARLY_IT
);
template
.
setData
(
dataMap
);
JSONObject
templateJson
=
(
JSONObject
)
JSONObject
.
toJSON
(
template
);
WechatCodeUtil
.
sendDataJson
(
templateJson
);
}
}
else
{
logger
.
info
(
"知乎推送拉取用户列表失败"
);
}
}
/**
* @Title: getUserList
* @author hero
* @Description: 用户列表
* @param @param projectName
* @param @return 设定文件
* @return List<String> 返回类型
*/
private
static
List
<
String
>
getUserList
()
{
List
<
String
>
userList
=
wechatUserDao
.
getWechatUserByGroup
(
"LP组"
);
if
(
userList
==
null
){
userList
=
WechatCodeUtil
.
getUserListByGroupName
(
"LP组"
);
}
return
userList
;
}
}
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.Calendar
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.dao.WechatUserDao
;
import
com.zhiwei.searchhotcrawler.util.Template
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.searchhotcrawler.util.WechatConstant
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
SendZhihuHotSearchRun
extends
Thread
{
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
private
static
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
@Override
public
void
run
()
{
while
(
true
)
{
try
{
Calendar
calendar
=
Calendar
.
getInstance
();
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
log
.
info
(
"知乎推送,当前系统时间为:"
+
hour
);
if
(
hour
>
6
&&
hour
<
23
){
List
<
DBObject
>
list
=
hotSearchDAO
.
getHotOneHour
(
HotSearchType
.
知乎热搜
.
name
());
if
(
list
!=
null
&&
!
list
.
isEmpty
()){
for
(
DBObject
zhihu
:
list
){
String
title
=
zhihu
.
get
(
"display_query"
).
toString
();
String
time
=
TimeParse
.
dateFormartString
((
Date
)
zhihu
.
get
(
"time"
),
"yyyy-MM-dd HH:mm:ss"
);
String
url
=
zhihu
.
get
(
"_id"
).
toString
();
if
(
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
)
>
6
&&
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
)
<
23
){
sendTemplateByUserIds
(
title
,
time
,
url
);
}
}
}
else
{
log
.
info
(
"知乎最近一小时无数据"
);
sendTemplateByUserIds
(
"最近一小时无数据"
,
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
}
}
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
catch
(
Exception
e
)
{
log
.
debug
(
"知乎热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
}
}
/**
* @Title: sendTemplateByUserIds
* @author hero
* @Description: 发送模版消息
* @param @param microTouTiao
* @param @param userList 设定文件
* @return void 返回类型
*/
public
static
void
sendTemplateByUserIds
(
String
title
,
String
time
,
String
url
)
{
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<>();
JSONObject
first
=
new
JSONObject
();
first
.
put
(
"value"
,
"您好,有一条来自知乎热搜榜的预警通知。"
);
dataMap
.
put
(
"first"
,
first
);
JSONObject
keyword1
=
new
JSONObject
();
keyword1
.
put
(
"value"
,
title
);
keyword1
.
put
(
"color"
,
"#173177"
);
dataMap
.
put
(
"keyword1"
,
keyword1
);
JSONObject
keyword2
=
new
JSONObject
();
keyword2
.
put
(
"value"
,
"知乎热搜榜"
);
keyword2
.
put
(
"color"
,
"#173177"
);
dataMap
.
put
(
"keyword2"
,
keyword2
);
JSONObject
keyword3
=
new
JSONObject
();
keyword3
.
put
(
"value"
,
time
);
keyword3
.
put
(
"color"
,
"#173177"
);
dataMap
.
put
(
"keyword3"
,
keyword3
);
JSONObject
remark
=
new
JSONObject
();
remark
.
put
(
"value"
,
"知微情报监测服务"
);
dataMap
.
put
(
"remark"
,
remark
);
List
<
String
>
userList
=
getUserList
();
if
(
userList
!=
null
&&
!
userList
.
isEmpty
())
{
for
(
String
openId
:
userList
)
{
Template
template
=
new
Template
();
template
.
setTouser
(
openId
);
if
(
url
!=
null
){
template
.
setUrl
(
url
);
}
template
.
setTemplate_id
(
WechatConstant
.
WECHAT_TEMPLATEID_EARLY_IT
);
template
.
setData
(
dataMap
);
JSONObject
templateJson
=
(
JSONObject
)
JSONObject
.
toJSON
(
template
);
WechatCodeUtil
.
sendDataJson
(
templateJson
);
}
}
else
{
log
.
info
(
"知乎推送拉取用户列表失败"
);
}
}
/**
* @Title: getUserList
* @author hero
* @Description: 用户列表
* @param @param projectName
* @param @return 设定文件
* @return List<String> 返回类型
*/
private
static
List
<
String
>
getUserList
()
{
List
<
String
>
userList
=
wechatUserDao
.
getWechatUserByGroup
(
"LP组"
);
if
(
userList
==
null
){
userList
=
WechatCodeUtil
.
getUserListByGroupName
(
"LP组"
);
}
return
userList
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
View file @
cb6bcd76
...
...
@@ -5,6 +5,7 @@ import java.util.Date;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -15,9 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
SougoHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SougoHotSearchRun
.
class
);
@Override
public
void
run
()
{
boolean
f
=
true
;
...
...
@@ -36,9 +37,9 @@ public class SougoHotSearchRun extends Thread {
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
log
ger
.
info
(
"搜狗微信采集开始........"
);
log
.
info
(
"搜狗微信采集开始........"
);
List
<
HotSearchList
>
list
=
SougoHotSearchCrawler
.
sougoHotSearch
();
logger
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
HotSearchList
sougoHotSearch
:
list
){
DBObject
doc
=
new
BasicDBObject
();
...
...
@@ -52,7 +53,7 @@ public class SougoHotSearchRun extends Thread {
data
.
add
(
doc
);
}
hotSearchDAO
.
addHotSearchList
(
data
);
log
ger
.
info
(
"搜狗微信采集结束........"
);
log
.
info
(
"搜狗微信采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/UpdateWechatUserRun.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.Calendar
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.searchhotcrawler.dao.WechatUserDao
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
UpdateWechatUserRun
extends
Thread
{
private
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
UpdateWechatUserRun
.
class
);
@Override
public
void
run
()
{
logger
.
info
(
"开始更新用户数据"
);
while
(
true
)
{
try
{
Calendar
calendar
=
Calendar
.
getInstance
();
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
if
(
hour
>
6
){
Map
<
String
,
Integer
>
groupMap
=
WechatCodeUtil
.
getAllGroupIp
();
logger
.
info
(
"此公众号的分组数量为:::{}"
,
groupMap
.
size
());
if
(!
groupMap
.
isEmpty
()
&&
groupMap
!=
null
){
for
(
Entry
<
String
,
Integer
>
group
:
groupMap
.
entrySet
()){
logger
.
info
(
"此公众号的分组名称及IP为:::{},{}"
,
group
.
getKey
(),
group
.
getValue
());
List
<
String
>
userList
=
WechatCodeUtil
.
getUserListByGroupId
(
group
.
getValue
());
logger
.
info
(
"{},此分组下的用户数量为::{}"
,
group
.
getKey
(),
userList
.
size
());
if
(
userList
!=
null
&&
!
userList
.
isEmpty
()){
wechatUserDao
.
addWechatUser
(
userList
,
group
.
getKey
(),
group
.
getValue
());
}
}
}
}
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
catch
(
Exception
e
)
{
logger
.
debug
(
"知乎热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
continue
;
}
}
}
}
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.Calendar
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.searchhotcrawler.dao.WechatUserDao
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
UpdateWechatUserRun
extends
Thread
{
private
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
@Override
public
void
run
()
{
log
.
info
(
"开始更新用户数据"
);
while
(
true
)
{
try
{
Calendar
calendar
=
Calendar
.
getInstance
();
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
if
(
hour
>
6
){
Map
<
String
,
Integer
>
groupMap
=
WechatCodeUtil
.
getAllGroupIp
();
log
.
info
(
"此公众号的分组数量为:::{}"
,
groupMap
.
size
());
if
(!
groupMap
.
isEmpty
()
&&
groupMap
!=
null
){
for
(
Entry
<
String
,
Integer
>
group
:
groupMap
.
entrySet
()){
log
.
info
(
"此公众号的分组名称及IP为:::{},{}"
,
group
.
getKey
(),
group
.
getValue
());
List
<
String
>
userList
=
WechatCodeUtil
.
getUserListByGroupId
(
group
.
getValue
());
log
.
info
(
"{},此分组下的用户数量为::{}"
,
group
.
getKey
(),
userList
.
size
());
if
(
userList
!=
null
&&
!
userList
.
isEmpty
()){
wechatUserDao
.
addWechatUser
(
userList
,
group
.
getKey
(),
group
.
getValue
());
}
}
}
}
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
catch
(
Exception
e
)
{
log
.
debug
(
"知乎热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
continue
;
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
View file @
cb6bcd76
...
...
@@ -5,6 +5,7 @@ import java.util.Date;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
WeiboHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboHotSearchRun
.
class
);
@Override
public
void
run
()
{
boolean
f
=
true
;
...
...
@@ -36,11 +36,11 @@ public class WeiboHotSearchRun extends Thread{
private
void
getHotList
()
{
log
ger
.
info
(
"微博话题采集开始........"
);
log
.
info
(
"微博话题采集开始........"
);
HotSearchListDAO
weiboHotSearchDAO
=
new
HotSearchListDAO
();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch();
List
<
HotSearchList
>
list
=
WeiboHotSearchCrawler
.
weiboHotSearchByPhone
();
logger
.
info
(
"{}, 微博此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 微博此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
HotSearchList
weiboHotSearch
:
list
){
int
changeCount
=
weiboHotSearchDAO
.
getChangeCount
(
weiboHotSearch
);
...
...
@@ -49,7 +49,7 @@ public class WeiboHotSearchRun extends Thread{
doc
.
put
(
"name"
,
weiboHotSearch
.
getName
());
doc
.
put
(
"url"
,
weiboHotSearch
.
getUrl
());
doc
.
put
(
"count"
,
weiboHotSearch
.
getCount
());
doc
.
put
(
"hot"
,
weiboHotSearch
.
is
Hot
());
doc
.
put
(
"hot"
,
weiboHotSearch
.
get
Hot
());
doc
.
put
(
"day"
,
weiboHotSearch
.
getDay
());
doc
.
put
(
"time"
,
weiboHotSearch
.
getTime
());
doc
.
put
(
"changeCount"
,
changeCount
);
...
...
@@ -59,7 +59,7 @@ public class WeiboHotSearchRun extends Thread{
data
.
add
(
doc
);
}
weiboHotSearchDAO
.
addHotSearchList
(
data
);
log
ger
.
info
(
"微博话题采集结束........"
);
log
.
info
(
"微博话题采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboSuperTopicRun.java
0 → 100644
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboSuperTopicCrawler
;
import
com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
WeiboSuperTopicRun
extends
Thread
{
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getTopicList
();
TimeUnit
.
HOURS
.
sleep
(
3
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
}
ZhiWeiTools
.
sleep
(
50
);
}
}
private
void
getTopicList
()
{
WeiboSuperTopicDAO
weiboTopicDAO
=
new
WeiboSuperTopicDAO
();
log
.
info
(
"微博超话采集开始........"
);
List
<
WeiboSuperTopic
>
list
=
WeiboSuperTopicCrawler
.
startCrawler
();
log
.
info
(
"{}, 微博超话此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
WeiboSuperTopic
topic
:
list
){
log
.
info
(
"topic::::{}"
,
topic
);
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"score_num"
,
topic
.
getScore
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"type"
,
topic
.
getType
());
doc
.
put
(
"day"
,
topic
.
getDay
());
doc
.
put
(
"time"
,
topic
.
getTime
());
doc
.
put
(
"url"
,
topic
.
getUrl
());
data
.
add
(
doc
);
}
weiboTopicDAO
.
addTopicList
(
data
);
log
.
info
(
"微博话题采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboTopicCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.WeiboTopic
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboHuatiCrawler
;
import
com.zhiwei.searchhotcrawler.dao.WeiboTopicDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
WeiboTopicRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboTopicRun
.
class
);
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getTopicList
();
TimeUnit
.
HOUR
S
.
sleep
(
3
);
TimeUnit
.
MINUTE
S
.
sleep
(
3
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
...
...
@@ -36,28 +31,29 @@ public class WeiboTopicRun extends Thread{
private
void
getTopicList
()
{
WeiboTopicDAO
weiboTopicDAO
=
new
WeiboTopic
DAO
();
log
ger
.
info
(
"微博超话
采集开始........"
);
List
<
WeiboTopic
>
list
=
WeiboHuatiCrawler
.
startCrawler
();
logger
.
info
(
"{}, 微博超话
此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
HotSearchListDAO
weiboHotSearchDAO
=
new
HotSearchList
DAO
();
log
.
info
(
"微博话题
采集开始........"
);
List
<
HotSearchList
>
list
=
WeiboTopicCrawler
.
startCrawlerByPhone
();
log
.
info
(
"{}, 微博话题
此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
WeiboTopic
topic
:
list
){
logger
.
info
(
"topic::::{}"
,
topic
);
for
(
HotSearchList
topic
:
list
){
log
.
info
(
"topic::::{}"
,
topic
);
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"score_num"
,
topic
.
getScore
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"type"
,
topic
.
getType
());
doc
.
put
(
"name"
,
topic
.
getName
());
doc
.
put
(
"url"
,
topic
.
getUrl
());
doc
.
put
(
"count"
,
topic
.
getCount
());
doc
.
put
(
"hot"
,
topic
.
getHot
());
doc
.
put
(
"day"
,
topic
.
getDay
());
doc
.
put
(
"time"
,
topic
.
getTime
());
doc
.
put
(
"url"
,
topic
.
getUrl
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"type"
,
topic
.
getType
());
doc
.
put
(
"topic_lead"
,
topic
.
getTopicLead
());
doc
.
put
(
"comment_count"
,
topic
.
getCommentCount
());
data
.
add
(
doc
);
}
weibo
TopicDAO
.
addTopic
List
(
data
);
log
ger
.
info
(
"微博话题采集结束........"
);
weibo
HotSearchDAO
.
addHotSearch
List
(
data
);
log
.
info
(
"微博话题采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
View file @
cb6bcd76
...
...
@@ -4,6 +4,7 @@ import java.util.Date;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -14,10 +15,9 @@ import com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
ZhihuHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuHotSearchRun
.
class
);
@Override
public
void
run
()
{
boolean
f
=
true
;
...
...
@@ -32,24 +32,22 @@ public class ZhihuHotSearchRun extends Thread{
ZhiWeiTools
.
sleep
(
50
);
}
}
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
log
ger
.
info
(
"知乎话题采集开始...,当前线程名字:{}"
,
Thread
.
currentThread
().
getName
());
log
.
info
(
"知乎话题采集开始...,当前线程名字:{}"
,
Thread
.
currentThread
().
getName
());
List
<
HotSearchList
>
list
=
ZhihuHotSearchCrawler
.
getZhihuHotList
();
List
<
HotSearchList
>
mobilelist
=
ZhihuHotSearchCrawler
.
getMobileZhihuHotList
();
list
.
addAll
(
mobilelist
);
log
ger
.
info
(
"{}, 知乎此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 知乎此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
for
(
HotSearchList
zhihuHotSearch
:
list
){
DBObject
zhihu
=
new
BasicDBObject
();
zhihu
.
put
(
"_id"
,
zhihuHotSearch
.
getId
());
zhihu
.
put
(
"name"
,
zhihuHotSearch
.
getName
());
zhihu
.
put
(
"url"
,
zhihuHotSearch
.
getUrl
());
zhihu
.
put
(
"count"
,
zhihuHotSearch
.
getCount
());
zhihu
.
put
(
"hot"
,
zhihuHotSearch
.
is
Hot
());
zhihu
.
put
(
"hot"
,
zhihuHotSearch
.
get
Hot
());
zhihu
.
put
(
"day"
,
zhihuHotSearch
.
getDay
());
zhihu
.
put
(
"time"
,
zhihuHotSearch
.
getTime
());
zhihu
.
put
(
"changeCount"
,
0
);
...
...
@@ -57,7 +55,7 @@ public class ZhihuHotSearchRun extends Thread{
zhihu
.
put
(
"type"
,
zhihuHotSearch
.
getType
());
hotSearchDAO
.
addHotSearch
(
zhihu
);
}
log
ger
.
info
(
"知乎话题采集结束........"
);
log
.
info
(
"知乎话题采集结束........"
);
}
}
src/main/resources/db.properties
View file @
cb6bcd76
#mongoIp=202.107.192.94
mongoIp
=
192.168.0.101
mongoPort
=
30000
#mongoIp=192.168.0.81
#mongoPort=27017
db.username
=
datapush
db.paasword
=
4d8ce5c42073c
db.certifiedDB
=
admin
dbName
=
hot_search_list
searchCollName
=
hot_search_list
topicCollName
=
topic_list
#mongoIp=202.107.192.94
mongoIp
=
192.168.0.101
mongoPort
=
30000
#mongoIp=192.168.0.81
#mongoPort=27017
db.username
=
searchhotcrawleruser
db.paasword
=
searchhotcrawler1q2w3e4r
db.certifiedDB
=
admin
dbName
=
hot_search_list
searchCollName
=
hot_search_list
topicCollName
=
topic_list
collWechatUserName
=
wechat_user
\ No newline at end of file
src/main/resources/proxyip.properties
View file @
cb6bcd76
registry
=
zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
group
=
hangzhou
########################################################
#registry=zookeeper://192.168.0.
36
:2181
#registry=zookeeper://192.168.0.
11:2181?backup=192.168.0.30:2181,192.168.0.35
:2181
#
group
=
local
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment