Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
cb6bcd76
Commit
cb6bcd76
authored
Mar 06, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加微博话题采集,并添加lombok
parent
a9966f9d
Hide whitespace changes
Inline
Side-by-side
Showing
30 changed files
with
2012 additions
and
1780 deletions
+2012
-1780
pom.xml
+120
-114
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
+76
-134
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+10
-9
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboSuperTopic.java
+90
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboTopic.java
+0
-158
src/main/java/com/zhiwei/searchhotcrawler/cache/CacheListener.java
+32
-29
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+114
-113
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
+75
-74
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
+96
-96
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+11
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboSuperTopicCrawler.java
+164
-163
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
+219
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+114
-114
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchListDAO.java
+153
-137
src/main/java/com/zhiwei/searchhotcrawler/dao/WechatUserDao.java
+70
-69
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiboSuperTopicDAO.java
+85
-71
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+7
-12
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchListTest.java
+142
-107
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
+6
-5
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
+5
-5
src/main/java/com/zhiwei/searchhotcrawler/timer/SendWeiboHotSearchRun.java
+125
-124
src/main/java/com/zhiwei/searchhotcrawler/timer/SendZhihuHotSearchRun.java
+127
-126
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
+6
-5
src/main/java/com/zhiwei/searchhotcrawler/timer/UpdateWechatUserRun.java
+50
-49
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
+6
-6
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboSuperTopicRun.java
+63
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
+25
-29
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
+7
-9
src/main/resources/db.properties
+12
-11
src/main/resources/proxyip.properties
+2
-1
No files found.
pom.xml
View file @
cb6bcd76
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
searchhotcrawler
</artifactId>
<artifactId>
searchhotcrawler
</artifactId>
<version>
0.0.6-SNAPSHOT
</version>
<version>
0.0.6-SNAPSHOT
</version>
<name>
各平台热搜榜单采集程序
</name>
<name>
各平台热搜榜单采集程序
</name>
<description>
各平台热搜榜单采集程序
<description>
各平台热搜榜单采集程序
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序
</description>
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序
</description>
<properties>
<properties>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
</properties>
</properties>
<developers>
<developers>
<developer>
<developer>
<id>
Bewilder
</id>
<id>
Bewilder
</id>
<name>
zhiwei zhang
</name>
<name>
zhiwei zhang
</name>
<email>
zhangzhiwei@zhiweidata.com
</email>
<email>
zhangzhiwei@zhiweidata.com
</email>
</developer>
</developer>
</developers>
</developers>
<dependencies>
<dependencies>
<!-- 数据解析jar -->
<!-- 数据解析jar -->
<dependency>
<dependency>
<groupId>
org.mongodb
</groupId>
<groupId>
org.mongodb
</groupId>
<artifactId>
mongo-java-driver
</artifactId>
<artifactId>
mongo-java-driver
</artifactId>
<version>
3.6.3
</version>
<version>
3.6.3
</version>
</dependency>
</dependency>
<dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
sendmail
</artifactId>
<artifactId>
sendmail
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
<version>
0.0.1-SNAPSHOT
</version>
</dependency>
</dependency>
<dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.5-SNAPSHOT
</version>
<version>
0.1.6-SNAPSHOT
</version>
</dependency>
</dependency>
<dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<artifactId>
crawler-core
</artifactId>
<version>
0.5.5.6-SNAPSHOT
</version>
<version>
0.6.0.4-RELEASE
</version>
</dependency>
</dependency>
</dependencies>
<dependency>
<groupId>
org.projectlombok
</groupId>
<build>
<artifactId>
lombok
</artifactId>
<plugins>
<version>
1.18.8
</version>
<plugin>
</dependency>
<groupId>
org.apache.maven.plugins
</groupId>
</dependencies>
<artifactId>
maven-shade-plugin
</artifactId>
<version>
2.4.2
</version>
<build>
<executions>
<plugins>
<execution>
<plugin>
<phase>
package
</phase>
<groupId>
org.apache.maven.plugins
</groupId>
<goals>
<artifactId>
maven-shade-plugin
</artifactId>
<goal>
shade
</goal>
<version>
2.4.2
</version>
</goals>
<executions>
<configuration>
<execution>
<filters>
<phase>
package
</phase>
<filter>
<goals>
<artifact>
*:*
</artifact>
<goal>
shade
</goal>
<excludes>
</goals>
<exclude>
META-INF/*.SF
</exclude>
<configuration>
<exclude>
META-INF/*.DSA
</exclude>
<filters>
<exclude>
META-INF/*.RSA
</exclude>
<filter>
</excludes>
<artifact>
*:*
</artifact>
</filter>
<excludes>
</filters>
<exclude>
META-INF/*.SF
</exclude>
<exclude>
META-INF/*.DSA
</exclude>
<transformers>
<exclude>
META-INF/*.RSA
</exclude>
<transformer
</excludes>
implementation=
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"
>
</filter>
<mainClass>
com.zhiwei.searchhotcrawler.run.HotSearchRun
</mainClass>
</filters>
</transformer>
</transformers>
<transformers>
</configuration>
<transformer
</execution>
implementation=
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"
>
</executions>
<mainClass>
com.zhiwei.searchhotcrawler.run.HotSearchRun
</mainClass>
</plugin>
</transformer>
</transformers>
<plugin>
</configuration>
<artifactId>
maven-source-plugin
</artifactId>
</execution>
<version>
2.4
</version>
</executions>
<configuration>
</plugin>
<attach>
true
</attach>
</configuration>
<plugin>
<executions>
<artifactId>
maven-source-plugin
</artifactId>
<execution>
<version>
2.4
</version>
<phase>
compile
</phase>
<configuration>
<goals>
<attach>
true
</attach>
<goal>
jar
</goal>
</configuration>
</goals>
<executions>
</execution>
<execution>
</executions>
<phase>
compile
</phase>
</plugin>
<goals>
<goal>
jar
</goal>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
</goals>
<plugin>
</execution>
<groupId>
org.apache.maven.plugins
</groupId>
</executions>
<artifactId>
maven-surefire-plugin
</artifactId>
</plugin>
<version>
2.19.1
</version>
<configuration>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<forkMode>
once
</forkMode>
<plugin>
<argLine>
-Dfile.encoding=UTF-8
</argLine>
<groupId>
org.apache.maven.plugins
</groupId>
<skipTests>
true
</skipTests>
<artifactId>
maven-surefire-plugin
</artifactId>
</configuration>
<version>
2.19.1
</version>
</plugin>
<configuration>
</plugins>
<forkMode>
once
</forkMode>
</build>
<argLine>
-Dfile.encoding=UTF-8
</argLine>
<skipTests>
true
</skipTests>
</configuration>
</plugin>
</plugins>
</build>
</project>
</project>
\ No newline at end of file
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
View file @
cb6bcd76
...
@@ -10,36 +10,80 @@ import java.io.Serializable;
...
@@ -10,36 +10,80 @@ import java.io.Serializable;
import
java.util.Date
;
import
java.util.Date
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
lombok.Data
;
import
lombok.ToString
;
@Data
@ToString
public
class
HotSearchList
implements
Serializable
{
public
class
HotSearchList
implements
Serializable
{
private
static
final
long
serialVersionUID
=
2076919584659821600L
;
private
static
final
long
serialVersionUID
=
2076919584659821600L
;
private
String
id
;
//主键
private
String
url
;
//消息链接
private
String
name
;
//热搜关键词
private
Integer
count
;
//时时热搜量
private
Boolean
hot
;
//状态(true 为热搜; false为时时上升)
private
String
day
;
//天
private
Date
time
;
//时间
private
Integer
changeCount
;
//据上分钟变化量
private
Integer
rank
;
//排名
private
String
type
;
//分类
private
String
icon
;
//热搜类型
/**
* 主键
*/
private
String
id
;
/**
* 消息链接
*/
private
String
url
;
/**
* 热搜关键词
*/
private
String
name
;
/**
* 热搜或话题导语
*/
private
String
topicLead
;
/**
* 时时热搜量
*/
private
Integer
count
;
/**
* 状态(true 为热搜; false为时时上升)
*/
private
Boolean
hot
;
/**
* 天
*/
private
String
day
;
/**
* 时间
*/
private
Date
time
;
/**
* 据上分钟变化量
*/
private
Integer
changeCount
;
/**
* 排名
*/
private
Integer
rank
;
/**
* 分类
*/
private
String
type
;
/**
* 热搜类型
*/
private
String
icon
;
/**
* 话题讨论量
*/
private
Integer
commentCount
;
public
HotSearchList
(){}
public
HotSearchList
(){}
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
){
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
){
...
@@ -67,122 +111,20 @@ public class HotSearchList implements Serializable{
...
@@ -67,122 +111,20 @@ public class HotSearchList implements Serializable{
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
type
=
type
;
this
.
type
=
type
;
}
}
@Override
public
String
toString
(){
return
"new HotSearchList["
+
"id = "
+
id
+
", url = "
+
url
+
", name = "
+
name
+
", count = "
+
count
+
", time = "
+
time
+
", hot = "
+
hot
+
", rank = "
+
rank
+
", day = "
+
day
+
", changeCount = "
+
changeCount
+
", type = "
+
type
+
", icon = "
+
icon
+
"]"
;
}
public
String
getId
()
{
return
id
;
}
public
void
setId
(
String
id
)
{
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Integer
rank
,
String
type
,
Integer
commentCount
,
String
topicLead
){
this
.
id
=
id
;
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
();
}
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
this
.
url
=
url
;
}
public
String
getName
()
{
return
name
;
}
public
void
setName
(
String
name
)
{
this
.
name
=
name
;
this
.
name
=
name
;
}
public
Integer
getCount
()
{
return
count
;
}
public
void
setCount
(
Integer
count
)
{
this
.
count
=
count
;
this
.
count
=
count
;
}
this
.
hot
=
true
;
public
Date
getTime
()
{
return
time
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
public
Integer
getChangeCount
()
{
return
changeCount
;
}
public
void
setChangeCount
(
Integer
changeCount
)
{
this
.
changeCount
=
changeCount
;
}
public
static
long
getSerialversionuid
()
{
return
serialVersionUID
;
}
public
Boolean
isHot
()
{
return
hot
;
}
public
void
setHot
(
Boolean
hot
)
{
this
.
hot
=
hot
;
}
public
Boolean
getHot
()
{
return
hot
;
}
public
String
getIcon
()
{
return
icon
;
}
public
void
setIcon
(
String
icon
)
{
this
.
icon
=
icon
;
}
public
String
getDay
()
{
return
day
;
}
public
void
setDay
(
String
day
)
{
this
.
day
=
day
;
}
public
Integer
getRank
()
{
return
rank
;
}
public
void
setRank
(
Integer
rank
)
{
this
.
rank
=
rank
;
this
.
rank
=
rank
;
}
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
public
String
getType
()
{
return
type
;
}
public
void
setType
(
String
type
)
{
this
.
type
=
type
;
this
.
type
=
type
;
this
.
commentCount
=
commentCount
;
this
.
topicLead
=
topicLead
;
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
public
enum
HotSearchType
{
public
enum
HotSearchType
{
百度热搜
,
百度热搜
,
微博热搜
,
微博热搜
,
知乎热搜
,
知乎热搜
,
抖音热搜
,
抖音热搜
,
搜狗微信热搜
搜狗微信热搜
,
}
微博话题
}
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboSuperTopic.java
0 → 100644
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
import
java.util.Date
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
lombok.Data
;
import
lombok.ToString
;
/**
*
* @ClassName: WeiboSuperTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
@Data
@ToString
public
class
WeiboSuperTopic
{
/**
* 主键
*/
private
String
id
;
/**
* 话题链接
*/
public
String
url
;
/**
* 话题名称
*/
public
String
topicName
;
/**
* 话题排名
*/
public
Integer
rank
;
/**
* 话题影响力
*/
public
String
score
;
/**
* 话题粉丝数
*/
public
String
fensi
;
/**
* 话题阅读数
*/
public
String
readNum
;
/**
* 话题帖子数
*/
public
String
postNum
;
/**
* 榜单类型
*/
public
String
type
;
/**
* 天
*/
private
String
day
;
/**
* 时间
*/
private
Date
time
;
public
WeiboSuperTopic
()
{}
public
WeiboSuperTopic
(
String
url
,
String
topicName
,
Integer
rank
,
String
score
,
String
fensi
,
String
type
)
{
this
.
url
=
url
;
this
.
topicName
=
topicName
;
this
.
rank
=
rank
;
this
.
score
=
score
;
this
.
fensi
=
fensi
;
this
.
type
=
type
;
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
id
=
topicName
+
"_"
+
type
+
"_"
+
day
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboTopic.java
deleted
100644 → 0
View file @
a9966f9d
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
import
java.util.Date
;
import
com.zhiwei.tools.timeparse.TimeParse
;
/**
*
* @ClassName: WeiboTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
public
class
WeiboTopic
{
private
String
id
;
//主键
public
String
url
;
//话题链接
public
String
topicName
;
//话题名称
public
Integer
rank
;
//话题排名
public
String
score
;
//话题影响力
public
String
fensi
;
//话题粉丝数
public
String
readNum
;
//话题阅读数
public
String
postNum
;
//话题帖子数
public
String
type
;
//榜单类型
private
String
day
;
//天
private
Date
time
;
//时间
public
WeiboTopic
()
{}
public
WeiboTopic
(
String
url
,
String
topicName
,
Integer
rank
,
String
score
,
String
fensi
,
String
type
)
{
this
.
url
=
url
;
this
.
topicName
=
topicName
;
this
.
rank
=
rank
;
this
.
score
=
score
;
this
.
fensi
=
fensi
;
this
.
type
=
type
;
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
id
=
topicName
+
"_"
+
type
+
"_"
+
day
;
}
@Override
public
String
toString
()
{
return
"new WeiboTopic["
+
"topicName = "
+
topicName
+
", rank = "
+
rank
+
", score = "
+
score
+
", fensi = "
+
fensi
+
", type = "
+
type
+
", readNum = "
+
readNum
+
", postNum = "
+
postNum
+
", url = "
+
url
+
"]"
;
}
public
String
getUrl
()
{
return
url
;
}
public
String
getTopicName
()
{
return
topicName
;
}
public
Integer
getRank
()
{
return
rank
;
}
public
String
getScore
()
{
return
score
;
}
public
String
getFensi
()
{
return
fensi
;
}
public
String
getReadNum
()
{
return
readNum
;
}
public
String
getPostNum
()
{
return
postNum
;
}
public
String
getType
()
{
return
type
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
void
setTopicName
(
String
topicName
)
{
this
.
topicName
=
topicName
;
}
public
void
setRank
(
Integer
rank
)
{
this
.
rank
=
rank
;
}
public
void
setScore
(
String
score
)
{
this
.
score
=
score
;
}
public
void
setFensi
(
String
fensi
)
{
this
.
fensi
=
fensi
;
}
public
void
setReadNum
(
String
readNum
)
{
this
.
readNum
=
readNum
;
}
public
void
setPostNum
(
String
postNum
)
{
this
.
postNum
=
postNum
;
}
public
void
setType
(
String
type
)
{
this
.
type
=
type
;
}
public
String
getId
()
{
return
id
;
}
public
String
getDay
()
{
return
day
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setId
(
String
id
)
{
this
.
id
=
id
;
}
public
void
setDay
(
String
day
)
{
this
.
day
=
day
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/cache/CacheListener.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
cache
;
package
com
.
zhiwei
.
searchhotcrawler
.
cache
;
import
org.slf4j.Logger
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
CacheListener
{
@Log4j2
Logger
logger
=
LoggerFactory
.
getLogger
(
CacheListener
.
class
);
public
class
CacheListener
{
public
void
startListen
()
{
/**
new
Thread
(){
* 开启缓存监听
public
void
run
()
{
*/
while
(
true
)
{
public
void
startListen
()
{
if
(
CacheManager
.
caches
!=
null
&&
CacheManager
.
caches
.
size
()>
0
){
new
Thread
(){
for
(
String
key
:
CacheManager
.
getAllKeys
())
{
public
void
run
()
{
if
(
CacheManager
.
isTimeOut
(
key
))
{
while
(
true
)
{
CacheManager
.
clearByKey
(
key
);
if
(
CacheManager
.
caches
!=
null
&&
CacheManager
.
caches
.
size
()>
0
){
logger
.
info
(
key
+
"缓存被清除"
);
for
(
String
key
:
CacheManager
.
getAllKeys
())
{
}
if
(
CacheManager
.
isTimeOut
(
key
))
{
}
CacheManager
.
clearByKey
(
key
);
}
log
.
info
(
key
+
"缓存被清除"
);
ZhiWeiTools
.
sleep
(
500
);
}
}
}
}
}
}.
start
();
ZhiWeiTools
.
sleep
(
500
);
}
}
}
}
}.
start
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Objects
;
import
java.util.Objects
;
import
org.apache.commons.lang3.StringUtils
;
import
lombok.extern.log4j.Log4j2
;
import
org.
jsoup.Jsoup
;
import
org.
apache.commons.lang3.StringUtils
;
import
org.jsoup.
nodes.Document
;
import
org.jsoup.
Jsoup
;
import
org.jsoup.
select.Elements
;
import
org.jsoup.
nodes.Document
;
import
org.
slf4j.Logger
;
import
org.
jsoup.select.Elements
;
import
org.slf4j.Logger
Factory
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.
proxy.ProxyHolder
;
import
com.zhiwei.crawler.
core.HttpBoot
;
import
com.zhiwei.crawler.
utils.RequestUtils
;
import
com.zhiwei.crawler.
proxy.ProxyHolder
;
import
com.zhiwei.
searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.
crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
Type
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
List
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
/**
* @ClassName:BaiDuHotSearch
/**
* @
Description: TODO(百度风云榜热搜采集)
* @
ClassName:BaiDuHotSearch
* @
author hero
* @
Description: TODO(百度风云榜热搜采集)
* @
date 2019年7月10日 上午10:54:31
* @
author hero
*
/
*
@date 2019年7月10日 上午10:54:31
public
class
BaiDuHotSearchCrawler
{
*/
@Log4j2
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiDuHotSearchCrawler
.
class
);
public
class
BaiDuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: BaiDuHotSearchTest
/**
* @
author hero
* @
Title: BaiDuHotSearchTest
* @
Description: TODO(PC端百度风云榜采集)
* @
author hero
* @
param 设定文件
* @
Description: PC端百度风云榜采集
* @return void 返回类型
* @return void 返回类型
*/
*/
public
static
List
<
HotSearchList
>
baiduHotSearch
()
{
public
static
List
<
HotSearchList
>
baiduHotSearch
()
{
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
try
{
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"mainBody"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"mainBody"
))
{
return
ansysData
(
htmlBody
);
return
ansysData
(
htmlBody
);
}
else
{
}
else
{
log
ger
.
info
(
"解析百度风云榜时出现解析错误,页面结构有问题"
);
log
.
info
(
"解析百度风云榜时出现解析错误,页面结构有问题"
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
}
}
return
Collections
.
emptyList
();
return
Collections
.
emptyList
();
}
}
/**
/**
* 解析数据
* 解析数据
* @param htmlBody
* @param htmlBody
* @return
* @return
*/
*/
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
){
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
try
{
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"table.list-table"
).
select
(
"tr"
);
Elements
elements
=
document
.
select
(
"table.list-table"
).
select
(
"tr"
);
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
())
{
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
())
{
elements
.
forEach
(
element
->
{
elements
.
forEach
(
element
->
{
try
{
try
{
// 获取排名rank
// 获取排名rank
String
rankStr
=
null
;
String
rankStr
=
null
;
// 根据网页标签,给rankStr做判断
// 根据网页标签,给rankStr做判断
if
(!
element
.
select
(
"td.first"
).
select
(
"span.num-top"
).
isEmpty
())
{
if
(!
element
.
select
(
"td.first"
).
select
(
"span.num-top"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"td.first"
).
select
(
"span.num-top"
).
text
();
rankStr
=
element
.
select
(
"td.first"
).
select
(
"span.num-top"
).
text
();
}
else
if
(!
element
.
select
(
"td.first"
).
select
(
"span.num-normal"
).
isEmpty
())
{
}
else
if
(!
element
.
select
(
"td.first"
).
select
(
"span.num-normal"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"td.first"
).
select
(
"span.num-normal"
).
text
();
rankStr
=
element
.
select
(
"td.first"
).
select
(
"span.num-normal"
).
text
();
}
}
Integer
rank
=
null
;
Integer
rank
=
null
;
// 判断rankStr是否为空
// 判断rankStr是否为空
if
(
StringUtils
.
isNoneBlank
(
rankStr
))
{
if
(
StringUtils
.
isNoneBlank
(
rankStr
))
{
rank
=
Integer
.
valueOf
(
rankStr
);
rank
=
Integer
.
valueOf
(
rankStr
);
}
}
// 获取关键词(String)
// 获取关键词(String)
String
kw
=
element
.
select
(
"td.keyword"
).
select
(
"a.list-title"
).
text
();
String
kw
=
element
.
select
(
"td.keyword"
).
select
(
"a.list-title"
).
text
();
// logger.info("关键词:{}", kw);
// logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
// 获取关键词相关链接everurl(String)
String
everurl
=
element
.
select
(
"td.keyword"
).
select
(
"a.list-title"
).
attr
(
"href"
);
String
everurl
=
element
.
select
(
"td.keyword"
).
select
(
"a.list-title"
).
attr
(
"href"
);
// 获取搜索指数count(int)
// 获取搜索指数count(int)
String
hot
=
null
;
String
hot
=
null
;
// 判断热度值所在的规则是否为null
// 判断热度值所在的规则是否为null
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-fall"
).
isEmpty
())
{
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-fall"
).
isEmpty
())
{
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-fall"
).
text
();
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-fall"
).
text
();
}
else
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
isEmpty
())
{
}
else
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
isEmpty
())
{
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
text
();
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
text
();
}
}
int
count
=
0
;
int
count
=
0
;
// 判断hot是否为空
// 判断hot是否为空
if
(
StringUtils
.
isNotBlank
(
hot
))
{
if
(
StringUtils
.
isNotBlank
(
hot
))
{
count
=
Integer
.
valueOf
(
hot
);
count
=
Integer
.
valueOf
(
hot
);
}
}
if
(
Objects
.
nonNull
(
rank
))
{
if
(
Objects
.
nonNull
(
rank
))
{
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
count
,
rank
,
HotSearchType
.
百度热搜
.
name
());
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
count
,
rank
,
HotSearchType
.
百度热搜
.
name
());
list
.
add
(
hotSearch
);
list
.
add
(
hotSearch
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析百度风云榜时出现解析错误"
,
e
);
log
.
error
(
"解析百度风云榜时出现解析错误"
,
e
);
}
}
});
});
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析百度风云榜时出现解析错误,数据不是json结构"
,
e
);
log
.
error
(
"解析百度风云榜时出现解析错误,数据不是json结构"
,
e
);
}
}
return
list
;
return
list
;
}
}
}
}
\ No newline at end of file
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
import
org.apache.commons.lang3.StringUtils
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONArray
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
/**
* @className DouyinHotSearchCrawler
/**
* @Description:抖音热搜榜采集程序
* @className DouyinHotSearchCrawler
* @author win 10
* @Description:抖音热搜榜采集程序
* @date:2019年07月11日 上午10:26:21
* @author win 10
*/
* @date:2019年07月11日 上午10:26:21
public
class
DouyinHotSearchCrawler
{
*/
@Log4j2
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuHotSearchCrawler
.
class
);
public
class
DouyinHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: getMobileDouyinHotList
/**
* @author hero
* @Title: getMobileDouyinHotList
* @Description: 移动端抖音热搜榜
* @author hero
* @param @return 设定文件
* @Description: 移动端抖音热搜榜
* @return List<ZhihuHotSearch> 返回类型
* @param @return 设定文件
*/
* @return List<ZhihuHotSearch> 返回类型
public
static
List
<
HotSearchList
>
getMobileDouyinHotList
(){
*/
List
<
HotSearchList
>
list
=
null
;
public
static
List
<
HotSearchList
>
getMobileDouyinHotList
(){
String
url
=
"https://api.amemv.com/aweme/v1/hot/search/list/"
;
List
<
HotSearchList
>
list
=
null
;
try
{
String
url
=
"https://api.amemv.com/aweme/v1/hot/search/list/"
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
try
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
)){
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
list
=
new
ArrayList
<>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
)){
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
);
list
=
new
ArrayList
<>();
JSONArray
wordList
=
data
.
getJSONObject
(
"data"
).
getJSONArray
(
"word_list"
);
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
);
String
positionStr
=
null
;
JSONArray
wordList
=
data
.
getJSONObject
(
"data"
).
getJSONArray
(
"word_list"
);
String
word
=
null
;
String
positionStr
=
null
;
String
hotValueStr
=
null
;
String
word
=
null
;
for
(
int
i
=
0
;
i
<
wordList
.
size
();
i
++)
{
String
hotValueStr
=
null
;
JSONObject
wl
=
wordList
.
getJSONObject
(
i
);
for
(
int
i
=
0
;
i
<
wordList
.
size
();
i
++)
{
//获取排名
JSONObject
wl
=
wordList
.
getJSONObject
(
i
);
positionStr
=
wl
.
getString
(
"position"
);
//获取排名
Integer
position
=
null
;
positionStr
=
wl
.
getString
(
"position"
);
position
=
Integer
.
valueOf
(
positionStr
);
Integer
position
=
null
;
//获取关键词
position
=
Integer
.
valueOf
(
positionStr
);
word
=
wl
.
getString
(
"word"
);
//获取关键词
//获取热度值
word
=
wl
.
getString
(
"word"
);
hotValueStr
=
wl
.
getString
(
"hot_value"
);
//获取热度值
Integer
hotValue
=
null
;
hotValueStr
=
wl
.
getString
(
"hot_value"
);
hotValue
=
Integer
.
valueOf
(
hotValueStr
);
Integer
hotValue
=
null
;
// logger.info("热度为:::{}", hot_value);
hotValue
=
Integer
.
valueOf
(
hotValueStr
);
HotSearchList
douyin
=
new
HotSearchList
(
null
,
word
,
hotValue
,
position
,
HotSearchType
.
抖音热搜
.
name
());
// logger.info("热度为:::{}", hot_value);
list
.
add
(
douyin
);
HotSearchList
douyin
=
new
HotSearchList
(
null
,
word
,
hotValue
,
position
,
HotSearchType
.
抖音热搜
.
name
());
}
list
.
add
(
douyin
);
}
}
}
catch
(
IOException
e
)
{
}
logger
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
);
}
catch
(
IOException
e
)
{
}
log
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
);
return
list
;
}
}
return
list
;
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
org.apache.commons.lang3.StringUtils
;
import
lombok.extern.log4j.Log4j2
;
import
org.
jsoup.Jsoup
;
import
org.
apache.commons.lang3.StringUtils
;
import
org.jsoup.
nodes.Document
;
import
org.jsoup.
Jsoup
;
import
org.jsoup.nodes.
Element
;
import
org.jsoup.nodes.
Document
;
import
org.jsoup.
select.Elements
;
import
org.jsoup.
nodes.Element
;
import
org.
slf4j.Logger
;
import
org.
jsoup.select.Elements
;
import
org.slf4j.Logger
Factory
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.
proxy.ProxyHolder
;
import
com.zhiwei.crawler.
core.HttpBoot
;
import
com.zhiwei.crawler.
utils.RequestUtils
;
import
com.zhiwei.crawler.
proxy.ProxyHolder
;
import
com.zhiwei.
searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.
crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
Type
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
List
;
import
com.zhiwei.
tools.httpclient.HeaderTool
;
import
com.zhiwei.
searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
/**
* @ClassName:SougoHotSearch
/**
* @
Description: TODO(搜狗微信关键词采集)
* @
ClassName:SougoHotSearch
* @
author hero
* @
Description: TODO(搜狗微信关键词采集)
* @
date 2019年7月10日 上午10:54:31
* @
author hero
*
/
*
@date 2019年7月10日 上午10:54:31
public
class
SougoHotSearchCrawler
{
*/
@Log4j2
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SougoHotSearchCrawler
.
class
);
public
class
SougoHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: SougoHotSearchTest
/**
* @
author hero
* @
Title: SougoHotSearchTest
* @
Description: TODO(PC端搜狗微信关键词采集)
* @
author hero
* @
param 设定文件
* @
Description: TODO(PC端搜狗微信关键词采集)
* @return void 返回类型
* @return void 返回类型
*/
*/
public
static
List
<
HotSearchList
>
sougoHotSearch
()
{
public
static
List
<
HotSearchList
>
sougoHotSearch
()
{
String
url
=
"https://weixin.sogou.com"
;
String
url
=
"https://weixin.sogou.com"
;
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
{
try
{
Map
<
String
,
String
>
headMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headMap
=
HeaderTool
.
getCommonHead
();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"topwords"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"topwords"
))
{
try
{
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"ol#topwords"
).
select
(
"li"
);
Elements
elements
=
document
.
select
(
"ol#topwords"
).
select
(
"li"
);
for
(
Element
element
:
elements
)
{
for
(
Element
element
:
elements
)
{
try
{
try
{
// 获取排名rank
// 获取排名rank
String
rankStr
=
null
;
String
rankStr
=
null
;
if
(!
element
.
select
(
"li"
).
select
(
"i"
).
isEmpty
())
{
if
(!
element
.
select
(
"li"
).
select
(
"i"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"li"
).
select
(
"i"
).
text
();
rankStr
=
element
.
select
(
"li"
).
select
(
"i"
).
text
();
}
}
Integer
rank
=
null
;
Integer
rank
=
null
;
if
(
StringUtils
.
isNoneBlank
(
rankStr
))
{
if
(
StringUtils
.
isNoneBlank
(
rankStr
))
{
rank
=
Integer
.
valueOf
(
rankStr
);
rank
=
Integer
.
valueOf
(
rankStr
);
}
}
// 获取关键词(String)
// 获取关键词(String)
String
kw
=
element
.
select
(
"li"
).
select
(
"a"
).
text
();
String
kw
=
element
.
select
(
"li"
).
select
(
"a"
).
text
();
// logger.info("关键词:{}", kw);
// logger.info("关键词:{}", kw);
String
everurl
=
element
.
select
(
"li"
).
select
(
"a"
).
attr
(
"href"
);
String
everurl
=
element
.
select
(
"li"
).
select
(
"a"
).
attr
(
"href"
);
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
null
,
rank
,
HotSearchType
.
搜狗微信热搜
.
name
());
HotSearchList
hotSearch
=
new
HotSearchList
(
everurl
,
kw
,
null
,
rank
,
HotSearchType
.
搜狗微信热搜
.
name
());
if
(
Objects
.
nonNull
(
rank
))
{
if
(
Objects
.
nonNull
(
rank
))
{
list
.
add
(
hotSearch
);
list
.
add
(
hotSearch
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析搜狗微信时出现解析错误"
,
e
);
log
.
error
(
"解析搜狗微信时出现解析错误"
,
e
);
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析搜狗微信时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
log
.
error
(
"解析搜狗微信时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
return
Collections
.
emptyList
();
return
Collections
.
emptyList
();
}
}
}
else
{
}
else
{
log
ger
.
info
(
"解析搜狗微信时出现解析错误,页面结构有问题"
);
log
.
info
(
"解析搜狗微信时出现解析错误,页面结构有问题"
);
}
}
break
;
break
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
}
}
}
}
return
list
;
return
list
;
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
cb6bcd76
...
@@ -7,6 +7,7 @@ import java.util.HashMap;
...
@@ -7,6 +7,7 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
lombok.extern.log4j.Log4j2
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
...
@@ -27,13 +28,13 @@ import com.zhiwei.tools.tools.URLCodeUtil;
...
@@ -27,13 +28,13 @@ import com.zhiwei.tools.tools.URLCodeUtil;
/**
/**
* @ClassName: WeiboHotSearch
* @ClassName: WeiboHotSearch
* @Description:
TODO(微博实时热搜采集)
* @Description:
微博实时热搜采集
* @author hero
* @author hero
* @date 2017年9月15日 上午10:54:31
* @date 2017年9月15日 上午10:54:31
*/
*/
@Log4j2
public
class
WeiboHotSearchCrawler
{
public
class
WeiboHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
/**
* @Title: weiboHotSearchTest
* @Title: weiboHotSearchTest
...
@@ -70,18 +71,18 @@ public class WeiboHotSearchCrawler {
...
@@ -70,18 +71,18 @@ public class WeiboHotSearchCrawler {
list
.
add
(
hotSearch
);
list
.
add
(
hotSearch
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
log
ger
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
log
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
continue
;
continue
;
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
return
null
;
return
null
;
}
}
}
else
{
}
else
{
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
log
ger
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
log
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
}
}
break
;
break
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -138,25 +139,25 @@ public class WeiboHotSearchCrawler {
...
@@ -138,25 +139,25 @@ public class WeiboHotSearchCrawler {
}
}
String
id
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
String
id
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
);
log
ger
.
info
(
"采集到的数据:::{}"
,
hotSearch
);
log
.
info
(
"采集到的数据:::{}"
,
hotSearch
);
result
.
add
(
hotSearch
);
result
.
add
(
hotSearch
);
rank
++;
rank
++;
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
log
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
continue
;
continue
;
}
}
}
}
return
result
;
return
result
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
);
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
);
return
Collections
.
emptyList
();
return
Collections
.
emptyList
();
}
}
}
else
{
}
else
{
log
ger
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
log
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
}
}
}
catch
(
IOException
e1
)
{
}
catch
(
IOException
e1
)
{
log
ger
.
error
(
"解析微博时时热搜时出现连接失败"
,
e1
);
log
.
error
(
"解析微博时时热搜时出现连接失败"
,
e1
);
return
Collections
.
emptyList
();
return
Collections
.
emptyList
();
}
}
return
Collections
.
emptyList
();
return
Collections
.
emptyList
();
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/Weibo
Huati
Crawler.java
→
src/main/java/com/zhiwei/searchhotcrawler/crawler/Weibo
SuperTopic
Crawler.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
java.util.Map.Entry
;
import
java.util.Objects
;
import
java.util.Objects
;
import
org.apache.commons.lang3.StringUtils
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
org.slf4j.Logger
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.LoggerFactory
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.alibaba.fastjson.JSONArray
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.searchhotcrawler.bean.WeiboTopic
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
/**
*
/**
* @ClassName: WeiboHuatiCrawler
*
* @Description: 微博话题榜单采集(明星)
* @ClassName: WeiboSuperTopicCrawler
* @author Bewilder ZW
* @Description: 微博超话榜单采集(明星)
* @date 2019年9月27日 下午3:01:34
* @author Bewilder ZW
*/
* @date 2019年9月27日 下午3:01:34
public
class
WeiboHuatiCrawler
{
*/
@Log4j2
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboHuatiCrawler
.
class
);
public
class
WeiboSuperTopicCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
static
{
headMap
.
put
(
"X-Requested-With"
,
"XMLHttpRequest"
);
static
{
headMap
.
put
(
"Referer"
,
"https://huati.weibo.cn/discovery/super?extparam=ctg1_2%7Cscorll_1&luicode=10000011&lfid=100803_-_super&sourceType=weixin"
);
headMap
.
put
(
"X-Requested-With"
,
"XMLHttpRequest"
);
headMap
.
put
(
"Host"
,
"huati.weibo.cn"
);
headMap
.
put
(
"Referer"
,
"https://huati.weibo.cn/discovery/super?extparam=ctg1_2%7Cscorll_1&luicode=10000011&lfid=100803_-_super&sourceType=weixin"
);
headMap
.
put
(
"Host"
,
"huati.weibo.cn"
);
}
}
/**
*
/**
* 开始采集明星话题
*
* @return void
* 开始采集明星话题
*/
* @return void
public
static
List
<
WeiboTopic
>
startCrawler
()
{
*/
Map
<
String
,
String
>
urlMap
=
new
HashMap
<>();
public
static
List
<
WeiboSuperTopic
>
startCrawler
()
{
urlMap
.
put
(
"明星"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=star&from=&wm="
);
Map
<
String
,
String
>
urlMap
=
new
HashMap
<>();
urlMap
.
put
(
"明星潜力"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=potential&from=&wm="
);
urlMap
.
put
(
"明星"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=star&from=&wm="
);
urlMap
.
put
(
"明星上升"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=up&from=&wm="
);
urlMap
.
put
(
"明星潜力"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=potential&from=&wm="
);
urlMap
.
put
(
"明星上升"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=up&from=&wm="
);
List
<
WeiboTopic
>
topicList
=
new
ArrayList
<>();
List
<
WeiboSuperTopic
>
topicList
=
new
ArrayList
<>();
for
(
Entry
<
String
,
String
>
entry
:
urlMap
.
entrySet
())
{
String
url
=
entry
.
getValue
();
for
(
Entry
<
String
,
String
>
entry
:
urlMap
.
entrySet
())
{
String
type
=
entry
.
getKey
();
String
url
=
entry
.
getValue
();
for
(
int
page
=
1
;
page
<=
5
;
page
++)
{
String
type
=
entry
.
getKey
();
String
pageUrl
=
url
+
"&page="
+
page
;
for
(
int
page
=
1
;
page
<=
5
;
page
++)
{
//重试三次
String
pageUrl
=
url
+
"&page="
+
page
;
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
//重试三次
try
{
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
System
.
out
.
println
(
"pageUrl=========="
+
pageUrl
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
pageUrl
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
System
.
out
.
println
(
"pageUrl=========="
+
pageUrl
);
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc1"
))
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
pageUrl
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
topicList
.
addAll
(
parseTopicRankHtml
(
page
,
htmlBody
,
type
));
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc1"
))
{
break
;
topicList
.
addAll
(
parseTopicRankHtml
(
page
,
htmlBody
,
type
));
}
else
{
break
;
logger
.
error
(
"获取榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
else
{
}
log
.
error
(
"获取榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
e
);
}
catch
(
Exception
e
)
{
continue
;
log
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
e
);
}
continue
;
}
}
}
}
}
}
return
topicList
;
}
}
return
topicList
;
}
/**
*
/**
* 解析话题榜单
*
* @param htmlBody
* 解析话题榜单
* @param type
* @param htmlBody
* @return void
* @param type
*/
* @return void
private
static
List
<
WeiboTopic
>
parseTopicRankHtml
(
int
page
,
String
htmlBody
,
String
type
)
{
*/
try
{
private
static
List
<
WeiboSuperTopic
>
parseTopicRankHtml
(
int
page
,
String
htmlBody
,
String
type
)
{
JSONArray
list
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
try
{
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
JSONArray
list
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
page
=
(
page
-
1
)*
20
;
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
page
=
(
page
-
1
)*
20
;
List
<
WeiboTopic
>
topicList
=
new
ArrayList
<>();
Integer
toprank
=
null
;
List
<
WeiboSuperTopic
>
topicList
=
new
ArrayList
<>();
String
topicName
=
null
;
Integer
toprank
=
null
;
String
id
=
null
;
String
topicName
=
null
;
String
score
=
null
;
String
id
=
null
;
String
desc1
=
null
;
String
score
=
null
;
String
fensi
=
null
;
String
desc1
=
null
;
String
url
=
null
;
String
fensi
=
null
;
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
String
url
=
null
;
JSONObject
data
=
list
.
getJSONObject
(
i
);
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
toprank
=
page
+
data
.
getInteger
(
"toprank"
);
JSONObject
data
=
list
.
getJSONObject
(
i
);
topicName
=
data
.
getString
(
"display_name"
);
toprank
=
page
+
data
.
getInteger
(
"toprank"
);
id
=
data
.
getString
(
"page_id"
);
topicName
=
data
.
getString
(
"display_name"
);
score
=
data
.
getString
(
"score"
);
id
=
data
.
getString
(
"page_id"
);
desc1
=
data
.
getString
(
"desc1"
);
score
=
data
.
getString
(
"score"
);
fensi
=
desc1
.
replaceAll
(
".*影响力|粉丝"
,
""
).
trim
();
desc1
=
data
.
getString
(
"desc1"
);
url
=
data
.
getString
(
"link"
);
fensi
=
desc1
.
replaceAll
(
".*影响力|粉丝"
,
""
).
trim
();
url
=
data
.
getString
(
"link"
);
WeiboTopic
topic
=
new
WeiboTopic
(
url
,
topicName
,
toprank
,
score
,
fensi
,
type
);
topic
=
getTopicInfo
(
id
,
topic
);
WeiboSuperTopic
topic
=
new
WeiboSuperTopic
(
url
,
topicName
,
toprank
,
score
,
fensi
,
type
);
System
.
out
.
println
(
"topic====="
+
topic
);
topic
=
getTopicInfo
(
id
,
topic
);
topicList
.
add
(
topic
);
System
.
out
.
println
(
"topic====="
+
topic
);
}
topicList
.
add
(
topic
);
return
topicList
;
}
}
return
topicList
;
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"解析榜单列表页面时出现错误,错误为:{}"
,
e
);
}
catch
(
Exception
e
)
{
}
log
.
error
(
"解析榜单列表页面时出现错误,错误为:{}"
,
e
);
return
Collections
.
emptyList
();
}
}
return
Collections
.
emptyList
();
}
/**
*
/**
* 根据单一话题id获取话题阅读数及发帖数
*
* @param id
* 根据单一话题id获取话题阅读数及发帖数
* @param topic
* @param id
* @return
* @param topic
* @return WeiboTopic
* @return
*/
* @return WeiboTopic
private
static
WeiboTopic
getTopicInfo
(
String
id
,
WeiboTopic
topic
)
{
*/
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
private
static
WeiboSuperTopic
getTopicInfo
(
String
id
,
WeiboSuperTopic
topic
)
{
try
{
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid="
+
id
;
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid="
+
id
;
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc_more"
))
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
String
descMore
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"pageInfo"
).
getJSONArray
(
"desc_more"
).
getString
(
0
);
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc_more"
))
{
if
(
StringUtils
.
isNotBlank
(
descMore
))
{
String
descMore
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"pageInfo"
).
getJSONArray
(
"desc_more"
).
getString
(
0
);
String
readNum
=
descMore
.
replaceAll
(
"阅读|帖子.*"
,
""
).
trim
();
if
(
StringUtils
.
isNotBlank
(
descMore
))
{
String
postNum
=
descMore
.
replaceAll
(
".*帖子|粉丝.*"
,
""
).
trim
();
String
readNum
=
descMore
.
replaceAll
(
"阅读|帖子.*"
,
""
).
trim
();
String
postNum
=
descMore
.
replaceAll
(
".*帖子|粉丝.*"
,
""
).
trim
();
topic
.
setPostNum
(
postNum
);
topic
.
setReadNum
(
readNum
);
topic
.
setPostNum
(
postNum
);
return
topic
;
topic
.
setReadNum
(
readNum
);
}
return
topic
;
}
}
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
e
);
}
catch
(
Exception
e
)
{
}
log
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
e
);
}
}
return
topic
;
}
}
return
topic
;
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
0 → 100644
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
java.util.*
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
/**
*
* @ClassName: WeiboTopicCrawler
* @Description: 微博话题榜单采集
* @author Bewilder ZW
*/
@Log4j2
public
class
WeiboTopicCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
static
{
headMap
.
put
(
"Host"
,
"simg.s.weibo.com"
);
headMap
.
put
(
"User-Agent"
,
"Weibo/40651 CFNetwork/978.0.7 Darwin/18.6.0"
);
}
// /**
// *
// * 开始采集明星话题
// * @return void
// */
// public static List<HotSearchList> startCrawler() {
// List<HotSearchList> topicList = new ArrayList<>();
// for(int page=1; page<=7; page++){
// String pageUrl = "https://d.weibo.com/231650_ctg1_-_all?pids=Pl_Discover_Pt6Rank__4&cfs=920&Pl_Discover_Pt6Rank__4_filter=&Pl_Discover_Pt6Rank__4_page=" + page;
// //重试三次
// for(int retryTimes = 1; retryTimes<=3; retryTimes++) {
// try {
// String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
// if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("pl.content.miniTab.index")) {
// log.info("pageUrl::{}", pageUrl);
// topicList.addAll(parseTopicRankHtml(htmlBody));
// break;
// }else {
// log.error("获取榜单列表页面时数据格式错误,页面为:{}", htmlBody);
// }
// } catch (Exception e) {
// log.error("获取榜单列表页面时出现错误,错误为:{}", e);
// continue;
// }
// }
// }
// return topicList;
// }
//
// /**
// *
// * 解析话题榜单
// * @param htmlBody
// * @return void
// */
// private static List<HotSearchList> parseTopicRankHtml(String htmlBody) {
// try {
// String script = "{\"ns\":\"pl.content.miniTab.index\""+ htmlBody.split("FM.view\\(\\{\"ns\":\"pl.content.miniTab.index\"")[1].split("\\)<\\/script>")[0];
// JSONObject json = JSONObject.parseObject(script);
// String html = json.getString("html");
//
// Elements elements = Jsoup.parse(html).select("div.text_box");
// if(Objects.nonNull(elements) && !elements.isEmpty()) {
// List<HotSearchList> topicList = new ArrayList<>();
// String rankString;
// Integer rank = null;
// String topicName = null;
// String url = null;
// String topicType = null;
// String description = null;
// Integer readNum = null;
// String author = null;
//
// for(Element element : elements) {
// rankString = element.select("div[class=\"title W_autocut\"]").text();
// Matcher matcher = Pattern.compile("\\d+").matcher(rankString);
// while (matcher.find()){
// rank = Integer.valueOf(matcher.group());
// }
// topicName = element.select("div[class=\"title W_autocut\"]").select("a.S_txt1").text();
// url = element.select("div[class=\"title W_autocut\"]").select("a.S_txt1").attr("href");
// topicType = element.select("a[class=\"W_btn_b W_btn_tag\"]").text();
// description = element.select("div.subtitle").text();
// String readNumString = element.select("span.number").text();
// if(readNumString.contains("万")){
// readNumString = readNumString.split("万")[0];
// readNum = Integer.valueOf(readNumString.split("万")[0])*10000;;
// }
// if(readNumString.contains("亿")){
// readNum = Integer.valueOf(readNumString.split("亿")[0])*100000000;
// }
// author = element.select("a[class=\"tlink S_txt1\"]").text();
// HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博话题.name(), author, topicType, description);
// log.info("topic::::" + topic);
// topicList.add(topic);
// }
// return topicList;
// }else{
// log.info("html:{}",html);
// }
// } catch (Exception e) {
// log.error("解析榜单列表页面时出现错误,错误为:{}", e);
// }
// return Collections.emptyList();
// }
/**
* 微博平话题榜采集
*/
public
static
List
<
HotSearchList
>
startCrawlerByPhone
(){
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
for
(
int
page
=
1
;
page
<=
7
;
page
++){
String
pageUrl
=
"https://api.weibo.cn/2/page?gsid=_2A25zJX_EDeRxGedH71YS8CzKzzmIHXVuc_QMrDV6PUJbkdANLXPbkWpNUK3OyitGCJsX8exvua-vfubUqCiaA4lb&from=10A1193010&c=iphone&s=2827eebe&count=20&containerid=106003type%253D25%2526t%253D3%2526disable_hot%253D1%2526filter_type%253Dtopicscene&page="
+
page
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
5
;
retryTimes
++)
{
try
{
log
.
info
(
"pageUrl::{}"
,
pageUrl
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
pageUrl
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"top_mark_text"
))
{
topicList
.
addAll
(
parseTopicHtml
(
htmlBody
));
break
;
}
else
{
log
.
info
(
"下载榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"下载榜单列表页面时出现错误,错误为:{}"
,
e
);
continue
;
}
}
}
return
topicList
;
}
private
static
List
<
HotSearchList
>
parseTopicHtml
(
String
htmlBody
)
{
try
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
Objects
.
nonNull
(
cards
)
&&
!
cards
.
isEmpty
())
{
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
Integer
rank
=
null
;
String
topicName
=
null
;
String
url
=
null
;
String
description
=
null
;
Integer
commentNum
=
null
;
Integer
readNum
=
null
;
String
desc2
=
null
;
for
(
int
i
=
0
;
i
<
cards
.
size
();
i
++)
{
JSONObject
cardGroup
=
cards
.
getJSONObject
(
i
).
getJSONArray
(
"card_group"
).
getJSONObject
(
0
);
rank
=
cardGroup
.
getInteger
(
"top_mark_text"
);
topicName
=
cardGroup
.
getString
(
"title_sub"
);
url
=
"https://s.weibo.com/weibo?q="
+
URLCodeUtil
.
getURLEncode
(
topicName
,
"utf-8"
);
description
=
cardGroup
.
getString
(
"desc1"
);
desc2
=
cardGroup
.
getString
(
"desc2"
);
String
commentNumStr
=
desc2
.
replaceAll
(
"讨论.*"
,
""
).
trim
();
String
readNumStr
=
desc2
.
replaceAll
(
".*讨论|阅读"
,
""
).
trim
();
try
{
if
(
commentNumStr
.
contains
(
"万"
)){
commentNumStr
=
commentNumStr
.
replaceAll
(
"万"
,
""
);
commentNum
=
(
int
)(
Double
.
parseDouble
(
commentNumStr
)*
10000
);
}
else
if
(
commentNumStr
.
contains
(
"亿"
)){
commentNumStr
=
commentNumStr
.
replaceAll
(
"亿"
,
""
);
commentNum
=
(
int
)(
Double
.
parseDouble
(
commentNumStr
)*
10000000
);
}
else
{
commentNum
=
Integer
.
getInteger
(
commentNumStr
);
}
if
(
readNumStr
.
contains
(
"万"
)){
readNumStr
=
readNumStr
.
replaceAll
(
"万"
,
""
);
readNum
=
(
int
)(
Double
.
parseDouble
(
readNumStr
)*
10000
);
}
else
if
(
readNumStr
.
contains
(
"亿"
)){
readNumStr
=
readNumStr
.
replaceAll
(
"亿"
,
""
);
readNum
=
(
int
)(
Double
.
parseDouble
(
readNumStr
)*
10000000
);
}
else
{
readNum
=
Integer
.
getInteger
(
readNumStr
);
}
}
catch
(
Exception
e
){
e
.
printStackTrace
();
}
HotSearchList
topic
=
new
HotSearchList
(
url
,
topicName
,
readNum
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
commentNum
,
description
);
log
.
info
(
"topic::::"
+
topic
);
topicList
.
add
(
topic
);
}
return
topicList
;
}
else
{
log
.
info
(
"html:{}"
,
htmlBody
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析榜单列表页面时出现错误,错误为:{}"
,
e
);
}
return
Collections
.
emptyList
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
Factory
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSON
Object
;
import
com.alibaba.fastjson.JSON
Array
;
import
com.
zhiwei.crawler.core.HttpBoot
;
import
com.
alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.
proxy.ProxyHolder
;
import
com.zhiwei.crawler.
core.HttpBoot
;
import
com.zhiwei.crawler.
utils.RequestUtils
;
import
com.zhiwei.crawler.
proxy.ProxyHolder
;
import
com.zhiwei.
searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.
crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
Type
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
List
;
import
com.zhiwei.
tools.httpclient.HeaderTool
;
import
com.zhiwei.
searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.
tools.URLCodeUtil
;
import
com.zhiwei.tools.
httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
/**
* @ClassName: ZhihuHotCrawler
/**
* @
Description: TODO(知乎热搜采集程序)
* @
ClassName: ZhihuHotCrawler
* @
author hero
* @
Description: TODO(知乎热搜采集程序)
* @
date 2017年9月15日 上午10:54:31
* @
author hero
*
/
*
@date 2017年9月15日 上午10:54:31
public
class
ZhihuHotSearchCrawler
{
*/
@Log4j2
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuHotSearchCrawler
.
class
);
public
class
ZhihuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
* @Title: getZhihuHotList
/**
* @
author hero
* @
Title: getZhihuHotList
* @
Description: 知乎热搜采集程序
* @
author hero
* @
param 设定文件
* @
Description: 知乎热搜采集程序
* @return void 返回类型
* @return void 返回类型
*/
*/
public
static
List
<
HotSearchList
>
getZhihuHotList
(){
public
static
List
<
HotSearchList
>
getZhihuHotList
(){
List
<
HotSearchList
>
list
=
null
;
List
<
HotSearchList
>
list
=
null
;
String
url
=
"https://www.zhihu.com/api/v4/search/top_search"
;
String
url
=
"https://www.zhihu.com/api/v4/search/top_search"
;
String
rerferer
=
"https://www.zhihu.com/search?type=content&q=%E5%BF%AB%E6%89%8B"
;
String
rerferer
=
"https://www.zhihu.com/search?type=content&q=%E5%BF%AB%E6%89%8B"
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
);
headerMap
.
put
(
"Host"
,
"www.zhihu.com"
);
headerMap
.
put
(
"Host"
,
"www.zhihu.com"
);
headerMap
.
put
(
"X-UDID"
,
"AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="
);
headerMap
.
put
(
"X-UDID"
,
"AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="
);
headerMap
.
put
(
"accept"
,
"application/json, text/plain, */*"
);
headerMap
.
put
(
"accept"
,
"application/json, text/plain, */*"
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
headerMap
.
put
(
"Referer"
,
rerferer
);
headerMap
.
put
(
"Referer"
,
rerferer
);
try
{
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"words"
)){
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"words"
)){
list
=
new
ArrayList
<>();
list
=
new
ArrayList
<>();
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
words
=
topSearch
.
getJSONObject
(
"top_search"
).
getJSONArray
(
"words"
);
JSONArray
words
=
topSearch
.
getJSONObject
(
"top_search"
).
getJSONArray
(
"words"
);
String
link
=
null
;
String
link
=
null
;
String
displayQuery
=
null
;
String
displayQuery
=
null
;
String
query
=
null
;
String
query
=
null
;
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
JSONObject
word
=
words
.
getJSONObject
(
i
);
JSONObject
word
=
words
.
getJSONObject
(
i
);
query
=
word
.
getString
(
"query"
);
query
=
word
.
getString
(
"query"
);
displayQuery
=
word
.
getString
(
"display_query"
);
displayQuery
=
word
.
getString
(
"display_query"
);
link
=
"https://www.zhihu.com/search?q="
+
URLCodeUtil
.
getURLEncode
(
query
,
"utf-8"
)+
"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content"
;
link
=
"https://www.zhihu.com/search?q="
+
URLCodeUtil
.
getURLEncode
(
query
,
"utf-8"
)+
"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content"
;
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
null
,
i
,
HotSearchType
.
知乎热搜
.
name
());
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
null
,
i
,
HotSearchType
.
知乎热搜
.
name
());
list
.
add
(
zhihu
);
list
.
add
(
zhihu
);
}
}
}
}
}
catch
(
IOException
e
)
{
}
catch
(
IOException
e
)
{
log
ger
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
return
list
;
}
}
return
list
;
return
list
;
}
}
/**
/**
* @Title: getMobileZhihuHotList
* @Title: getMobileZhihuHotList
* @author hero
* @author hero
* @Description: 移動端知乎熱搜榜
* @Description: 移動端知乎熱搜榜
* @param @return 设定文件
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
* @return List<ZhihuHotSearch> 返回类型
*/
*/
public
static
List
<
HotSearchList
>
getMobileZhihuHotList
(){
public
static
List
<
HotSearchList
>
getMobileZhihuHotList
(){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();;
List
<
HotSearchList
>
list
=
new
ArrayList
<>();;
String
url
=
"https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0"
;
String
url
=
"https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0"
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"api.zhihu.com"
);
headerMap
.
put
(
"Host"
,
"api.zhihu.com"
);
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
);
headerMap
.
put
(
"X-UDID"
,
"AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="
);
headerMap
.
put
(
"X-UDID"
,
"AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
try
{
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
)){
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
)){
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
words
=
topSearch
.
getJSONArray
(
"data"
);
JSONArray
words
=
topSearch
.
getJSONArray
(
"data"
);
String
link
=
null
;
String
link
=
null
;
String
displayQuery
=
null
;
String
displayQuery
=
null
;
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
JSONObject
word
=
words
.
getJSONObject
(
i
).
getJSONObject
(
"target"
);
JSONObject
word
=
words
.
getJSONObject
(
i
).
getJSONObject
(
"target"
);
displayQuery
=
word
.
getString
(
"title"
);
displayQuery
=
word
.
getString
(
"title"
);
link
=
"https://www.zhihu.com/question/"
+
word
.
getLongValue
(
"id"
);
link
=
"https://www.zhihu.com/question/"
+
word
.
getLongValue
(
"id"
);
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
null
,
i
,
HotSearchType
.
知乎热搜
.
name
());
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
null
,
i
,
HotSearchType
.
知乎热搜
.
name
());
list
.
add
(
zhihu
);
list
.
add
(
zhihu
);
}
}
}
}
}
catch
(
IOException
e
)
{
}
catch
(
IOException
e
)
{
log
ger
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
return
list
;
}
}
return
list
;
return
list
;
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchListDAO.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Objects
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
com.mongodb.BasicDBObject
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.DBCursor
;
import
com.mongodb.DBObject
;
import
com.mongodb.BasicDBObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.mongodb.DBCursor
;
import
com.zhiwei.searchhotcrawler.cache.CacheManager
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.searchhotcrawler.cache.CacheManager
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
public
class
HotSearchListDAO
extends
MongoDBTemplate
{
import
com.zhiwei.tools.timeparse.TimeParse
;
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
HotSearchListDAO
.
class
);
@Log4j2
public
HotSearchListDAO
()
{
public
class
HotSearchListDAO
extends
MongoDBTemplate
{
super
();
super
.
setDbName
(
Config
.
dbName
);
public
HotSearchListDAO
()
{
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
super
();
String
year
=
time
.
substring
(
0
,
4
);
super
.
setDbName
(
Config
.
dbName
);
String
month
=
time
.
substring
(
5
,
7
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
String
collName
=
Config
.
searchCollName
+
year
+
"_"
+
month
;
String
year
=
time
.
substring
(
0
,
4
);
super
.
setCollName
(
collName
);
String
month
=
time
.
substring
(
5
,
7
);
DBObject
countIndexDoc
=
new
BasicDBObject
();
String
collName
=
Config
.
searchCollName
+
year
+
"_"
+
month
;
countIndexDoc
.
put
(
"count"
,
-
1
);
super
.
setCollName
(
collName
);
DBObject
timeIndexDoc
=
new
BasicDBObject
();
timeIndexDoc
.
put
(
"time"
,
-
1
);
//给数据表创建索引
DBObject
rankIndexDoc
=
new
BasicDBObject
();
createIndex
();
rankIndexDoc
.
put
(
"rank"
,
-
1
);
DBObject
nameIndexDoc
=
new
BasicDBObject
();
}
nameIndexDoc
.
put
(
"name"
,
-
1
);
DBObject
typeIndexDoc
=
new
BasicDBObject
();
typeIndexDoc
.
put
(
"type"
,
-
1
);
/**
try
{
* 初次创建表及创建相应的索引
super
.
getReadColl
().
createIndex
(
countIndexDoc
,
new
BasicDBObject
(
"name"
,
"count_desc"
));
*/
super
.
getReadColl
().
createIndex
(
timeIndexDoc
,
new
BasicDBObject
(
"name"
,
"time_desc"
));
private
void
createIndex
(){
super
.
getReadColl
().
createIndex
(
rankIndexDoc
,
new
BasicDBObject
(
"name"
,
"rank_desc"
));
List
<
DBObject
>
indexList
=
this
.
getReadColl
().
getIndexInfo
();
super
.
getReadColl
().
createIndex
(
nameIndexDoc
,
new
BasicDBObject
(
"name"
,
"name_desc"
));
if
(
Objects
.
isNull
(
indexList
)
&&
indexList
.
isEmpty
()){
super
.
getReadColl
().
createIndex
(
typeIndexDoc
,
new
BasicDBObject
(
"name"
,
"type_desc"
));
DBObject
countIndexDoc
=
new
BasicDBObject
();
}
catch
(
Exception
e
)
{
countIndexDoc
.
put
(
"count"
,
-
1
);
e
.
printStackTrace
();
DBObject
timeIndexDoc
=
new
BasicDBObject
();
}
timeIndexDoc
.
put
(
"time"
,
-
1
);
}
DBObject
rankIndexDoc
=
new
BasicDBObject
();
rankIndexDoc
.
put
(
"rank"
,
-
1
);
/**
DBObject
nameIndexDoc
=
new
BasicDBObject
();
* 添加数据入库
nameIndexDoc
.
put
(
"name"
,
-
1
);
* @param list
DBObject
typeIndexDoc
=
new
BasicDBObject
();
*/
typeIndexDoc
.
put
(
"type"
,
-
1
);
public
void
addHotSearchList
(
List
<
DBObject
>
list
){
try
{
try
{
super
.
getReadColl
().
createIndex
(
countIndexDoc
,
new
BasicDBObject
(
"name"
,
"count_desc"
));
this
.
getReadColl
().
insert
(
list
);
super
.
getReadColl
().
createIndex
(
timeIndexDoc
,
new
BasicDBObject
(
"name"
,
"time_desc"
));
}
catch
(
Exception
e
)
{
super
.
getReadColl
().
createIndex
(
rankIndexDoc
,
new
BasicDBObject
(
"name"
,
"rank_desc"
));
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
super
.
getReadColl
().
createIndex
(
nameIndexDoc
,
new
BasicDBObject
(
"name"
,
"name_desc"
));
}
super
.
getReadColl
().
createIndex
(
typeIndexDoc
,
new
BasicDBObject
(
"name"
,
"type_desc"
));
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
public
void
addHotSearch
(
DBObject
doc
){
}
try
{
}
this
.
getReadColl
().
insert
(
doc
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
/**
}
* 添加数据入库
}
* @param list
*/
/**
public
void
addHotSearchList
(
List
<
DBObject
>
list
){
* 查询据上次变化量
try
{
* @Title: getChangeCount
this
.
getReadColl
().
insert
(
list
);
* @author hero
}
catch
(
Exception
e
)
{
* @param @param weiboHotSearch
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
* @param @return 设定文件
}
* @return int 返回类型
}
*/
public
int
getChangeCount
(
HotSearchList
weiboHotSearch
){
public
void
addHotSearch
(
DBObject
doc
){
int
result
=
0
;
try
{
DBObject
query
=
new
BasicDBObject
();
this
.
getReadColl
().
insert
(
doc
);
query
.
put
(
"name"
,
weiboHotSearch
.
getName
());
}
catch
(
Exception
e
)
{
DBObject
sort
=
new
BasicDBObject
();
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
sort
.
put
(
"time"
,
-
1
);
}
try
{
}
DBCursor
cur
=
this
.
getReadColl
().
find
(
query
).
sort
(
sort
).
limit
(
1
);
while
(
cur
.
hasNext
()){
/**
DBObject
doc
=
cur
.
next
();
* 查询据上次变化量
if
(
doc
.
get
(
"count"
)!=
null
)
{
* @Title: getChangeCount
result
=
weiboHotSearch
.
getCount
()
-
Integer
.
valueOf
(
doc
.
get
(
"count"
).
toString
());
* @author hero
break
;
* @param @param weiboHotSearch
}
* @param @return 设定文件
}
* @return int 返回类型
cur
.
close
();
*/
}
catch
(
Exception
e
)
{
public
int
getChangeCount
(
HotSearchList
weiboHotSearch
){
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
int
result
=
0
;
return
result
;
DBObject
query
=
new
BasicDBObject
();
}
query
.
put
(
"name"
,
weiboHotSearch
.
getName
());
return
result
;
DBObject
sort
=
new
BasicDBObject
();
}
sort
.
put
(
"time"
,
-
1
);
try
{
DBCursor
cur
=
this
.
getReadColl
().
find
(
query
).
sort
(
sort
).
limit
(
1
);
/**
while
(
cur
.
hasNext
()){
* @Title: getWeiboHotOneHour
DBObject
doc
=
cur
.
next
();
* @author hero
if
(
doc
.
get
(
"count"
)!=
null
)
{
* @Description: 查询最近1小时内新增的微博热搜
result
=
weiboHotSearch
.
getCount
()
-
Integer
.
valueOf
(
doc
.
get
(
"count"
).
toString
());
* @param @return 设定文件
break
;
* @return List<DBObject> 返回类型
}
*/
}
public
List
<
DBObject
>
getHotOneHour
(
String
type
){
cur
.
close
();
List
<
DBObject
>
list
=
new
ArrayList
<>();
}
catch
(
Exception
e
)
{
Date
date
=
new
Date
((
new
Date
().
getTime
()-
60
*
60
*
1000
));
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
DBObject
query
=
new
BasicDBObject
();
return
result
;
query
.
put
(
"time"
,
new
BasicDBObject
(
"$gte"
,
date
));
}
query
.
put
(
"changeCount"
,
0
);
return
result
;
query
.
put
(
"type"
,
type
);
}
try
{
DBCursor
cur
=
this
.
getReadColl
().
find
(
query
);
/**
while
(
cur
.
hasNext
()){
* @Title: getWeiboHotOneHour
DBObject
doc
=
cur
.
next
();
* @author hero
String
name
=
doc
.
get
(
"name"
).
toString
();
* @Description: 查询最近1小时内新增的微博热搜
if
(
CacheManager
.
getCacheByKey
(
name
)==
null
){
* @param @return 设定文件
CacheManager
.
putCache
(
name
,
doc
,
48
*
60
*
60
*
1000
);
* @return List<DBObject> 返回类型
list
.
add
(
doc
);
*/
}
public
List
<
DBObject
>
getHotOneHour
(
String
type
){
}
List
<
DBObject
>
list
=
new
ArrayList
<>();
cur
.
close
();
Date
date
=
new
Date
((
new
Date
().
getTime
()-
60
*
60
*
1000
));
}
catch
(
Exception
e
)
{
DBObject
query
=
new
BasicDBObject
();
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
query
.
put
(
"time"
,
new
BasicDBObject
(
"$gte"
,
date
));
}
query
.
put
(
"changeCount"
,
0
);
return
list
;
query
.
put
(
"type"
,
type
);
}
try
{
DBCursor
cur
=
this
.
getReadColl
().
find
(
query
);
}
while
(
cur
.
hasNext
()){
DBObject
doc
=
cur
.
next
();
String
name
=
doc
.
get
(
"name"
).
toString
();
if
(
CacheManager
.
getCacheByKey
(
name
)==
null
){
CacheManager
.
putCache
(
name
,
doc
,
48
*
60
*
60
*
1000
);
list
.
add
(
doc
);
}
}
cur
.
close
();
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/WechatUserDao.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
java.util.Collections
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.List
;
import
org.slf4j.Logger
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.mongodb.BasicDBObject
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
public
class
WechatUserDao
extends
MongoDBTemplate
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiDuHotSearchCrawler
.
class
);
@Log4j2
public
class
WechatUserDao
extends
MongoDBTemplate
{
public
WechatUserDao
()
{
super
();
public
WechatUserDao
()
{
super
.
setDbName
(
Config
.
dbName
);
super
();
super
.
setCollName
(
Config
.
collWechatUserName
);
super
.
setDbName
(
Config
.
dbName
);
}
super
.
setCollName
(
Config
.
collWechatUserName
);
}
/**
* 添加分组用户
/**
* @param userlist
* 添加分组用户
* @param groupName
* @param userlist
* @param groupId
* @param groupName
*/
* @param groupId
public
void
addWechatUser
(
List
<
String
>
userlist
,
String
groupName
,
Integer
groupId
){
*/
for
(
int
i
=
0
;
i
<
3
;
i
++){
public
void
addWechatUser
(
List
<
String
>
userlist
,
String
groupName
,
Integer
groupId
){
try
{
for
(
int
i
=
0
;
i
<
3
;
i
++){
DBObject
doc
=
new
BasicDBObject
();
try
{
doc
.
put
(
"_id"
,
groupId
+
"-"
+
groupName
);
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"groupId"
,
groupId
);
doc
.
put
(
"_id"
,
groupId
+
"-"
+
groupName
);
doc
.
put
(
"groupName"
,
groupName
);
doc
.
put
(
"groupId"
,
groupId
);
doc
.
put
(
"user"
,
userlist
);
doc
.
put
(
"groupName"
,
groupName
);
this
.
getReadColl
().
save
(
doc
);
doc
.
put
(
"user"
,
userlist
);
break
;
this
.
getReadColl
().
save
(
doc
);
}
catch
(
Exception
e
)
{
break
;
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
catch
(
Exception
e
)
{
}
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
}
}
}
/**
* 根据分组名称查询分组用户
/**
* @param group
* 根据分组名称查询分组用户
* @return
* @param group
*/
* @return
@SuppressWarnings
(
"unchecked"
)
*/
public
List
<
String
>
getWechatUserByGroup
(
String
group
){
@SuppressWarnings
(
"unchecked"
)
try
{
public
List
<
String
>
getWechatUserByGroup
(
String
group
){
DBObject
query
=
new
BasicDBObject
();
try
{
query
.
put
(
"groupName"
,
group
);
DBObject
query
=
new
BasicDBObject
();
DBObject
doc
=
this
.
getReadColl
().
findOne
(
query
);
query
.
put
(
"groupName"
,
group
);
if
(
doc
!=
null
){
DBObject
doc
=
this
.
getReadColl
().
findOne
(
query
);
return
(
List
<
String
>)
doc
.
get
(
"user"
);
if
(
doc
!=
null
){
}
return
(
List
<
String
>)
doc
.
get
(
"user"
);
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
catch
(
Exception
e
)
{
}
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
return
Collections
.
emptyList
();
}
}
return
Collections
.
emptyList
();
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiboTopicDAO.java
→
src/main/java/com/zhiwei/searchhotcrawler/dao/Weibo
Super
TopicDAO.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Objects
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
com.mongodb.BasicDBObject
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.mongodb.BasicDBObject
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.mongodb.DBObject
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
public
class
WeiboTopicDAO
extends
MongoDBTemplate
{
import
com.zhiwei.tools.timeparse.TimeParse
;
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboTopicDAO
.
class
);
@Log4j2
public
WeiboTopicDAO
()
{
public
class
WeiboSuperTopicDAO
extends
MongoDBTemplate
{
super
();
super
.
setDbName
(
Config
.
dbName
);
public
WeiboSuperTopicDAO
()
{
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
super
();
String
year
=
time
.
substring
(
0
,
4
);
super
.
setDbName
(
Config
.
dbName
);
String
month
=
time
.
substring
(
5
,
7
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
String
collName
=
Config
.
topicCollName
+
year
+
"_"
+
month
;
String
year
=
time
.
substring
(
0
,
4
);
super
.
setCollName
(
collName
);
String
month
=
time
.
substring
(
5
,
7
);
String
collName
=
Config
.
topicCollName
+
year
+
"_"
+
month
;
DBObject
countIndexDoc
=
new
BasicDBObject
();
super
.
setCollName
(
collName
);
countIndexDoc
.
put
(
"score_num"
,
-
1
);
DBObject
timeIndexDoc
=
new
BasicDBObject
();
createIndex
();
timeIndexDoc
.
put
(
"time"
,
-
1
);
}
DBObject
rankIndexDoc
=
new
BasicDBObject
();
rankIndexDoc
.
put
(
"rank"
,
-
1
);
DBObject
nameIndexDoc
=
new
BasicDBObject
();
/**
nameIndexDoc
.
put
(
"name"
,
-
1
);
* 初次创建表及创建相应的索引
DBObject
typeIndexDoc
=
new
BasicDBObject
();
*/
typeIndexDoc
.
put
(
"type"
,
-
1
);
private
void
createIndex
(){
try
{
List
<
DBObject
>
indexList
=
this
.
getReadColl
().
getIndexInfo
();
super
.
getReadColl
().
createIndex
(
countIndexDoc
,
new
BasicDBObject
(
"name"
,
"score_desc"
));
if
(
Objects
.
isNull
(
indexList
)
&&
indexList
.
isEmpty
()){
super
.
getReadColl
().
createIndex
(
timeIndexDoc
,
new
BasicDBObject
(
"name"
,
"time_desc"
));
DBObject
countIndexDoc
=
new
BasicDBObject
();
super
.
getReadColl
().
createIndex
(
rankIndexDoc
,
new
BasicDBObject
(
"name"
,
"rank_desc"
));
countIndexDoc
.
put
(
"score_num"
,
-
1
);
super
.
getReadColl
().
createIndex
(
nameIndexDoc
,
new
BasicDBObject
(
"name"
,
"name_desc"
));
DBObject
timeIndexDoc
=
new
BasicDBObject
();
super
.
getReadColl
().
createIndex
(
typeIndexDoc
,
new
BasicDBObject
(
"name"
,
"type_desc"
));
timeIndexDoc
.
put
(
"time"
,
-
1
);
}
catch
(
Exception
e
)
{
DBObject
rankIndexDoc
=
new
BasicDBObject
();
e
.
printStackTrace
();
rankIndexDoc
.
put
(
"rank"
,
-
1
);
}
DBObject
nameIndexDoc
=
new
BasicDBObject
();
}
nameIndexDoc
.
put
(
"name"
,
-
1
);
DBObject
typeIndexDoc
=
new
BasicDBObject
();
/**
typeIndexDoc
.
put
(
"type"
,
-
1
);
* 添加数据入库
try
{
* @param list
super
.
getReadColl
().
createIndex
(
countIndexDoc
,
new
BasicDBObject
(
"name"
,
"score_desc"
));
*/
super
.
getReadColl
().
createIndex
(
timeIndexDoc
,
new
BasicDBObject
(
"name"
,
"time_desc"
));
public
void
addTopicList
(
List
<
DBObject
>
list
){
super
.
getReadColl
().
createIndex
(
rankIndexDoc
,
new
BasicDBObject
(
"name"
,
"rank_desc"
));
try
{
super
.
getReadColl
().
createIndex
(
nameIndexDoc
,
new
BasicDBObject
(
"name"
,
"name_desc"
));
this
.
getReadColl
().
insert
(
list
);
super
.
getReadColl
().
createIndex
(
typeIndexDoc
,
new
BasicDBObject
(
"name"
,
"type_desc"
));
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
e
.
printStackTrace
();
}
}
}
}
}
public
void
addTopic
(
DBObject
doc
){
try
{
this
.
getReadColl
().
insert
(
doc
);
/**
}
catch
(
Exception
e
)
{
* 添加数据入库
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
* @param list
}
*/
}
public
void
addTopicList
(
List
<
DBObject
>
list
){
try
{
this
.
getReadColl
().
insert
(
list
);
}
catch
(
Exception
e
)
{
}
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
public
void
addTopic
(
DBObject
doc
){
try
{
this
.
getReadColl
().
insert
(
doc
);
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
run
;
package
com
.
zhiwei
.
searchhotcrawler
.
run
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.cache.CacheListener
;
import
com.zhiwei.searchhotcrawler.cache.CacheListener
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.*
;
import
com.zhiwei.searchhotcrawler.timer.DouyinHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.SendWeiboHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.SendZhihuHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.SougoHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.UpdateWechatUserRun
;
import
com.zhiwei.searchhotcrawler.timer.WeiboHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.WeiboTopicRun
;
import
com.zhiwei.searchhotcrawler.timer.ZhihuHotSearchRun
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
java.util.concurrent.Executors
;
import
java.util.concurrent.Executors
;
...
@@ -23,8 +15,10 @@ import java.util.concurrent.TimeUnit;
...
@@ -23,8 +15,10 @@ import java.util.concurrent.TimeUnit;
public
class
HotSearchRun
{
public
class
HotSearchRun
{
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
ProxyConfig
.
registry
,
ProxyConfig
.
group
,
GroupType
.
PROVIDER
,
10000013
);
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
new
UpdateWechatUserRun
().
start
();
new
UpdateWechatUserRun
().
start
();
ZhiWeiTools
.
sleep
(
10000
);
ZhiWeiTools
.
sleep
(
10000
);
...
@@ -51,6 +45,7 @@ public class HotSearchRun {
...
@@ -51,6 +45,7 @@ public class HotSearchRun {
new
SougoHotSearchRun
().
start
();
new
SougoHotSearchRun
().
start
();
new
DouyinHotSearchRun
().
start
();
new
DouyinHotSearchRun
().
start
();
new
ZhihuHotSearchRun
().
start
();
new
ZhihuHotSearchRun
().
start
();
new
WeiboSuperTopicRun
().
start
();
new
WeiboTopicRun
().
start
();
new
WeiboTopicRun
().
start
();
//推送程序启动
//推送程序启动
new
SendWeiboHotSearchRun
().
start
();
new
SendWeiboHotSearchRun
().
start
();
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchListTest.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.Arrays
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DB
;
import
com.mongodb.DB
;
import
com.mongodb.DBCollection
;
import
com.mongodb.DBCollection
;
import
com.mongodb.DBCursor
;
import
com.mongodb.DBCursor
;
import
com.mongodb.DBObject
;
import
com.mongodb.DBObject
;
import
com.mongodb.Mongo
;
import
com.mongodb.Mongo
;
import
com.mongodb.MongoClient
;
import
com.mongodb.MongoClient
;
import
com.mongodb.MongoCredential
;
import
com.mongodb.MongoCredential
;
import
com.mongodb.ServerAddress
;
import
com.mongodb.ServerAddress
;
import
com.mongodb.WriteResult
;
import
com.mongodb.WriteResult
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
public
class
HotSearchListTest
{
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.config.Config
;
public
static
void
main
(
String
[]
args
)
{
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.tools.timeparse.TimeParse
;
MongoCredential
credential
=
MongoCredential
.
createCredential
(
Config
.
userName
,
Config
.
authDB
,
Config
.
userPwd
.
toCharArray
());
import
org.jsoup.Jsoup
;
ServerAddress
address
=
new
ServerAddress
(
Config
.
mongoIp
,
Config
.
mongoPort
);
import
org.jsoup.nodes.Element
;
Mongo
mongo
=
new
MongoClient
(
address
,
Arrays
.
asList
(
credential
));
import
org.jsoup.select.Elements
;
DB
db
=
mongo
.
getDB
(
"hot_search_list"
);
public
class
HotSearchListTest
{
DBCollection
coll
=
db
.
getCollection
(
"hot_search_list2019_09"
);
// MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray());
public
static
void
main
(
String
[]
args
)
{
// ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
// Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew));
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
// DB dbNew = mongoNew.getDB("hot_search_list");
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"zzw"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
Map
<
String
,
String
>
timLine
=
TimeParse
.
getTimeMap
(
"2019-10-01 00:00:00"
,
"2019-10-09 23:59:59"
,
"dd"
,
1
);
String
url
=
"http://app.myzaker.com/news/app.php?f="
;
timLine
.
forEach
((
start
,
end
)
->{
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
try
{
String
year
=
end
.
substring
(
0
,
4
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
String
month
=
end
.
substring
(
5
,
7
);
Elements
elements
=
Jsoup
.
parse
(
htmlBody
).
select
(
"div.titlebar>a"
);
Date
startDate
=
TimeParse
.
stringFormartDate
(
start
);
for
(
Element
element
:
elements
){
Date
endDate
=
TimeParse
.
stringFormartDate
(
end
);
String
lableUrl
=
"http://app.myzaker.com/news/app.php"
+
element
.
attr
(
"href"
);
System
.
out
.
println
(
"lableUrl========="
+
lableUrl
);
String
collName
=
"hot_search_list"
+
year
+
"_"
+
month
;
String
htmlBodyLable
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
lableUrl
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
System
.
out
.
println
(
"collName=========="
+
collName
);
Elements
elementsLable
=
Jsoup
.
parse
(
htmlBodyLable
).
select
(
"div#infinite_scroll>a"
);
// DBCollection collNew = dbNew.getCollection(collName);
// DBObject countIndexDoc = new BasicDBObject();
for
(
Element
elementLable
:
elementsLable
){
// countIndexDoc.put("count", -1);
System
.
out
.
println
(
elementLable
.
attr
(
"href"
)
+
"============="
+
elementLable
.
text
());
// DBObject timeIndexDoc = new BasicDBObject();
}
// timeIndexDoc.put("time", -1);
}
// DBObject rankIndexDoc = new BasicDBObject();
// rankIndexDoc.put("rank", -1);
}
catch
(
Exception
e
){
// DBObject nameIndexDoc = new BasicDBObject();
e
.
printStackTrace
();
// nameIndexDoc.put("name", -1);
}
// DBObject typeIndexDoc = new BasicDBObject();
// typeIndexDoc.put("type", -1);
// try {
// collNew.createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
// MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray());
// collNew.createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
// ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
// collNew.createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
// Mongo mongo = new MongoClient(address, Arrays.asList(credential));
// collNew.createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
//
// collNew.createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
// DB db = mongo.getDB("hot_search_list");
// } catch (Exception e) {
// DBCollection coll = db.getCollection("hot_search_list2019_09");
// e.printStackTrace();
//
// }
//// MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray());
//// ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
DBObject
query
=
new
BasicDBObject
(
new
BasicDBObject
(
"time"
,
//// Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew));
new
BasicDBObject
(
"$gte"
,
startDate
).
append
(
"$lte"
,
endDate
)));
//// DB dbNew = mongoNew.getDB("hot_search_list");
System
.
out
.
println
(
query
);
//
WriteResult
wr
=
coll
.
remove
(
query
);
// Map<String,String> timLine = TimeParse.getTimeMap("2019-10-01 00:00:00", "2019-10-09 23:59:59", "dd", 1);
System
.
out
.
println
(
"========"
+
wr
.
getN
());
//
// int i = 0;
// timLine.forEach((start, end) ->{
// DBCursor cur = coll.remove(query);
//
// System.out.println(query +"======="+ cur.count());
// String year = end.substring(0,4);
// List<DBObject> dataList = new ArrayList<>();
// String month = end.substring(5,7);
// while(cur.hasNext()) {
// Date startDate = TimeParse.stringFormartDate(start);
// DBObject doc = cur.next();
// Date endDate = TimeParse.stringFormartDate(end);
// try {
//
//// collNew.save(doc);
// String collName = "hot_search_list"+year+"_"+month;
// i++;
// System.out.println("collName=========="+collName);
// coll.remove(doc);
//// DBCollection collNew = dbNew.getCollection(collName);
// } catch (Exception e2) {
//// DBObject countIndexDoc = new BasicDBObject();
// e2.printStackTrace();
//// countIndexDoc.put("count", -1);
// }
//// DBObject timeIndexDoc = new BasicDBObject();
// dataList.add(doc);
//// timeIndexDoc.put("time", -1);
// }
//// DBObject rankIndexDoc = new BasicDBObject();
// System.out.println(collName +"数据量大小" +dataList.size());
//// rankIndexDoc.put("rank", -1);
// cur.close();
//// DBObject nameIndexDoc = new BasicDBObject();
// if(!dataList.isEmpty()) {
//// nameIndexDoc.put("name", -1);
// try {
//// DBObject typeIndexDoc = new BasicDBObject();
// collNew.insert(dataList);
//// typeIndexDoc.put("type", -1);
// } catch (Exception e) {
//// try {
// e.printStackTrace();
//// collNew.createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
// }
//// collNew.createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
// }
//// collNew.createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
});
//// collNew.createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
mongo
.
close
();
//// collNew.createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
}
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//
}
// DBObject query = new BasicDBObject(new BasicDBObject("time",
// new BasicDBObject("$gte",startDate).append("$lte", endDate)));
// System.out.println(query);
// WriteResult wr = coll.remove(query);
// System.out.println("========"+wr.getN());
//// int i = 0;
//// DBCursor cur = coll.remove(query);
//// System.out.println(query +"======="+ cur.count());
//// List<DBObject> dataList = new ArrayList<>();
//// while(cur.hasNext()) {
//// DBObject doc = cur.next();
//// try {
////// collNew.save(doc);
//// i++;
//// coll.remove(doc);
//// } catch (Exception e2) {
//// e2.printStackTrace();
//// }
//// dataList.add(doc);
//// }
//// System.out.println(collName +"数据量大小" +dataList.size());
//// cur.close();
//// if(!dataList.isEmpty()) {
//// try {
//// collNew.insert(dataList);
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// });
// mongo.close();
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
View file @
cb6bcd76
...
@@ -6,6 +6,7 @@ import java.util.List;
...
@@ -6,6 +6,7 @@ import java.util.List;
import
java.util.Objects
;
import
java.util.Objects
;
import
java.util.concurrent.TimeUnit
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
...
@@ -16,10 +17,9 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
...
@@ -16,10 +17,9 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
BaiduHotSearchRun
extends
Thread
{
public
class
BaiduHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiduHotSearchRun
.
class
);
@Override
@Override
public
void
run
()
{
public
void
run
()
{
boolean
f
=
true
;
boolean
f
=
true
;
...
@@ -37,10 +37,10 @@ public class BaiduHotSearchRun extends Thread{
...
@@ -37,10 +37,10 @@ public class BaiduHotSearchRun extends Thread{
private
void
getHotList
()
{
private
void
getHotList
()
{
log
ger
.
info
(
"百度风云榜采集开始........"
);
log
.
info
(
"百度风云榜采集开始........"
);
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
List
<
HotSearchList
>
list
=
BaiDuHotSearchCrawler
.
baiduHotSearch
();
List
<
HotSearchList
>
list
=
BaiDuHotSearchCrawler
.
baiduHotSearch
();
logger
.
info
(
"{}, 此轮百度风云榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 此轮百度风云榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
saveDataList
=
new
ArrayList
<>();
List
<
DBObject
>
saveDataList
=
new
ArrayList
<>();
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
list
.
forEach
(
baiduHotSearch
->{
list
.
forEach
(
baiduHotSearch
->{
...
@@ -59,7 +59,7 @@ public class BaiduHotSearchRun extends Thread{
...
@@ -59,7 +59,7 @@ public class BaiduHotSearchRun extends Thread{
});
});
}
}
hotSearchDAO
.
addHotSearchList
(
saveDataList
);
hotSearchDAO
.
addHotSearchList
(
saveDataList
);
log
ger
.
info
(
"百度风云榜采集结束........"
);
log
.
info
(
"百度风云榜采集结束........"
);
}
}
}
}
\ No newline at end of file
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
View file @
cb6bcd76
...
@@ -5,6 +5,7 @@ import java.util.Date;
...
@@ -5,6 +5,7 @@ import java.util.Date;
import
java.util.List
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
...
@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.DouyinHotSearchCrawler;
...
@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.DouyinHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
DouyinHotSearchRun
extends
Thread
{
public
class
DouyinHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
DouyinHotSearchRun
.
class
);
@Override
@Override
public
void
run
()
{
public
void
run
()
{
boolean
f
=
true
;
boolean
f
=
true
;
...
@@ -40,10 +40,10 @@ public class DouyinHotSearchRun extends Thread{
...
@@ -40,10 +40,10 @@ public class DouyinHotSearchRun extends Thread{
* @return void
* @return void
*/
*/
private
void
getHotList
()
{
private
void
getHotList
()
{
log
ger
.
info
(
"抖音热搜榜采集开始........"
);
log
.
info
(
"抖音热搜榜采集开始........"
);
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
List
<
HotSearchList
>
list
=
DouyinHotSearchCrawler
.
getMobileDouyinHotList
();
List
<
HotSearchList
>
list
=
DouyinHotSearchCrawler
.
getMobileDouyinHotList
();
log
ger
.
info
(
"{}, 抖音热搜榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 抖音热搜榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
HotSearchList
douyinHotSearch
:
list
){
for
(
HotSearchList
douyinHotSearch
:
list
){
int
changeCount
=
hotSearchDAO
.
getChangeCount
(
douyinHotSearch
);
int
changeCount
=
hotSearchDAO
.
getChangeCount
(
douyinHotSearch
);
...
@@ -60,7 +60,7 @@ public class DouyinHotSearchRun extends Thread{
...
@@ -60,7 +60,7 @@ public class DouyinHotSearchRun extends Thread{
data
.
add
(
douyin
);
data
.
add
(
douyin
);
hotSearchDAO
.
addHotSearch
(
douyin
);
hotSearchDAO
.
addHotSearch
(
douyin
);
}
}
log
ger
.
info
(
"抖音热搜榜采集结束........"
);
log
.
info
(
"抖音热搜榜采集结束........"
);
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/SendWeiboHotSearchRun.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.Calendar
;
import
java.util.Calendar
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.DBObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.searchhotcrawler.dao.WechatUserDao
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.dao.WechatUserDao
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.Template
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.searchhotcrawler.util.Template
;
import
com.zhiwei.searchhotcrawler.util.WechatConstant
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.searchhotcrawler.util.WechatConstant
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
SendWeiboHotSearchRun
extends
Thread
{
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
@Log4j2
private
static
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
public
class
SendWeiboHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SendWeiboHotSearchRun
.
class
);
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
@Override
private
static
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
public
void
run
()
{
@Override
while
(
true
)
{
public
void
run
()
{
try
{
while
(
true
)
{
Calendar
calendar
=
Calendar
.
getInstance
();
try
{
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
Calendar
calendar
=
Calendar
.
getInstance
();
logger
.
info
(
"微博推送,当前系统时间为:"
+
hour
);
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
if
(
hour
>
6
&&
hour
<
23
)
{
log
.
info
(
"微博推送,当前系统时间为:"
+
hour
);
List
<
DBObject
>
list
=
hotSearchDAO
.
getHotOneHour
(
HotSearchType
.
微博热搜
.
name
());
if
(
hour
>
6
&&
hour
<
23
)
{
if
(
list
!=
null
&&
!
list
.
isEmpty
())
{
List
<
DBObject
>
list
=
hotSearchDAO
.
getHotOneHour
(
HotSearchType
.
微博热搜
.
name
());
for
(
DBObject
weibo
:
list
)
{
if
(
list
!=
null
&&
!
list
.
isEmpty
())
{
String
title
=
weibo
.
get
(
"name"
).
toString
();
for
(
DBObject
weibo
:
list
)
{
String
time
=
TimeParse
.
dateFormartString
((
Date
)
weibo
.
get
(
"time"
),
"yyyy-MM-dd HH:mm:ss"
);
String
title
=
weibo
.
get
(
"name"
).
toString
();
String
url
=
weibo
.
get
(
"url"
).
toString
();
String
time
=
TimeParse
.
dateFormartString
((
Date
)
weibo
.
get
(
"time"
),
"yyyy-MM-dd HH:mm:ss"
);
sendTemplateByUserIds
(
title
,
time
,
url
);
String
url
=
weibo
.
get
(
"url"
).
toString
();
}
sendTemplateByUserIds
(
title
,
time
,
url
);
}
else
{
}
logger
.
info
(
"微博最近一小时无数据"
);
}
else
{
sendTemplateByUserIds
(
"最近一小时无数据"
,
log
.
info
(
"微博最近一小时无数据"
);
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
sendTemplateByUserIds
(
"最近一小时无数据"
,
}
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
}
}
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
logger
.
debug
(
"微博热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
log
.
debug
(
"微博热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
continue
;
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
continue
;
}
}
}
}
}
/**
* @Title: sendTemplateByUserIds
/**
* @author hero
* @Title: sendTemplateByUserIds
* @Description: 发送模版消息
* @author hero
* @param @param
* @Description: 发送模版消息
* microTouTiao
* @param @param
* @param @param
* microTouTiao
* userList 设定文件
* @param @param
* @return void 返回类型
* userList 设定文件
*/
* @return void 返回类型
public
static
void
sendTemplateByUserIds
(
String
title
,
String
time
,
String
url
)
{
*/
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<
String
,
Object
>();
public
static
void
sendTemplateByUserIds
(
String
title
,
String
time
,
String
url
)
{
JSONObject
first
=
new
JSONObject
();
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<
String
,
Object
>();
first
.
put
(
"value"
,
"您好,有一条来自微博热搜榜的预警通知。"
);
JSONObject
first
=
new
JSONObject
();
dataMap
.
put
(
"first"
,
first
);
first
.
put
(
"value"
,
"您好,有一条来自微博热搜榜的预警通知。"
);
JSONObject
keyword1
=
new
JSONObject
();
dataMap
.
put
(
"first"
,
first
);
keyword1
.
put
(
"value"
,
title
);
JSONObject
keyword1
=
new
JSONObject
();
keyword1
.
put
(
"color"
,
"#173177"
);
keyword1
.
put
(
"value"
,
title
);
dataMap
.
put
(
"keyword1"
,
keyword1
);
keyword1
.
put
(
"color"
,
"#173177"
);
JSONObject
keyword2
=
new
JSONObject
();
dataMap
.
put
(
"keyword1"
,
keyword1
);
keyword2
.
put
(
"value"
,
"微博热搜榜"
);
JSONObject
keyword2
=
new
JSONObject
();
keyword2
.
put
(
"color"
,
"#173177"
);
keyword2
.
put
(
"value"
,
"微博热搜榜"
);
dataMap
.
put
(
"keyword2"
,
keyword2
);
keyword2
.
put
(
"color"
,
"#173177"
);
JSONObject
keyword3
=
new
JSONObject
();
dataMap
.
put
(
"keyword2"
,
keyword2
);
keyword3
.
put
(
"value"
,
time
);
JSONObject
keyword3
=
new
JSONObject
();
keyword3
.
put
(
"color"
,
"#173177"
);
keyword3
.
put
(
"value"
,
time
);
dataMap
.
put
(
"keyword3"
,
keyword3
);
keyword3
.
put
(
"color"
,
"#173177"
);
JSONObject
remark
=
new
JSONObject
();
dataMap
.
put
(
"keyword3"
,
keyword3
);
remark
.
put
(
"value"
,
"知微情报监测服务"
);
JSONObject
remark
=
new
JSONObject
();
dataMap
.
put
(
"remark"
,
remark
);
remark
.
put
(
"value"
,
"知微情报监测服务"
);
List
<
String
>
userList
=
getUserList
();
dataMap
.
put
(
"remark"
,
remark
);
if
(
userList
!=
null
&&
userList
.
size
()
>
0
)
{
List
<
String
>
userList
=
getUserList
();
for
(
String
openId
:
userList
)
{
if
(
userList
!=
null
&&
userList
.
size
()
>
0
)
{
Template
template
=
new
Template
();
for
(
String
openId
:
userList
)
{
template
.
setTouser
(
openId
);
Template
template
=
new
Template
();
if
(
url
!=
null
)
{
template
.
setTouser
(
openId
);
template
.
setUrl
(
url
);
if
(
url
!=
null
)
{
}
template
.
setUrl
(
url
);
template
.
setTemplate_id
(
WechatConstant
.
WECHAT_TEMPLATEID_EARLY_IT
);
}
template
.
setData
(
dataMap
);
template
.
setTemplate_id
(
WechatConstant
.
WECHAT_TEMPLATEID_EARLY_IT
);
template
.
setData
(
dataMap
);
JSONObject
templateJson
=
(
JSONObject
)
JSONObject
.
toJSON
(
template
);
WechatCodeUtil
.
sendDataJson
(
templateJson
);
JSONObject
templateJson
=
(
JSONObject
)
JSONObject
.
toJSON
(
template
);
}
WechatCodeUtil
.
sendDataJson
(
templateJson
);
}
else
{
}
logger
.
info
(
"拉取微博用户列表失败"
);
}
else
{
}
log
.
info
(
"拉取微博用户列表失败"
);
}
}
}
/**
* @Title: getUserList
/**
* @author hero
* @Title: getUserList
* @Description: 用户列表
* @author hero
* @param @param
* @Description: 用户列表
* projectName
* @param @param
* @param @return
* projectName
* 设定文件
* @param @return
* @return List<String> 返回类型
* 设定文件
*/
* @return List<String> 返回类型
public
static
List
<
String
>
getUserList
()
{
*/
List
<
String
>
userList
=
wechatUserDao
.
getWechatUserByGroup
(
"weibohot"
);
public
static
List
<
String
>
getUserList
()
{
if
(
userList
==
null
){
List
<
String
>
userList
=
wechatUserDao
.
getWechatUserByGroup
(
"weibohot"
);
userList
=
WechatCodeUtil
.
getUserListByGroupName
(
"weibohot"
);
if
(
userList
==
null
){
}
userList
=
WechatCodeUtil
.
getUserListByGroupName
(
"weibohot"
);
return
userList
;
}
}
return
userList
;
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/SendZhihuHotSearchRun.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.Calendar
;
import
java.util.Calendar
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.DBObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.dao.WechatUserDao
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.util.Template
;
import
com.zhiwei.searchhotcrawler.dao.WechatUserDao
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.searchhotcrawler.util.Template
;
import
com.zhiwei.searchhotcrawler.util.WechatConstant
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.searchhotcrawler.util.WechatConstant
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
SendZhihuHotSearchRun
extends
Thread
{
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
@Log4j2
private
static
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
public
class
SendZhihuHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SendZhihuHotSearchRun
.
class
);
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
@Override
private
static
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
public
void
run
()
{
@Override
public
void
run
()
{
while
(
true
)
{
try
{
while
(
true
)
{
Calendar
calendar
=
Calendar
.
getInstance
();
try
{
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
Calendar
calendar
=
Calendar
.
getInstance
();
logger
.
info
(
"知乎推送,当前系统时间为:"
+
hour
);
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
if
(
hour
>
6
&&
hour
<
23
){
log
.
info
(
"知乎推送,当前系统时间为:"
+
hour
);
List
<
DBObject
>
list
=
hotSearchDAO
.
getHotOneHour
(
HotSearchType
.
知乎热搜
.
name
());
if
(
hour
>
6
&&
hour
<
23
){
if
(
list
!=
null
&&
!
list
.
isEmpty
()){
List
<
DBObject
>
list
=
hotSearchDAO
.
getHotOneHour
(
HotSearchType
.
知乎热搜
.
name
());
for
(
DBObject
zhihu
:
list
){
if
(
list
!=
null
&&
!
list
.
isEmpty
()){
String
title
=
zhihu
.
get
(
"display_query"
).
toString
();
for
(
DBObject
zhihu
:
list
){
String
time
=
TimeParse
.
dateFormartString
((
Date
)
zhihu
.
get
(
"time"
),
"yyyy-MM-dd HH:mm:ss"
);
String
title
=
zhihu
.
get
(
"display_query"
).
toString
();
String
url
=
zhihu
.
get
(
"_id"
).
toString
();
String
time
=
TimeParse
.
dateFormartString
((
Date
)
zhihu
.
get
(
"time"
),
"yyyy-MM-dd HH:mm:ss"
);
if
(
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
)
>
6
&&
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
)
<
23
){
String
url
=
zhihu
.
get
(
"_id"
).
toString
();
sendTemplateByUserIds
(
title
,
time
,
url
);
if
(
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
)
>
6
&&
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
)
<
23
){
}
sendTemplateByUserIds
(
title
,
time
,
url
);
}
}
}
else
{
}
logger
.
info
(
"知乎最近一小时无数据"
);
}
else
{
sendTemplateByUserIds
(
"最近一小时无数据"
,
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
log
.
info
(
"知乎最近一小时无数据"
);
}
sendTemplateByUserIds
(
"最近一小时无数据"
,
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
}
}
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
logger
.
debug
(
"知乎热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
log
.
debug
(
"知乎热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
}
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
}
}
}
}
/**
* @Title: sendTemplateByUserIds
/**
* @author hero
* @Title: sendTemplateByUserIds
* @Description: 发送模版消息
* @author hero
* @param @param microTouTiao
* @Description: 发送模版消息
* @param @param userList 设定文件
* @param @param microTouTiao
* @return void 返回类型
* @param @param userList 设定文件
*/
* @return void 返回类型
public
static
void
sendTemplateByUserIds
(
String
title
,
String
time
,
String
url
)
{
*/
public
static
void
sendTemplateByUserIds
(
String
title
,
String
time
,
String
url
)
{
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<>();
JSONObject
first
=
new
JSONObject
();
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<>();
first
.
put
(
"value"
,
"您好,有一条来自知乎热搜榜的预警通知。"
);
JSONObject
first
=
new
JSONObject
();
dataMap
.
put
(
"first"
,
first
);
first
.
put
(
"value"
,
"您好,有一条来自知乎热搜榜的预警通知。"
);
JSONObject
keyword1
=
new
JSONObject
();
dataMap
.
put
(
"first"
,
first
);
keyword1
.
put
(
"value"
,
title
);
JSONObject
keyword1
=
new
JSONObject
();
keyword1
.
put
(
"color"
,
"#173177"
);
keyword1
.
put
(
"value"
,
title
);
dataMap
.
put
(
"keyword1"
,
keyword1
);
keyword1
.
put
(
"color"
,
"#173177"
);
JSONObject
keyword2
=
new
JSONObject
();
dataMap
.
put
(
"keyword1"
,
keyword1
);
keyword2
.
put
(
"value"
,
"知乎热搜榜"
);
JSONObject
keyword2
=
new
JSONObject
();
keyword2
.
put
(
"color"
,
"#173177"
);
keyword2
.
put
(
"value"
,
"知乎热搜榜"
);
dataMap
.
put
(
"keyword2"
,
keyword2
);
keyword2
.
put
(
"color"
,
"#173177"
);
JSONObject
keyword3
=
new
JSONObject
();
dataMap
.
put
(
"keyword2"
,
keyword2
);
keyword3
.
put
(
"value"
,
time
);
JSONObject
keyword3
=
new
JSONObject
();
keyword3
.
put
(
"color"
,
"#173177"
);
keyword3
.
put
(
"value"
,
time
);
dataMap
.
put
(
"keyword3"
,
keyword3
);
keyword3
.
put
(
"color"
,
"#173177"
);
JSONObject
remark
=
new
JSONObject
();
dataMap
.
put
(
"keyword3"
,
keyword3
);
remark
.
put
(
"value"
,
"知微情报监测服务"
);
JSONObject
remark
=
new
JSONObject
();
dataMap
.
put
(
"remark"
,
remark
);
remark
.
put
(
"value"
,
"知微情报监测服务"
);
dataMap
.
put
(
"remark"
,
remark
);
List
<
String
>
userList
=
getUserList
();
if
(
userList
!=
null
&&
!
userList
.
isEmpty
())
{
List
<
String
>
userList
=
getUserList
();
for
(
String
openId
:
userList
)
{
if
(
userList
!=
null
&&
!
userList
.
isEmpty
())
{
Template
template
=
new
Template
();
for
(
String
openId
:
userList
)
{
template
.
setTouser
(
openId
);
Template
template
=
new
Template
();
if
(
url
!=
null
){
template
.
setTouser
(
openId
);
template
.
setUrl
(
url
);
if
(
url
!=
null
){
}
template
.
setUrl
(
url
);
template
.
setTemplate_id
(
WechatConstant
.
WECHAT_TEMPLATEID_EARLY_IT
);
}
template
.
setData
(
dataMap
);
template
.
setTemplate_id
(
WechatConstant
.
WECHAT_TEMPLATEID_EARLY_IT
);
template
.
setData
(
dataMap
);
JSONObject
templateJson
=
(
JSONObject
)
JSONObject
.
toJSON
(
template
);
WechatCodeUtil
.
sendDataJson
(
templateJson
);
JSONObject
templateJson
=
(
JSONObject
)
JSONObject
.
toJSON
(
template
);
}
WechatCodeUtil
.
sendDataJson
(
templateJson
);
}
else
{
}
logger
.
info
(
"知乎推送拉取用户列表失败"
);
}
else
{
}
log
.
info
(
"知乎推送拉取用户列表失败"
);
}
}
}
/**
* @Title: getUserList
/**
* @author hero
* @Title: getUserList
* @Description: 用户列表
* @author hero
* @param @param projectName
* @Description: 用户列表
* @param @return 设定文件
* @param @param projectName
* @return List<String> 返回类型
* @param @return 设定文件
*/
* @return List<String> 返回类型
private
static
List
<
String
>
getUserList
()
*/
{
private
static
List
<
String
>
getUserList
()
List
<
String
>
userList
=
wechatUserDao
.
getWechatUserByGroup
(
"LP组"
);
{
if
(
userList
==
null
){
List
<
String
>
userList
=
wechatUserDao
.
getWechatUserByGroup
(
"LP组"
);
userList
=
WechatCodeUtil
.
getUserListByGroupName
(
"LP组"
);
if
(
userList
==
null
){
}
userList
=
WechatCodeUtil
.
getUserListByGroupName
(
"LP组"
);
return
userList
;
}
}
return
userList
;
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
View file @
cb6bcd76
...
@@ -5,6 +5,7 @@ import java.util.Date;
...
@@ -5,6 +5,7 @@ import java.util.Date;
import
java.util.List
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
...
@@ -15,9 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler;
...
@@ -15,9 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
SougoHotSearchRun
extends
Thread
{
public
class
SougoHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SougoHotSearchRun
.
class
);
@Override
@Override
public
void
run
()
{
public
void
run
()
{
boolean
f
=
true
;
boolean
f
=
true
;
...
@@ -36,9 +37,9 @@ public class SougoHotSearchRun extends Thread {
...
@@ -36,9 +37,9 @@ public class SougoHotSearchRun extends Thread {
private
void
getHotList
()
{
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
log
ger
.
info
(
"搜狗微信采集开始........"
);
log
.
info
(
"搜狗微信采集开始........"
);
List
<
HotSearchList
>
list
=
SougoHotSearchCrawler
.
sougoHotSearch
();
List
<
HotSearchList
>
list
=
SougoHotSearchCrawler
.
sougoHotSearch
();
logger
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
HotSearchList
sougoHotSearch
:
list
){
for
(
HotSearchList
sougoHotSearch
:
list
){
DBObject
doc
=
new
BasicDBObject
();
DBObject
doc
=
new
BasicDBObject
();
...
@@ -52,7 +53,7 @@ public class SougoHotSearchRun extends Thread {
...
@@ -52,7 +53,7 @@ public class SougoHotSearchRun extends Thread {
data
.
add
(
doc
);
data
.
add
(
doc
);
}
}
hotSearchDAO
.
addHotSearchList
(
data
);
hotSearchDAO
.
addHotSearchList
(
data
);
log
ger
.
info
(
"搜狗微信采集结束........"
);
log
.
info
(
"搜狗微信采集结束........"
);
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/UpdateWechatUserRun.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.Calendar
;
import
java.util.Calendar
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
java.util.Map.Entry
;
import
org.slf4j.Logger
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.searchhotcrawler.dao.WechatUserDao
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.searchhotcrawler.dao.WechatUserDao
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
UpdateWechatUserRun
extends
Thread
{
private
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
@Log4j2
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
UpdateWechatUserRun
.
class
);
public
class
UpdateWechatUserRun
extends
Thread
{
@Override
private
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
public
void
run
()
{
@Override
logger
.
info
(
"开始更新用户数据"
);
public
void
run
()
{
while
(
true
)
{
log
.
info
(
"开始更新用户数据"
);
try
{
while
(
true
)
{
Calendar
calendar
=
Calendar
.
getInstance
();
try
{
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
Calendar
calendar
=
Calendar
.
getInstance
();
if
(
hour
>
6
){
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
Map
<
String
,
Integer
>
groupMap
=
WechatCodeUtil
.
getAllGroupIp
();
if
(
hour
>
6
){
logger
.
info
(
"此公众号的分组数量为:::{}"
,
groupMap
.
size
());
Map
<
String
,
Integer
>
groupMap
=
WechatCodeUtil
.
getAllGroupIp
();
if
(!
groupMap
.
isEmpty
()
&&
groupMap
!=
null
){
log
.
info
(
"此公众号的分组数量为:::{}"
,
groupMap
.
size
());
for
(
Entry
<
String
,
Integer
>
group
:
groupMap
.
entrySet
()){
if
(!
groupMap
.
isEmpty
()
&&
groupMap
!=
null
){
logger
.
info
(
"此公众号的分组名称及IP为:::{},{}"
,
group
.
getKey
(),
group
.
getValue
());
for
(
Entry
<
String
,
Integer
>
group
:
groupMap
.
entrySet
()){
List
<
String
>
userList
=
WechatCodeUtil
.
getUserListByGroupId
(
group
.
getValue
());
log
.
info
(
"此公众号的分组名称及IP为:::{},{}"
,
group
.
getKey
(),
group
.
getValue
());
logger
.
info
(
"{},此分组下的用户数量为::{}"
,
group
.
getKey
(),
userList
.
size
());
List
<
String
>
userList
=
WechatCodeUtil
.
getUserListByGroupId
(
group
.
getValue
());
if
(
userList
!=
null
&&
!
userList
.
isEmpty
()){
log
.
info
(
"{},此分组下的用户数量为::{}"
,
group
.
getKey
(),
userList
.
size
());
wechatUserDao
.
addWechatUser
(
userList
,
group
.
getKey
(),
group
.
getValue
());
if
(
userList
!=
null
&&
!
userList
.
isEmpty
()){
}
wechatUserDao
.
addWechatUser
(
userList
,
group
.
getKey
(),
group
.
getValue
());
}
}
}
}
}
}
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
logger
.
debug
(
"知乎热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
log
.
debug
(
"知乎热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
continue
;
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
continue
;
}
}
}
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
View file @
cb6bcd76
...
@@ -5,6 +5,7 @@ import java.util.Date;
...
@@ -5,6 +5,7 @@ import java.util.Date;
import
java.util.List
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
...
@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
...
@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
WeiboHotSearchRun
extends
Thread
{
public
class
WeiboHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboHotSearchRun
.
class
);
@Override
@Override
public
void
run
()
{
public
void
run
()
{
boolean
f
=
true
;
boolean
f
=
true
;
...
@@ -36,11 +36,11 @@ public class WeiboHotSearchRun extends Thread{
...
@@ -36,11 +36,11 @@ public class WeiboHotSearchRun extends Thread{
private
void
getHotList
()
{
private
void
getHotList
()
{
log
ger
.
info
(
"微博话题采集开始........"
);
log
.
info
(
"微博话题采集开始........"
);
HotSearchListDAO
weiboHotSearchDAO
=
new
HotSearchListDAO
();
HotSearchListDAO
weiboHotSearchDAO
=
new
HotSearchListDAO
();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch();
List
<
HotSearchList
>
list
=
WeiboHotSearchCrawler
.
weiboHotSearchByPhone
();
List
<
HotSearchList
>
list
=
WeiboHotSearchCrawler
.
weiboHotSearchByPhone
();
logger
.
info
(
"{}, 微博此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 微博此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
HotSearchList
weiboHotSearch
:
list
){
for
(
HotSearchList
weiboHotSearch
:
list
){
int
changeCount
=
weiboHotSearchDAO
.
getChangeCount
(
weiboHotSearch
);
int
changeCount
=
weiboHotSearchDAO
.
getChangeCount
(
weiboHotSearch
);
...
@@ -49,7 +49,7 @@ public class WeiboHotSearchRun extends Thread{
...
@@ -49,7 +49,7 @@ public class WeiboHotSearchRun extends Thread{
doc
.
put
(
"name"
,
weiboHotSearch
.
getName
());
doc
.
put
(
"name"
,
weiboHotSearch
.
getName
());
doc
.
put
(
"url"
,
weiboHotSearch
.
getUrl
());
doc
.
put
(
"url"
,
weiboHotSearch
.
getUrl
());
doc
.
put
(
"count"
,
weiboHotSearch
.
getCount
());
doc
.
put
(
"count"
,
weiboHotSearch
.
getCount
());
doc
.
put
(
"hot"
,
weiboHotSearch
.
is
Hot
());
doc
.
put
(
"hot"
,
weiboHotSearch
.
get
Hot
());
doc
.
put
(
"day"
,
weiboHotSearch
.
getDay
());
doc
.
put
(
"day"
,
weiboHotSearch
.
getDay
());
doc
.
put
(
"time"
,
weiboHotSearch
.
getTime
());
doc
.
put
(
"time"
,
weiboHotSearch
.
getTime
());
doc
.
put
(
"changeCount"
,
changeCount
);
doc
.
put
(
"changeCount"
,
changeCount
);
...
@@ -59,7 +59,7 @@ public class WeiboHotSearchRun extends Thread{
...
@@ -59,7 +59,7 @@ public class WeiboHotSearchRun extends Thread{
data
.
add
(
doc
);
data
.
add
(
doc
);
}
}
weiboHotSearchDAO
.
addHotSearchList
(
data
);
weiboHotSearchDAO
.
addHotSearchList
(
data
);
log
ger
.
info
(
"微博话题采集结束........"
);
log
.
info
(
"微博话题采集结束........"
);
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboSuperTopicRun.java
0 → 100644
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboSuperTopicCrawler
;
import
com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
WeiboSuperTopicRun
extends
Thread
{
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getTopicList
();
TimeUnit
.
HOURS
.
sleep
(
3
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
}
ZhiWeiTools
.
sleep
(
50
);
}
}
private
void
getTopicList
()
{
WeiboSuperTopicDAO
weiboTopicDAO
=
new
WeiboSuperTopicDAO
();
log
.
info
(
"微博超话采集开始........"
);
List
<
WeiboSuperTopic
>
list
=
WeiboSuperTopicCrawler
.
startCrawler
();
log
.
info
(
"{}, 微博超话此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
WeiboSuperTopic
topic
:
list
){
log
.
info
(
"topic::::{}"
,
topic
);
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"score_num"
,
topic
.
getScore
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"type"
,
topic
.
getType
());
doc
.
put
(
"day"
,
topic
.
getDay
());
doc
.
put
(
"time"
,
topic
.
getTime
());
doc
.
put
(
"url"
,
topic
.
getUrl
());
data
.
add
(
doc
);
}
weiboTopicDAO
.
addTopicList
(
data
);
log
.
info
(
"微博话题采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboTopicCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
java.util.concurrent.TimeUnit
;
import
org.slf4j.Logger
;
@Log4j2
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.WeiboTopic
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboHuatiCrawler
;
import
com.zhiwei.searchhotcrawler.dao.WeiboTopicDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
WeiboTopicRun
extends
Thread
{
public
class
WeiboTopicRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboTopicRun
.
class
);
@Override
@Override
public
void
run
()
{
public
void
run
()
{
boolean
f
=
true
;
boolean
f
=
true
;
while
(
f
)
{
while
(
f
)
{
try
{
try
{
getTopicList
();
getTopicList
();
TimeUnit
.
HOUR
S
.
sleep
(
3
);
TimeUnit
.
MINUTE
S
.
sleep
(
3
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
...
@@ -36,28 +31,29 @@ public class WeiboTopicRun extends Thread{
...
@@ -36,28 +31,29 @@ public class WeiboTopicRun extends Thread{
private
void
getTopicList
()
{
private
void
getTopicList
()
{
WeiboTopicDAO
weiboTopicDAO
=
new
WeiboTopic
DAO
();
HotSearchListDAO
weiboHotSearchDAO
=
new
HotSearchList
DAO
();
log
ger
.
info
(
"微博超话
采集开始........"
);
log
.
info
(
"微博话题
采集开始........"
);
List
<
WeiboTopic
>
list
=
WeiboHuatiCrawler
.
startCrawler
();
List
<
HotSearchList
>
list
=
WeiboTopicCrawler
.
startCrawlerByPhone
();
logger
.
info
(
"{}, 微博超话
此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 微博话题
此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
WeiboTopic
topic
:
list
){
for
(
HotSearchList
topic
:
list
){
logger
.
info
(
"topic::::{}"
,
topic
);
log
.
info
(
"topic::::{}"
,
topic
);
DBObject
doc
=
new
BasicDBObject
();
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"name"
,
topic
.
getName
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"url"
,
topic
.
getUrl
());
doc
.
put
(
"score_num"
,
topic
.
getScore
());
doc
.
put
(
"count"
,
topic
.
getCount
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"hot"
,
topic
.
getHot
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"type"
,
topic
.
getType
());
doc
.
put
(
"day"
,
topic
.
getDay
());
doc
.
put
(
"day"
,
topic
.
getDay
());
doc
.
put
(
"time"
,
topic
.
getTime
());
doc
.
put
(
"time"
,
topic
.
getTime
());
doc
.
put
(
"url"
,
topic
.
getUrl
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"type"
,
topic
.
getType
());
doc
.
put
(
"topic_lead"
,
topic
.
getTopicLead
());
doc
.
put
(
"comment_count"
,
topic
.
getCommentCount
());
data
.
add
(
doc
);
data
.
add
(
doc
);
}
}
weibo
TopicDAO
.
addTopic
List
(
data
);
weibo
HotSearchDAO
.
addHotSearch
List
(
data
);
log
ger
.
info
(
"微博话题采集结束........"
);
log
.
info
(
"微博话题采集结束........"
);
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
View file @
cb6bcd76
...
@@ -4,6 +4,7 @@ import java.util.Date;
...
@@ -4,6 +4,7 @@ import java.util.Date;
import
java.util.List
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
...
@@ -14,10 +15,9 @@ import com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler;
...
@@ -14,10 +15,9 @@ import com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
ZhihuHotSearchRun
extends
Thread
{
public
class
ZhihuHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuHotSearchRun
.
class
);
@Override
@Override
public
void
run
()
{
public
void
run
()
{
boolean
f
=
true
;
boolean
f
=
true
;
...
@@ -32,24 +32,22 @@ public class ZhihuHotSearchRun extends Thread{
...
@@ -32,24 +32,22 @@ public class ZhihuHotSearchRun extends Thread{
ZhiWeiTools
.
sleep
(
50
);
ZhiWeiTools
.
sleep
(
50
);
}
}
}
}
private
void
getHotList
()
{
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
log
ger
.
info
(
"知乎话题采集开始...,当前线程名字:{}"
,
Thread
.
currentThread
().
getName
());
log
.
info
(
"知乎话题采集开始...,当前线程名字:{}"
,
Thread
.
currentThread
().
getName
());
List
<
HotSearchList
>
list
=
ZhihuHotSearchCrawler
.
getZhihuHotList
();
List
<
HotSearchList
>
list
=
ZhihuHotSearchCrawler
.
getZhihuHotList
();
List
<
HotSearchList
>
mobilelist
=
ZhihuHotSearchCrawler
.
getMobileZhihuHotList
();
List
<
HotSearchList
>
mobilelist
=
ZhihuHotSearchCrawler
.
getMobileZhihuHotList
();
list
.
addAll
(
mobilelist
);
list
.
addAll
(
mobilelist
);
log
ger
.
info
(
"{}, 知乎此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 知乎此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
for
(
HotSearchList
zhihuHotSearch
:
list
){
for
(
HotSearchList
zhihuHotSearch
:
list
){
DBObject
zhihu
=
new
BasicDBObject
();
DBObject
zhihu
=
new
BasicDBObject
();
zhihu
.
put
(
"_id"
,
zhihuHotSearch
.
getId
());
zhihu
.
put
(
"_id"
,
zhihuHotSearch
.
getId
());
zhihu
.
put
(
"name"
,
zhihuHotSearch
.
getName
());
zhihu
.
put
(
"name"
,
zhihuHotSearch
.
getName
());
zhihu
.
put
(
"url"
,
zhihuHotSearch
.
getUrl
());
zhihu
.
put
(
"url"
,
zhihuHotSearch
.
getUrl
());
zhihu
.
put
(
"count"
,
zhihuHotSearch
.
getCount
());
zhihu
.
put
(
"count"
,
zhihuHotSearch
.
getCount
());
zhihu
.
put
(
"hot"
,
zhihuHotSearch
.
is
Hot
());
zhihu
.
put
(
"hot"
,
zhihuHotSearch
.
get
Hot
());
zhihu
.
put
(
"day"
,
zhihuHotSearch
.
getDay
());
zhihu
.
put
(
"day"
,
zhihuHotSearch
.
getDay
());
zhihu
.
put
(
"time"
,
zhihuHotSearch
.
getTime
());
zhihu
.
put
(
"time"
,
zhihuHotSearch
.
getTime
());
zhihu
.
put
(
"changeCount"
,
0
);
zhihu
.
put
(
"changeCount"
,
0
);
...
@@ -57,7 +55,7 @@ public class ZhihuHotSearchRun extends Thread{
...
@@ -57,7 +55,7 @@ public class ZhihuHotSearchRun extends Thread{
zhihu
.
put
(
"type"
,
zhihuHotSearch
.
getType
());
zhihu
.
put
(
"type"
,
zhihuHotSearch
.
getType
());
hotSearchDAO
.
addHotSearch
(
zhihu
);
hotSearchDAO
.
addHotSearch
(
zhihu
);
}
}
log
ger
.
info
(
"知乎话题采集结束........"
);
log
.
info
(
"知乎话题采集结束........"
);
}
}
}
}
src/main/resources/db.properties
View file @
cb6bcd76
#mongoIp=202.107.192.94
#mongoIp=202.107.192.94
mongoIp
=
192.168.0.101
mongoIp
=
192.168.0.101
mongoPort
=
30000
mongoPort
=
30000
#mongoIp=192.168.0.81
#mongoIp=192.168.0.81
#mongoPort=27017
#mongoPort=27017
db.username
=
datapush
db.username
=
searchhotcrawleruser
db.paasword
=
4d8ce5c42073c
db.paasword
=
searchhotcrawler1q2w3e4r
db.certifiedDB
=
admin
db.certifiedDB
=
admin
dbName
=
hot_search_list
dbName
=
hot_search_list
searchCollName
=
hot_search_list
searchCollName
=
hot_search_list
topicCollName
=
topic_list
topicCollName
=
topic_list
collWechatUserName
=
wechat_user
collWechatUserName
=
wechat_user
\ No newline at end of file
src/main/resources/proxyip.properties
View file @
cb6bcd76
registry
=
zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
registry
=
zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
group
=
hangzhou
group
=
hangzhou
########################################################
########################################################
#registry=zookeeper://192.168.0.
36
:2181
#registry=zookeeper://192.168.0.
11:2181?backup=192.168.0.30:2181,192.168.0.35
:2181
#
group
=
local
#
group
=
local
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment