Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
cb6bcd76
Commit
cb6bcd76
authored
Mar 06, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加微博话题采集,并添加lombok
parent
a9966f9d
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
30 changed files
with
729 additions
and
500 deletions
+729
-500
pom.xml
+7
-2
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
+63
-121
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboSuperTopic.java
+90
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboTopic.java
+0
-158
src/main/java/com/zhiwei/searchhotcrawler/cache/CacheListener.java
+6
-3
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+7
-7
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
+3
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
+6
-6
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+11
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboSuperTopicCrawler.java
+16
-15
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
+219
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+4
-4
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchListDAO.java
+21
-5
src/main/java/com/zhiwei/searchhotcrawler/dao/WechatUserDao.java
+4
-3
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiboSuperTopicDAO.java
+19
-5
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+6
-11
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchListTest.java
+106
-71
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
+6
-5
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
+5
-5
src/main/java/com/zhiwei/searchhotcrawler/timer/SendWeiboHotSearchRun.java
+6
-5
src/main/java/com/zhiwei/searchhotcrawler/timer/SendZhihuHotSearchRun.java
+6
-5
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
+5
-4
src/main/java/com/zhiwei/searchhotcrawler/timer/UpdateWechatUserRun.java
+7
-6
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
+6
-6
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboSuperTopicRun.java
+63
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
+25
-29
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
+6
-8
src/main/resources/db.properties
+2
-2
src/main/resources/proxyip.properties
+2
-1
No files found.
pom.xml
View file @
cb6bcd76
...
...
@@ -38,12 +38,17 @@
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.
5-SNAPSHOT
</version>
<version>
0.1.
6-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.5.5.6-SNAPSHOT
</version>
<version>
0.6.0.4-RELEASE
</version>
</dependency>
<dependency>
<groupId>
org.projectlombok
</groupId>
<artifactId>
lombok
</artifactId>
<version>
1.18.8
</version>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
View file @
cb6bcd76
...
...
@@ -10,35 +10,79 @@ import java.io.Serializable;
import
java.util.Date
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
lombok.Data
;
import
lombok.ToString
;
@Data
@ToString
public
class
HotSearchList
implements
Serializable
{
private
static
final
long
serialVersionUID
=
2076919584659821600L
;
private
String
id
;
//主键
/**
* 主键
*/
private
String
id
;
private
String
url
;
//消息链接
/**
* 消息链接
*/
private
String
url
;
private
String
name
;
//热搜关键词
/**
* 热搜关键词
*/
private
String
name
;
private
Integer
count
;
//时时热搜量
/**
* 热搜或话题导语
*/
private
String
topicLead
;
private
Boolean
hot
;
//状态(true 为热搜; false为时时上升)
/**
* 时时热搜量
*/
private
Integer
count
;
private
String
day
;
//天
/**
* 状态(true 为热搜; false为时时上升)
*/
private
Boolean
hot
;
private
Date
time
;
//时间
/**
* 天
*/
private
String
day
;
private
Integer
changeCount
;
//据上分钟变化量
/**
* 时间
*/
private
Date
time
;
private
Integer
rank
;
//排名
/**
* 据上分钟变化量
*/
private
Integer
changeCount
;
private
String
type
;
//分类
/**
* 排名
*/
private
Integer
rank
;
private
String
icon
;
//热搜类型
/**
* 分类
*/
private
String
type
;
/**
* 热搜类型
*/
private
String
icon
;
/**
* 话题讨论量
*/
private
Integer
commentCount
;
public
HotSearchList
(){}
...
...
@@ -69,120 +113,18 @@ public class HotSearchList implements Serializable{
}
@Override
public
String
toString
(){
return
"new HotSearchList["
+
"id = "
+
id
+
", url = "
+
url
+
", name = "
+
name
+
", count = "
+
count
+
", time = "
+
time
+
", hot = "
+
hot
+
", rank = "
+
rank
+
", day = "
+
day
+
", changeCount = "
+
changeCount
+
", type = "
+
type
+
", icon = "
+
icon
+
"]"
;
}
public
String
getId
()
{
return
id
;
}
public
void
setId
(
String
id
)
{
this
.
id
=
id
;
}
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Integer
rank
,
String
type
,
Integer
commentCount
,
String
topicLead
){
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
();
this
.
url
=
url
;
}
public
String
getName
()
{
return
name
;
}
public
void
setName
(
String
name
)
{
this
.
name
=
name
;
}
public
Integer
getCount
()
{
return
count
;
}
public
void
setCount
(
Integer
count
)
{
this
.
count
=
count
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
public
Integer
getChangeCount
()
{
return
changeCount
;
}
public
void
setChangeCount
(
Integer
changeCount
)
{
this
.
changeCount
=
changeCount
;
}
public
static
long
getSerialversionuid
()
{
return
serialVersionUID
;
}
public
Boolean
isHot
()
{
return
hot
;
}
public
void
setHot
(
Boolean
hot
)
{
this
.
hot
=
hot
;
}
public
Boolean
getHot
()
{
return
hot
;
}
public
String
getIcon
()
{
return
icon
;
}
public
void
setIcon
(
String
icon
)
{
this
.
icon
=
icon
;
}
public
String
getDay
()
{
return
day
;
}
public
void
setDay
(
String
day
)
{
this
.
day
=
day
;
}
public
Integer
getRank
()
{
return
rank
;
}
public
void
setRank
(
Integer
rank
)
{
this
.
hot
=
true
;
this
.
rank
=
rank
;
}
public
String
getType
()
{
return
type
;
}
public
void
setType
(
String
type
)
{
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
type
=
type
;
this
.
commentCount
=
commentCount
;
this
.
topicLead
=
topicLead
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
cb6bcd76
...
...
@@ -5,5 +5,6 @@ public enum HotSearchType {
微博热搜
,
知乎热搜
,
抖音热搜
,
搜狗微信热搜
搜狗微信热搜
,
微博话题
}
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboSuperTopic.java
0 → 100644
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
import
java.util.Date
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
lombok.Data
;
import
lombok.ToString
;
/**
*
* @ClassName: WeiboSuperTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
@Data
@ToString
public
class
WeiboSuperTopic
{
/**
* 主键
*/
private
String
id
;
/**
* 话题链接
*/
public
String
url
;
/**
* 话题名称
*/
public
String
topicName
;
/**
* 话题排名
*/
public
Integer
rank
;
/**
* 话题影响力
*/
public
String
score
;
/**
* 话题粉丝数
*/
public
String
fensi
;
/**
* 话题阅读数
*/
public
String
readNum
;
/**
* 话题帖子数
*/
public
String
postNum
;
/**
* 榜单类型
*/
public
String
type
;
/**
* 天
*/
private
String
day
;
/**
* 时间
*/
private
Date
time
;
public
WeiboSuperTopic
()
{}
public
WeiboSuperTopic
(
String
url
,
String
topicName
,
Integer
rank
,
String
score
,
String
fensi
,
String
type
)
{
this
.
url
=
url
;
this
.
topicName
=
topicName
;
this
.
rank
=
rank
;
this
.
score
=
score
;
this
.
fensi
=
fensi
;
this
.
type
=
type
;
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
id
=
topicName
+
"_"
+
type
+
"_"
+
day
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboTopic.java
deleted
100644 → 0
View file @
a9966f9d
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
import
java.util.Date
;
import
com.zhiwei.tools.timeparse.TimeParse
;
/**
*
* @ClassName: WeiboTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
public
class
WeiboTopic
{
private
String
id
;
//主键
public
String
url
;
//话题链接
public
String
topicName
;
//话题名称
public
Integer
rank
;
//话题排名
public
String
score
;
//话题影响力
public
String
fensi
;
//话题粉丝数
public
String
readNum
;
//话题阅读数
public
String
postNum
;
//话题帖子数
public
String
type
;
//榜单类型
private
String
day
;
//天
private
Date
time
;
//时间
public
WeiboTopic
()
{}
public
WeiboTopic
(
String
url
,
String
topicName
,
Integer
rank
,
String
score
,
String
fensi
,
String
type
)
{
this
.
url
=
url
;
this
.
topicName
=
topicName
;
this
.
rank
=
rank
;
this
.
score
=
score
;
this
.
fensi
=
fensi
;
this
.
type
=
type
;
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
id
=
topicName
+
"_"
+
type
+
"_"
+
day
;
}
@Override
public
String
toString
()
{
return
"new WeiboTopic["
+
"topicName = "
+
topicName
+
", rank = "
+
rank
+
", score = "
+
score
+
", fensi = "
+
fensi
+
", type = "
+
type
+
", readNum = "
+
readNum
+
", postNum = "
+
postNum
+
", url = "
+
url
+
"]"
;
}
public
String
getUrl
()
{
return
url
;
}
public
String
getTopicName
()
{
return
topicName
;
}
public
Integer
getRank
()
{
return
rank
;
}
public
String
getScore
()
{
return
score
;
}
public
String
getFensi
()
{
return
fensi
;
}
public
String
getReadNum
()
{
return
readNum
;
}
public
String
getPostNum
()
{
return
postNum
;
}
public
String
getType
()
{
return
type
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
void
setTopicName
(
String
topicName
)
{
this
.
topicName
=
topicName
;
}
public
void
setRank
(
Integer
rank
)
{
this
.
rank
=
rank
;
}
public
void
setScore
(
String
score
)
{
this
.
score
=
score
;
}
public
void
setFensi
(
String
fensi
)
{
this
.
fensi
=
fensi
;
}
public
void
setReadNum
(
String
readNum
)
{
this
.
readNum
=
readNum
;
}
public
void
setPostNum
(
String
postNum
)
{
this
.
postNum
=
postNum
;
}
public
void
setType
(
String
type
)
{
this
.
type
=
type
;
}
public
String
getId
()
{
return
id
;
}
public
String
getDay
()
{
return
day
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setId
(
String
id
)
{
this
.
id
=
id
;
}
public
void
setDay
(
String
day
)
{
this
.
day
=
day
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/cache/CacheListener.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
cache
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
CacheListener
{
Logger
logger
=
LoggerFactory
.
getLogger
(
CacheListener
.
class
);
/**
* 开启缓存监听
*/
public
void
startListen
()
{
new
Thread
(){
public
void
run
()
{
...
...
@@ -17,7 +20,7 @@ public class CacheListener {
for
(
String
key
:
CacheManager
.
getAllKeys
())
{
if
(
CacheManager
.
isTimeOut
(
key
))
{
CacheManager
.
clearByKey
(
key
);
log
ger
.
info
(
key
+
"缓存被清除"
);
log
.
info
(
key
+
"缓存被清除"
);
}
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
View file @
cb6bcd76
...
...
@@ -5,6 +5,7 @@ import java.util.Collections;
import
java.util.List
;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -24,16 +25,15 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
@Log4j2
public
class
BaiDuHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiDuHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: BaiDuHotSearchTest
* @author hero
* @Description: TODO(PC端百度风云榜采集)
* @param 设定文件
* @Description: PC端百度风云榜采集
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
baiduHotSearch
()
{
...
...
@@ -43,10 +43,10 @@ public class BaiDuHotSearchCrawler {
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"mainBody"
))
{
return
ansysData
(
htmlBody
);
}
else
{
log
ger
.
info
(
"解析百度风云榜时出现解析错误,页面结构有问题"
);
log
.
info
(
"解析百度风云榜时出现解析错误,页面结构有问题"
);
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
}
return
Collections
.
emptyList
();
}
...
...
@@ -101,12 +101,12 @@ public class BaiDuHotSearchCrawler {
list
.
add
(
hotSearch
);
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析百度风云榜时出现解析错误"
,
e
);
log
.
error
(
"解析百度风云榜时出现解析错误"
,
e
);
}
});
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析百度风云榜时出现解析错误,数据不是json结构"
,
e
);
log
.
error
(
"解析百度风云榜时出现解析错误,数据不是json结构"
,
e
);
}
return
list
;
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
View file @
cb6bcd76
...
...
@@ -4,6 +4,7 @@ import java.io.IOException;
import
java.util.ArrayList
;
import
java.util.List
;
import
lombok.extern.log4j.Log4j2
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -24,9 +25,9 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
* @author win 10
* @date:2019年07月11日 上午10:26:21
*/
@Log4j2
public
class
DouyinHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
...
...
@@ -66,7 +67,7 @@ public class DouyinHotSearchCrawler {
}
}
}
catch
(
IOException
e
)
{
log
ger
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
);
log
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
);
}
return
list
;
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
View file @
cb6bcd76
...
...
@@ -6,6 +6,7 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -27,16 +28,15 @@ import com.zhiwei.tools.httpclient.HeaderTool;
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
@Log4j2
public
class
SougoHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SougoHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: SougoHotSearchTest
* @author hero
* @Description: TODO(PC端搜狗微信关键词采集)
* @param 设定文件
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
sougoHotSearch
()
{
...
...
@@ -76,19 +76,19 @@ public class SougoHotSearchCrawler {
list
.
add
(
hotSearch
);
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析搜狗微信时出现解析错误"
,
e
);
log
.
error
(
"解析搜狗微信时出现解析错误"
,
e
);
}
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析搜狗微信时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
log
.
error
(
"解析搜狗微信时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
return
Collections
.
emptyList
();
}
}
else
{
log
ger
.
info
(
"解析搜狗微信时出现解析错误,页面结构有问题"
);
log
.
info
(
"解析搜狗微信时出现解析错误,页面结构有问题"
);
}
break
;
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
}
}
return
list
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
cb6bcd76
...
...
@@ -7,6 +7,7 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
lombok.extern.log4j.Log4j2
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -27,13 +28,13 @@ import com.zhiwei.tools.tools.URLCodeUtil;
/**
* @ClassName: WeiboHotSearch
* @Description:
TODO(微博实时热搜采集)
* @Description:
微博实时热搜采集
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
public
class
WeiboHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: weiboHotSearchTest
...
...
@@ -70,18 +71,18 @@ public class WeiboHotSearchCrawler {
list
.
add
(
hotSearch
);
}
catch
(
Exception
e
)
{
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
log
ger
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
log
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
continue
;
}
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
return
null
;
}
}
else
{
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
log
ger
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
log
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
}
break
;
}
catch
(
Exception
e
)
{
...
...
@@ -138,25 +139,25 @@ public class WeiboHotSearchCrawler {
}
String
id
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
);
log
ger
.
info
(
"采集到的数据:::{}"
,
hotSearch
);
log
.
info
(
"采集到的数据:::{}"
,
hotSearch
);
result
.
add
(
hotSearch
);
rank
++;
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
log
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
continue
;
}
}
return
result
;
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
);
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
);
return
Collections
.
emptyList
();
}
}
else
{
log
ger
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
log
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
}
}
catch
(
IOException
e1
)
{
log
ger
.
error
(
"解析微博时时热搜时出现连接失败"
,
e1
);
log
.
error
(
"解析微博时时热搜时出现连接失败"
,
e1
);
return
Collections
.
emptyList
();
}
return
Collections
.
emptyList
();
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/Weibo
Huati
Crawler.java
→
src/main/java/com/zhiwei/searchhotcrawler/crawler/Weibo
SuperTopic
Crawler.java
View file @
cb6bcd76
...
...
@@ -8,6 +8,8 @@ import java.util.Map;
import
java.util.Map.Entry
;
import
java.util.Objects
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
lombok.extern.log4j.Log4j2
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -17,18 +19,17 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.WeiboTopic
;
/**
*
* @ClassName: Weibo
HuatiCrawler
* @Description: 微博
话题榜单采集(明星)
* @ClassName: Weibo
SuperTopicCrawler
* @Description: 微博
超话榜单采集(明星)
* @author Bewilder ZW
* @date 2019年9月27日 下午3:01:34
*/
public
class
WeiboHuatiCrawler
{
@Log4j2
public
class
WeiboSuperTopicCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboHuatiCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
...
...
@@ -44,13 +45,13 @@ public class WeiboHuatiCrawler {
* 开始采集明星话题
* @return void
*/
public
static
List
<
Weibo
Topic
>
startCrawler
()
{
public
static
List
<
Weibo
SuperTopic
>
startCrawler
()
{
Map
<
String
,
String
>
urlMap
=
new
HashMap
<>();
urlMap
.
put
(
"明星"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=star&from=&wm="
);
urlMap
.
put
(
"明星潜力"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=potential&from=&wm="
);
urlMap
.
put
(
"明星上升"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=up&from=&wm="
);
List
<
Weibo
Topic
>
topicList
=
new
ArrayList
<>();
List
<
Weibo
SuperTopic
>
topicList
=
new
ArrayList
<>();
for
(
Entry
<
String
,
String
>
entry
:
urlMap
.
entrySet
())
{
String
url
=
entry
.
getValue
();
...
...
@@ -66,10 +67,10 @@ public class WeiboHuatiCrawler {
topicList
.
addAll
(
parseTopicRankHtml
(
page
,
htmlBody
,
type
));
break
;
}
else
{
log
ger
.
error
(
"获取榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
log
.
error
(
"获取榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
e
);
log
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
e
);
continue
;
}
}
...
...
@@ -87,13 +88,13 @@ public class WeiboHuatiCrawler {
* @param type
* @return void
*/
private
static
List
<
Weibo
Topic
>
parseTopicRankHtml
(
int
page
,
String
htmlBody
,
String
type
)
{
private
static
List
<
Weibo
SuperTopic
>
parseTopicRankHtml
(
int
page
,
String
htmlBody
,
String
type
)
{
try
{
JSONArray
list
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
page
=
(
page
-
1
)*
20
;
List
<
Weibo
Topic
>
topicList
=
new
ArrayList
<>();
List
<
Weibo
SuperTopic
>
topicList
=
new
ArrayList
<>();
Integer
toprank
=
null
;
String
topicName
=
null
;
String
id
=
null
;
...
...
@@ -111,7 +112,7 @@ public class WeiboHuatiCrawler {
fensi
=
desc1
.
replaceAll
(
".*影响力|粉丝"
,
""
).
trim
();
url
=
data
.
getString
(
"link"
);
Weibo
Topic
topic
=
new
WeiboTopic
(
url
,
topicName
,
toprank
,
score
,
fensi
,
type
);
Weibo
SuperTopic
topic
=
new
WeiboSuperTopic
(
url
,
topicName
,
toprank
,
score
,
fensi
,
type
);
topic
=
getTopicInfo
(
id
,
topic
);
System
.
out
.
println
(
"topic====="
+
topic
);
topicList
.
add
(
topic
);
...
...
@@ -119,7 +120,7 @@ public class WeiboHuatiCrawler {
return
topicList
;
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析榜单列表页面时出现错误,错误为:{}"
,
e
);
log
.
error
(
"解析榜单列表页面时出现错误,错误为:{}"
,
e
);
}
return
Collections
.
emptyList
();
}
...
...
@@ -134,7 +135,7 @@ public class WeiboHuatiCrawler {
* @return
* @return WeiboTopic
*/
private
static
Weibo
Topic
getTopicInfo
(
String
id
,
WeiboTopic
topic
)
{
private
static
Weibo
SuperTopic
getTopicInfo
(
String
id
,
WeiboSuperTopic
topic
)
{
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
try
{
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid="
+
id
;
...
...
@@ -151,7 +152,7 @@ public class WeiboHuatiCrawler {
}
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
e
);
log
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
e
);
}
}
return
topic
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
0 → 100644
View file @
cb6bcd76
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
cb6bcd76
...
...
@@ -5,6 +5,7 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.Map
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -24,15 +25,14 @@ import com.zhiwei.tools.tools.URLCodeUtil;
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
public
class
ZhihuHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: getZhihuHotList
* @author hero
* @Description: 知乎热搜采集程序
* @param 设定文件
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
getZhihuHotList
(){
...
...
@@ -65,7 +65,7 @@ public class ZhihuHotSearchCrawler {
}
}
}
catch
(
IOException
e
)
{
log
ger
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
}
return
list
;
...
...
@@ -106,7 +106,7 @@ public class ZhihuHotSearchCrawler {
}
}
}
catch
(
IOException
e
)
{
log
ger
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
}
return
list
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchListDAO.java
View file @
cb6bcd76
...
...
@@ -4,7 +4,9 @@ package com.zhiwei.searchhotcrawler.dao;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -17,8 +19,8 @@ import com.zhiwei.searchhotcrawler.config.Config;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.tools.timeparse.TimeParse
;
@Log4j2
public
class
HotSearchListDAO
extends
MongoDBTemplate
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
HotSearchListDAO
.
class
);
public
HotSearchListDAO
()
{
super
();
...
...
@@ -28,6 +30,19 @@ public class HotSearchListDAO extends MongoDBTemplate{
String
month
=
time
.
substring
(
5
,
7
);
String
collName
=
Config
.
searchCollName
+
year
+
"_"
+
month
;
super
.
setCollName
(
collName
);
//给数据表创建索引
createIndex
();
}
/**
* 初次创建表及创建相应的索引
*/
private
void
createIndex
(){
List
<
DBObject
>
indexList
=
this
.
getReadColl
().
getIndexInfo
();
if
(
Objects
.
isNull
(
indexList
)
&&
indexList
.
isEmpty
()){
DBObject
countIndexDoc
=
new
BasicDBObject
();
countIndexDoc
.
put
(
"count"
,
-
1
);
DBObject
timeIndexDoc
=
new
BasicDBObject
();
...
...
@@ -48,6 +63,7 @@ public class HotSearchListDAO extends MongoDBTemplate{
e
.
printStackTrace
();
}
}
}
/**
* 添加数据入库
...
...
@@ -57,7 +73,7 @@ public class HotSearchListDAO extends MongoDBTemplate{
try
{
this
.
getReadColl
().
insert
(
list
);
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
...
...
@@ -65,7 +81,7 @@ public class HotSearchListDAO extends MongoDBTemplate{
try
{
this
.
getReadColl
().
insert
(
doc
);
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
...
...
@@ -94,7 +110,7 @@ public class HotSearchListDAO extends MongoDBTemplate{
}
cur
.
close
();
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
return
result
;
}
return
result
;
...
...
@@ -128,7 +144,7 @@ public class HotSearchListDAO extends MongoDBTemplate{
}
cur
.
close
();
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
return
list
;
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/WechatUserDao.java
View file @
cb6bcd76
...
...
@@ -3,6 +3,7 @@ package com.zhiwei.searchhotcrawler.dao;
import
java.util.Collections
;
import
java.util.List
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -12,8 +13,8 @@ import com.zhiwei.searchhotcrawler.config.Config;
import
com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
@Log4j2
public
class
WechatUserDao
extends
MongoDBTemplate
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiDuHotSearchCrawler
.
class
);
public
WechatUserDao
()
{
super
();
...
...
@@ -39,7 +40,7 @@ public class WechatUserDao extends MongoDBTemplate{
this
.
getReadColl
().
save
(
doc
);
break
;
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
}
...
...
@@ -61,7 +62,7 @@ public class WechatUserDao extends MongoDBTemplate{
return
(
List
<
String
>)
doc
.
get
(
"user"
);
}
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
return
Collections
.
emptyList
();
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiboTopicDAO.java
→
src/main/java/com/zhiwei/searchhotcrawler/dao/Weibo
Super
TopicDAO.java
View file @
cb6bcd76
...
...
@@ -3,7 +3,9 @@ package com.zhiwei.searchhotcrawler.dao;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -13,10 +15,10 @@ import com.zhiwei.searchhotcrawler.config.Config;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.tools.timeparse.TimeParse
;
public
class
WeiboTopicDAO
extends
MongoDBTemplate
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboTopicDAO
.
class
);
@Log4j2
public
class
WeiboSuperTopicDAO
extends
MongoDBTemplate
{
public
Weibo
TopicDAO
()
{
public
Weibo
SuperTopicDAO
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
...
...
@@ -25,6 +27,16 @@ public class WeiboTopicDAO extends MongoDBTemplate{
String
collName
=
Config
.
topicCollName
+
year
+
"_"
+
month
;
super
.
setCollName
(
collName
);
createIndex
();
}
/**
* 初次创建表及创建相应的索引
*/
private
void
createIndex
(){
List
<
DBObject
>
indexList
=
this
.
getReadColl
().
getIndexInfo
();
if
(
Objects
.
isNull
(
indexList
)
&&
indexList
.
isEmpty
()){
DBObject
countIndexDoc
=
new
BasicDBObject
();
countIndexDoc
.
put
(
"score_num"
,
-
1
);
DBObject
timeIndexDoc
=
new
BasicDBObject
();
...
...
@@ -45,6 +57,8 @@ public class WeiboTopicDAO extends MongoDBTemplate{
e
.
printStackTrace
();
}
}
}
/**
* 添加数据入库
...
...
@@ -54,7 +68,7 @@ public class WeiboTopicDAO extends MongoDBTemplate{
try
{
this
.
getReadColl
().
insert
(
list
);
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
...
...
@@ -62,7 +76,7 @@ public class WeiboTopicDAO extends MongoDBTemplate{
try
{
this
.
getReadColl
().
insert
(
doc
);
}
catch
(
Exception
e
)
{
log
ger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
run
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.cache.CacheListener
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.DouyinHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.SendWeiboHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.SendZhihuHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.SougoHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.UpdateWechatUserRun
;
import
com.zhiwei.searchhotcrawler.timer.WeiboHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.WeiboTopicRun
;
import
com.zhiwei.searchhotcrawler.timer.ZhihuHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.*
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
java.util.concurrent.Executors
;
...
...
@@ -24,7 +16,9 @@ public class HotSearchRun {
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
ProxyConfig
.
registry
,
ProxyConfig
.
group
,
GroupType
.
PROVIDER
,
10000013
);
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
new
UpdateWechatUserRun
().
start
();
ZhiWeiTools
.
sleep
(
10000
);
...
...
@@ -51,6 +45,7 @@ public class HotSearchRun {
new
SougoHotSearchRun
().
start
();
new
DouyinHotSearchRun
().
start
();
new
ZhihuHotSearchRun
().
start
();
new
WeiboSuperTopicRun
().
start
();
new
WeiboTopicRun
().
start
();
//推送程序启动
new
SendWeiboHotSearchRun
().
start
();
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchListTest.java
View file @
cb6bcd76
...
...
@@ -16,90 +16,125 @@ import com.mongodb.MongoClient;
import
com.mongodb.MongoCredential
;
import
com.mongodb.ServerAddress
;
import
com.mongodb.WriteResult
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
public
class
HotSearchListTest
{
public
static
void
main
(
String
[]
args
)
{
MongoCredential
credential
=
MongoCredential
.
createCredential
(
Config
.
userName
,
Config
.
authDB
,
Config
.
userPwd
.
toCharArray
());
ServerAddress
address
=
new
ServerAddress
(
Config
.
mongoIp
,
Config
.
mongoPort
);
Mongo
mongo
=
new
MongoClient
(
address
,
Arrays
.
asList
(
credential
));
DB
db
=
mongo
.
getDB
(
"hot_search_list"
);
DBCollection
coll
=
db
.
getCollection
(
"hot_search_list2019_09"
);
// MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray());
// ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
// Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew));
// DB dbNew = mongoNew.getDB("hot_search_list");
Map
<
String
,
String
>
timLine
=
TimeParse
.
getTimeMap
(
"2019-10-01 00:00:00"
,
"2019-10-09 23:59:59"
,
"dd"
,
1
);
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"zzw"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
String
url
=
"http://app.myzaker.com/news/app.php?f="
;
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
Elements
elements
=
Jsoup
.
parse
(
htmlBody
).
select
(
"div.titlebar>a"
);
for
(
Element
element
:
elements
){
String
lableUrl
=
"http://app.myzaker.com/news/app.php"
+
element
.
attr
(
"href"
);
System
.
out
.
println
(
"lableUrl========="
+
lableUrl
);
String
htmlBodyLable
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
lableUrl
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
Elements
elementsLable
=
Jsoup
.
parse
(
htmlBodyLable
).
select
(
"div#infinite_scroll>a"
);
for
(
Element
elementLable
:
elementsLable
){
System
.
out
.
println
(
elementLable
.
attr
(
"href"
)
+
"============="
+
elementLable
.
text
());
}
}
timLine
.
forEach
((
start
,
end
)
->{
}
catch
(
Exception
e
){
e
.
printStackTrace
();
}
String
year
=
end
.
substring
(
0
,
4
);
String
month
=
end
.
substring
(
5
,
7
);
Date
startDate
=
TimeParse
.
stringFormartDate
(
start
);
Date
endDate
=
TimeParse
.
stringFormartDate
(
end
);
String
collName
=
"hot_search_list"
+
year
+
"_"
+
month
;
System
.
out
.
println
(
"collName=========="
+
collName
);
// DBCollection collNew = dbNew.getCollection(collName);
// DBObject countIndexDoc = new BasicDBObject();
// countIndexDoc.put("count", -1);
// DBObject timeIndexDoc = new BasicDBObject();
// timeIndexDoc.put("time", -1);
// DBObject rankIndexDoc = new BasicDBObject();
// rankIndexDoc.put("rank", -1);
// DBObject nameIndexDoc = new BasicDBObject();
// nameIndexDoc.put("name", -1);
// DBObject typeIndexDoc = new BasicDBObject();
// typeIndexDoc.put("type", -1);
// try {
// collNew.createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
// collNew.createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
// collNew.createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
// collNew.createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
// collNew.createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
// } catch (Exception e) {
// e.printStackTrace();
// }
DBObject
query
=
new
BasicDBObject
(
new
BasicDBObject
(
"time"
,
new
BasicDBObject
(
"$gte"
,
startDate
).
append
(
"$lte"
,
endDate
)));
System
.
out
.
println
(
query
);
WriteResult
wr
=
coll
.
remove
(
query
);
System
.
out
.
println
(
"========"
+
wr
.
getN
());
// int i = 0;
// DBCursor cur = coll.remove(query);
// System.out.println(query +"======="+ cur.count());
// List<DBObject> dataList = new ArrayList<>();
// while(cur.hasNext()) {
// DBObject doc = cur.next();
// try {
//// collNew.save(doc);
// i++;
// coll.remove(doc);
// } catch (Exception e2) {
// e2.printStackTrace();
// }
// dataList.add(doc);
// }
// System.out.println(collName +"数据量大小" +dataList.size());
// cur.close();
// if(!dataList.isEmpty()) {
// try {
// collNew.insert(dataList);
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
});
mongo
.
close
();
// MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray());
// ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
// Mongo mongo = new MongoClient(address, Arrays.asList(credential));
//
// DB db = mongo.getDB("hot_search_list");
// DBCollection coll = db.getCollection("hot_search_list2019_09");
//
//// MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray());
//// ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
//// Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew));
//// DB dbNew = mongoNew.getDB("hot_search_list");
//
// Map<String,String> timLine = TimeParse.getTimeMap("2019-10-01 00:00:00", "2019-10-09 23:59:59", "dd", 1);
//
// timLine.forEach((start, end) ->{
//
// String year = end.substring(0,4);
// String month = end.substring(5,7);
// Date startDate = TimeParse.stringFormartDate(start);
// Date endDate = TimeParse.stringFormartDate(end);
//
// String collName = "hot_search_list"+year+"_"+month;
// System.out.println("collName=========="+collName);
//// DBCollection collNew = dbNew.getCollection(collName);
//// DBObject countIndexDoc = new BasicDBObject();
//// countIndexDoc.put("count", -1);
//// DBObject timeIndexDoc = new BasicDBObject();
//// timeIndexDoc.put("time", -1);
//// DBObject rankIndexDoc = new BasicDBObject();
//// rankIndexDoc.put("rank", -1);
//// DBObject nameIndexDoc = new BasicDBObject();
//// nameIndexDoc.put("name", -1);
//// DBObject typeIndexDoc = new BasicDBObject();
//// typeIndexDoc.put("type", -1);
//// try {
//// collNew.createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
//// collNew.createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
//// collNew.createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
//// collNew.createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
//// collNew.createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//
// DBObject query = new BasicDBObject(new BasicDBObject("time",
// new BasicDBObject("$gte",startDate).append("$lte", endDate)));
// System.out.println(query);
// WriteResult wr = coll.remove(query);
// System.out.println("========"+wr.getN());
//// int i = 0;
//// DBCursor cur = coll.remove(query);
//// System.out.println(query +"======="+ cur.count());
//// List<DBObject> dataList = new ArrayList<>();
//// while(cur.hasNext()) {
//// DBObject doc = cur.next();
//// try {
////// collNew.save(doc);
//// i++;
//// coll.remove(doc);
//// } catch (Exception e2) {
//// e2.printStackTrace();
//// }
//// dataList.add(doc);
//// }
//// System.out.println(collName +"数据量大小" +dataList.size());
//// cur.close();
//// if(!dataList.isEmpty()) {
//// try {
//// collNew.insert(dataList);
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// });
// mongo.close();
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
View file @
cb6bcd76
...
...
@@ -6,6 +6,7 @@ import java.util.List;
import
java.util.Objects
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -16,10 +17,9 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
BaiduHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiduHotSearchRun
.
class
);
@Override
public
void
run
()
{
boolean
f
=
true
;
...
...
@@ -37,10 +37,10 @@ public class BaiduHotSearchRun extends Thread{
private
void
getHotList
()
{
log
ger
.
info
(
"百度风云榜采集开始........"
);
log
.
info
(
"百度风云榜采集开始........"
);
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
List
<
HotSearchList
>
list
=
BaiDuHotSearchCrawler
.
baiduHotSearch
();
logger
.
info
(
"{}, 此轮百度风云榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 此轮百度风云榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
saveDataList
=
new
ArrayList
<>();
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
list
.
forEach
(
baiduHotSearch
->{
...
...
@@ -59,7 +59,7 @@ public class BaiduHotSearchRun extends Thread{
});
}
hotSearchDAO
.
addHotSearchList
(
saveDataList
);
log
ger
.
info
(
"百度风云榜采集结束........"
);
log
.
info
(
"百度风云榜采集结束........"
);
}
}
\ No newline at end of file
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
View file @
cb6bcd76
...
...
@@ -5,6 +5,7 @@ import java.util.Date;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.DouyinHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
DouyinHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
DouyinHotSearchRun
.
class
);
@Override
public
void
run
()
{
boolean
f
=
true
;
...
...
@@ -40,10 +40,10 @@ public class DouyinHotSearchRun extends Thread{
* @return void
*/
private
void
getHotList
()
{
log
ger
.
info
(
"抖音热搜榜采集开始........"
);
log
.
info
(
"抖音热搜榜采集开始........"
);
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
List
<
HotSearchList
>
list
=
DouyinHotSearchCrawler
.
getMobileDouyinHotList
();
log
ger
.
info
(
"{}, 抖音热搜榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 抖音热搜榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
HotSearchList
douyinHotSearch
:
list
){
int
changeCount
=
hotSearchDAO
.
getChangeCount
(
douyinHotSearch
);
...
...
@@ -60,7 +60,7 @@ public class DouyinHotSearchRun extends Thread{
data
.
add
(
douyin
);
hotSearchDAO
.
addHotSearch
(
douyin
);
}
log
ger
.
info
(
"抖音热搜榜采集结束........"
);
log
.
info
(
"抖音热搜榜采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/SendWeiboHotSearchRun.java
View file @
cb6bcd76
...
...
@@ -6,6 +6,7 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -20,17 +21,17 @@ import com.zhiwei.searchhotcrawler.util.WechatConstant;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
SendWeiboHotSearchRun
extends
Thread
{
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
private
static
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SendWeiboHotSearchRun
.
class
);
@Override
public
void
run
()
{
while
(
true
)
{
try
{
Calendar
calendar
=
Calendar
.
getInstance
();
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
log
ger
.
info
(
"微博推送,当前系统时间为:"
+
hour
);
log
.
info
(
"微博推送,当前系统时间为:"
+
hour
);
if
(
hour
>
6
&&
hour
<
23
)
{
List
<
DBObject
>
list
=
hotSearchDAO
.
getHotOneHour
(
HotSearchType
.
微博热搜
.
name
());
if
(
list
!=
null
&&
!
list
.
isEmpty
())
{
...
...
@@ -41,14 +42,14 @@ public class SendWeiboHotSearchRun extends Thread {
sendTemplateByUserIds
(
title
,
time
,
url
);
}
}
else
{
log
ger
.
info
(
"微博最近一小时无数据"
);
log
.
info
(
"微博最近一小时无数据"
);
sendTemplateByUserIds
(
"最近一小时无数据"
,
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
}
}
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
catch
(
Exception
e
)
{
log
ger
.
debug
(
"微博热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
log
.
debug
(
"微博热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
continue
;
}
...
...
@@ -100,7 +101,7 @@ public class SendWeiboHotSearchRun extends Thread {
WechatCodeUtil
.
sendDataJson
(
templateJson
);
}
}
else
{
log
ger
.
info
(
"拉取微博用户列表失败"
);
log
.
info
(
"拉取微博用户列表失败"
);
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/SendZhihuHotSearchRun.java
View file @
cb6bcd76
...
...
@@ -6,6 +6,7 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -20,10 +21,10 @@ import com.zhiwei.searchhotcrawler.util.WechatConstant;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
SendZhihuHotSearchRun
extends
Thread
{
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
private
static
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SendZhihuHotSearchRun
.
class
);
@Override
public
void
run
()
{
...
...
@@ -31,7 +32,7 @@ public class SendZhihuHotSearchRun extends Thread{
try
{
Calendar
calendar
=
Calendar
.
getInstance
();
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
log
ger
.
info
(
"知乎推送,当前系统时间为:"
+
hour
);
log
.
info
(
"知乎推送,当前系统时间为:"
+
hour
);
if
(
hour
>
6
&&
hour
<
23
){
List
<
DBObject
>
list
=
hotSearchDAO
.
getHotOneHour
(
HotSearchType
.
知乎热搜
.
name
());
if
(
list
!=
null
&&
!
list
.
isEmpty
()){
...
...
@@ -44,13 +45,13 @@ public class SendZhihuHotSearchRun extends Thread{
}
}
}
else
{
log
ger
.
info
(
"知乎最近一小时无数据"
);
log
.
info
(
"知乎最近一小时无数据"
);
sendTemplateByUserIds
(
"最近一小时无数据"
,
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
}
}
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
catch
(
Exception
e
)
{
log
ger
.
debug
(
"知乎热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
log
.
debug
(
"知乎热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
}
...
...
@@ -101,7 +102,7 @@ public class SendZhihuHotSearchRun extends Thread{
WechatCodeUtil
.
sendDataJson
(
templateJson
);
}
}
else
{
log
ger
.
info
(
"知乎推送拉取用户列表失败"
);
log
.
info
(
"知乎推送拉取用户列表失败"
);
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
View file @
cb6bcd76
...
...
@@ -5,6 +5,7 @@ import java.util.Date;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -15,8 +16,8 @@ import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
SougoHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SougoHotSearchRun
.
class
);
@Override
public
void
run
()
{
...
...
@@ -36,9 +37,9 @@ public class SougoHotSearchRun extends Thread {
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
log
ger
.
info
(
"搜狗微信采集开始........"
);
log
.
info
(
"搜狗微信采集开始........"
);
List
<
HotSearchList
>
list
=
SougoHotSearchCrawler
.
sougoHotSearch
();
logger
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
HotSearchList
sougoHotSearch
:
list
){
DBObject
doc
=
new
BasicDBObject
();
...
...
@@ -52,7 +53,7 @@ public class SougoHotSearchRun extends Thread {
data
.
add
(
doc
);
}
hotSearchDAO
.
addHotSearchList
(
data
);
log
ger
.
info
(
"搜狗微信采集结束........"
);
log
.
info
(
"搜狗微信采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/UpdateWechatUserRun.java
View file @
cb6bcd76
...
...
@@ -5,6 +5,7 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -12,24 +13,24 @@ import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
UpdateWechatUserRun
extends
Thread
{
private
WechatUserDao
wechatUserDao
=
new
WechatUserDao
();
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
UpdateWechatUserRun
.
class
);
@Override
public
void
run
()
{
log
ger
.
info
(
"开始更新用户数据"
);
log
.
info
(
"开始更新用户数据"
);
while
(
true
)
{
try
{
Calendar
calendar
=
Calendar
.
getInstance
();
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
if
(
hour
>
6
){
Map
<
String
,
Integer
>
groupMap
=
WechatCodeUtil
.
getAllGroupIp
();
log
ger
.
info
(
"此公众号的分组数量为:::{}"
,
groupMap
.
size
());
log
.
info
(
"此公众号的分组数量为:::{}"
,
groupMap
.
size
());
if
(!
groupMap
.
isEmpty
()
&&
groupMap
!=
null
){
for
(
Entry
<
String
,
Integer
>
group
:
groupMap
.
entrySet
()){
log
ger
.
info
(
"此公众号的分组名称及IP为:::{},{}"
,
group
.
getKey
(),
group
.
getValue
());
log
.
info
(
"此公众号的分组名称及IP为:::{},{}"
,
group
.
getKey
(),
group
.
getValue
());
List
<
String
>
userList
=
WechatCodeUtil
.
getUserListByGroupId
(
group
.
getValue
());
log
ger
.
info
(
"{},此分组下的用户数量为::{}"
,
group
.
getKey
(),
userList
.
size
());
log
.
info
(
"{},此分组下的用户数量为::{}"
,
group
.
getKey
(),
userList
.
size
());
if
(
userList
!=
null
&&
!
userList
.
isEmpty
()){
wechatUserDao
.
addWechatUser
(
userList
,
group
.
getKey
(),
group
.
getValue
());
}
...
...
@@ -38,7 +39,7 @@ public class UpdateWechatUserRun extends Thread{
}
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
catch
(
Exception
e
)
{
log
ger
.
debug
(
"知乎热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
log
.
debug
(
"知乎热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
continue
;
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
View file @
cb6bcd76
...
...
@@ -5,6 +5,7 @@ import java.util.Date;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
WeiboHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboHotSearchRun
.
class
);
@Override
public
void
run
()
{
boolean
f
=
true
;
...
...
@@ -36,11 +36,11 @@ public class WeiboHotSearchRun extends Thread{
private
void
getHotList
()
{
log
ger
.
info
(
"微博话题采集开始........"
);
log
.
info
(
"微博话题采集开始........"
);
HotSearchListDAO
weiboHotSearchDAO
=
new
HotSearchListDAO
();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch();
List
<
HotSearchList
>
list
=
WeiboHotSearchCrawler
.
weiboHotSearchByPhone
();
logger
.
info
(
"{}, 微博此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 微博此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
HotSearchList
weiboHotSearch
:
list
){
int
changeCount
=
weiboHotSearchDAO
.
getChangeCount
(
weiboHotSearch
);
...
...
@@ -49,7 +49,7 @@ public class WeiboHotSearchRun extends Thread{
doc
.
put
(
"name"
,
weiboHotSearch
.
getName
());
doc
.
put
(
"url"
,
weiboHotSearch
.
getUrl
());
doc
.
put
(
"count"
,
weiboHotSearch
.
getCount
());
doc
.
put
(
"hot"
,
weiboHotSearch
.
is
Hot
());
doc
.
put
(
"hot"
,
weiboHotSearch
.
get
Hot
());
doc
.
put
(
"day"
,
weiboHotSearch
.
getDay
());
doc
.
put
(
"time"
,
weiboHotSearch
.
getTime
());
doc
.
put
(
"changeCount"
,
changeCount
);
...
...
@@ -59,7 +59,7 @@ public class WeiboHotSearchRun extends Thread{
data
.
add
(
doc
);
}
weiboHotSearchDAO
.
addHotSearchList
(
data
);
log
ger
.
info
(
"微博话题采集结束........"
);
log
.
info
(
"微博话题采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboSuperTopicRun.java
0 → 100644
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboSuperTopicCrawler
;
import
com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
WeiboSuperTopicRun
extends
Thread
{
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getTopicList
();
TimeUnit
.
HOURS
.
sleep
(
3
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
}
ZhiWeiTools
.
sleep
(
50
);
}
}
private
void
getTopicList
()
{
WeiboSuperTopicDAO
weiboTopicDAO
=
new
WeiboSuperTopicDAO
();
log
.
info
(
"微博超话采集开始........"
);
List
<
WeiboSuperTopic
>
list
=
WeiboSuperTopicCrawler
.
startCrawler
();
log
.
info
(
"{}, 微博超话此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
WeiboSuperTopic
topic
:
list
){
log
.
info
(
"topic::::{}"
,
topic
);
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"score_num"
,
topic
.
getScore
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"type"
,
topic
.
getType
());
doc
.
put
(
"day"
,
topic
.
getDay
());
doc
.
put
(
"time"
,
topic
.
getTime
());
doc
.
put
(
"url"
,
topic
.
getUrl
());
data
.
add
(
doc
);
}
weiboTopicDAO
.
addTopicList
(
data
);
log
.
info
(
"微博话题采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
View file @
cb6bcd76
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboTopicCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.WeiboTopic
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboHuatiCrawler
;
import
com.zhiwei.searchhotcrawler.dao.WeiboTopicDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
WeiboTopicRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboTopicRun
.
class
);
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getTopicList
();
TimeUnit
.
HOUR
S
.
sleep
(
3
);
TimeUnit
.
MINUTE
S
.
sleep
(
3
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
...
...
@@ -36,28 +31,29 @@ public class WeiboTopicRun extends Thread{
private
void
getTopicList
()
{
WeiboTopicDAO
weiboTopicDAO
=
new
WeiboTopic
DAO
();
log
ger
.
info
(
"微博超话
采集开始........"
);
List
<
WeiboTopic
>
list
=
WeiboHuatiCrawler
.
startCrawler
();
logger
.
info
(
"{}, 微博超话
此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
HotSearchListDAO
weiboHotSearchDAO
=
new
HotSearchList
DAO
();
log
.
info
(
"微博话题
采集开始........"
);
List
<
HotSearchList
>
list
=
WeiboTopicCrawler
.
startCrawlerByPhone
();
log
.
info
(
"{}, 微博话题
此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
WeiboTopic
topic
:
list
){
logger
.
info
(
"topic::::{}"
,
topic
);
for
(
HotSearchList
topic
:
list
){
log
.
info
(
"topic::::{}"
,
topic
);
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"score_num"
,
topic
.
getScore
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"type"
,
topic
.
getType
());
doc
.
put
(
"name"
,
topic
.
getName
());
doc
.
put
(
"url"
,
topic
.
getUrl
());
doc
.
put
(
"count"
,
topic
.
getCount
());
doc
.
put
(
"hot"
,
topic
.
getHot
());
doc
.
put
(
"day"
,
topic
.
getDay
());
doc
.
put
(
"time"
,
topic
.
getTime
());
doc
.
put
(
"url"
,
topic
.
getUrl
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"type"
,
topic
.
getType
());
doc
.
put
(
"topic_lead"
,
topic
.
getTopicLead
());
doc
.
put
(
"comment_count"
,
topic
.
getCommentCount
());
data
.
add
(
doc
);
}
weibo
TopicDAO
.
addTopic
List
(
data
);
log
ger
.
info
(
"微博话题采集结束........"
);
weibo
HotSearchDAO
.
addHotSearch
List
(
data
);
log
.
info
(
"微博话题采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
View file @
cb6bcd76
...
...
@@ -4,6 +4,7 @@ import java.util.Date;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
lombok.extern.log4j.Log4j2
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -14,10 +15,9 @@ import com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
public
class
ZhihuHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuHotSearchRun
.
class
);
@Override
public
void
run
()
{
boolean
f
=
true
;
...
...
@@ -34,22 +34,20 @@ public class ZhihuHotSearchRun extends Thread{
}
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
log
ger
.
info
(
"知乎话题采集开始...,当前线程名字:{}"
,
Thread
.
currentThread
().
getName
());
log
.
info
(
"知乎话题采集开始...,当前线程名字:{}"
,
Thread
.
currentThread
().
getName
());
List
<
HotSearchList
>
list
=
ZhihuHotSearchCrawler
.
getZhihuHotList
();
List
<
HotSearchList
>
mobilelist
=
ZhihuHotSearchCrawler
.
getMobileZhihuHotList
();
list
.
addAll
(
mobilelist
);
log
ger
.
info
(
"{}, 知乎此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"{}, 知乎此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
for
(
HotSearchList
zhihuHotSearch
:
list
){
DBObject
zhihu
=
new
BasicDBObject
();
zhihu
.
put
(
"_id"
,
zhihuHotSearch
.
getId
());
zhihu
.
put
(
"name"
,
zhihuHotSearch
.
getName
());
zhihu
.
put
(
"url"
,
zhihuHotSearch
.
getUrl
());
zhihu
.
put
(
"count"
,
zhihuHotSearch
.
getCount
());
zhihu
.
put
(
"hot"
,
zhihuHotSearch
.
is
Hot
());
zhihu
.
put
(
"hot"
,
zhihuHotSearch
.
get
Hot
());
zhihu
.
put
(
"day"
,
zhihuHotSearch
.
getDay
());
zhihu
.
put
(
"time"
,
zhihuHotSearch
.
getTime
());
zhihu
.
put
(
"changeCount"
,
0
);
...
...
@@ -57,7 +55,7 @@ public class ZhihuHotSearchRun extends Thread{
zhihu
.
put
(
"type"
,
zhihuHotSearch
.
getType
());
hotSearchDAO
.
addHotSearch
(
zhihu
);
}
log
ger
.
info
(
"知乎话题采集结束........"
);
log
.
info
(
"知乎话题采集结束........"
);
}
}
src/main/resources/db.properties
View file @
cb6bcd76
...
...
@@ -3,8 +3,8 @@ mongoIp=192.168.0.101
mongoPort
=
30000
#mongoIp=192.168.0.81
#mongoPort=27017
db.username
=
datapush
db.paasword
=
4d8ce5c42073c
db.username
=
searchhotcrawleruser
db.paasword
=
searchhotcrawler1q2w3e4r
db.certifiedDB
=
admin
dbName
=
hot_search_list
searchCollName
=
hot_search_list
...
...
src/main/resources/proxyip.properties
View file @
cb6bcd76
registry
=
zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
group
=
hangzhou
########################################################
#registry=zookeeper://192.168.0.
36
:2181
#registry=zookeeper://192.168.0.
11:2181?backup=192.168.0.30:2181,192.168.0.35
:2181
#
group
=
local
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment