Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
4fce8f43
Commit
4fce8f43
authored
Sep 27, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加明星超话榜单采集
parent
21d9c6dc
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
431 additions
and
23 deletions
+431
-23
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboTopic.java
+158
-0
src/main/java/com/zhiwei/searchhotcrawler/config/Config.java
+4
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHuatiCrawler.java
+151
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchListDAO.java
+2
-5
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiboTopicDAO.java
+50
-0
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+2
-13
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
+0
-1
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
+61
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
+0
-1
src/main/resources/db.properties
+3
-1
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboTopic.java
0 → 100644
View file @
4fce8f43
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
import
java.util.Date
;
import
com.zhiwei.tools.timeparse.TimeParse
;
/**
*
* @ClassName: WeiboTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
public
class
WeiboTopic
{
private
String
id
;
//主键
public
String
url
;
//话题链接
public
String
topicName
;
//话题名称
public
Integer
rank
;
//话题排名
public
String
score
;
//话题影响力
public
String
fensi
;
//话题粉丝数
public
String
readNum
;
//话题阅读数
public
String
postNum
;
//话题帖子数
public
String
type
;
//榜单类型
private
String
day
;
//天
private
Date
time
;
//时间
public
WeiboTopic
()
{}
public
WeiboTopic
(
String
url
,
String
topicName
,
Integer
rank
,
String
score
,
String
fensi
,
String
type
)
{
this
.
url
=
url
;
this
.
topicName
=
topicName
;
this
.
rank
=
rank
;
this
.
score
=
score
;
this
.
fensi
=
fensi
;
this
.
type
=
type
;
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
id
=
topicName
+
"_"
+
day
;
}
@Override
public
String
toString
()
{
return
"new WeiboTopic["
+
"topicName = "
+
topicName
+
", rank = "
+
rank
+
", score = "
+
score
+
", fensi = "
+
fensi
+
", type = "
+
type
+
", readNum = "
+
readNum
+
", postNum = "
+
postNum
+
", url = "
+
url
+
"]"
;
}
public
String
getUrl
()
{
return
url
;
}
public
String
getTopicName
()
{
return
topicName
;
}
public
Integer
getRank
()
{
return
rank
;
}
public
String
getScore
()
{
return
score
;
}
public
String
getFensi
()
{
return
fensi
;
}
public
String
getReadNum
()
{
return
readNum
;
}
public
String
getPostNum
()
{
return
postNum
;
}
public
String
getType
()
{
return
type
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
void
setTopicName
(
String
topicName
)
{
this
.
topicName
=
topicName
;
}
public
void
setRank
(
Integer
rank
)
{
this
.
rank
=
rank
;
}
public
void
setScore
(
String
score
)
{
this
.
score
=
score
;
}
public
void
setFensi
(
String
fensi
)
{
this
.
fensi
=
fensi
;
}
public
void
setReadNum
(
String
readNum
)
{
this
.
readNum
=
readNum
;
}
public
void
setPostNum
(
String
postNum
)
{
this
.
postNum
=
postNum
;
}
public
void
setType
(
String
type
)
{
this
.
type
=
type
;
}
public
String
getId
()
{
return
id
;
}
public
String
getDay
()
{
return
day
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setId
(
String
id
)
{
this
.
id
=
id
;
}
public
void
setDay
(
String
day
)
{
this
.
day
=
day
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/config/Config.java
View file @
4fce8f43
...
...
@@ -18,7 +18,8 @@ public class Config {
userPwd
=
conf
.
getProperty
(
"db.paasword"
);
authDB
=
conf
.
getProperty
(
"db.certifiedDB"
);
dbName
=
conf
.
getProperty
(
"dbName"
);
collName
=
conf
.
getProperty
(
"collName"
);
searchCollName
=
conf
.
getProperty
(
"searchCollName"
);
topicCollName
=
conf
.
getProperty
(
"topicCollName"
);
collWechatUserName
=
conf
.
getProperty
(
"collWechatUserName"
);
}
catch
(
Exception
e
)
{
...
...
@@ -33,6 +34,7 @@ public class Config {
public
static
String
userPwd
;
public
static
String
authDB
;
public
static
String
dbName
;
public
static
String
collName
;
public
static
String
searchCollName
;
public
static
String
topicCollName
;
public
static
String
collWechatUserName
;
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHuatiCrawler.java
0 → 100644
View file @
4fce8f43
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
java.util.Objects
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.WeiboTopic
;
/**
*
* @ClassName: WeiboHuatiCrawler
* @Description: 微博话题榜单采集(明星)
* @author Bewilder ZW
* @date 2019年9月27日 下午3:01:34
*/
public
class
WeiboHuatiCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboHuatiCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
static
{
headMap
.
put
(
"X-Requested-With"
,
"XMLHttpRequest"
);
headMap
.
put
(
"Referer"
,
"https://huati.weibo.cn/discovery/super?extparam=ctg1_2%7Cscorll_1&luicode=10000011&lfid=100803_-_super&sourceType=weixin"
);
headMap
.
put
(
"Host"
,
"huati.weibo.cn"
);
}
/**
*
* 开始采集明星话题
* @return void
*/
public
static
List
<
WeiboTopic
>
startCrawler
()
{
Map
<
String
,
String
>
urlMap
=
new
HashMap
<>();
urlMap
.
put
(
"明星"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=star&from=&wm="
);
urlMap
.
put
(
"明星潜力"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=potential&from=&wm="
);
urlMap
.
put
(
"明星上升"
,
"https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=up&from=&wm="
);
List
<
WeiboTopic
>
topicList
=
new
ArrayList
<>();
for
(
Entry
<
String
,
String
>
entry
:
urlMap
.
entrySet
())
{
String
url
=
entry
.
getValue
();
String
type
=
entry
.
getKey
();
for
(
int
page
=
1
;
page
<=
5
;
page
++)
{
String
pageUrl
=
url
+
"&page="
+
page
;
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
pageUrl
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc1"
))
{
topicList
.
addAll
(
parseTopicRankHtml
(
htmlBody
,
type
));
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
e
);
}
}
}
return
topicList
;
}
/**
*
* 解析话题榜单
* @param htmlBody
* @param type
* @return void
*/
private
static
List
<
WeiboTopic
>
parseTopicRankHtml
(
String
htmlBody
,
String
type
)
{
try
{
JSONArray
list
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
List
<
WeiboTopic
>
topicList
=
new
ArrayList
<>();
Integer
toprank
=
null
;
String
topicName
=
null
;
String
id
=
null
;
String
score
=
null
;
String
desc1
=
null
;
String
fensi
=
null
;
String
url
=
null
;
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
JSONObject
data
=
list
.
getJSONObject
(
i
);
toprank
=
data
.
getInteger
(
"toprank"
);
topicName
=
data
.
getString
(
"display_name"
);
id
=
data
.
getString
(
"page_id"
);
score
=
data
.
getString
(
"score"
);
desc1
=
data
.
getString
(
"desc1"
);
fensi
=
desc1
.
replaceAll
(
"影响力.*"
,
""
);
url
=
data
.
getString
(
"link"
);
WeiboTopic
topic
=
new
WeiboTopic
(
url
,
topicName
,
toprank
,
score
,
fensi
,
type
);
topic
=
getTopicInfo
(
id
,
topic
);
topicList
.
add
(
topic
);
}
return
topicList
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析榜单列表页面时出现错误,错误为:{}"
,
e
);
}
return
Collections
.
emptyList
();
}
/**
*
* 根据单一话题id获取话题阅读数及发帖数
* @param id
* @param topic
* @return
* @return WeiboTopic
*/
private
static
WeiboTopic
getTopicInfo
(
String
id
,
WeiboTopic
topic
)
{
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid="
+
id
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc_more"
))
{
String
descMore
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"pageInfo"
).
getJSONArray
(
"desc_more"
).
getString
(
0
);
if
(
StringUtils
.
isNotBlank
(
descMore
))
{
String
readNum
=
descMore
.
replaceAll
(
"阅读|帖子.*"
,
""
).
trim
();
String
postNum
=
descMore
.
replaceAll
(
".*帖子|粉丝.*"
,
""
).
trim
();
topic
.
setPostNum
(
postNum
);
topic
.
setReadNum
(
readNum
);
return
topic
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
e
);
}
}
return
topic
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchListDAO.java
View file @
4fce8f43
...
...
@@ -14,22 +14,19 @@ import com.mongodb.DBObject;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.cache.CacheManager
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
HotSearchListDAO
extends
MongoDBTemplate
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiDuHotSearchCrawler
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
HotSearchListDAO
.
class
);
@SuppressWarnings
(
"unused"
)
public
HotSearchListDAO
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
String
year
=
time
.
substring
(
0
,
4
);
String
month
=
time
.
substring
(
5
,
7
);
String
collName
=
Config
.
c
ollName
+
year
+
"_"
+
month
;
String
collName
=
Config
.
searchC
ollName
+
year
+
"_"
+
month
;
super
.
setCollName
(
collName
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiboTopicDAO.java
0 → 100644
View file @
4fce8f43
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
java.util.Date
;
import
java.util.List
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.tools.timeparse.TimeParse
;
public
class
WeiboTopicDAO
extends
MongoDBTemplate
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboTopicDAO
.
class
);
public
WeiboTopicDAO
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
String
year
=
time
.
substring
(
0
,
4
);
String
month
=
time
.
substring
(
5
,
7
);
String
collName
=
Config
.
topicCollName
+
year
+
"_"
+
month
;
super
.
setCollName
(
collName
);
}
/**
* 添加数据入库
* @param list
*/
public
void
addTopicList
(
List
<
DBObject
>
list
){
try
{
this
.
getReadColl
().
insert
(
list
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
public
void
addTopic
(
DBObject
doc
){
try
{
this
.
getReadColl
().
insert
(
doc
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
4fce8f43
...
...
@@ -11,25 +11,13 @@ import com.zhiwei.searchhotcrawler.timer.SendZhihuHotSearchRun;
import
com.zhiwei.searchhotcrawler.timer.SougoHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.UpdateWechatUserRun
;
import
com.zhiwei.searchhotcrawler.timer.WeiboHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.WeiboTopicRun
;
import
com.zhiwei.searchhotcrawler.timer.ZhihuHotSearchRun
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
HotSearchRun
{
// private ScheduledExecutorService scheduExec;
//
// public HotSearchRun() {
// this.scheduExec = Executors.newScheduledThreadPool(5);
// }
// public void showTimer() {
// scheduExec.scheduleAtFixedRate(new WeiboHotSearchRun(), 0, 1, TimeUnit.MINUTES);
// scheduExec.scheduleAtFixedRate(new ZhihuHotSearchRun(), 0, 10 , TimeUnit.MINUTES);
// scheduExec.scheduleAtFixedRate(new BaiduHotSearchRun(), 0, 5 , TimeUnit.MINUTES);
// scheduExec.scheduleAtFixedRate(new SougoHotSearchRun(), 0, 5 , TimeUnit.MINUTES);
// scheduExec.scheduleAtFixedRate(new DouyinHotSearchRun(), 0, 10 , TimeUnit.MINUTES);
// }
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
ProxyConfig
.
registry
,
ProxyConfig
.
group
,
GroupType
.
PROVIDER
);
...
...
@@ -43,6 +31,7 @@ public class HotSearchRun {
new
SougoHotSearchRun
().
start
();
new
DouyinHotSearchRun
().
start
();
new
ZhihuHotSearchRun
().
start
();
new
WeiboTopicRun
().
start
();
//推送程序启动
new
SendWeiboHotSearchRun
().
start
();
new
SendZhihuHotSearchRun
().
start
();
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
View file @
4fce8f43
...
...
@@ -12,7 +12,6 @@ import org.slf4j.LoggerFactory;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
0 → 100644
View file @
4fce8f43
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.WeiboTopic
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboHuatiCrawler
;
import
com.zhiwei.searchhotcrawler.dao.WeiboTopicDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
WeiboTopicRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboTopicRun
.
class
);
private
WeiboTopicDAO
weiboTopicDAO
=
new
WeiboTopicDAO
();
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getTopicList
();
TimeUnit
.
DAYS
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
}
ZhiWeiTools
.
sleep
(
50
);
}
}
private
void
getTopicList
()
{
logger
.
info
(
"微博超话采集开始........"
);
List
<
WeiboTopic
>
list
=
WeiboHuatiCrawler
.
startCrawler
();
logger
.
info
(
"{}, 微博此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
WeiboTopic
topic
:
list
){
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"score_num"
,
topic
.
getScore
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"type"
,
topic
.
getType
());
doc
.
put
(
"day"
,
topic
.
getDay
());
doc
.
put
(
"time"
,
topic
.
getTime
());
doc
.
put
(
"url"
,
topic
.
getUrl
());
data
.
add
(
doc
);
}
weiboTopicDAO
.
addTopicList
(
data
);
logger
.
info
(
"微博话题采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
View file @
4fce8f43
...
...
@@ -10,7 +10,6 @@ import org.slf4j.LoggerFactory;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
src/main/resources/db.properties
View file @
4fce8f43
...
...
@@ -7,5 +7,6 @@ db.username=datapush
db.paasword
=
4d8ce5c42073c
db.certifiedDB
=
admin
dbName
=
hot_search_list
collName
=
hot_search_list
searchCollName
=
hot_search_list
topicCollName
=
topic_list
collWechatUserName
=
wechat_user
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment