Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
96aca677
Commit
96aca677
authored
Aug 10, 2020
by
马黎滨
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'mlbWork' into 'master'
网易新闻实时热榜和跟贴热议采集 See merge request
!18
parents
21bf95d3
cfb1f13a
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
145 additions
and
35 deletions
+145
-35
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+3
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/WangYiHotSearchCrawler.java
+108
-0
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+3
-3
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
+16
-9
src/main/java/com/zhiwei/searchhotcrawler/timer/ThreadOneRun.java
+6
-22
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuChildHotSearchRun.java
+9
-0
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
96aca677
...
...
@@ -13,5 +13,7 @@ public enum HotSearchType {
新浪热榜
,
新浪热点
,
搜狐话题
,
凤凰新闻热榜
凤凰新闻热榜
,
网易热榜
,
网易跟帖热议
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WangYiHotSearchCrawler.java
0 → 100644
View file @
96aca677
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
/**
* 网易新闻采集
*/
@Log4j2
public
class
WangYiHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
/**
* 网易新闻实时热榜的采集
* @return
*/
public
static
List
<
HotSearchList
>
getWangYiHotSearch
(){
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
log
.
info
(
"网易新闻实时热榜开始采集"
);
String
url
=
"https://v6-gw.m.163.com/nc-main/api/v1/hqc/no-repeat-hot-list"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
htmlBody
=
null
;
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"网易新闻实时热榜页面连接异常..."
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
bodyObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONArray
jsonObject
=
bodyObject
.
getJSONArray
(
"items"
);
if
(
jsonObject
!=
null
)
{
for
(
int
i
=
0
;
i
<
jsonObject
.
size
();
i
++)
{
int
rank
=
i
+
1
;
String
name
=
jsonObject
.
getJSONObject
(
i
).
getString
(
"title"
);
int
count
=
jsonObject
.
getJSONObject
(
i
).
getIntValue
(
"hotValue"
);
String
contentId
=
jsonObject
.
getJSONObject
(
i
).
getString
(
"contentId"
);
String
wangyiUrl
=
"https://c.m.163.com/news/a/"
+
contentId
+
".html"
;
HotSearchList
hotSearchList
=
new
HotSearchList
(
wangyiUrl
,
name
,
count
,
rank
,
HotSearchType
.
网易热榜
.
name
());
hotSearchLists
.
add
(
hotSearchList
);
}
log
.
info
(
"{}, 此轮网易新闻热榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
));
log
.
info
(
"网易新闻热榜采集结束"
);
return
hotSearchLists
;
}
}
ZhiWeiTools
.
sleep
(
3000L
);
}
return
hotSearchLists
;
}
/**
* 网易新闻跟帖热议的采集
* @return
*/
public
static
List
<
HotSearchList
>
getWangYicomment
(){
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
log
.
info
(
"网易新闻跟贴热议开始采集"
);
String
url
=
"https://v6-gw.m.163.com/gentie-web/api/v2/products/a2869674571f77b5a0867c3d71db5856/rankDocs/all/list?ibc=newsapph5&limit=30"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
htmlBody
=
null
;
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"网易新闻跟贴热议页面连接异常..."
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
bodyObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONArray
jsonObject
=
bodyObject
.
getJSONArray
(
"cmtDocs"
);
if
(
jsonObject
!=
null
)
{
for
(
int
i
=
0
;
i
<
jsonObject
.
size
();
i
++)
{
int
rank
=
i
+
1
;
String
name
=
jsonObject
.
getJSONObject
(
i
).
getString
(
"doc_title"
);
int
count
=
jsonObject
.
getJSONObject
(
i
).
getIntValue
(
"hotScore"
)*
10000
;
String
contentId
=
jsonObject
.
getJSONObject
(
i
).
getString
(
"docId"
);
String
wangyiUrl
=
"https://c.m.163.com/news/a/"
+
contentId
+
".html"
;
HotSearchList
hotSearchList
=
new
HotSearchList
(
wangyiUrl
,
name
,
count
,
rank
,
HotSearchType
.
网易跟帖热议
.
name
());
hotSearchLists
.
add
(
hotSearchList
);
}
log
.
info
(
"{}, 此轮网易新闻跟贴热议采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
hotSearchLists
!=
null
?
hotSearchLists
.
size
()
:
0
));
log
.
info
(
"网易新闻跟贴热议采集结束"
);
return
hotSearchLists
;
}
}
ZhiWeiTools
.
sleep
(
3000L
);
}
return
hotSearchLists
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
96aca677
...
...
@@ -50,11 +50,11 @@ public class HotSearchRun {
// new ZhihuHotSearchRun().start();
new
WeiboSuperTopicRun
().
start
();
new
WeiboTopicRun
().
start
();
new
ToutiaoHotSearchRun
().
start
();
new
ZhihuTopSearchRun
().
start
();
//
new ToutiaoHotSearchRun().start();
//
new ZhihuTopSearchRun().start();
new
ZhihuChildHotSearchRun
().
start
();
new
ThreadOneRun
().
start
();
//抖音链接更新
//
//抖音链接更新
new
DouYinUrlHotSearchRun
().
start
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
View file @
96aca677
...
...
@@ -5,6 +5,9 @@ import java.util.Date;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.crawler.ToutiaoHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.crawler.ZhihuTopicSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
...
...
@@ -46,18 +49,22 @@ public class DouyinHotSearchRun extends Thread{
*/
private
void
getHotList
()
{
log
.
info
(
"抖音热搜榜采集开始........"
);
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
list
=
DouyinHotSearchCrawler
.
getMobileDouyinHotList
();
log
.
info
(
"{}, 抖音热搜榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
if
(
list
==
null
||
list
.
size
()
==
0
){
TipsUtils
.
sendTips
(
"抖音热搜"
,
new
Date
());
}
else
{
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
TipsUtils
.
recoveryTips
(
"抖音热搜"
,
new
Date
());
}
TipsUtils
.
addHotList
(
"抖音热搜"
,
list
);
log
.
info
(
"抖音热搜榜采集结束........"
);
ZhiWeiTools
.
sleep
(
3000L
);
log
.
info
(
"今日头条热搜采集开始........"
);
List
<
HotSearchList
>
toutiaoList
=
ToutiaoHotSearchCrawler
.
toutiaoHotSearchByPhone
();
log
.
info
(
"{}, 今日头条此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
toutiaoList
!=
null
?
toutiaoList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
今日头条热搜
.
name
(),
toutiaoList
);
log
.
info
(
"今日头条热搜采集结束........"
);
ZhiWeiTools
.
sleep
(
3000L
);
log
.
info
(
"知乎热搜榜单采集开始..."
);
List
<
HotSearchList
>
zhihuList
=
ZhihuTopicSearchCrawler
.
getZhihuTopicSearch
();
log
.
info
(
"{}, 知乎热搜榜单此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
zhihuList
!=
null
?
zhihuList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
知乎热搜榜单
.
name
(),
zhihuList
);
log
.
info
(
"知乎热搜榜单采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ThreadOneRun.java
View file @
96aca677
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.FengHuangSearchCrawler
;
import
com.zhiwei.searchhotcrawler.crawler.SouhuTopicCrawler
;
import
com.zhiwei.searchhotcrawler.crawler.TengXunCrawler
;
import
com.zhiwei.searchhotcrawler.crawler.XinLangHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.crawler.*
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
...
...
@@ -35,31 +32,18 @@ public class ThreadOneRun extends Thread {
private
void
getHotList
(){
List
<
HotSearchList
>
tengXunlist
=
TengXunCrawler
.
getTengXunHotList
();
addHotList
(
"腾讯新闻"
,
tengXunlist
);
TipsUtils
.
addHotList
(
"腾讯新闻"
,
tengXunlist
);
ZhiWeiTools
.
sleep
(
3000L
);
List
<
HotSearchList
>
xinLanglist
=
XinLangHotSearchCrawler
.
getXinLangHotSearch
();
addHotList
(
"新浪热榜"
,
xinLanglist
);
TipsUtils
.
addHotList
(
"新浪热榜"
,
xinLanglist
);
ZhiWeiTools
.
sleep
(
3000L
);
List
<
HotSearchList
>
souhuList
=
SouhuTopicCrawler
.
getSouhuTopic
();
addHotList
(
"搜狐话题"
,
souhuList
);
TipsUtils
.
addHotList
(
"搜狐话题"
,
souhuList
);
ZhiWeiTools
.
sleep
(
3000L
);
List
<
HotSearchList
>
xinLangHotList
=
XinLangHotSearchCrawler
.
getXinLangHotSpot
();
addHotList
(
"新浪热点"
,
xinLangHotList
);
TipsUtils
.
addHotList
(
"新浪热点"
,
xinLangHotList
);
ZhiWeiTools
.
sleep
(
3000L
);
List
<
HotSearchList
>
fengHuangHotList
=
FengHuangSearchCrawler
.
getFengHuangHotList
();
addHotList
(
"凤凰新闻热榜"
,
fengHuangHotList
);
}
private
void
addHotList
(
String
type
,
List
<
HotSearchList
>
list
){
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
if
(
list
==
null
||
list
.
size
()
==
0
){
TipsUtils
.
sendTips
(
type
,
new
Date
());
}
else
{
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
TipsUtils
.
recoveryTips
(
type
,
new
Date
());
}
TipsUtils
.
addHotList
(
"凤凰新闻热榜"
,
fengHuangHotList
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuChildHotSearchRun.java
View file @
96aca677
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.WangYiHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.crawler.ZhihuChildHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
...
...
@@ -53,6 +54,14 @@ public class ZhihuChildHotSearchRun extends Thread {
ZhiWeiTools
.
sleep
(
3000
);
}
}
//网易实时热榜采集
ZhiWeiTools
.
sleep
(
3000L
);
List
<
HotSearchList
>
wangyiHotSearchList
=
WangYiHotSearchCrawler
.
getWangYiHotSearch
();
TipsUtils
.
addHotList
(
"网易热榜"
,
wangyiHotSearchList
);
//网易跟帖热议采集
ZhiWeiTools
.
sleep
(
3000L
);
List
<
HotSearchList
>
wangyiComment
=
WangYiHotSearchCrawler
.
getWangYicomment
();
TipsUtils
.
addHotList
(
"网易跟帖热议"
,
wangyiComment
);
}
private
String
getTypeName
(
String
type
){
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment