Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
d00d9860
Commit
d00d9860
authored
Jul 30, 2020
by
马黎滨
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'mlbWork' into 'master'
Mlb work See merge request
!7
parents
d767f59c
c209c204
Show whitespace changes
Inline
Side-by-side
Showing
29 changed files
with
782 additions
and
184 deletions
+782
-184
pom.xml
+5
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+10
-6
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
+13
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
+13
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
+80
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
+26
-20
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+39
-37
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboSuperTopicCrawler.java
+21
-18
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
+12
-8
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuChildHotSearchCrawler.java
+86
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+28
-24
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuTopicSearchCrawler.java
+15
-14
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchListDAO.java
+17
-3
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+2
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
+3
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
+4
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
+4
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/ThreadOneRun.java
+44
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/ToutiaoHotSearchRun.java
+4
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
+4
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
+4
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuChildHotSearchRun.java
+73
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
+4
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuTopSearchRun.java
+4
-0
src/main/java/com/zhiwei/searchhotcrawler/util/HttpClientUtils.java
+50
-0
src/main/java/com/zhiwei/searchhotcrawler/util/QYWechatUtil.java
+125
-0
src/main/java/com/zhiwei/searchhotcrawler/util/TipsUtils.java
+32
-0
src/main/java/com/zhiwei/searchhotcrawler/util/WechatCodeUtil.java
+58
-35
No files found.
pom.xml
View file @
d00d9860
...
...
@@ -50,6 +50,11 @@
<artifactId>
lombok
</artifactId>
<version>
1.18.8
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.6.0.4-RELEASE
</version>
</dependency>
</dependencies>
<build>
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
d00d9860
...
...
@@ -8,5 +8,6 @@ public enum HotSearchType {
搜狗微信热搜
,
微博话题
,
今日头条热搜
,
知乎热搜榜单
知乎热搜榜单
,
腾讯新闻
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
View file @
d00d9860
...
...
@@ -7,6 +7,8 @@ import java.util.List;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -29,7 +31,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
@Log4j2
public
class
BaiDuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @Title: BaiDuHotSearchTest
...
...
@@ -39,16 +41,18 @@ public class BaiDuHotSearchCrawler {
*/
public
static
List
<
HotSearchList
>
baiduHotSearch
()
{
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"mainBody"
))
{
return
ansysData
(
htmlBody
);
}
else
{
log
.
info
(
"解析百度风云榜时出现解析错误,页面结构有问题"
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
}
return
Collections
.
emptyList
();
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
View file @
d00d9860
...
...
@@ -5,6 +5,8 @@ import java.util.ArrayList;
import
java.util.List
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -28,7 +30,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
@Log4j2
public
class
DouyinHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
/**
* @Title: getMobileDouyinHotList
...
...
@@ -40,9 +42,14 @@ public class DouyinHotSearchCrawler {
public
static
List
<
HotSearchList
>
getMobileDouyinHotList
(){
List
<
HotSearchList
>
list
=
null
;
String
url
=
"https://api.amemv.com/aweme/v1/hot/search/list/"
;
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
)){
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
);
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
))
{
list
=
new
ArrayList
<>();
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
wordList
=
data
.
getJSONObject
(
"data"
).
getJSONArray
(
"word_list"
);
...
...
@@ -58,17 +65,14 @@ public class DouyinHotSearchCrawler {
//获取关键词
word
=
wl
.
getString
(
"word"
);
//获取热度值
hotValueStr
=
wl
.
getString
(
"hot_value"
);
hotValueStr
=
wl
.
getString
(
"hot_value"
);
Integer
hotValue
=
null
;
hotValue
=
Integer
.
valueOf
(
hotValueStr
);
// logger.info("热度为:::{}", hot_value);
HotSearchList
douyin
=
new
HotSearchList
(
null
,
word
,
hotValue
,
position
,
HotSearchType
.
抖音热搜
.
name
());
HotSearchList
douyin
=
new
HotSearchList
(
null
,
word
,
hotValue
,
position
,
HotSearchType
.
抖音热搜
.
name
());
list
.
add
(
douyin
);
}
}
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
);
}
return
list
;
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
View file @
d00d9860
...
...
@@ -7,6 +7,8 @@ import java.util.Map;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -31,7 +33,7 @@ import com.zhiwei.tools.httpclient.HeaderTool;
@Log4j2
public
class
SougoHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
/**
* @Title: SougoHotSearchTest
...
...
@@ -41,13 +43,16 @@ public class SougoHotSearchCrawler {
*/
public
static
List
<
HotSearchList
>
sougoHotSearch
()
{
String
url
=
"https://weixin.sogou.com"
;
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
Map
<
String
,
String
>
headMap
=
HeaderTool
.
getCommonHead
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headMap
);
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
try
{
Map
<
String
,
String
>
headMap
=
HeaderTool
.
getCommonHead
();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"topwords"
))
{
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
@@ -75,10 +80,12 @@ public class SougoHotSearchCrawler {
if
(
Objects
.
nonNull
(
rank
))
{
list
.
add
(
hotSearch
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析搜狗微信时出现解析错误"
,
e
);
}
}
break
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析搜狗微信时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
return
Collections
.
emptyList
();
...
...
@@ -86,10 +93,7 @@ public class SougoHotSearchCrawler {
}
else
{
log
.
info
(
"解析搜狗微信时出现解析错误,页面结构有问题"
);
}
break
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
}
}
return
list
;
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
0 → 100644
View file @
d00d9860
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
@Log4j2
public
class
TengXunCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* 腾讯热榜数据采集
* @return
*/
public
static
List
<
HotSearchList
>
getTengXunHotList
()
{
log
.
info
(
"腾讯新闻热榜开始采集..."
);
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
JSONArray
dataJson
=
null
;
String
htmlBody
=
null
;
String
url
=
"https://r.inews.qq.com/getWeiboRankingList?chlid=news_recommend_hot&appver=28_android_4.2.40&devid=&qn-rid=&qn-sig=f690e21095559203e3f55c42a04f8f15"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"idlist"
))
{
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
dataJson
=
topSearch
.
getJSONArray
(
"idlist"
).
getJSONObject
(
0
).
getJSONArray
(
"newslist"
);
for
(
int
i
=
1
;
i
<
dataJson
.
size
();
i
++)
{
Integer
rank
=
i
;
String
name
=
dataJson
.
getJSONObject
(
i
).
getString
(
"title"
);
String
tengxunUrl
=
"https://view.inews.qq.com/topic/"
+
dataJson
.
getJSONObject
(
i
).
getString
(
"id"
);
Integer
count
=
0
;
String
icon
=
null
;
if
(
dataJson
.
getJSONObject
(
i
).
containsKey
(
"topic"
))
{
count
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"topic"
).
getIntValue
(
"ranking_score"
);
if
(
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"topic"
).
containsKey
(
"rec_icon"
))
{
icon
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"topic"
).
getString
(
"rec_icon"
);
}
}
else
if
(
dataJson
.
getJSONObject
(
i
).
containsKey
(
"hotEvent"
))
{
count
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"hotEvent"
).
getIntValue
(
"hotScore"
);
if
(
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"hotEvent"
).
containsKey
(
"rec_icon"
))
{
icon
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"hotEvent"
).
getString
(
"rec_icon"
);
}
}
if
(
icon
!=
null
)
{
if
(
icon
.
contains
(
"11918331890"
))
{
icon
=
"热"
;
}
else
if
(
icon
.
contains
(
"11918332271"
))
{
icon
=
"新"
;
}
}
HotSearchList
hotSearchList
=
new
HotSearchList
(
tengxunUrl
,
name
,
count
,
false
,
rank
,
HotSearchType
.
腾讯新闻
.
name
(),
icon
);
list
.
add
(
hotSearchList
);
}
}
ZhiWeiTools
.
sleep
(
3000L
);
}
log
.
info
(
"{}, 此轮腾讯新闻热榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"腾讯新闻采集结束"
);
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
View file @
d00d9860
...
...
@@ -9,6 +9,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -37,34 +39,40 @@ public class ToutiaoHotSearchCrawler {
public
static
List
<
HotSearchList
>
toutiaoHotSearchByPhone
(){
String
origin
=
"hot_board"
;
String
jsUrl
=
"https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js"
;
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
jsUrl
)).
body
().
string
();
if
(
htmlBody
.
contains
(
"origin"
)){
String
s
=
htmlBody
.
substring
(
htmlBody
.
indexOf
(
"origin:"
)+
"origin:"
.
length
());
origin
=
s
.
substring
(
1
,
s
.
indexOf
(
"}"
)-
1
);
}
Request
jsRequest
=
RequestUtils
.
wrapGet
(
jsUrl
);
String
jsBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
jsRequest
))
{
jsBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"获取今日头条实时热搜头部信息标识失败"
,
e
);
}
for
(
int
count
=
0
;
count
<=
5
;
count
++){
if
(
jsBody
!=
null
&&
jsBody
.
contains
(
"origin"
)){
String
s
=
jsBody
.
substring
(
jsBody
.
indexOf
(
"origin:"
)+
"origin:"
.
length
());
origin
=
s
.
substring
(
1
,
s
.
indexOf
(
"}"
)-
1
);
}
//采集头条内容
String
url
=
"https://i.snssdk.com/hot-event/hot-board/?origin="
+
origin
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
);
headerMap
.
put
(
"referer"
,
"https://ib.snssdk.com/rogue/aladdin_landingpage/template/aladdin_landingpage/hot_words.html?isBrowser=true&traffic_source="
);
String
htmlBody
;
try
{
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e1
)
{
log
.
error
(
"解析今日头条实时热搜时出现连接失败"
,
e1
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
try
{
JSONArray
words
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
);
int
rank
=
1
;
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
try
{
JSONObject
word
=
words
.
getJSONObject
(
i
);
String
name
=
word
.
getString
(
"Title"
);
String
link
=
"https://ib.snssdk.com/search/?keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&pd=synthesis&source=trending_list&traffic_source="
;
String
link
=
"https://ib.snssdk.com/search/?keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&pd=synthesis&source=trending_list&traffic_source="
;
Integer
hotCount
=
word
.
getInteger
(
"HotValue"
);
String
wordsType
=
word
.
getString
(
"Label"
);
String
icon
=
getIcon
(
wordsType
);
...
...
@@ -73,20 +81,18 @@ public class ToutiaoHotSearchCrawler {
result
.
add
(
hotSearch
);
rank
++;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析今日头条实时热搜时出现解析错误"
,
e
);
log
.
error
(
"解析今日头条实时热搜时出现解析错误"
,
e
);
continue
;
}
}
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析今日头条实时热搜时出现解析错误,数据不是json结构"
,
e
);
log
.
error
(
"解析今日头条实时热搜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
}
else
{
log
.
info
(
"解析今日头条实时热搜时出现解析错误,页面结构有问题"
);
}
}
catch
(
IOException
e1
)
{
log
.
error
(
"解析今日头条实时热搜时出现连接失败"
,
e1
);
}
}
return
Collections
.
emptyList
();
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
d00d9860
...
...
@@ -4,6 +4,8 @@ import java.io.IOException;
import
java.util.*
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -44,9 +46,17 @@ public class WeiboHotSearchCrawler {
List
<
HotSearchList
>
list
=
new
ArrayList
<
HotSearchList
>();
for
(
int
i
=
0
;
i
<
3
;
i
++){
String
htmlBody
=
null
;
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"pl_top_realtimehot"
)){
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
if
(
i
==
2
){
return
list
;
}
else
{
continue
;
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"pl_top_realtimehot"
))
{
try
{
// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
// script = script.replace("(", "").replace(")", "");
...
...
@@ -54,16 +64,16 @@ public class WeiboHotSearchCrawler {
// String html = json.getString("html");
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"div#pl_top_realtimehot"
).
select
(
"tbody"
).
select
(
"tr"
);
for
(
Element
element
:
elements
)
{
for
(
Element
element
:
elements
)
{
try
{
String
id
=
"http://s.weibo.com"
+
element
.
select
(
"td.td-02"
).
select
(
"a"
).
attr
(
"href"
);
String
id
=
"http://s.weibo.com"
+
element
.
select
(
"td.td-02"
).
select
(
"a"
).
attr
(
"href"
);
String
name
=
element
.
select
(
"td.td-02"
).
select
(
"a"
).
text
();
String
num
=
!
element
.
select
(
"td.td-02"
).
select
(
"span"
).
text
().
equals
(
""
)?
element
.
select
(
"td.td-02"
).
select
(
"span"
).
text
():
"0"
;
String
rank
=
!
element
.
select
(
"td[class=\"td-01 ranktop\"]"
).
text
().
equals
(
""
)?
element
.
select
(
"td[class=\"td-01 ranktop\"]"
).
text
():
"-1"
;
String
num
=
!
element
.
select
(
"td.td-02"
).
select
(
"span"
).
text
().
equals
(
""
)
?
element
.
select
(
"td.td-02"
).
select
(
"span"
).
text
()
:
"0"
;
String
rank
=
!
element
.
select
(
"td[class=\"td-01 ranktop\"]"
).
text
().
equals
(
""
)
?
element
.
select
(
"td[class=\"td-01 ranktop\"]"
).
text
()
:
"-1"
;
int
hotCount
=
Integer
.
valueOf
(
num
);
int
rankCount
=
Integer
.
valueOf
(
rank
);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
true
,
rankCount
,
HotSearchType
.
微博热搜
.
name
(),
null
);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
true
,
rankCount
,
HotSearchType
.
微博热搜
.
name
(),
null
);
list
.
add
(
hotSearch
);
}
catch
(
Exception
e
)
{
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
...
...
@@ -72,24 +82,16 @@ public class WeiboHotSearchCrawler {
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
return
null
;
}
}
else
{
}
else
{
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
log
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
}
break
;
}
catch
(
Exception
e
)
{
if
(
i
==
2
){
return
list
;
}
else
{
continue
;
}
}
}
return
list
;
}
...
...
@@ -103,62 +105,62 @@ public class WeiboHotSearchCrawler {
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(){
for
(
int
count
=
0
;
count
<=
5
;
count
++){
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583"
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
);
String
htmlBody
;
try
{
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e1
)
{
log
.
error
(
"解析微博时时热搜时出现连接失败"
,
e1
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
)){
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONArray
cards
=
json
.
getJSONArray
(
"cards"
);
int
rank
=
0
;
for
(
int
i
=
0
;
i
<
cards
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
cards
.
size
();
i
++)
{
try
{
JSONObject
card
=
cards
.
getJSONObject
(
i
);
JSONArray
cardGroup
=
card
.
getJSONArray
(
"card_group"
);
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
String
title
=
card
.
getString
(
"title"
);
boolean
hot
=
true
;
if
(
Objects
.
nonNull
(
title
)
&&
title
.
contains
(
"实时上升热点"
))
{
if
(
Objects
.
nonNull
(
title
)
&&
title
.
contains
(
"实时上升热点"
))
{
hot
=
false
;
rank
=
51
;
}
for
(
int
j
=
0
;
j
<
cardGroup
.
size
();
j
++)
{
for
(
int
j
=
0
;
j
<
cardGroup
.
size
();
j
++)
{
JSONObject
cardInfo
=
cardGroup
.
getJSONObject
(
j
);
String
name
=
cardInfo
.
getString
(
"desc"
);
int
hotCount
=
cardInfo
.
getIntValue
(
"desc_extr"
);
String
icon
=
cardInfo
.
getString
(
"icon"
);
if
(
StringUtils
.
isNotBlank
(
icon
))
{
if
(
StringUtils
.
isNotBlank
(
icon
))
{
icon
=
icon
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
}
String
id
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
String
id
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
);
result
.
add
(
hotSearch
);
rank
++;
}
}
else
{
}
else
{
log
.
info
(
"card 数据结构为:{}"
,
card
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
log
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
continue
;
}
}
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
);
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
}
else
{
log
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
}
}
catch
(
IOException
e1
)
{
log
.
error
(
"解析微博时时热搜时出现连接失败"
,
e1
);
}
}
return
Collections
.
emptyList
();
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboSuperTopicCrawler.java
View file @
d00d9860
...
...
@@ -10,6 +10,8 @@ import java.util.Objects;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -58,24 +60,24 @@ public class WeiboSuperTopicCrawler {
String
type
=
entry
.
getKey
();
for
(
int
page
=
1
;
page
<=
5
;
page
++)
{
String
pageUrl
=
url
+
"&page="
+
page
;
Request
request
=
RequestUtils
.
wrapGet
(
pageUrl
,
headMap
);
String
htmlBody
=
null
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
try
{
// System.out.println("pageUrl=========="+pageUrl);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
pageUrl
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc1"
))
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
e
);
continue
;
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc1"
))
{
topicList
.
addAll
(
parseTopicRankHtml
(
page
,
htmlBody
,
type
));
break
;
}
else
{
}
else
{
log
.
error
(
"获取榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
e
);
continue
;
}
}
}
}
return
topicList
;
}
...
...
@@ -136,23 +138,24 @@ public class WeiboSuperTopicCrawler {
*/
private
static
WeiboSuperTopic
getTopicInfo
(
String
id
,
WeiboSuperTopic
topic
)
{
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
try
{
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid="
+
id
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc_more"
))
{
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
e
);
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc_more"
))
{
String
descMore
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"pageInfo"
).
getJSONArray
(
"desc_more"
).
getString
(
0
);
if
(
StringUtils
.
isNotBlank
(
descMore
))
{
if
(
StringUtils
.
isNotBlank
(
descMore
))
{
String
readNum
=
descMore
.
replaceAll
(
"阅读|帖子.*"
,
""
).
trim
();
String
postNum
=
descMore
.
replaceAll
(
".*帖子|粉丝.*"
,
""
).
trim
();
topic
.
setPostNum
(
postNum
);
topic
.
setReadNum
(
readNum
);
return
topic
;
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
e
);
}
}
return
topic
;
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
View file @
d00d9860
...
...
@@ -11,6 +11,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Element
;
...
...
@@ -131,21 +133,23 @@ public class WeiboTopicCrawler {
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
for
(
int
page
=
1
;
page
<=
6
;
page
++){
String
pageUrl
=
"https://api.weibo.cn/2/page?gsid=_2A25zJX_EDeRxGedH71YS8CzKzzmIHXVuc_QMrDV6PUJbkdANLXPbkWpNUK3OyitGCJsX8exvua-vfubUqCiaA4lb&from=10A1193010&c=iphone&s=2827eebe&count=20&containerid=106003type%253D25%2526t%253D3%2526disable_hot%253D1%2526filter_type%253Dtopicscene&page="
+
page
;
Request
request
=
RequestUtils
.
wrapGet
(
pageUrl
);
String
htmlBody
=
null
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
5
;
retryTimes
++)
{
try
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
// log.info("pageUrl::{}", pageUrl);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
pageUrl
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
topicList
.
addAll
(
parseTopicHtml
(
htmlBody
));
break
;
}
else
{
log
.
info
(
"下载榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"下载榜单列表页面时出现错误,错误为:{}"
,
e
);
continue
;
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
topicList
.
addAll
(
parseTopicHtml
(
htmlBody
));
break
;
}
else
{
log
.
info
(
"下载榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
}
}
return
topicList
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuChildHotSearchCrawler.java
0 → 100644
View file @
d00d9860
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
@Log4j2
public
class
ZhihuChildHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* 知乎子级分类数据采集
* @param type
* @param typeName
* @return
*/
public
static
List
<
HotSearchList
>
getZhihuTopicSearch
(
String
type
,
String
typeName
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
url
=
"https://www.zhihu.com/api/v3/feed/topstory/hot-lists/"
+
type
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"x-api-version"
,
"3.0.76"
);
JSONArray
dataJson
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
dataJson
=
topSearch
.
getJSONArray
(
"data"
);
for
(
int
i
=
0
;
i
<
dataJson
.
size
();
i
++)
{
JSONObject
jsonObject
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"target"
);
Integer
rank
=
i
+
1
;
String
name
=
jsonObject
.
getJSONObject
(
"title_area"
).
getString
(
"text"
);
String
hotCountString
=
jsonObject
.
getJSONObject
(
"metrics_area"
).
getString
(
"text"
);
Integer
count
=
getHotCount
(
hotCountString
);
String
childUrl
=
jsonObject
.
getJSONObject
(
"link"
).
getString
(
"url"
);
HotSearchList
hotSearchList
=
new
HotSearchList
(
childUrl
,
name
,
count
,
rank
,
HotSearchType
.
知乎热搜
.
name
()
+
typeName
+
"分类"
);
list
.
add
(
hotSearchList
);
}
}
ZhiWeiTools
.
sleep
(
3000L
);
}
return
list
;
}
/**
* 截取出热度值
* @param hotCountString
* @return
*/
private
static
Integer
getHotCount
(
String
hotCountString
){
Integer
count
;
if
(
hotCountString
.
contains
(
"万"
)){
hotCountString
=
hotCountString
.
replaceAll
(
"万.*"
,
""
).
trim
();
count
=
(
int
)(
Double
.
parseDouble
(
hotCountString
)*
10000
);
}
else
if
(
hotCountString
.
contains
(
"亿"
)){
hotCountString
=
hotCountString
.
replaceAll
(
"亿.*"
,
""
).
trim
();
count
=
(
int
)(
Double
.
parseDouble
(
hotCountString
)*
10000000
);
}
else
{
count
=
Integer
.
getInteger
(
hotCountString
.
substring
(
0
,
hotCountString
.
indexOf
(
"领域热度"
)));
}
return
count
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
d00d9860
...
...
@@ -6,6 +6,8 @@ import java.util.List;
import
java.util.Map
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -46,9 +48,14 @@ public class ZhihuHotSearchCrawler {
headerMap
.
put
(
"accept"
,
"application/json, text/plain, */*"
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
headerMap
.
put
(
"Referer"
,
rerferer
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"words"
)){
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"words"
))
{
list
=
new
ArrayList
<>();
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
words
=
topSearch
.
getJSONObject
(
"top_search"
).
getJSONArray
(
"words"
);
...
...
@@ -59,15 +66,11 @@ public class ZhihuHotSearchCrawler {
JSONObject
word
=
words
.
getJSONObject
(
i
);
query
=
word
.
getString
(
"query"
);
displayQuery
=
word
.
getString
(
"display_query"
);
link
=
"https://www.zhihu.com/search?q="
+
URLCodeUtil
.
getURLEncode
(
query
,
"utf-8"
)+
"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content"
;
link
=
"https://www.zhihu.com/search?q="
+
URLCodeUtil
.
getURLEncode
(
query
,
"utf-8"
)
+
"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content"
;
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
null
,
i
,
HotSearchType
.
知乎热搜
.
name
());
list
.
add
(
zhihu
);
}
}
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
}
return
list
;
}
...
...
@@ -81,7 +84,7 @@ public class ZhihuHotSearchCrawler {
* @return List<ZhihuHotSearch> 返回类型
*/
public
static
List
<
HotSearchList
>
getMobileZhihuHotList
(){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
;
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
url
=
"https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0"
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"api.zhihu.com"
);
...
...
@@ -89,10 +92,15 @@ public class ZhihuHotSearchCrawler {
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
);
headerMap
.
put
(
"X-UDID"
,
"AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
)){
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
))
{
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
dataJson
=
topSearch
.
getJSONArray
(
"data"
);
String
link
=
null
;
...
...
@@ -107,26 +115,22 @@ public class ZhihuHotSearchCrawler {
//计算热度
try
{
if
(
hotText
.
contains
(
"万"
))
{
if
(
hotText
.
contains
(
"万"
))
{
hotText
=
hotText
.
replaceAll
(
"万.*"
,
""
).
trim
();
hotCount
=
(
int
)(
Double
.
parseDouble
(
hotText
)*
10000
);
}
else
if
(
hotText
.
contains
(
"亿"
))
{
hotCount
=
(
int
)
(
Double
.
parseDouble
(
hotText
)
*
10000
);
}
else
if
(
hotText
.
contains
(
"亿"
))
{
hotText
=
hotText
.
replaceAll
(
"亿.*"
,
""
).
trim
();
hotCount
=
(
int
)(
Double
.
parseDouble
(
hotText
)*
10000000
);
}
else
{
hotCount
=
(
int
)
(
Double
.
parseDouble
(
hotText
)
*
10000000
);
}
else
{
hotCount
=
Integer
.
getInteger
(
hotText
);
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
hotCount
,
i
+
1
,
HotSearchType
.
知乎热搜
.
name
());
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
hotCount
,
i
+
1
,
HotSearchType
.
知乎热搜
.
name
());
list
.
add
(
zhihu
);
}
}
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuTopicSearchCrawler.java
View file @
d00d9860
...
...
@@ -10,6 +10,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.select.Elements
;
...
...
@@ -30,18 +32,19 @@ public class ZhihuTopicSearchCrawler {
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
url
=
"https://www.zhihu.com/topsearch"
;
JSONObject
jsonObject
=
null
;
try
{
for
(
int
t
=
0
;
t
<
3
&&
jsonObject
==
null
;
t
++)
{
// ZhiWeiTools.sleep(10000L);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
// log.info("页面内容获取:{}",htmlBody);
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
jsonObject
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"知乎热搜页面连接异常"
,
e
);
}
if
(
htmlBody
!=
null
)
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
String
html
=
document
.
getElementsByTag
(
"script"
).
select
(
"#js-initialData"
).
html
();
jsonObject
=
JSONObject
.
parseObject
(
html
);
}
if
(
jsonObject
!=
null
)
{
if
(
jsonObject
!=
null
)
{
JSONArray
dataJson
=
jsonObject
.
getJSONObject
(
"initialState"
).
getJSONObject
(
"topsearch"
).
getJSONArray
(
"data"
);
for
(
int
i
=
0
;
i
<
dataJson
.
size
();
i
++)
{
Integer
rank
=
i
+
1
;
...
...
@@ -53,12 +56,10 @@ public class ZhihuTopicSearchCrawler {
list
.
add
(
hotSearchList
);
}
return
list
;
}
else
{
log
.
error
(
"知乎热搜榜单页面获取异常,404"
);
log
.
error
(
jsonObject
);
}
}
catch
(
IOException
e
)
{
log
.
error
(
"知乎热搜获取异常"
,
e
);
}
else
{
log
.
error
(
"知乎热搜榜单页面获取异常"
);
}
}
return
Collections
.
emptyList
();
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchListDAO.java
View file @
d00d9860
...
...
@@ -6,10 +6,9 @@ import java.util.Date;
import
java.util.List
;
import
java.util.Objects
;
import
com.mongodb.client.ListIndexesIterable
;
import
com.mongodb.client.MongoCollection
;
import
com.mongodb.client.MongoDatabase
;
import
com.mongodb.client.*
;
import
com.mongodb.client.model.IndexOptions
;
import
com.mongodb.client.model.Sorts
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
lombok.extern.log4j.Log4j2
;
...
...
@@ -65,4 +64,19 @@ public class HotSearchListDAO{
}
}
public
Date
getLastTimeByType
(
String
type
){
try
{
BasicDBObject
basicDBObject
=
new
BasicDBObject
();
basicDBObject
.
put
(
"type"
,
type
);
MongoCursor
<
Document
>
cursor
=
mongoCollection
.
find
(
basicDBObject
).
sort
(
Sorts
.
orderBy
(
Sorts
.
descending
(
"time"
))).
skip
(
0
).
limit
(
1
).
iterator
();
while
(
cursor
.
hasNext
())
{
return
(
Date
)
cursor
.
next
().
get
(
"time"
);
}
}
catch
(
Exception
e
){
log
.
error
(
"查询数据时出错,错误为:{}"
,
e
);
}
return
null
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
d00d9860
...
...
@@ -52,5 +52,7 @@ public class HotSearchRun {
new
WeiboTopicRun
().
start
();
new
ToutiaoHotSearchRun
().
start
();
new
ZhihuTopSearchRun
().
start
();
new
ZhihuChildHotSearchRun
().
start
();
new
ThreadOneRun
().
start
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
View file @
d00d9860
...
...
@@ -7,6 +7,7 @@ import java.util.Objects;
import
java.util.concurrent.TimeUnit
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
org.slf4j.Logger
;
...
...
@@ -47,6 +48,8 @@ public class BaiduHotSearchRun extends Thread{
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
}
else
{
TipsUtils
.
sendTips
(
"百度热搜"
,
new
Date
());
}
log
.
info
(
"百度风云榜采集结束........"
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
View file @
d00d9860
...
...
@@ -6,6 +6,7 @@ import java.util.List;
import
java.util.concurrent.TimeUnit
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
org.slf4j.Logger
;
...
...
@@ -47,6 +48,9 @@ public class DouyinHotSearchRun extends Thread{
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
List
<
HotSearchList
>
list
=
DouyinHotSearchCrawler
.
getMobileDouyinHotList
();
log
.
info
(
"{}, 抖音热搜榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
if
(
list
==
null
||
list
.
size
()
==
0
){
TipsUtils
.
sendTips
(
"抖音热搜"
,
new
Date
());
}
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
log
.
info
(
"抖音热搜榜采集结束........"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
View file @
d00d9860
...
...
@@ -7,6 +7,7 @@ import java.util.concurrent.TimeUnit;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
org.slf4j.Logger
;
...
...
@@ -43,6 +44,9 @@ public class SougoHotSearchRun extends Thread {
log
.
info
(
"搜狗微信采集开始........"
);
List
<
HotSearchList
>
list
=
SougoHotSearchCrawler
.
sougoHotSearch
();
log
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
if
(
list
==
null
||
list
.
size
()
==
0
){
TipsUtils
.
sendTips
(
"搜狗微信热搜"
,
new
Date
());
}
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
log
.
info
(
"搜狗微信采集结束........"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/ThreadOneRun.java
0 → 100644
View file @
d00d9860
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.TengXunCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
@Log4j2
public
class
ThreadOneRun
extends
Thread
{
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getHotList
();
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
1000
);
}
ZhiWeiTools
.
sleep
(
50
);
}
}
private
void
getHotList
(){
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
List
<
HotSearchList
>
list
=
TengXunCrawler
.
getTengXunHotList
();
if
(
list
==
null
||
list
.
size
()
==
0
){
TipsUtils
.
sendTips
(
"腾讯新闻"
,
new
Date
());
}
else
{
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ToutiaoHotSearchRun.java
View file @
d00d9860
...
...
@@ -5,6 +5,7 @@ import com.zhiwei.searchhotcrawler.crawler.ToutiaoHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
...
...
@@ -39,6 +40,9 @@ public class ToutiaoHotSearchRun extends Thread{
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
List
<
HotSearchList
>
list
=
ToutiaoHotSearchCrawler
.
toutiaoHotSearchByPhone
();
log
.
info
(
"{}, 今日头条此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
if
(
list
==
null
||
list
.
size
()
==
0
){
TipsUtils
.
sendTips
(
"今日头条热搜"
,
new
Date
());
}
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
log
.
info
(
"今日头条热搜采集结束........"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
View file @
d00d9860
...
...
@@ -6,6 +6,7 @@ import java.util.List;
import
java.util.concurrent.TimeUnit
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler
;
...
...
@@ -37,6 +38,9 @@ public class WeiboHotSearchRun extends Thread{
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
List
<
HotSearchList
>
list
=
WeiboHotSearchCrawler
.
weiboHotSearchByPhone
();
log
.
info
(
"{}, 微博此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
if
(
list
==
null
||
list
.
size
()
==
0
){
TipsUtils
.
sendTips
(
"微博热搜"
,
new
Date
());
}
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
View file @
d00d9860
...
...
@@ -4,6 +4,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import
com.zhiwei.searchhotcrawler.crawler.WeiboTopicCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
...
...
@@ -37,6 +38,9 @@ public class WeiboTopicRun extends Thread{
log
.
info
(
"微博话题采集开始........"
);
List
<
HotSearchList
>
list
=
WeiboTopicCrawler
.
startCrawlerByPhone
();
log
.
info
(
"{}, 微博话题此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
if
(
list
==
null
||
list
.
size
()
==
0
){
TipsUtils
.
sendTips
(
"微博话题"
,
new
Date
());
}
List
<
Document
>
data
=
new
ArrayList
<>();
for
(
HotSearchList
topic
:
list
){
Document
doc
=
new
Document
();
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuChildHotSearchRun.java
0 → 100644
View file @
d00d9860
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.ZhihuChildHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
java.util.Arrays
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
@Log4j2
public
class
ZhihuChildHotSearchRun
extends
Thread
{
private
List
<
String
>
childType
=
Arrays
.
asList
(
"digital"
,
"focus"
,
"depth"
);
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getHotList
();
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
}
ZhiWeiTools
.
sleep
(
50
);
}
}
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
for
(
int
i
=
0
;
i
<
childType
.
size
();
i
++)
{
String
name
=
this
.
getTypeName
(
childType
.
get
(
i
));
if
(!
""
.
equals
(
name
))
{
log
.
info
(
"知乎{}话题热榜采集开始..."
,
name
);
List
<
HotSearchList
>
list
=
ZhihuChildHotSearchCrawler
.
getZhihuTopicSearch
(
childType
.
get
(
i
),
name
);
log
.
info
(
"{}, 知乎{}话题此轮采集到的数据量为:{}"
,
new
Date
(),
name
,
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
if
(
list
==
null
||
list
.
size
()
==
0
)
{
TipsUtils
.
sendTips
(
"知乎热搜"
+
name
+
"分类"
,
new
Date
());
}
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
log
.
info
(
"知乎{}话题热榜采集结束..."
,
name
);
ZhiWeiTools
.
sleep
(
3000
);
}
}
}
private
String
getTypeName
(
String
type
){
String
name
;
switch
(
type
)
{
case
"digital"
:
name
=
"数码"
;
break
;
case
"focus"
:
name
=
"国际"
;
break
;
case
"depth"
:
name
=
"时事"
;
break
;
default
:
name
=
""
;
}
return
name
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
View file @
d00d9860
...
...
@@ -6,6 +6,7 @@ import java.util.List;
import
java.util.concurrent.TimeUnit
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
org.slf4j.Logger
;
...
...
@@ -44,6 +45,9 @@ public class ZhihuHotSearchRun extends Thread{
// List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList();
List
<
HotSearchList
>
list
=
ZhihuHotSearchCrawler
.
getMobileZhihuHotList
();
log
.
info
(
"{}, 知乎此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
if
(
list
==
null
||
list
.
size
()
==
0
){
TipsUtils
.
sendTips
(
"知乎热搜"
,
new
Date
());
}
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
log
.
info
(
"知乎话题采集结束........"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuTopSearchRun.java
View file @
d00d9860
...
...
@@ -4,6 +4,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import
com.zhiwei.searchhotcrawler.crawler.ZhihuTopicSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
...
...
@@ -36,6 +37,9 @@ public class ZhihuTopSearchRun extends Thread {
log
.
info
(
"知乎热搜采集开始...,当前线程名字:{}"
,
Thread
.
currentThread
().
getName
());
List
<
HotSearchList
>
list
=
ZhihuTopicSearchCrawler
.
getZhihuTopicSearch
();
log
.
info
(
"{}, 知乎热搜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
if
(
list
==
null
||
list
.
size
()
==
0
){
TipsUtils
.
sendTips
(
"知乎热搜榜单"
,
new
Date
());
}
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
log
.
info
(
"知乎热搜话题采集结束........"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/util/HttpClientUtils.java
0 → 100644
View file @
d00d9860
package
com
.
zhiwei
.
searchhotcrawler
.
util
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
okhttp3.MediaType
;
import
okhttp3.Request
;
import
okhttp3.RequestBody
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
java.io.IOException
;
import
java.nio.charset.Charset
;
import
java.util.Map
;
/**
* http请求的工具类
*/
public
final
class
HttpClientUtils
{
private
static
final
Logger
LOGGER
=
LogManager
.
getLogger
(
HttpClientUtils
.
class
);
private
static
final
String
NAME_VALUE_SEPARATOR
=
"="
;
private
static
final
String
QUERY_PARAM_SEP
=
"&"
;
private
static
final
String
URL_QUERY_PARAM_SEPARATOR
=
"?"
;
private
static
final
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
2
).
build
();
public
static
String
sendPost
(
String
url
,
String
jsonParam
){
return
sendPost
(
url
,
jsonParam
,
null
,
Charset
.
forName
(
"UTF-8"
));
}
public
static
String
sendPost
(
String
url
,
String
jsonParam
,
Map
<
String
,
String
>
headers
,
final
Charset
charset
)
{
if
(
StringUtils
.
isEmpty
(
url
))
{
LOGGER
.
error
(
"URL can not be empty or null."
);
}
if
(
LOGGER
.
isDebugEnabled
())
{
LOGGER
.
debug
(
"Post Request:{}"
,
url
);
}
String
result
=
null
;
Request
request
=
RequestUtils
.
wrapPost
(
url
,
headers
,
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
jsonParam
));
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
result
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
LOGGER
.
error
(
"http connection error :"
+
e
.
getMessage
(),
e
);
}
return
result
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/util/QYWechatUtil.java
0 → 100644
View file @
d00d9860
package
com
.
zhiwei
.
searchhotcrawler
.
util
;
import
com.alibaba.fastjson.JSONObject
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
/**
* 企业微信机器人推送工具
*
* @ClassName: QYWechatUtil
* @author: 陈炜涛
* @date: 2019年7月17日 下午2:33:12
*
* @Copyright: 2019 www.zhiweidata.com
*/
public
class
QYWechatUtil
{
/** 推送地址 **/
private
static
final
String
SEND_URL
=
"https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key="
;
/** markdown模式 **/
public
static
final
String
MSGTYPE_MARKDOWN
=
"markdown"
;
/** 文字 **/
public
static
final
String
MSGTYPE_TEXT
=
"text"
;
/** 图片,需另外封装 **/
public
static
final
String
MSGTYPE_IMAGE
=
"image"
;
/** 图文,需另外封装 **/
public
static
final
String
MSGTYPE_NEWS
=
"news"
;
/**
* @param key
* 发送预警的key 目标机器人
* @param content
* @param mentionedList
* '@'对象id集合
* @param mentionedMobileList
* 手机号码集合
* @return
* @return: String
* @throws @author:
* 陈炜涛
* @date: 2019年7月17日 下午2:56:40
*/
public
static
String
send
(
String
key
,
String
msgtype
,
String
content
,
List
<
String
>
mentionedList
,
List
<
String
>
mentionedMobileList
)
{
msgtype
=
msgtype
!=
null
&&
!
msgtype
.
isEmpty
()
?
msgtype
:
MSGTYPE_TEXT
;
TextBody
text
=
new
TextBody
(
content
,
mentionedList
,
mentionedMobileList
);
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<>();
dataMap
.
put
(
"msgtype"
,
msgtype
);
dataMap
.
put
(
msgtype
,
text
);
return
HttpClientUtils
.
sendPost
(
SEND_URL
+
key
,
JSONObject
.
toJSONString
(
dataMap
));
}
}
/**
* 中转对象仅在此处使用
*
* @ClassName: Body
* @author: 陈炜涛
* @date: 2019年7月17日 下午2:50:19
*
* @Copyright: 2019 www.zhiweidata.com
*/
class
TextBody
{
/**
* 消息内容
*/
private
String
content
;
/**
* 通知人id
*/
private
List
<
String
>
mentionedList
;
/**
* 通知人手机号
*/
private
List
<
String
>
mentionedMobileList
;
public
TextBody
()
{
super
();
}
public
TextBody
(
String
content
,
List
<
String
>
mentionedList
,
List
<
String
>
mentionedMobileList
)
{
super
();
this
.
content
=
content
;
this
.
mentionedList
=
mentionedList
;
this
.
mentionedMobileList
=
mentionedMobileList
;
}
public
String
getContent
()
{
return
content
;
}
public
void
setContent
(
String
content
)
{
this
.
content
=
content
;
}
public
List
<
String
>
getMentionedList
()
{
return
mentionedList
;
}
public
void
setMentionedList
(
List
<
String
>
mentionedList
)
{
this
.
mentionedList
=
mentionedList
;
}
public
List
<
String
>
getMentionedMobileList
()
{
return
mentionedMobileList
;
}
public
void
setMentionedMobileList
(
List
<
String
>
mentionedMobileList
)
{
this
.
mentionedMobileList
=
mentionedMobileList
;
}
@Override
public
String
toString
()
{
return
"TextBody [content="
+
content
+
", mentionedList="
+
mentionedList
+
", mentionedMobileList="
+
mentionedMobileList
+
"]"
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/util/TipsUtils.java
0 → 100644
View file @
d00d9860
package
com
.
zhiwei
.
searchhotcrawler
.
util
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
java.util.Date
;
/**
* 预警发送
*/
public
class
TipsUtils
{
private
static
Long
timeDifference
=
5
*
60
*
1000L
;
private
static
String
key
=
"a8e26ce3-8aaa-4d3e-bcf6-30b81526050b"
;
private
Logger
logger
=
LoggerFactory
.
getLogger
(
TipsUtils
.
class
);
//未采集到数据发送预警信息
public
static
void
sendTips
(
String
type
,
Date
time
){
HotSearchListDAO
hotSearchListDAO
=
new
HotSearchListDAO
();
//获取数据库最后一条数据判断该程序几分钟没有采集到数据
Date
lastTime
=
hotSearchListDAO
.
getLastTimeByType
(
type
);
if
(
time
.
getTime
()
-
lastTime
.
getTime
()
>
timeDifference
){
//发送预警
String
crawlerContent
=
String
.
format
(
"%s已经连续%s分钟未采集到数据"
,
type
,(
time
.
getTime
()
-
lastTime
.
getTime
())/
1000
/
60
);
QYWechatUtil
.
send
(
key
,
QYWechatUtil
.
MSGTYPE_TEXT
,
crawlerContent
,
null
,
null
);
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/util/WechatCodeUtil.java
View file @
d00d9860
...
...
@@ -5,6 +5,8 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -35,20 +37,22 @@ public class WechatCodeUtil {
String
jmAppId
=
AESUtils
.
encrypt
(
"wechat"
,
appId
);
String
url
=
"http://yuqing.zhiweidata.com/WechatPublic/common/getToken?appId="
+
jmAppId
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
try
{
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
result
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
result
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"获取微信公众号推送token失败,问题为:::{}"
,
e
.
fillInStackTrace
());
return
null
;
}
if
(
result
!=
null
)
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
result
);
if
(
jsonObject
.
containsKey
(
"data"
))
{
if
(
jsonObject
.
containsKey
(
"data"
))
{
JSONObject
inJson
=
JSONObject
.
parseObject
(
jsonObject
.
getString
(
"data"
));
token
=
inJson
.
getString
(
"accessToken"
);
}
}
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"获取微信公众号推送token失败,问题为:::{}"
,
e
.
fillInStackTrace
());
return
null
;
}
return
token
;
}
...
...
@@ -65,24 +69,26 @@ public class WechatCodeUtil {
public
static
int
sendDataJson
(
JSONObject
templateJson
)
{
int
msgid
=
0
;
String
url
=
WechatConstant
.
WECHAT_TEMPLET_SEND_URL
.
replace
(
"ACCESS_TOKEN"
,
getToken
());
try
{
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
templateJson
.
toJSONString
());
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
requestBody
)).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"消息推送失败,错误为::{}"
,
e
.
fillInStackTrace
());
msgid
=
0
;
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
null
!=
jsonObject
)
{
if
(
"ok"
.
equals
(
jsonObject
.
getString
(
"errmsg"
)))
{
msgid
=
jsonObject
.
getIntValue
(
"msgid"
);
}
else
{
}
else
{
msgid
=
0
;
logger
.
info
(
"消息推送失败,错误为::{}"
,
jsonObject
.
toString
());
logger
.
info
(
"消息推送失败,错误为::{}"
,
jsonObject
.
toString
());
}
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"消息推送失败,错误为::{}"
,
e
.
fillInStackTrace
());
msgid
=
0
;
}
return
msgid
;
}
...
...
@@ -106,7 +112,14 @@ public class WechatCodeUtil {
postData
.
put
(
"tagid"
,
getGroupIp
(
groupName
));
postData
.
put
(
"next_openid"
,
""
);
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
postData
.
toJSONString
());
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
requestBody
)).
body
().
string
();
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
){
logger
.
error
(
"页面连接获取失败"
,
e
);
return
null
;
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
null
!=
jsonObject
)
{
...
...
@@ -120,7 +133,6 @@ public class WechatCodeUtil {
}
else
{
logger
.
info
(
"token 获取失败"
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
...
...
@@ -139,7 +151,14 @@ public class WechatCodeUtil {
postData
.
put
(
"tagid"
,
groupId
);
postData
.
put
(
"next_openid"
,
""
);
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
postData
.
toJSONString
());
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
requestBody
)).
body
().
string
();
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
)){
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
){
logger
.
error
(
"页面链接获取失败"
,
e
);
return
null
;
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
null
!=
jsonObject
)
{
...
...
@@ -175,10 +194,16 @@ public class WechatCodeUtil {
String
url
=
"https://api.weixin.qq.com/cgi-bin/tags/get?access_token="
+
getToken
();
Integer
groupId
=
null
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
logger
.
error
(
"获取分组id时出现错误"
,
e
.
fillInStackTrace
());
return
null
;
}
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
.
contains
(
"tags"
))
{
if
(
htmlBody
.
contains
(
"tags"
))
{
JSONArray
jsonArry
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"tags"
);
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
...
...
@@ -191,10 +216,6 @@ public class WechatCodeUtil {
}
}
}
}
catch
(
IOException
e
)
{
logger
.
error
(
"获取分组id时出现错误"
,
e
.
fillInStackTrace
());
return
null
;
}
return
groupId
;
}
...
...
@@ -206,10 +227,16 @@ public class WechatCodeUtil {
String
url
=
"https://api.weixin.qq.com/cgi-bin/tags/get?access_token="
+
getToken
();
Map
<
String
,
Integer
>
resultMap
=
new
HashMap
<
String
,
Integer
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
logger
.
error
(
"获取分组id时出现错误"
,
e
.
fillInStackTrace
());
return
null
;
}
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
.
contains
(
"tags"
))
{
if
(
htmlBody
.
contains
(
"tags"
))
{
JSONArray
jsonArry
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"tags"
);
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
...
...
@@ -217,14 +244,10 @@ public class WechatCodeUtil {
String
name
=
data
.
getString
(
"name"
);
resultMap
.
put
(
name
,
id
);
}
}
else
{
}
else
{
logger
.
info
(
"获取分组id时出现错误,数据为:::{}"
,
htmlBody
);
}
}
}
catch
(
IOException
e
)
{
logger
.
error
(
"获取分组id时出现错误"
,
e
.
fillInStackTrace
());
return
null
;
}
return
resultMap
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment