Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
c209c204
Commit
c209c204
authored
Jul 30, 2020
by
马黎滨
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
HttpBoot采集修改和腾讯新闻热榜,知乎子级分类热榜采集
parent
fb89c7b3
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
525 additions
and
185 deletions
+525
-185
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+10
-6
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
+13
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
+13
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
+80
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
+26
-20
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+39
-37
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboSuperTopicCrawler.java
+21
-18
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
+12
-8
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuChildHotSearchCrawler.java
+86
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+28
-24
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuTopicSearchCrawler.java
+15
-14
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+2
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/ThreadOneRun.java
+44
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuChildHotSearchRun.java
+73
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/util/TipsUtils.java
+2
-3
src/main/java/com/zhiwei/searchhotcrawler/util/WechatCodeUtil.java
+58
-35
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
c209c204
...
...
@@ -8,5 +8,6 @@ public enum HotSearchType {
搜狗微信热搜
,
微博话题
,
今日头条热搜
,
知乎热搜榜单
知乎热搜榜单
,
腾讯新闻
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
View file @
c209c204
...
...
@@ -7,6 +7,8 @@ import java.util.List;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -29,7 +31,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
@Log4j2
public
class
BaiDuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @Title: BaiDuHotSearchTest
...
...
@@ -39,16 +41,18 @@ public class BaiDuHotSearchCrawler {
*/
public
static
List
<
HotSearchList
>
baiduHotSearch
()
{
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"mainBody"
))
{
return
ansysData
(
htmlBody
);
}
else
{
log
.
info
(
"解析百度风云榜时出现解析错误,页面结构有问题"
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
}
return
Collections
.
emptyList
();
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
View file @
c209c204
...
...
@@ -5,6 +5,8 @@ import java.util.ArrayList;
import
java.util.List
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -28,7 +30,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
@Log4j2
public
class
DouyinHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
/**
* @Title: getMobileDouyinHotList
...
...
@@ -40,9 +42,14 @@ public class DouyinHotSearchCrawler {
public
static
List
<
HotSearchList
>
getMobileDouyinHotList
(){
List
<
HotSearchList
>
list
=
null
;
String
url
=
"https://api.amemv.com/aweme/v1/hot/search/list/"
;
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
)){
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
);
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
))
{
list
=
new
ArrayList
<>();
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
wordList
=
data
.
getJSONObject
(
"data"
).
getJSONArray
(
"word_list"
);
...
...
@@ -58,17 +65,14 @@ public class DouyinHotSearchCrawler {
//获取关键词
word
=
wl
.
getString
(
"word"
);
//获取热度值
hotValueStr
=
wl
.
getString
(
"hot_value"
);
hotValueStr
=
wl
.
getString
(
"hot_value"
);
Integer
hotValue
=
null
;
hotValue
=
Integer
.
valueOf
(
hotValueStr
);
// logger.info("热度为:::{}", hot_value);
HotSearchList
douyin
=
new
HotSearchList
(
null
,
word
,
hotValue
,
position
,
HotSearchType
.
抖音热搜
.
name
());
HotSearchList
douyin
=
new
HotSearchList
(
null
,
word
,
hotValue
,
position
,
HotSearchType
.
抖音热搜
.
name
());
list
.
add
(
douyin
);
}
}
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
);
}
return
list
;
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
View file @
c209c204
...
...
@@ -7,6 +7,8 @@ import java.util.Map;
import
java.util.Objects
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -31,7 +33,7 @@ import com.zhiwei.tools.httpclient.HeaderTool;
@Log4j2
public
class
SougoHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
/**
* @Title: SougoHotSearchTest
...
...
@@ -41,13 +43,16 @@ public class SougoHotSearchCrawler {
*/
public
static
List
<
HotSearchList
>
sougoHotSearch
()
{
String
url
=
"https://weixin.sogou.com"
;
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
Map
<
String
,
String
>
headMap
=
HeaderTool
.
getCommonHead
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headMap
);
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
try
{
Map
<
String
,
String
>
headMap
=
HeaderTool
.
getCommonHead
();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"topwords"
))
{
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
@@ -75,10 +80,12 @@ public class SougoHotSearchCrawler {
if
(
Objects
.
nonNull
(
rank
))
{
list
.
add
(
hotSearch
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析搜狗微信时出现解析错误"
,
e
);
}
}
break
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析搜狗微信时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
return
Collections
.
emptyList
();
...
...
@@ -86,10 +93,7 @@ public class SougoHotSearchCrawler {
}
else
{
log
.
info
(
"解析搜狗微信时出现解析错误,页面结构有问题"
);
}
break
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
}
}
return
list
;
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
0 → 100644
View file @
c209c204
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
@Log4j2
public
class
TengXunCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* 腾讯热榜数据采集
* @return
*/
public
static
List
<
HotSearchList
>
getTengXunHotList
()
{
log
.
info
(
"腾讯新闻热榜开始采集..."
);
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
JSONArray
dataJson
=
null
;
String
htmlBody
=
null
;
String
url
=
"https://r.inews.qq.com/getWeiboRankingList?chlid=news_recommend_hot&appver=28_android_4.2.40&devid=&qn-rid=&qn-sig=f690e21095559203e3f55c42a04f8f15"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"idlist"
))
{
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
dataJson
=
topSearch
.
getJSONArray
(
"idlist"
).
getJSONObject
(
0
).
getJSONArray
(
"newslist"
);
for
(
int
i
=
1
;
i
<
dataJson
.
size
();
i
++)
{
Integer
rank
=
i
;
String
name
=
dataJson
.
getJSONObject
(
i
).
getString
(
"title"
);
String
tengxunUrl
=
"https://view.inews.qq.com/topic/"
+
dataJson
.
getJSONObject
(
i
).
getString
(
"id"
);
Integer
count
=
0
;
String
icon
=
null
;
if
(
dataJson
.
getJSONObject
(
i
).
containsKey
(
"topic"
))
{
count
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"topic"
).
getIntValue
(
"ranking_score"
);
if
(
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"topic"
).
containsKey
(
"rec_icon"
))
{
icon
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"topic"
).
getString
(
"rec_icon"
);
}
}
else
if
(
dataJson
.
getJSONObject
(
i
).
containsKey
(
"hotEvent"
))
{
count
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"hotEvent"
).
getIntValue
(
"hotScore"
);
if
(
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"hotEvent"
).
containsKey
(
"rec_icon"
))
{
icon
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"hotEvent"
).
getString
(
"rec_icon"
);
}
}
if
(
icon
!=
null
)
{
if
(
icon
.
contains
(
"11918331890"
))
{
icon
=
"热"
;
}
else
if
(
icon
.
contains
(
"11918332271"
))
{
icon
=
"新"
;
}
}
HotSearchList
hotSearchList
=
new
HotSearchList
(
tengxunUrl
,
name
,
count
,
false
,
rank
,
HotSearchType
.
腾讯新闻
.
name
(),
icon
);
list
.
add
(
hotSearchList
);
}
}
ZhiWeiTools
.
sleep
(
3000L
);
}
log
.
info
(
"{}, 此轮腾讯新闻热榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
log
.
info
(
"腾讯新闻采集结束"
);
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
View file @
c209c204
...
...
@@ -9,6 +9,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -37,34 +39,40 @@ public class ToutiaoHotSearchCrawler {
public
static
List
<
HotSearchList
>
toutiaoHotSearchByPhone
(){
String
origin
=
"hot_board"
;
String
jsUrl
=
"https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js"
;
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
jsUrl
)).
body
().
string
();
if
(
htmlBody
.
contains
(
"origin"
)){
String
s
=
htmlBody
.
substring
(
htmlBody
.
indexOf
(
"origin:"
)+
"origin:"
.
length
());
origin
=
s
.
substring
(
1
,
s
.
indexOf
(
"}"
)-
1
);
}
Request
jsRequest
=
RequestUtils
.
wrapGet
(
jsUrl
);
String
jsBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
jsRequest
))
{
jsBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"获取今日头条实时热搜头部信息标识失败"
,
e
);
}
for
(
int
count
=
0
;
count
<=
5
;
count
++){
if
(
jsBody
!=
null
&&
jsBody
.
contains
(
"origin"
)){
String
s
=
jsBody
.
substring
(
jsBody
.
indexOf
(
"origin:"
)+
"origin:"
.
length
());
origin
=
s
.
substring
(
1
,
s
.
indexOf
(
"}"
)-
1
);
}
//采集头条内容
String
url
=
"https://i.snssdk.com/hot-event/hot-board/?origin="
+
origin
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
);
headerMap
.
put
(
"referer"
,
"https://ib.snssdk.com/rogue/aladdin_landingpage/template/aladdin_landingpage/hot_words.html?isBrowser=true&traffic_source="
);
String
htmlBody
;
try
{
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e1
)
{
log
.
error
(
"解析今日头条实时热搜时出现连接失败"
,
e1
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
try
{
JSONArray
words
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
);
int
rank
=
1
;
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
try
{
JSONObject
word
=
words
.
getJSONObject
(
i
);
String
name
=
word
.
getString
(
"Title"
);
String
link
=
"https://ib.snssdk.com/search/?keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&pd=synthesis&source=trending_list&traffic_source="
;
String
link
=
"https://ib.snssdk.com/search/?keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&pd=synthesis&source=trending_list&traffic_source="
;
Integer
hotCount
=
word
.
getInteger
(
"HotValue"
);
String
wordsType
=
word
.
getString
(
"Label"
);
String
icon
=
getIcon
(
wordsType
);
...
...
@@ -73,20 +81,18 @@ public class ToutiaoHotSearchCrawler {
result
.
add
(
hotSearch
);
rank
++;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析今日头条实时热搜时出现解析错误"
,
e
);
log
.
error
(
"解析今日头条实时热搜时出现解析错误"
,
e
);
continue
;
}
}
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析今日头条实时热搜时出现解析错误,数据不是json结构"
,
e
);
log
.
error
(
"解析今日头条实时热搜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
}
else
{
log
.
info
(
"解析今日头条实时热搜时出现解析错误,页面结构有问题"
);
}
}
catch
(
IOException
e1
)
{
log
.
error
(
"解析今日头条实时热搜时出现连接失败"
,
e1
);
}
}
return
Collections
.
emptyList
();
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
c209c204
...
...
@@ -4,6 +4,8 @@ import java.io.IOException;
import
java.util.*
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -44,9 +46,17 @@ public class WeiboHotSearchCrawler {
List
<
HotSearchList
>
list
=
new
ArrayList
<
HotSearchList
>();
for
(
int
i
=
0
;
i
<
3
;
i
++){
String
htmlBody
=
null
;
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"pl_top_realtimehot"
)){
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
if
(
i
==
2
){
return
list
;
}
else
{
continue
;
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"pl_top_realtimehot"
))
{
try
{
// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
// script = script.replace("(", "").replace(")", "");
...
...
@@ -54,16 +64,16 @@ public class WeiboHotSearchCrawler {
// String html = json.getString("html");
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"div#pl_top_realtimehot"
).
select
(
"tbody"
).
select
(
"tr"
);
for
(
Element
element
:
elements
)
{
for
(
Element
element
:
elements
)
{
try
{
String
id
=
"http://s.weibo.com"
+
element
.
select
(
"td.td-02"
).
select
(
"a"
).
attr
(
"href"
);
String
id
=
"http://s.weibo.com"
+
element
.
select
(
"td.td-02"
).
select
(
"a"
).
attr
(
"href"
);
String
name
=
element
.
select
(
"td.td-02"
).
select
(
"a"
).
text
();
String
num
=
!
element
.
select
(
"td.td-02"
).
select
(
"span"
).
text
().
equals
(
""
)?
element
.
select
(
"td.td-02"
).
select
(
"span"
).
text
():
"0"
;
String
rank
=
!
element
.
select
(
"td[class=\"td-01 ranktop\"]"
).
text
().
equals
(
""
)?
element
.
select
(
"td[class=\"td-01 ranktop\"]"
).
text
():
"-1"
;
String
num
=
!
element
.
select
(
"td.td-02"
).
select
(
"span"
).
text
().
equals
(
""
)
?
element
.
select
(
"td.td-02"
).
select
(
"span"
).
text
()
:
"0"
;
String
rank
=
!
element
.
select
(
"td[class=\"td-01 ranktop\"]"
).
text
().
equals
(
""
)
?
element
.
select
(
"td[class=\"td-01 ranktop\"]"
).
text
()
:
"-1"
;
int
hotCount
=
Integer
.
valueOf
(
num
);
int
rankCount
=
Integer
.
valueOf
(
rank
);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
true
,
rankCount
,
HotSearchType
.
微博热搜
.
name
(),
null
);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
true
,
rankCount
,
HotSearchType
.
微博热搜
.
name
(),
null
);
list
.
add
(
hotSearch
);
}
catch
(
Exception
e
)
{
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
...
...
@@ -72,24 +82,16 @@ public class WeiboHotSearchCrawler {
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
return
null
;
}
}
else
{
}
else
{
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
log
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
}
break
;
}
catch
(
Exception
e
)
{
if
(
i
==
2
){
return
list
;
}
else
{
continue
;
}
}
}
return
list
;
}
...
...
@@ -103,62 +105,62 @@ public class WeiboHotSearchCrawler {
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(){
for
(
int
count
=
0
;
count
<=
5
;
count
++){
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583"
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
);
String
htmlBody
;
try
{
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e1
)
{
log
.
error
(
"解析微博时时热搜时出现连接失败"
,
e1
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
)){
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONArray
cards
=
json
.
getJSONArray
(
"cards"
);
int
rank
=
0
;
for
(
int
i
=
0
;
i
<
cards
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
cards
.
size
();
i
++)
{
try
{
JSONObject
card
=
cards
.
getJSONObject
(
i
);
JSONArray
cardGroup
=
card
.
getJSONArray
(
"card_group"
);
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
String
title
=
card
.
getString
(
"title"
);
boolean
hot
=
true
;
if
(
Objects
.
nonNull
(
title
)
&&
title
.
contains
(
"实时上升热点"
))
{
if
(
Objects
.
nonNull
(
title
)
&&
title
.
contains
(
"实时上升热点"
))
{
hot
=
false
;
rank
=
51
;
}
for
(
int
j
=
0
;
j
<
cardGroup
.
size
();
j
++)
{
for
(
int
j
=
0
;
j
<
cardGroup
.
size
();
j
++)
{
JSONObject
cardInfo
=
cardGroup
.
getJSONObject
(
j
);
String
name
=
cardInfo
.
getString
(
"desc"
);
int
hotCount
=
cardInfo
.
getIntValue
(
"desc_extr"
);
String
icon
=
cardInfo
.
getString
(
"icon"
);
if
(
StringUtils
.
isNotBlank
(
icon
))
{
if
(
StringUtils
.
isNotBlank
(
icon
))
{
icon
=
icon
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
}
String
id
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
String
id
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
);
result
.
add
(
hotSearch
);
rank
++;
}
}
else
{
}
else
{
log
.
info
(
"card 数据结构为:{}"
,
card
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
log
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
continue
;
}
}
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
);
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
}
else
{
log
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
}
}
catch
(
IOException
e1
)
{
log
.
error
(
"解析微博时时热搜时出现连接失败"
,
e1
);
}
}
return
Collections
.
emptyList
();
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboSuperTopicCrawler.java
View file @
c209c204
...
...
@@ -10,6 +10,8 @@ import java.util.Objects;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -58,24 +60,24 @@ public class WeiboSuperTopicCrawler {
String
type
=
entry
.
getKey
();
for
(
int
page
=
1
;
page
<=
5
;
page
++)
{
String
pageUrl
=
url
+
"&page="
+
page
;
Request
request
=
RequestUtils
.
wrapGet
(
pageUrl
,
headMap
);
String
htmlBody
=
null
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
try
{
// System.out.println("pageUrl=========="+pageUrl);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
pageUrl
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc1"
))
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
e
);
continue
;
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc1"
))
{
topicList
.
addAll
(
parseTopicRankHtml
(
page
,
htmlBody
,
type
));
break
;
}
else
{
}
else
{
log
.
error
(
"获取榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
e
);
continue
;
}
}
}
}
return
topicList
;
}
...
...
@@ -136,23 +138,24 @@ public class WeiboSuperTopicCrawler {
*/
private
static
WeiboSuperTopic
getTopicInfo
(
String
id
,
WeiboSuperTopic
topic
)
{
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
try
{
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid="
+
id
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc_more"
))
{
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
e
);
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc_more"
))
{
String
descMore
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"pageInfo"
).
getJSONArray
(
"desc_more"
).
getString
(
0
);
if
(
StringUtils
.
isNotBlank
(
descMore
))
{
if
(
StringUtils
.
isNotBlank
(
descMore
))
{
String
readNum
=
descMore
.
replaceAll
(
"阅读|帖子.*"
,
""
).
trim
();
String
postNum
=
descMore
.
replaceAll
(
".*帖子|粉丝.*"
,
""
).
trim
();
topic
.
setPostNum
(
postNum
);
topic
.
setReadNum
(
readNum
);
return
topic
;
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
e
);
}
}
return
topic
;
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
View file @
c209c204
...
...
@@ -11,6 +11,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Element
;
...
...
@@ -131,21 +133,23 @@ public class WeiboTopicCrawler {
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
for
(
int
page
=
1
;
page
<=
6
;
page
++){
String
pageUrl
=
"https://api.weibo.cn/2/page?gsid=_2A25zJX_EDeRxGedH71YS8CzKzzmIHXVuc_QMrDV6PUJbkdANLXPbkWpNUK3OyitGCJsX8exvua-vfubUqCiaA4lb&from=10A1193010&c=iphone&s=2827eebe&count=20&containerid=106003type%253D25%2526t%253D3%2526disable_hot%253D1%2526filter_type%253Dtopicscene&page="
+
page
;
Request
request
=
RequestUtils
.
wrapGet
(
pageUrl
);
String
htmlBody
=
null
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
5
;
retryTimes
++)
{
try
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
// log.info("pageUrl::{}", pageUrl);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
pageUrl
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
topicList
.
addAll
(
parseTopicHtml
(
htmlBody
));
break
;
}
else
{
log
.
info
(
"下载榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"下载榜单列表页面时出现错误,错误为:{}"
,
e
);
continue
;
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
topicList
.
addAll
(
parseTopicHtml
(
htmlBody
));
break
;
}
else
{
log
.
info
(
"下载榜单列表页面时数据格式错误,页面为:{}"
,
htmlBody
);
}
}
}
return
topicList
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuChildHotSearchCrawler.java
0 → 100644
View file @
c209c204
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
@Log4j2
public
class
ZhihuChildHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* 知乎子级分类数据采集
* @param type
* @param typeName
* @return
*/
public
static
List
<
HotSearchList
>
getZhihuTopicSearch
(
String
type
,
String
typeName
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
url
=
"https://www.zhihu.com/api/v3/feed/topstory/hot-lists/"
+
type
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"x-api-version"
,
"3.0.76"
);
JSONArray
dataJson
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
dataJson
=
topSearch
.
getJSONArray
(
"data"
);
for
(
int
i
=
0
;
i
<
dataJson
.
size
();
i
++)
{
JSONObject
jsonObject
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"target"
);
Integer
rank
=
i
+
1
;
String
name
=
jsonObject
.
getJSONObject
(
"title_area"
).
getString
(
"text"
);
String
hotCountString
=
jsonObject
.
getJSONObject
(
"metrics_area"
).
getString
(
"text"
);
Integer
count
=
getHotCount
(
hotCountString
);
String
childUrl
=
jsonObject
.
getJSONObject
(
"link"
).
getString
(
"url"
);
HotSearchList
hotSearchList
=
new
HotSearchList
(
childUrl
,
name
,
count
,
rank
,
HotSearchType
.
知乎热搜
.
name
()
+
typeName
+
"分类"
);
list
.
add
(
hotSearchList
);
}
}
ZhiWeiTools
.
sleep
(
3000L
);
}
return
list
;
}
/**
* 截取出热度值
* @param hotCountString
* @return
*/
private
static
Integer
getHotCount
(
String
hotCountString
){
Integer
count
;
if
(
hotCountString
.
contains
(
"万"
)){
hotCountString
=
hotCountString
.
replaceAll
(
"万.*"
,
""
).
trim
();
count
=
(
int
)(
Double
.
parseDouble
(
hotCountString
)*
10000
);
}
else
if
(
hotCountString
.
contains
(
"亿"
)){
hotCountString
=
hotCountString
.
replaceAll
(
"亿.*"
,
""
).
trim
();
count
=
(
int
)(
Double
.
parseDouble
(
hotCountString
)*
10000000
);
}
else
{
count
=
Integer
.
getInteger
(
hotCountString
.
substring
(
0
,
hotCountString
.
indexOf
(
"领域热度"
)));
}
return
count
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
c209c204
...
...
@@ -6,6 +6,8 @@ import java.util.List;
import
java.util.Map
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -46,9 +48,14 @@ public class ZhihuHotSearchCrawler {
headerMap
.
put
(
"accept"
,
"application/json, text/plain, */*"
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
headerMap
.
put
(
"Referer"
,
rerferer
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"words"
)){
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"words"
))
{
list
=
new
ArrayList
<>();
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
words
=
topSearch
.
getJSONObject
(
"top_search"
).
getJSONArray
(
"words"
);
...
...
@@ -59,15 +66,11 @@ public class ZhihuHotSearchCrawler {
JSONObject
word
=
words
.
getJSONObject
(
i
);
query
=
word
.
getString
(
"query"
);
displayQuery
=
word
.
getString
(
"display_query"
);
link
=
"https://www.zhihu.com/search?q="
+
URLCodeUtil
.
getURLEncode
(
query
,
"utf-8"
)+
"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content"
;
link
=
"https://www.zhihu.com/search?q="
+
URLCodeUtil
.
getURLEncode
(
query
,
"utf-8"
)
+
"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content"
;
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
null
,
i
,
HotSearchType
.
知乎热搜
.
name
());
list
.
add
(
zhihu
);
}
}
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
}
return
list
;
}
...
...
@@ -81,7 +84,7 @@ public class ZhihuHotSearchCrawler {
* @return List<ZhihuHotSearch> 返回类型
*/
public
static
List
<
HotSearchList
>
getMobileZhihuHotList
(){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
;
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
url
=
"https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0"
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"api.zhihu.com"
);
...
...
@@ -89,10 +92,15 @@ public class ZhihuHotSearchCrawler {
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
);
headerMap
.
put
(
"X-UDID"
,
"AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
)){
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
))
{
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
dataJson
=
topSearch
.
getJSONArray
(
"data"
);
String
link
=
null
;
...
...
@@ -107,26 +115,22 @@ public class ZhihuHotSearchCrawler {
//计算热度
try
{
if
(
hotText
.
contains
(
"万"
))
{
if
(
hotText
.
contains
(
"万"
))
{
hotText
=
hotText
.
replaceAll
(
"万.*"
,
""
).
trim
();
hotCount
=
(
int
)(
Double
.
parseDouble
(
hotText
)*
10000
);
}
else
if
(
hotText
.
contains
(
"亿"
))
{
hotCount
=
(
int
)
(
Double
.
parseDouble
(
hotText
)
*
10000
);
}
else
if
(
hotText
.
contains
(
"亿"
))
{
hotText
=
hotText
.
replaceAll
(
"亿.*"
,
""
).
trim
();
hotCount
=
(
int
)(
Double
.
parseDouble
(
hotText
)*
10000000
);
}
else
{
hotCount
=
(
int
)
(
Double
.
parseDouble
(
hotText
)
*
10000000
);
}
else
{
hotCount
=
Integer
.
getInteger
(
hotText
);
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
hotCount
,
i
+
1
,
HotSearchType
.
知乎热搜
.
name
());
HotSearchList
zhihu
=
new
HotSearchList
(
link
,
displayQuery
,
hotCount
,
i
+
1
,
HotSearchType
.
知乎热搜
.
name
());
list
.
add
(
zhihu
);
}
}
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
return
list
;
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuTopicSearchCrawler.java
View file @
c209c204
...
...
@@ -10,6 +10,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.select.Elements
;
...
...
@@ -30,18 +32,19 @@ public class ZhihuTopicSearchCrawler {
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
url
=
"https://www.zhihu.com/topsearch"
;
JSONObject
jsonObject
=
null
;
try
{
for
(
int
t
=
0
;
t
<
3
&&
jsonObject
==
null
;
t
++)
{
// ZhiWeiTools.sleep(10000L);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
// log.info("页面内容获取:{}",htmlBody);
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
jsonObject
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"知乎热搜页面连接异常"
,
e
);
}
if
(
htmlBody
!=
null
)
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
String
html
=
document
.
getElementsByTag
(
"script"
).
select
(
"#js-initialData"
).
html
();
jsonObject
=
JSONObject
.
parseObject
(
html
);
}
if
(
jsonObject
!=
null
)
{
if
(
jsonObject
!=
null
)
{
JSONArray
dataJson
=
jsonObject
.
getJSONObject
(
"initialState"
).
getJSONObject
(
"topsearch"
).
getJSONArray
(
"data"
);
for
(
int
i
=
0
;
i
<
dataJson
.
size
();
i
++)
{
Integer
rank
=
i
+
1
;
...
...
@@ -53,12 +56,10 @@ public class ZhihuTopicSearchCrawler {
list
.
add
(
hotSearchList
);
}
return
list
;
}
else
{
log
.
error
(
"知乎热搜榜单页面获取异常,404"
);
log
.
error
(
jsonObject
);
}
}
catch
(
IOException
e
)
{
log
.
error
(
"知乎热搜获取异常"
,
e
);
}
else
{
log
.
error
(
"知乎热搜榜单页面获取异常"
);
}
}
return
Collections
.
emptyList
();
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
c209c204
...
...
@@ -52,5 +52,7 @@ public class HotSearchRun {
new
WeiboTopicRun
().
start
();
new
ToutiaoHotSearchRun
().
start
();
new
ZhihuTopSearchRun
().
start
();
new
ZhihuChildHotSearchRun
().
start
();
new
ThreadOneRun
().
start
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ThreadOneRun.java
0 → 100644
View file @
c209c204
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.TengXunCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
@Log4j2
public
class
ThreadOneRun
extends
Thread
{
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getHotList
();
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
1000
);
}
ZhiWeiTools
.
sleep
(
50
);
}
}
private
void
getHotList
(){
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
List
<
HotSearchList
>
list
=
TengXunCrawler
.
getTengXunHotList
();
if
(
list
==
null
||
list
.
size
()
==
0
){
TipsUtils
.
sendTips
(
"腾讯新闻"
,
new
Date
());
}
else
{
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuChildHotSearchRun.java
0 → 100644
View file @
c209c204
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.ZhihuChildHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
java.util.Arrays
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
@Log4j2
public
class
ZhihuChildHotSearchRun
extends
Thread
{
private
List
<
String
>
childType
=
Arrays
.
asList
(
"digital"
,
"focus"
,
"depth"
);
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getHotList
();
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
}
ZhiWeiTools
.
sleep
(
50
);
}
}
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
for
(
int
i
=
0
;
i
<
childType
.
size
();
i
++)
{
String
name
=
this
.
getTypeName
(
childType
.
get
(
i
));
if
(!
""
.
equals
(
name
))
{
log
.
info
(
"知乎{}话题热榜采集开始..."
,
name
);
List
<
HotSearchList
>
list
=
ZhihuChildHotSearchCrawler
.
getZhihuTopicSearch
(
childType
.
get
(
i
),
name
);
log
.
info
(
"{}, 知乎{}话题此轮采集到的数据量为:{}"
,
new
Date
(),
name
,
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
if
(
list
==
null
||
list
.
size
()
==
0
)
{
TipsUtils
.
sendTips
(
"知乎热搜"
+
name
+
"分类"
,
new
Date
());
}
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
log
.
info
(
"知乎{}话题热榜采集结束..."
,
name
);
ZhiWeiTools
.
sleep
(
3000
);
}
}
}
private
String
getTypeName
(
String
type
){
String
name
;
switch
(
type
)
{
case
"digital"
:
name
=
"数码"
;
break
;
case
"focus"
:
name
=
"国际"
;
break
;
case
"depth"
:
name
=
"时事"
;
break
;
default
:
name
=
""
;
}
return
name
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
View file @
c209c204
...
...
@@ -46,7 +46,7 @@ public class ZhihuHotSearchRun extends Thread{
List
<
HotSearchList
>
list
=
ZhihuHotSearchCrawler
.
getMobileZhihuHotList
();
log
.
info
(
"{}, 知乎此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
if
(
list
==
null
||
list
.
size
()
==
0
){
TipsUtils
.
sendTips
(
"知乎
话题
"
,
new
Date
());
TipsUtils
.
sendTips
(
"知乎
热搜
"
,
new
Date
());
}
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/util/TipsUtils.java
View file @
c209c204
...
...
@@ -19,12 +19,11 @@ public class TipsUtils {
//未采集到数据发送预警信息
public
static
void
sendTips
(
String
type
,
Date
time
){
//1.未采集到的程序触发
//2.获取数据库最后一条数据判断该程序几分钟没有采集到数据
//3.符合条件发送预警
HotSearchListDAO
hotSearchListDAO
=
new
HotSearchListDAO
();
//获取数据库最后一条数据判断该程序几分钟没有采集到数据
Date
lastTime
=
hotSearchListDAO
.
getLastTimeByType
(
type
);
if
(
time
.
getTime
()
-
lastTime
.
getTime
()
>
timeDifference
){
//发送预警
String
crawlerContent
=
String
.
format
(
"%s已经连续%s分钟未采集到数据"
,
type
,(
time
.
getTime
()
-
lastTime
.
getTime
())/
1000
/
60
);
QYWechatUtil
.
send
(
key
,
QYWechatUtil
.
MSGTYPE_TEXT
,
crawlerContent
,
null
,
null
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/util/WechatCodeUtil.java
View file @
c209c204
...
...
@@ -5,6 +5,8 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -35,20 +37,22 @@ public class WechatCodeUtil {
String
jmAppId
=
AESUtils
.
encrypt
(
"wechat"
,
appId
);
String
url
=
"http://yuqing.zhiweidata.com/WechatPublic/common/getToken?appId="
+
jmAppId
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
try
{
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
result
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
result
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"获取微信公众号推送token失败,问题为:::{}"
,
e
.
fillInStackTrace
());
return
null
;
}
if
(
result
!=
null
)
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
result
);
if
(
jsonObject
.
containsKey
(
"data"
))
{
if
(
jsonObject
.
containsKey
(
"data"
))
{
JSONObject
inJson
=
JSONObject
.
parseObject
(
jsonObject
.
getString
(
"data"
));
token
=
inJson
.
getString
(
"accessToken"
);
}
}
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"获取微信公众号推送token失败,问题为:::{}"
,
e
.
fillInStackTrace
());
return
null
;
}
return
token
;
}
...
...
@@ -65,24 +69,26 @@ public class WechatCodeUtil {
public
static
int
sendDataJson
(
JSONObject
templateJson
)
{
int
msgid
=
0
;
String
url
=
WechatConstant
.
WECHAT_TEMPLET_SEND_URL
.
replace
(
"ACCESS_TOKEN"
,
getToken
());
try
{
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
templateJson
.
toJSONString
());
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
requestBody
)).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"消息推送失败,错误为::{}"
,
e
.
fillInStackTrace
());
msgid
=
0
;
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
null
!=
jsonObject
)
{
if
(
"ok"
.
equals
(
jsonObject
.
getString
(
"errmsg"
)))
{
msgid
=
jsonObject
.
getIntValue
(
"msgid"
);
}
else
{
}
else
{
msgid
=
0
;
logger
.
info
(
"消息推送失败,错误为::{}"
,
jsonObject
.
toString
());
logger
.
info
(
"消息推送失败,错误为::{}"
,
jsonObject
.
toString
());
}
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"消息推送失败,错误为::{}"
,
e
.
fillInStackTrace
());
msgid
=
0
;
}
return
msgid
;
}
...
...
@@ -106,7 +112,14 @@ public class WechatCodeUtil {
postData
.
put
(
"tagid"
,
getGroupIp
(
groupName
));
postData
.
put
(
"next_openid"
,
""
);
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
postData
.
toJSONString
());
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
requestBody
)).
body
().
string
();
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
){
logger
.
error
(
"页面连接获取失败"
,
e
);
return
null
;
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
null
!=
jsonObject
)
{
...
...
@@ -120,7 +133,6 @@ public class WechatCodeUtil {
}
else
{
logger
.
info
(
"token 获取失败"
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
...
...
@@ -139,7 +151,14 @@ public class WechatCodeUtil {
postData
.
put
(
"tagid"
,
groupId
);
postData
.
put
(
"next_openid"
,
""
);
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
postData
.
toJSONString
());
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
requestBody
)).
body
().
string
();
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
)){
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
){
logger
.
error
(
"页面链接获取失败"
,
e
);
return
null
;
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
null
!=
jsonObject
)
{
...
...
@@ -175,10 +194,16 @@ public class WechatCodeUtil {
String
url
=
"https://api.weixin.qq.com/cgi-bin/tags/get?access_token="
+
getToken
();
Integer
groupId
=
null
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
logger
.
error
(
"获取分组id时出现错误"
,
e
.
fillInStackTrace
());
return
null
;
}
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
.
contains
(
"tags"
))
{
if
(
htmlBody
.
contains
(
"tags"
))
{
JSONArray
jsonArry
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"tags"
);
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
...
...
@@ -191,10 +216,6 @@ public class WechatCodeUtil {
}
}
}
}
catch
(
IOException
e
)
{
logger
.
error
(
"获取分组id时出现错误"
,
e
.
fillInStackTrace
());
return
null
;
}
return
groupId
;
}
...
...
@@ -206,10 +227,16 @@ public class WechatCodeUtil {
String
url
=
"https://api.weixin.qq.com/cgi-bin/tags/get?access_token="
+
getToken
();
Map
<
String
,
Integer
>
resultMap
=
new
HashMap
<
String
,
Integer
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
logger
.
error
(
"获取分组id时出现错误"
,
e
.
fillInStackTrace
());
return
null
;
}
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
.
contains
(
"tags"
))
{
if
(
htmlBody
.
contains
(
"tags"
))
{
JSONArray
jsonArry
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"tags"
);
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
...
...
@@ -217,14 +244,10 @@ public class WechatCodeUtil {
String
name
=
data
.
getString
(
"name"
);
resultMap
.
put
(
name
,
id
);
}
}
else
{
}
else
{
logger
.
info
(
"获取分组id时出现错误,数据为:::{}"
,
htmlBody
);
}
}
}
catch
(
IOException
e
)
{
logger
.
error
(
"获取分组id时出现错误"
,
e
.
fillInStackTrace
());
return
null
;
}
return
resultMap
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment