Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
e03ea262
Commit
e03ea262
authored
Sep 03, 2020
by
马黎滨
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
采集添加代理ip
parent
a98a48ca
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
77 additions
and
28 deletions
+77
-28
src/main/java/com/zhiwei/searchhotcrawler/crawler/SouhuTopicCrawler.java
+4
-3
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
+2
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+63
-14
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
+4
-5
src/main/java/com/zhiwei/searchhotcrawler/util/TipsUtils.java
+4
-4
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/SouhuTopicCrawler.java
View file @
e03ea262
...
...
@@ -3,6 +3,7 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
...
...
@@ -30,11 +31,11 @@ public class SouhuTopicCrawler {
String
htmlBody
=
null
;
String
url
=
"https://api.k.sohu.com/api/news/moment/v2/list.go?pageSize=50"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"搜狐话题页面连接失败"
,
e
);
log
.
error
(
"搜狐话题页面连接失败"
,
e
.
fillInStackTrace
()
);
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
View file @
e03ea262
...
...
@@ -41,7 +41,7 @@ public class ToutiaoHotSearchCrawler {
String
jsUrl
=
"https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js"
;
Request
jsRequest
=
RequestUtils
.
wrapGet
(
jsUrl
);
String
jsBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
jsRequest
))
{
try
(
Response
response
=
httpBoot
.
syncCall
(
jsRequest
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
jsBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"获取今日头条实时热搜头部信息标识失败"
,
e
);
...
...
@@ -55,7 +55,7 @@ public class ToutiaoHotSearchCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e1
)
{
log
.
error
(
"解析今日头条实时热搜时出现连接失败"
,
e1
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
e03ea262
...
...
@@ -2,7 +2,9 @@ package com.zhiwei.searchhotcrawler.crawler;
import
java.io.IOException
;
import
java.util.*
;
import
java.util.stream.Collectors
;
import
com.alibaba.fastjson.JSON
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
...
...
@@ -111,10 +113,10 @@ public class WeiboHotSearchCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
1
)
{
log
.
error
(
"解析微博时
时热搜时出现连接失败"
,
e1
);
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博时
热搜时出现连接失败"
,
e
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
...
...
@@ -122,17 +124,21 @@ public class WeiboHotSearchCrawler {
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONArray
cards
=
json
.
getJSONArray
(
"cards"
);
int
rank
=
0
;
for
(
int
i
=
0
;
i
<
cards
.
size
();
i
++)
{
//
for (int i = 0; i < cards.size(); i++) {
try
{
JSONObject
card
=
cards
.
getJSONObject
(
i
);
JSONObject
card
=
cards
.
getJSONObject
(
0
);
JSONArray
cardGroup
=
card
.
getJSONArray
(
"card_group"
);
JSONObject
topCard
=
cardGroup
.
getJSONObject
(
0
);
if
(!
topCard
.
containsKey
(
"pic"
)){
rank
=
1
;
}
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
String
title
=
card
.
getString
(
"title"
);
//
String title = card.getString("title");
boolean
hot
=
true
;
if
(
Objects
.
nonNull
(
title
)
&&
title
.
contains
(
"实时上升热点"
))
{
hot
=
false
;
rank
=
51
;
}
//
if (Objects.nonNull(title) && title.contains("实时上升热点")) {
//
hot = false;
//
rank = 51;
//
}
for
(
int
j
=
0
;
j
<
cardGroup
.
size
();
j
++)
{
JSONObject
cardInfo
=
cardGroup
.
getJSONObject
(
j
);
String
name
=
cardInfo
.
getString
(
"desc"
);
...
...
@@ -150,19 +156,62 @@ public class WeiboHotSearchCrawler {
log
.
info
(
"card 数据结构为:{}"
,
card
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时
时
热搜时出现解析错误"
,
e
);
log
.
error
(
"解析微博时热搜时出现解析错误"
,
e
);
continue
;
}
}
//
}
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时
时
热搜时出现解析错误,数据不是json结构"
,
e
);
log
.
error
(
"解析微博时热搜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
"解析微博时
时
热搜时出现解析错误,页面结构有问题"
);
log
.
info
(
"解析微博时热搜时出现解析错误,页面结构有问题"
);
}
}
return
Collections
.
emptyList
();
}
/**
* 微博预热榜(实时上升热点采集)
* @param date
* @return
*/
public
static
List
<
HotSearchList
>
weiboPreheatSearch
(
Date
date
){
String
url
=
"https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜时出现连接失败"
,
e
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
)){
JSONArray
cardArray
=
JSON
.
parseObject
(
htmlBody
).
getJSONArray
(
"cards"
);
if
(
cardArray
.
size
()
>
1
)
{
JSONObject
jsonObject
=
cardArray
.
getJSONObject
(
1
);
if
(
"实时上升热点"
.
equals
(
jsonObject
.
getString
(
"title"
))
&&
jsonObject
.
containsKey
(
"card_group"
))
{
JSONArray
jsonArray
=
jsonObject
.
getJSONArray
(
"card_group"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++){
JSONObject
cardInfo
=
jsonArray
.
getJSONObject
(
i
);
String
name
=
cardInfo
.
getString
(
"desc"
);
int
hotCount
=
cardInfo
.
getIntValue
(
"desc_extr"
);
String
weiboUrl
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
HotSearchList
hotSearchList
=
new
HotSearchList
(
weiboUrl
,
name
,
hotCount
,
null
,
HotSearchType
.
微博预热榜
.
name
(),
date
);
result
.
add
(
hotSearchList
);
}
//根据热度排序,赋值排名
result
=
result
.
stream
().
sorted
(
Comparator
.
comparing
(
HotSearchList:
:
getCount
).
reversed
()).
collect
(
Collectors
.
toList
());
int
rank
=
1
;
for
(
HotSearchList
hotSearchList
:
result
){
hotSearchList
.
setRank
(
rank
);
rank
++;
}
}
}
}
return
result
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
View file @
e03ea262
...
...
@@ -139,7 +139,6 @@ public class WeiboTopicCrawler {
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
5
;
retryTimes
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
// log.info("pageUrl::{}", pageUrl);
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"下载榜单列表页面时出现错误,错误为:{}"
,
e
);
...
...
@@ -159,8 +158,9 @@ public class WeiboTopicCrawler {
private
static
List
<
HotSearchList
>
parseTopicHtml
(
String
htmlBody
,
Date
date
)
{
try
{
JSONArray
cards
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"cards"
);
if
(
Objects
.
nonNull
(
cards
)
&&
!
cards
.
isEmpty
())
{
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"cards"
);
if
(
Objects
.
nonNull
(
jsonArray
)
&&
!
jsonArray
.
isEmpty
())
{
JSONArray
cards
=
jsonArray
.
getJSONObject
(
0
).
getJSONArray
(
"card_group"
);
List
<
HotSearchList
>
topicList
=
new
ArrayList
<>();
Integer
rank
=
null
;
String
topicName
=
null
;
...
...
@@ -169,9 +169,8 @@ public class WeiboTopicCrawler {
Integer
commentNum
=
null
;
Integer
readNum
=
null
;
String
desc2
=
null
;
for
(
int
i
=
0
;
i
<
cards
.
size
();
i
++)
{
JSONObject
cardGroup
=
cards
.
getJSONObject
(
i
)
.
getJSONArray
(
"card_group"
).
getJSONObject
(
0
)
;
JSONObject
cardGroup
=
cards
.
getJSONObject
(
i
);
rank
=
cardGroup
.
getInteger
(
"top_mark_text"
);
topicName
=
cardGroup
.
getString
(
"title_sub"
);
url
=
"https://s.weibo.com/weibo?q="
+
URLCodeUtil
.
getURLEncode
(
topicName
,
"utf-8"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/util/TipsUtils.java
View file @
e03ea262
...
...
@@ -33,8 +33,8 @@ public class TipsUtils {
if
(!
typeTips
.
containsKey
(
type
))
{
//发送预警
String
crawlerContent
=
String
.
format
(
"%s数据采集异常"
,
type
);
//
QYWechatUtil.send(key, QYWechatUtil.MSGTYPE_TEXT, crawlerContent,
//
null, null);
QYWechatUtil
.
send
(
key
,
QYWechatUtil
.
MSGTYPE_TEXT
,
crawlerContent
,
null
,
null
);
}
typeTips
.
put
(
type
,
time
);
}
...
...
@@ -52,8 +52,8 @@ public class TipsUtils {
typeTips
.
remove
(
type
);
//发送恢复通知
String
crawlerContent
=
String
.
format
(
"%s数据采集恢复正常"
,
type
);
//
QYWechatUtil.send(key, QYWechatUtil.MSGTYPE_TEXT, crawlerContent,
//
null, null);
QYWechatUtil
.
send
(
key
,
QYWechatUtil
.
MSGTYPE_TEXT
,
crawlerContent
,
null
,
null
);
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment