Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
172e5b3c
Commit
172e5b3c
authored
Sep 26, 2021
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'working' into 'master'
Working See merge request
!135
parents
139ff5af
8ec17aa9
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
127 additions
and
95 deletions
+127
-95
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+127
-95
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
172e5b3c
...
@@ -6,12 +6,9 @@ import java.net.URLEncoder;
...
@@ -6,12 +6,9 @@ import java.net.URLEncoder;
import
java.text.ParseException
;
import
java.text.ParseException
;
import
java.text.SimpleDateFormat
;
import
java.text.SimpleDateFormat
;
import
java.util.*
;
import
java.util.*
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
java.util.stream.Collectors
;
import
java.util.stream.Collectors
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSON
;
import
com.mongodb.client.result.UpdateResult
;
import
com.zhiwei.searchhotcrawler.bean.*
;
import
com.zhiwei.searchhotcrawler.bean.*
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
com.zhiwei.searchhotcrawler.dao.RedisDao
;
import
com.zhiwei.searchhotcrawler.dao.RedisDao
;
...
@@ -22,19 +19,10 @@ import lombok.extern.log4j.Log4j2;
...
@@ -22,19 +19,10 @@ import lombok.extern.log4j.Log4j2;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClients
;
import
org.apache.http.util.EntityUtils
;
import
org.bson.Document
;
import
org.bson.Document
;
import
org.checkerframework.checker.units.qual.C
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
...
@@ -43,86 +31,126 @@ import com.zhiwei.crawler.core.proxy.ProxyHolder;
...
@@ -43,86 +31,126 @@ import com.zhiwei.crawler.core.proxy.ProxyHolder;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.mail.SendMailWeibo
;
import
com.zhiwei.searchhotcrawler.mail.SendMailWeibo
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
static
java
.
util
.
Objects
.
isNull
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
/**
* @author hero
* @author hero
* @author hero
* @ClassName: WeiboHotSearch
* @ClassName: WeiboHotSearch
* @Description: 微博实时热搜采集
* @Description: 微博实时热搜采集
* @author hero
* @date 2017年9月15日 上午10:54:31
* @date 2017年9月15日 上午10:54:31
*/
*/
@Log4j2
@Log4j2
public
class
WeiboHotSearchCrawler
{
public
class
WeiboHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
RedisDao
redisDao
=
new
RedisDao
();
private
static
RedisDao
redisDao
=
new
RedisDao
();
static
WeiBoUserDao
weiBoUserDao
=
new
WeiBoUserDao
();
static
WeiBoUserDao
weiBoUserDao
=
new
WeiBoUserDao
();
static
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
static
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
/**
* @Title: weiboHotSearchTest
/**
* @author hero
* @return void 返回类型
* @Description: TODO(PC端微博热搜采集)
* @Title: weiboHotSearchTest
* @return void 返回类型
* @author hero
*/
* @Description: TODO(PC端微博热搜采集)
// public static List<HotSearchList> weiboHotSearch(){
*/
// String url = "https://s.weibo.com/top/summary?cate=realtimehot";
public
static
List
<
HotSearchList
>
weiboHotSearch
()
{
//
String
url
=
"https://s.weibo.com/top/summary?cate=realtimehot"
;
// List<HotSearchList> list = new ArrayList<HotSearchList>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
// for(int i =0; i<3; i++){
headerMap
.
put
(
"Cookie"
,
"SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN"
);
// String htmlBody = null;
List
<
HotSearchList
>
list
=
new
ArrayList
<
HotSearchList
>();
// Request request = RequestUtils.wrapGet(url);
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
// try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
String
htmlBody
=
null
;
// htmlBody = response.body().string();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
// } catch (Exception e) {
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
// if(i==2){
htmlBody
=
response
.
body
().
string
();
// return list;
}
catch
(
Exception
e
)
{
// }else{
if
(
i
==
2
)
{
// continue;
return
list
;
// }
}
else
{
// }
continue
;
// if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) {
}
// try {
}
//// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"pl_top_realtimehot"
))
{
//// script = script.replace("(", "").replace(")", "");
try
{
//// JSONObject json = JSONObject.parseObject(script);
Date
date
=
new
Date
();
//// String html = json.getString("html");
org
.
jsoup
.
nodes
.
Document
document
=
Jsoup
.
parse
(
htmlBody
);
// Document document = Jsoup.parse(htmlBody);
Elements
elements
=
document
.
select
(
"div#pl_top_realtimehot"
).
select
(
"tbody"
).
select
(
"tr"
);
// Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
for
(
Element
element
:
elements
)
{
// for (Element element : elements) {
try
{
// try {
//获取链接
// String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href");
String
id
=
"http://s.weibo.com"
+
element
.
select
(
"td.td-02"
).
select
(
"a"
).
attr
(
"href"
);
// String name = element.select("td.td-02").select("a").text();
//获取标题
// String num = !element.select("td.td-02").select("span").text().equals("") ? element.select("td.td-02").select("span").text() : "0";
String
name
=
element
.
select
(
"td.td-02"
).
select
(
"a"
).
text
();
// String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("") ? element.select("td[class=\"td-01 ranktop\"]").text() : "-1";
//获取热度值
//
String
num
=
element
.
select
(
"td.td-02"
).
select
(
"span"
).
text
();
// int hotCount = Integer.valueOf(num);
//获取排名
// int rankCount = Integer.valueOf(rank);
String
rank
=
element
.
select
(
"td.td-01"
).
text
();
// HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), null);
Integer
rankCount
=
null
;
// list.add(hotSearch);
//默认推荐位排名为0 置顶为-1
// } catch (Exception e) {
if
(
"•"
.
equals
(
rank
))
{
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
rankCount
=
0
;
// log.error("解析微博时时热搜时出现解析错误", e);
id
=
"http://s.weibo.com"
+
element
.
select
(
"td.td-02"
).
select
(
"a"
).
attr
(
"href_to"
);
// continue;
}
else
if
(
StringUtils
.
isEmpty
(
rank
))
{
// }
rankCount
=
-
1
;
// }
}
else
{
// } catch (Exception e) {
rankCount
=
Integer
.
valueOf
(
rank
);
// log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace());
}
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
//获取icon
// return null;
String
text
=
element
.
select
(
"td.td-03"
).
text
();
// }
String
icon
=
null
;
// } else {
if
(
StringUtils
.
isNotEmpty
(
text
)
&&
nonNull
(
text
))
{
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
if
(
"商"
.
equals
(
text
))
{
// log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
icon
=
"jian"
;
// }
}
else
if
(
"新"
.
equals
(
text
))
{
// break;
icon
=
"new"
;
// }
}
else
if
(
"热"
.
equals
(
text
))
{
// return list;
icon
=
"hot"
;
// }
}
else
if
(
"沸"
.
equals
(
text
))
{
icon
=
"fei"
;
}
else
if
(
"爆"
.
equals
(
text
))
{
icon
=
"boom"
;
}
}
//获取热度标签
String
heatLabel
=
null
;
//获取热度值 置顶 推荐位 默认值为0
Long
hotCount
=
0L
;
if
(
StringUtils
.
isNotEmpty
(
num
)
&&
Objects
.
nonNull
(
num
))
{
String
[]
split
=
num
.
split
(
" "
);
if
(
split
.
length
>
1
)
{
heatLabel
=
split
[
0
].
trim
();
hotCount
=
Long
.
valueOf
(
split
[
1
].
trim
());
}
else
{
hotCount
=
Long
.
valueOf
(
num
);
}
}
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
true
,
rankCount
,
HotSearchType
.
微博热搜
.
name
(),
icon
,
date
);
hotSearch
.
setHeatLabel
(
heatLabel
);
list
.
add
(
hotSearch
);
}
catch
(
Exception
e
)
{
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
log
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
continue
;
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
return
null
;
}
}
else
{
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
log
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
}
break
;
}
return
list
;
}
/**
/**
...
@@ -263,20 +291,20 @@ public class WeiboHotSearchCrawler {
...
@@ -263,20 +291,20 @@ public class WeiboHotSearchCrawler {
JSONObject
cardInfo
=
cardGroup
.
getJSONObject
(
j
);
JSONObject
cardInfo
=
cardGroup
.
getJSONObject
(
j
);
String
name
=
cardInfo
.
getString
(
"desc"
);
String
name
=
cardInfo
.
getString
(
"desc"
);
String
desc_extr
=
cardInfo
.
getString
(
"desc_extr"
);
String
desc_extr
=
cardInfo
.
getString
(
"desc_extr"
);
String
heatLabel
=
null
;
String
heatLabel
=
null
;
Long
hotCount
=
null
;
Long
hotCount
=
null
;
if
(
Objects
.
nonNull
(
desc_extr
)){
if
(
Objects
.
nonNull
(
desc_extr
))
{
String
[]
split
=
desc_extr
.
split
(
" "
);
String
[]
split
=
desc_extr
.
split
(
" "
);
if
(
split
.
length
>
1
)
{
if
(
split
.
length
>
1
)
{
heatLabel
=
split
[
0
].
trim
();
heatLabel
=
split
[
0
].
trim
();
hotCount
=
Long
.
valueOf
(
split
[
1
].
trim
());
hotCount
=
Long
.
valueOf
(
split
[
1
].
trim
());
}
else
{
}
else
{
hotCount
=
cardInfo
.
getLongValue
(
"desc_extr"
);
hotCount
=
cardInfo
.
getLongValue
(
"desc_extr"
);
}
}
}
}
String
iconUrl
=
cardInfo
.
getString
(
"icon"
);
String
iconUrl
=
cardInfo
.
getString
(
"icon"
);
String
icon
=
null
;
String
icon
=
null
;
if
(
StringUtils
.
isNotBlank
(
iconUrl
))
{
if
(
StringUtils
.
isNotBlank
(
iconUrl
))
{
icon
=
iconUrl
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
icon
=
iconUrl
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
}
}
...
@@ -284,7 +312,9 @@ public class WeiboHotSearchCrawler {
...
@@ -284,7 +312,9 @@ public class WeiboHotSearchCrawler {
String
id
=
cardInfo
.
getString
(
"scheme"
);
String
id
=
cardInfo
.
getString
(
"scheme"
);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
,
date
);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
,
date
);
hotSearch
.
setHeatLabel
(
heatLabel
);
hotSearch
.
setHeatLabel
(
heatLabel
);
if
(
Objects
.
nonNull
(
iconUrl
)){
hotSearch
.
setIconUrl
(
iconUrl
);}
if
(
Objects
.
nonNull
(
iconUrl
))
{
hotSearch
.
setIconUrl
(
iconUrl
);
}
result
.
add
(
hotSearch
);
result
.
add
(
hotSearch
);
rank
++;
rank
++;
redisDao
.
addDataToSet
(
RedisConfig
.
WEIBO_HOTSEARCHIDS
,
name
+
"_微博热搜"
);
redisDao
.
addDataToSet
(
RedisConfig
.
WEIBO_HOTSEARCHIDS
,
name
+
"_微博热搜"
);
...
@@ -371,7 +401,7 @@ public class WeiboHotSearchCrawler {
...
@@ -371,7 +401,7 @@ public class WeiboHotSearchCrawler {
}
catch
(
UnsupportedEncodingException
e
)
{
}
catch
(
UnsupportedEncodingException
e
)
{
log
.
error
(
"更新导语时字符解析成URl模式异常"
,
e
);
log
.
error
(
"更新导语时字符解析成URl模式异常"
,
e
);
}
}
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=100103type"
+
encode
;
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=100103type"
+
encode
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
...
@@ -389,7 +419,7 @@ public class WeiboHotSearchCrawler {
...
@@ -389,7 +419,7 @@ public class WeiboHotSearchCrawler {
document
.
put
(
"topicLead"
,
topicLead
);
document
.
put
(
"topicLead"
,
topicLead
);
}
}
}
}
if
(
json
.
containsKey
(
"cardlist_head_cards"
)
&&
!
json
.
getJSONArray
(
"cardlist_head_cards"
).
isEmpty
())
{
if
(
json
.
containsKey
(
"cardlist_head_cards"
)
&&
!
json
.
getJSONArray
(
"cardlist_head_cards"
).
isEmpty
())
{
JSONObject
readJson
=
json
.
getJSONArray
(
"cardlist_head_cards"
).
getJSONObject
(
0
);
JSONObject
readJson
=
json
.
getJSONArray
(
"cardlist_head_cards"
).
getJSONObject
(
0
);
if
(
readJson
.
containsKey
(
"head_data"
))
{
if
(
readJson
.
containsKey
(
"head_data"
))
{
String
midText
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"midtext"
);
String
midText
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"midtext"
);
...
@@ -465,9 +495,11 @@ public class WeiboHotSearchCrawler {
...
@@ -465,9 +495,11 @@ public class WeiboHotSearchCrawler {
}
catch
(
UnsupportedEncodingException
e
)
{
}
catch
(
UnsupportedEncodingException
e
)
{
log
.
error
(
"字符解析成URl模式异常"
,
e
);
log
.
error
(
"字符解析成URl模式异常"
,
e
);
}
}
String
url
=
"https://s.weibo.com/weibo?q="
+
encode
+
"&Refer=top"
;
String
url
=
"https://s.weibo.com/weibo?q="
+
encode
+
"&Refer=top"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"Cookie"
,
"SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
}
catch
(
IOException
e
)
{
...
@@ -634,13 +666,13 @@ public class WeiboHotSearchCrawler {
...
@@ -634,13 +666,13 @@ public class WeiboHotSearchCrawler {
Long
followerCount
=
null
;
Long
followerCount
=
null
;
if
(
followers_count
.
contains
(
"万"
))
{
if
(
followers_count
.
contains
(
"万"
))
{
String
[]
split
=
followers_count
.
split
(
"万"
);
String
[]
split
=
followers_count
.
split
(
"万"
);
Double
aDouble
=
Double
.
valueOf
(
split
[
0
])
*
10000
;
Double
aDouble
=
Double
.
valueOf
(
split
[
0
])
*
10000
;
followerCount
=
new
Double
(
aDouble
).
longValue
();
followerCount
=
new
Double
(
aDouble
).
longValue
();
}
else
if
(
followers_count
.
contains
(
"亿"
)){
}
else
if
(
followers_count
.
contains
(
"亿"
))
{
String
[]
split
=
followers_count
.
split
(
"亿"
);
String
[]
split
=
followers_count
.
split
(
"亿"
);
Double
aDouble
=
Double
.
valueOf
(
split
[
0
])
*
100000000
;
Double
aDouble
=
Double
.
valueOf
(
split
[
0
])
*
100000000
;
followerCount
=
new
Double
(
aDouble
).
longValue
();
followerCount
=
new
Double
(
aDouble
).
longValue
();
}
else
{
}
else
{
followerCount
=
Long
.
valueOf
(
followers_count
);
followerCount
=
Long
.
valueOf
(
followers_count
);
}
}
//用户头像地址
//用户头像地址
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment