Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
8ec17aa9
Commit
8ec17aa9
authored
Sep 26, 2021
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
更新微博pc端采集程序
parent
5ab03924
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
99 additions
and
103 deletions
+99
-103
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+99
-103
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
8ec17aa9
...
@@ -52,109 +52,105 @@ public class WeiboHotSearchCrawler {
...
@@ -52,109 +52,105 @@ public class WeiboHotSearchCrawler {
static
WeiBoUserDao
weiBoUserDao
=
new
WeiBoUserDao
();
static
WeiBoUserDao
weiBoUserDao
=
new
WeiBoUserDao
();
static
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
static
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
// /**
/**
// * @return void 返回类型
* @return void 返回类型
// * @Title: weiboHotSearchTest
* @Title: weiboHotSearchTest
// * @author hero
* @author hero
// * @Description: TODO(PC端微博热搜采集)
* @Description: TODO(PC端微博热搜采集)
// */
*/
// public static List<HotSearchList> weiboHotSearch() {
public
static
List
<
HotSearchList
>
weiboHotSearch
()
{
// String url = "https://s.weibo.com/top/summary?cate=realtimehot";
String
url
=
"https://s.weibo.com/top/summary?cate=realtimehot"
;
// Map<String, String> headerMap = new HashMap<>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
// headerMap.put("Cookie", "SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN");
headerMap
.
put
(
"Cookie"
,
"SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN"
);
// List<HotSearchList> list = new ArrayList<HotSearchList>();
List
<
HotSearchList
>
list
=
new
ArrayList
<
HotSearchList
>();
// for (int i = 0; i < 3; i++) {
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
// String htmlBody = null;
String
htmlBody
=
null
;
// Request request = RequestUtils.wrapGet(url, headerMap);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
// try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
// htmlBody = response.body().string();
htmlBody
=
response
.
body
().
string
();
// } catch (Exception e) {
}
catch
(
Exception
e
)
{
// if (i == 2) {
if
(
i
==
2
)
{
// return list;
return
list
;
// } else {
}
else
{
// continue;
continue
;
// }
}
// }
}
// if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) {
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"pl_top_realtimehot"
))
{
// try {
try
{
//// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
Date
date
=
new
Date
();
//// script = script.replace("(", "").replace(")", "");
org
.
jsoup
.
nodes
.
Document
document
=
Jsoup
.
parse
(
htmlBody
);
//// JSONObject json = JSONObject.parseObject(script);
Elements
elements
=
document
.
select
(
"div#pl_top_realtimehot"
).
select
(
"tbody"
).
select
(
"tr"
);
//// String html =
for
(
Element
element
:
elements
)
{
// Date date = new Date();
try
{
// org.jsoup.nodes.Document document = Jsoup.parse(htmlBody);
//获取链接
// Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
String
id
=
"http://s.weibo.com"
+
element
.
select
(
"td.td-02"
).
select
(
"a"
).
attr
(
"href"
);
// for (Element element : elements) {
//获取标题
// try {
String
name
=
element
.
select
(
"td.td-02"
).
select
(
"a"
).
text
();
// String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href");
//获取热度值
// String name = element.select("td.td-02").select("a").text();
String
num
=
element
.
select
(
"td.td-02"
).
select
(
"span"
).
text
();
// //String num = !element.select("td.td-02").select("span").text().equals("") ? element.select("td.td-02").select("span").text() : "0";
//获取排名
// String num = element.select("td.td-02").select("span").text();
String
rank
=
element
.
select
(
"td.td-01"
).
text
();
// //String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("") ? element.select("td[class=\"td-01 ranktop\"]").text() : "-1";
Integer
rankCount
=
null
;
// //获取排名
//默认推荐位排名为0 置顶为-1
// String rank = element.select("td.td-01").text();
if
(
"•"
.
equals
(
rank
))
{
// Integer rankCount = null;
rankCount
=
0
;
// //默认推荐位排名为0 置顶为-1
id
=
"http://s.weibo.com"
+
element
.
select
(
"td.td-02"
).
select
(
"a"
).
attr
(
"href_to"
);
// if ("•".equals(rank)) {
}
else
if
(
StringUtils
.
isEmpty
(
rank
))
{
// rankCount = 0;
rankCount
=
-
1
;
// id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href_to");
}
else
{
// } else if (StringUtils.isEmpty(rank)) {
rankCount
=
Integer
.
valueOf
(
rank
);
// rankCount = -1;
}
// } else {
//获取icon
// rankCount = Integer.valueOf(rank);
String
text
=
element
.
select
(
"td.td-03"
).
text
();
// }
String
icon
=
null
;
// //获取icon
if
(
StringUtils
.
isNotEmpty
(
text
)
&&
nonNull
(
text
))
{
// String text = element.select("td.td-03").text();
if
(
"商"
.
equals
(
text
))
{
// String icon = null;
icon
=
"jian"
;
// if (StringUtils.isNotEmpty(text) && nonNull(text)) {
}
else
if
(
"新"
.
equals
(
text
))
{
// if ("商".equals(text)) {
icon
=
"new"
;
// icon = "jian";
}
else
if
(
"热"
.
equals
(
text
))
{
// } else if ("新".equals(text)) {
icon
=
"hot"
;
// icon = "new";
}
else
if
(
"沸"
.
equals
(
text
))
{
// } else if ("热".equals(text)) {
icon
=
"fei"
;
// icon = "hot";
}
else
if
(
"爆"
.
equals
(
text
))
{
// } else if ("沸".equals(text)) {
icon
=
"boom"
;
// icon = "fei";
}
// } else if ("爆".equals(text)) {
}
// icon = "boom";
//获取热度标签
// }
String
heatLabel
=
null
;
// }
//获取热度值 置顶 推荐位 默认值为0
// //获取热度标签
Long
hotCount
=
0L
;
// String heatLabel = null;
if
(
StringUtils
.
isNotEmpty
(
num
)
&&
Objects
.
nonNull
(
num
))
{
// //获取热度值 置顶 推荐位 默认值为0
String
[]
split
=
num
.
split
(
" "
);
// Long hotCount =0L;
if
(
split
.
length
>
1
)
{
// if (StringUtils.isNotEmpty(num) && Objects.nonNull(num)) {
heatLabel
=
split
[
0
].
trim
();
// String[] split = num.split(" ");
hotCount
=
Long
.
valueOf
(
split
[
1
].
trim
());
// if (split.length > 1) {
}
else
{
// heatLabel = split[0].trim();
hotCount
=
Long
.
valueOf
(
num
);
// hotCount = Long.valueOf(split[1].trim());
}
// }else {
}
// hotCount = Long.valueOf(num);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
true
,
rankCount
,
HotSearchType
.
微博热搜
.
name
(),
icon
,
date
);
// }
hotSearch
.
setHeatLabel
(
heatLabel
);
// }
list
.
add
(
hotSearch
);
// // Long hotCount = Long.valueOf(num);
}
catch
(
Exception
e
)
{
// HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), icon, date);
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
// hotSearch.setHeatLabel(heatLabel);
log
.
error
(
"解析微博时时热搜时出现解析错误"
,
e
);
// list.add(hotSearch);
continue
;
// } catch (Exception e) {
}
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
}
// log.error("解析微博时时热搜时出现解析错误", e);
}
catch
(
Exception
e
)
{
// continue;
log
.
error
(
"解析微博时时热搜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
// }
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
// }
return
null
;
// } catch (Exception e) {
}
// log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace());
}
else
{
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
SendMailWeibo
.
sendMail
(
"微博热搜采集出现问题"
,
"859548429@qq.com"
);
// return null;
log
.
info
(
"解析微博时时热搜时出现解析错误,页面结构有问题"
);
// }
}
// } else {
break
;
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
}
// log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
return
list
;
// }
}
// break;
// }
// return list;
// }
/**
/**
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment