Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
2c471a78
Commit
2c471a78
authored
Sep 29, 2018
by
[zhangzhiwei]
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改微博热搜解析规则
parent
c071c8fd
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
60 additions
and
39 deletions
+60
-39
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+9
-9
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+3
-3
src/main/java/com/zhiwei/searchhotcrawler/timer/SendWeiboHotSearchRun.java
+23
-13
src/main/java/com/zhiwei/searchhotcrawler/timer/SendZhihuHotSearchRun.java
+25
-14
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
2c471a78
...
...
@@ -38,10 +38,10 @@ public class WeiboHotSearchCrawler {
* @return void 返回类型
*/
public
static
List
<
WeiboHotSearch
>
weiboHotSearch
(){
String
url
=
"http://s.weibo.com/top/summary?cate=realtimehot"
;
String
url
=
"http
s
://s.weibo.com/top/summary?cate=realtimehot"
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"Referer"
,
"http
://s.weibo.com/top/summary?cate=realtimehot"
);
// headerMap.put("Referer", "https
://s.weibo.com/top/summary?cate=realtimehot");
headerMap
.
put
(
"Host"
,
"s.weibo.com"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
);
headerMap
.
put
(
"Upgrade-Insecure-Requests"
,
"1"
);
...
...
@@ -54,12 +54,12 @@ public class WeiboHotSearchCrawler {
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"pl_top_realtimehot"
)){
try
{
String
script
=
htmlBody
.
split
(
"<script>STK && STK.pageletM && STK.pageletM.view"
)[
5
].
split
(
"<\\/script>"
)[
0
];
script
=
script
.
replace
(
"("
,
""
).
replace
(
")"
,
""
);
JSONObject
json
=
JSONObject
.
parseObject
(
script
);
String
html
=
json
.
getString
(
"html"
);
Document
document
=
Jsoup
.
parse
(
html
);
Elements
elements
=
document
.
select
(
"tbody"
).
select
(
"tr"
);
//
String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
//
script = script.replace("(", "").replace(")", "");
//
JSONObject json = JSONObject.parseObject(script);
//
String html = json.getString("html");
Document
document
=
Jsoup
.
parse
(
html
Body
);
Elements
elements
=
document
.
select
(
"
div#pl_top_realtimehot"
).
select
(
"
tbody"
).
select
(
"tr"
);
for
(
Element
element
:
elements
){
try
{
String
id
=
"http://s.weibo.com"
+
element
.
select
(
"p.star_name"
).
select
(
"a"
).
attr
(
"href"
);
...
...
@@ -97,7 +97,7 @@ public class WeiboHotSearchCrawler {
return
list
;
}
/**
...
...
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
2c471a78
...
...
@@ -16,18 +16,18 @@ public class HotSearchRun {
private
ScheduledExecutorService
scheduExec
;
public
HotSearchRun
()
{
this
.
scheduExec
=
Executors
.
newScheduledThreadPool
(
3
);
this
.
scheduExec
=
Executors
.
newScheduledThreadPool
(
2
);
}
public
void
showTimer
()
{
scheduExec
.
scheduleAtFixedRate
(
new
WeiboHotSearchRun
(),
0
,
1
,
TimeUnit
.
MINUTES
);
scheduExec
.
scheduleAtFixedRate
(
new
ZhihuHotSearchRun
(),
0
,
1
,
TimeUnit
.
MINUTES
);
scheduExec
.
scheduleAtFixedRate
(
new
SendZhihuHotSearchRun
(),
0
,
60
,
TimeUnit
.
MINUTES
);
scheduExec
.
scheduleAtFixedRate
(
new
SendWeiboHotSearchRun
(),
0
,
60
,
TimeUnit
.
MINUTES
);
}
public
static
void
main
(
String
[]
args
)
{
new
HotSearchRun
().
showTimer
();
new
CacheListener
().
startListen
();
new
SendWeiboHotSearchRun
().
start
();
new
SendZhihuHotSearchRun
().
start
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/SendWeiboHotSearchRun.java
View file @
2c471a78
...
...
@@ -14,6 +14,7 @@ import com.zhiwei.searchhotcrawler.util.Template;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.searchhotcrawler.util.WechatConstant
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
SendWeiboHotSearchRun
extends
Thread
{
private
WeiboHotSearchDAO
weiboHotSearchDAO
=
new
WeiboHotSearchDAO
();
...
...
@@ -21,20 +22,29 @@ public class SendWeiboHotSearchRun extends Thread{
@Override
public
void
run
()
{
Calendar
calendar
=
Calendar
.
getInstance
();
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
logger
.
info
(
"微博推送,当前系统时间为:"
+
hour
);
if
(
hour
>
6
&&
hour
<
23
){
List
<
DBObject
>
list
=
weiboHotSearchDAO
.
getWeiboHotOneHour
();
if
(
list
!=
null
&&
list
.
size
()>
0
){
for
(
DBObject
weibo
:
list
){
String
title
=
weibo
.
get
(
"name"
).
toString
();
String
time
=
TimeParse
.
dateFormartString
((
Date
)
weibo
.
get
(
"time"
),
"yyyy-MM-dd HH:mm:ss"
);
String
url
=
weibo
.
get
(
"url"
).
toString
();
sendTemplateByUserIds
(
title
,
time
,
url
);
while
(
true
)
{
try
{
Calendar
calendar
=
Calendar
.
getInstance
();
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
logger
.
info
(
"微博推送,当前系统时间为:"
+
hour
);
if
(
hour
>
6
&&
hour
<
23
){
List
<
DBObject
>
list
=
weiboHotSearchDAO
.
getWeiboHotOneHour
();
if
(
list
!=
null
&&
list
.
size
()>
0
){
for
(
DBObject
weibo
:
list
){
String
title
=
weibo
.
get
(
"name"
).
toString
();
String
time
=
TimeParse
.
dateFormartString
((
Date
)
weibo
.
get
(
"time"
),
"yyyy-MM-dd HH:mm:ss"
);
String
url
=
weibo
.
get
(
"url"
).
toString
();
sendTemplateByUserIds
(
title
,
time
,
url
);
}
}
else
{
sendTemplateByUserIds
(
"最近一小时无数据"
,
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
}
}
}
else
{
sendTemplateByUserIds
(
"最近一小时无数据"
,
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
catch
(
Exception
e
)
{
logger
.
debug
(
"微博热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
continue
;
}
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/SendZhihuHotSearchRun.java
View file @
2c471a78
...
...
@@ -17,28 +17,39 @@ import com.zhiwei.searchhotcrawler.util.Template;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
com.zhiwei.searchhotcrawler.util.WechatConstant
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
SendZhihuHotSearchRun
extends
Thread
{
private
ZhihuHotSearchDAO
zhihuHotSearchDAO
=
new
ZhihuHotSearchDAO
();
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SendZhihuHotSearchRun
.
class
);
@Override
public
void
run
()
{
Calendar
calendar
=
Calendar
.
getInstance
();
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
logger
.
info
(
"知乎推送,当前系统时间为:"
+
hour
);
if
(
hour
>
6
&&
hour
<
23
){
List
<
DBObject
>
list
=
zhihuHotSearchDAO
.
getZhiHuHotSearch
();
if
(
list
!=
null
&&
list
.
size
()>
0
){
for
(
DBObject
zhihu
:
list
){
String
title
=
zhihu
.
get
(
"display_query"
).
toString
();
String
time
=
TimeParse
.
dateFormartString
((
Date
)
zhihu
.
get
(
"time"
),
"yyyy-MM-dd HH:mm:ss"
);
String
url
=
zhihu
.
get
(
"_id"
).
toString
();
if
(
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
)
>
6
&&
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
)
<
23
){
sendTemplateByUserIds
(
title
,
time
,
url
);
while
(
true
)
{
try
{
Calendar
calendar
=
Calendar
.
getInstance
();
int
hour
=
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
);
logger
.
info
(
"知乎推送,当前系统时间为:"
+
hour
);
if
(
hour
>
6
&&
hour
<
23
){
List
<
DBObject
>
list
=
zhihuHotSearchDAO
.
getZhiHuHotSearch
();
if
(
list
!=
null
&&
list
.
size
()>
0
){
for
(
DBObject
zhihu
:
list
){
String
title
=
zhihu
.
get
(
"display_query"
).
toString
();
String
time
=
TimeParse
.
dateFormartString
((
Date
)
zhihu
.
get
(
"time"
),
"yyyy-MM-dd HH:mm:ss"
);
String
url
=
zhihu
.
get
(
"_id"
).
toString
();
if
(
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
)
>
6
&&
calendar
.
get
(
Calendar
.
HOUR_OF_DAY
)
<
23
){
sendTemplateByUserIds
(
title
,
time
,
url
);
}
}
}
else
{
sendTemplateByUserIds
(
"最近一小时无数据"
,
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
}
}
}
else
{
sendTemplateByUserIds
(
"最近一小时无数据"
,
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd HH:mm:ss"
),
null
);
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
}
catch
(
Exception
e
)
{
logger
.
debug
(
"知乎热搜推送出现问题,问题为:::{}"
,
e
.
fillInStackTrace
());
ZhiWeiTools
.
sleep
(
1
*
60
*
60
*
1000
);
continue
;
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment