Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
df8ce8d3
Commit
df8ce8d3
authored
Jun 15, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加休眠时间自配
parent
1325c572
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
48 additions
and
32 deletions
+48
-32
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+11
-6
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
+4
-2
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
+8
-4
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
+4
-2
src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
+5
-2
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+16
-16
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
View file @
df8ce8d3
...
...
@@ -25,7 +25,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiduNewsCrawlerParse
.
class
);
private
static
final
String
pt
=
"百度新闻"
;
/**
* @Title: getBaiduNewsData
* @author hero
...
...
@@ -44,7 +44,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getBaiduNewsData
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
)
throws
Exception
{
public
static
List
<
NewsData
>
getBaiduNewsData
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
,
Long
sleepTime
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
0
;
boolean
more
=
true
;
...
...
@@ -63,7 +63,9 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
}
page
++;
ZhiWeiTools
.
sleep
(
3000
);
if
(
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
3000
);
}
}
return
list
;
}
...
...
@@ -108,7 +110,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getBaiduNewsDataByTitle
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
)
throws
Exception
{
public
static
List
<
NewsData
>
getBaiduNewsDataByTitle
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
,
Long
sleepTime
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
0
;
boolean
more
=
true
;
...
...
@@ -127,7 +129,9 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
}
page
++;
ZhiWeiTools
.
sleep
(
3000
);
if
(
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
3000
);
}
}
return
list
;
}
...
...
@@ -252,9 +256,10 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
time
=
soureAndtimes
[
1
];
source
=
soureAndtimes
[
0
];
}
else
{
time
=
element
.
select
(
"div.c-row"
).
select
(
"p.c-author"
).
text
();
time
=
element
.
select
(
"div.c-row"
).
select
(
"p.c-author"
).
text
()
.
trim
()
;
}
/** 文章发布时间处理 **/
time
=
time
.
replaceAll
(
" "
,
""
);
time
=
TimeParse
.
dateFormartString
(
TimeParse
.
stringFormartDate
(
time
),
"yyyy-MM-dd HH:mm:ss"
);
// 处理文章简介
if
(
element
.
select
(
"div.c-row"
)
!=
null
)
{
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
View file @
df8ce8d3
...
...
@@ -33,7 +33,7 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
* @return List<TiebaData> 返回类型
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
TiebaData
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
,
String
tiebaName
)
throws
Exception
{
public
static
List
<
TiebaData
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
,
String
tiebaName
,
Long
sleepTime
)
throws
Exception
{
List
<
TiebaData
>
list
=
new
ArrayList
<
TiebaData
>();
int
page
=
0
;
boolean
more
=
true
;
...
...
@@ -52,7 +52,9 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
}
page
++;
ZhiWeiTools
.
sleep
(
3000
);
if
(
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
3000
);
}
}
return
list
;
}
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
View file @
df8ce8d3
...
...
@@ -38,7 +38,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSougouNewsData
(
String
word
,
Proxy
proxy
)
throws
Exception
{
public
static
List
<
NewsData
>
getSougouNewsData
(
String
word
,
Proxy
proxy
,
Long
sleepTime
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
1
;
boolean
more
=
true
;
...
...
@@ -56,8 +56,10 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
}
else
{
more
=
false
;
}
ZhiWeiTools
.
sleep
(
5000
);
page
++;
if
(
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
5000
);
}
}
return
list
;
}
...
...
@@ -74,7 +76,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSougouNewsDataByTitle
(
String
word
,
Proxy
proxy
)
throws
Exception
{
public
static
List
<
NewsData
>
getSougouNewsDataByTitle
(
String
word
,
Proxy
proxy
,
Long
sleepTime
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
0
;
boolean
more
=
true
;
...
...
@@ -93,7 +95,9 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
}
page
++;
ZhiWeiTools
.
sleep
(
5000
);
if
(
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
5000
);
}
}
return
list
;
}
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
View file @
df8ce8d3
...
...
@@ -41,7 +41,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
ZhiHuData
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
)
throws
Exception
{
public
static
List
<
ZhiHuData
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
,
Long
sleepTime
)
throws
Exception
{
List
<
ZhiHuData
>
list
=
new
ArrayList
<
ZhiHuData
>();
int
page
=
1
;
boolean
more
=
true
;
...
...
@@ -60,7 +60,9 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
}
else
{
more
=
false
;
}
ZhiWeiTools
.
sleep
(
5000
);
if
(
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
5000
);
}
page
++;
}
return
list
;
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
View file @
df8ce8d3
...
...
@@ -37,7 +37,7 @@ public class TianYaCrawlerParse extends HttpClientTemplateOK {
* @return List<TiebaData> 返回类型
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
LunTanData
>
getLunTanData
(
String
word
,
Proxy
proxy
,
String
endTime
)
throws
Exception
{
public
static
List
<
LunTanData
>
getLunTanData
(
String
word
,
Proxy
proxy
,
String
endTime
,
Long
sleepTime
)
throws
Exception
{
List
<
LunTanData
>
list
=
new
ArrayList
<
LunTanData
>();
int
page
=
0
;
boolean
more
=
true
;
...
...
@@ -56,7 +56,10 @@ public class TianYaCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
}
page
++;
ZhiWeiTools
.
sleep
(
3000
);
if
(
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
3000
);
}
}
return
list
;
}
...
...
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
View file @
df8ce8d3
...
...
@@ -32,9 +32,9 @@ public class DataCrawler {
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public
static
List
<
NewsData
>
getBaiduNewsData
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
){
public
static
List
<
NewsData
>
getBaiduNewsData
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
,
Long
sleepTime
){
try
{
return
BaiduNewsCrawlerParse
.
getBaiduNewsData
(
word
,
startTime
,
endTime
,
proxy
);
return
BaiduNewsCrawlerParse
.
getBaiduNewsData
(
word
,
startTime
,
endTime
,
proxy
,
sleepTime
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
...
...
@@ -53,9 +53,9 @@ public class DataCrawler {
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public
static
List
<
NewsData
>
getBaiduNewsDataByTitle
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
){
public
static
List
<
NewsData
>
getBaiduNewsDataByTitle
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
,
Long
sleepTime
){
try
{
return
BaiduNewsCrawlerParse
.
getBaiduNewsDataByTitle
(
word
,
startTime
,
endTime
,
proxy
);
return
BaiduNewsCrawlerParse
.
getBaiduNewsDataByTitle
(
word
,
startTime
,
endTime
,
proxy
,
sleepTime
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
...
...
@@ -112,9 +112,9 @@ public class DataCrawler {
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public
static
List
<
NewsData
>
getSougouNewsData
(
String
word
,
Proxy
proxy
){
public
static
List
<
NewsData
>
getSougouNewsData
(
String
word
,
Proxy
proxy
,
Long
sleepTime
){
try
{
return
SougouNewsCrawlerParse
.
getSougouNewsData
(
word
,
proxy
);
return
SougouNewsCrawlerParse
.
getSougouNewsData
(
word
,
proxy
,
sleepTime
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
...
...
@@ -131,9 +131,9 @@ public class DataCrawler {
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public
static
List
<
NewsData
>
getSougouNewsDataByTitle
(
String
word
,
Proxy
proxy
){
public
static
List
<
NewsData
>
getSougouNewsDataByTitle
(
String
word
,
Proxy
proxy
,
Long
sleepTime
){
try
{
return
SougouNewsCrawlerParse
.
getSougouNewsDataByTitle
(
word
,
proxy
);
return
SougouNewsCrawlerParse
.
getSougouNewsDataByTitle
(
word
,
proxy
,
sleepTime
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
...
...
@@ -149,9 +149,9 @@ public class DataCrawler {
* @param @return 设定文件
* @return List<ZhiHuData> 返回类型
*/
public
static
List
<
ZhiHuData
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
){
public
static
List
<
ZhiHuData
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
,
Long
sleepTime
){
try
{
return
SougouZhihuCrawlerParse
.
getSougouZhihuData
(
word
,
proxy
);
return
SougouZhihuCrawlerParse
.
getSougouZhihuData
(
word
,
proxy
,
sleepTime
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
...
...
@@ -167,9 +167,9 @@ public class DataCrawler {
* @param @return 设定文件
* @return List<TiebaData> 返回类型
*/
public
static
List
<
TiebaData
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
){
public
static
List
<
TiebaData
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
,
Long
sleepTime
){
try
{
return
BaiduTiebaCrawlerParse
.
getBaiduTiebaData
(
word
,
proxy
,
null
);
return
BaiduTiebaCrawlerParse
.
getBaiduTiebaData
(
word
,
proxy
,
null
,
sleepTime
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
...
...
@@ -186,9 +186,9 @@ public class DataCrawler {
* @param @return 设定文件
* @return List<TiebaData> 返回类型
*/
public
static
List
<
TiebaData
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
,
String
tiebaName
){
public
static
List
<
TiebaData
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
,
String
tiebaName
,
Long
sleepTime
){
try
{
return
BaiduTiebaCrawlerParse
.
getBaiduTiebaData
(
word
,
proxy
,
tiebaName
);
return
BaiduTiebaCrawlerParse
.
getBaiduTiebaData
(
word
,
proxy
,
tiebaName
,
sleepTime
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
...
...
@@ -205,9 +205,9 @@ public class DataCrawler {
* @param @return 设定文件
* @return List<LunTanData> 返回类型
*/
public
static
List
<
LunTanData
>
getLunTanData
(
String
word
,
Proxy
proxy
,
String
endTime
){
public
static
List
<
LunTanData
>
getLunTanData
(
String
word
,
Proxy
proxy
,
String
endTime
,
Long
sleepTime
){
try
{
return
TianYaCrawlerParse
.
getLunTanData
(
word
,
proxy
,
endTime
);
return
TianYaCrawlerParse
.
getLunTanData
(
word
,
proxy
,
endTime
,
sleepTime
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment