Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
1b78ab01
Commit
1b78ab01
authored
Jun 15, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改添加休眠时间方式,修改为在DataCrawler中统一设置
parent
df8ce8d3
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
39 additions
and
18 deletions
+39
-18
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+7
-4
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
+4
-2
src/main/java/com/zhiwei/media_data_crawler/crawler/DoubanCrawlerParse.java
+7
-2
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
+8
-2
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
+7
-4
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
+3
-2
src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
+3
-2
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
View file @
1b78ab01
...
...
@@ -8,12 +8,15 @@ import java.util.List;
import
java.util.Map
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
...
...
@@ -44,7 +47,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getBaiduNewsData
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
,
Long
sleepTime
)
throws
Exception
{
public
static
List
<
NewsData
>
getBaiduNewsData
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
0
;
boolean
more
=
true
;
...
...
@@ -63,7 +66,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
}
page
++;
if
(
sleepTime
==
null
){
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
3000
);
}
}
...
...
@@ -110,7 +113,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getBaiduNewsDataByTitle
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
,
Long
sleepTime
)
throws
Exception
{
public
static
List
<
NewsData
>
getBaiduNewsDataByTitle
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
0
;
boolean
more
=
true
;
...
...
@@ -129,7 +132,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
}
page
++;
if
(
sleepTime
==
null
){
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
3000
);
}
}
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
View file @
1b78ab01
...
...
@@ -12,6 +12,8 @@ import org.jsoup.nodes.Element;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.TiebaData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
...
...
@@ -33,7 +35,7 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
* @return List<TiebaData> 返回类型
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
TiebaData
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
,
String
tiebaName
,
Long
sleepTime
)
throws
Exception
{
public
static
List
<
TiebaData
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
,
String
tiebaName
)
throws
Exception
{
List
<
TiebaData
>
list
=
new
ArrayList
<
TiebaData
>();
int
page
=
0
;
boolean
more
=
true
;
...
...
@@ -52,7 +54,7 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
}
page
++;
if
(
sleepTime
==
null
){
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
3000
);
}
}
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/DoubanCrawlerParse.java
View file @
1b78ab01
...
...
@@ -14,6 +14,7 @@ import org.jsoup.select.Elements;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.DouBanData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
...
...
@@ -189,7 +190,9 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK {
String
content
=
document
.
select
(
"div.topic-doc"
).
select
(
"div#link-report"
).
select
(
"div.topic-content"
).
text
();
douban
.
setContent
(
content
);
}
ZhiWeiTools
.
sleep
(
1000
);
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
5000
);
}
return
douban
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
...
...
@@ -267,7 +270,9 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK {
douban
.
setContent
(
content
);
}
}
ZhiWeiTools
.
sleep
(
1000
);
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
5000
);
}
return
douban
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
View file @
1b78ab01
...
...
@@ -12,6 +12,8 @@ import org.jsoup.nodes.Element;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
...
...
@@ -55,7 +57,9 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
}
page
++;
ZhiWeiTools
.
sleep
(
5000
);
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
5000
);
}
}
return
list
;
}
...
...
@@ -102,7 +106,9 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
}
page
++;
ZhiWeiTools
.
sleep
(
5000
);
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
5000
);
}
}
return
list
;
}
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
View file @
1b78ab01
...
...
@@ -6,12 +6,15 @@ import java.util.ArrayList;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
...
...
@@ -38,7 +41,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSougouNewsData
(
String
word
,
Proxy
proxy
,
Long
sleepTime
)
throws
Exception
{
public
static
List
<
NewsData
>
getSougouNewsData
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
1
;
boolean
more
=
true
;
...
...
@@ -57,7 +60,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
}
page
++;
if
(
sleepTime
==
null
){
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
5000
);
}
}
...
...
@@ -76,7 +79,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSougouNewsDataByTitle
(
String
word
,
Proxy
proxy
,
Long
sleepTime
)
throws
Exception
{
public
static
List
<
NewsData
>
getSougouNewsDataByTitle
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
0
;
boolean
more
=
true
;
...
...
@@ -95,7 +98,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
}
page
++;
if
(
sleepTime
==
null
){
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
5000
);
}
}
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
View file @
1b78ab01
...
...
@@ -17,6 +17,7 @@ import org.jsoup.select.Elements;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
...
...
@@ -41,7 +42,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
ZhiHuData
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
,
Long
sleepTime
)
throws
Exception
{
public
static
List
<
ZhiHuData
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
ZhiHuData
>
list
=
new
ArrayList
<
ZhiHuData
>();
int
page
=
1
;
boolean
more
=
true
;
...
...
@@ -60,7 +61,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
}
else
{
more
=
false
;
}
if
(
sleepTime
==
null
){
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
5000
);
}
page
++;
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
View file @
1b78ab01
...
...
@@ -15,6 +15,7 @@ import org.jsoup.select.Elements;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.LunTanData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
...
...
@@ -37,7 +38,7 @@ public class TianYaCrawlerParse extends HttpClientTemplateOK {
* @return List<TiebaData> 返回类型
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
LunTanData
>
getLunTanData
(
String
word
,
Proxy
proxy
,
String
endTime
,
Long
sleepTime
)
throws
Exception
{
public
static
List
<
LunTanData
>
getLunTanData
(
String
word
,
Proxy
proxy
,
String
endTime
)
throws
Exception
{
List
<
LunTanData
>
list
=
new
ArrayList
<
LunTanData
>();
int
page
=
0
;
boolean
more
=
true
;
...
...
@@ -56,7 +57,7 @@ public class TianYaCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
}
page
++;
if
(
sleepTime
==
null
){
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
3000
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment