Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
f4b4b293
Commit
f4b4b293
authored
Feb 20, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
升级采集核心包版本并默认代理使用晋豪提供得NAT代理
parent
f0484148
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
121 additions
and
42 deletions
+121
-42
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+21
-5
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
+16
-6
src/main/java/com/zhiwei/media_data_crawler/crawler/DoubanCrawlerParse.java
+17
-8
src/main/java/com/zhiwei/media_data_crawler/crawler/SoCrawlerParse.java
+9
-3
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
+11
-5
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
+20
-7
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
+18
-5
src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
+9
-3
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
View file @
f4b4b293
...
...
@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
...
...
@@ -68,8 +69,8 @@ public class BaiduNewsCrawlerParse {
more
=
false
;
}
page
++;
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
3000
);
if
(
DataCrawler
.
sleepTime
!=
null
){
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
}
}
return
list
;
...
...
@@ -201,7 +202,12 @@ public class BaiduNewsCrawlerParse {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
));
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
...
...
@@ -247,7 +253,12 @@ public class BaiduNewsCrawlerParse {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
...
...
@@ -271,7 +282,12 @@ public class BaiduNewsCrawlerParse {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
View file @
f4b4b293
...
...
@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.TiebaData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
...
...
@@ -57,8 +58,8 @@ public class BaiduTiebaCrawlerParse {
more
=
false
;
}
page
++;
if
(
DataCrawler
.
sleepTime
=
=
null
){
ZhiWeiTools
.
sleep
(
3000
);
if
(
DataCrawler
.
sleepTime
!
=
null
){
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
}
}
return
list
;
...
...
@@ -217,8 +218,13 @@ public class BaiduTiebaCrawlerParse {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
...
...
@@ -255,8 +261,12 @@ public class BaiduTiebaCrawlerParse {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
string
();
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/DoubanCrawlerParse.java
View file @
f4b4b293
...
...
@@ -3,6 +3,7 @@ package com.zhiwei.media_data_crawler.crawler;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.DouBanData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
...
...
@@ -94,8 +95,12 @@ public class DoubanCrawlerParse {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
string
();
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
...
...
@@ -116,8 +121,12 @@ public class DoubanCrawlerParse {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
...
...
@@ -193,8 +202,8 @@ public class DoubanCrawlerParse {
String
content
=
document
.
select
(
"div.topic-doc"
).
select
(
"div#link-report"
).
select
(
"div.topic-content"
).
text
();
douban
.
setContent
(
content
);
}
if
(
DataCrawler
.
sleepTime
=
=
null
){
ZhiWeiTools
.
sleep
(
5000
);
if
(
DataCrawler
.
sleepTime
!
=
null
){
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
}
return
douban
;
}
catch
(
Exception
e
)
{
...
...
@@ -273,8 +282,8 @@ public class DoubanCrawlerParse {
douban
.
setContent
(
content
);
}
}
if
(
DataCrawler
.
sleepTime
=
=
null
){
ZhiWeiTools
.
sleep
(
5000
);
if
(
DataCrawler
.
sleepTime
!
=
null
){
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
}
return
douban
;
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SoCrawlerParse.java
View file @
f4b4b293
...
...
@@ -3,6 +3,7 @@ package com.zhiwei.media_data_crawler.crawler;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
...
...
@@ -62,8 +63,8 @@ public class SoCrawlerParse {
more
=
false
;
}
page
++;
if
(
DataCrawler
.
sleepTime
=
=
null
){
ZhiWeiTools
.
sleep
(
5000
);
if
(
DataCrawler
.
sleepTime
!
=
null
){
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
}
}
return
list
;
...
...
@@ -104,7 +105,12 @@ public class SoCrawlerParse {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取360新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
View file @
f4b4b293
...
...
@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
...
...
@@ -59,8 +60,8 @@ public class SoNewsCrawlerParse {
more
=
false
;
}
page
++;
if
(
DataCrawler
.
sleepTime
=
=
null
){
ZhiWeiTools
.
sleep
(
5000
);
if
(
DataCrawler
.
sleepTime
!
=
null
){
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
}
}
return
list
;
...
...
@@ -108,8 +109,8 @@ public class SoNewsCrawlerParse {
more
=
false
;
}
page
++;
if
(
DataCrawler
.
sleepTime
=
=
null
){
ZhiWeiTools
.
sleep
(
5000
);
if
(
DataCrawler
.
sleepTime
!
=
null
){
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
}
}
return
list
;
...
...
@@ -136,7 +137,12 @@ public class SoNewsCrawlerParse {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取360新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
View file @
f4b4b293
...
...
@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
...
...
@@ -9,6 +10,8 @@ import com.zhiwei.tools.timeparse.TimeParse;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
...
...
@@ -63,8 +66,8 @@ public class SougouNewsCrawlerParse {
}
page
++;
logger
.
info
(
"采集到 {} 页 采集的数据量为 {}"
,
page
,
list
.
size
());
if
(
DataCrawler
.
sleepTime
=
=
null
){
ZhiWeiTools
.
sleep
(
5000
);
if
(
DataCrawler
.
sleepTime
!
=
null
){
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
}
}
return
list
;
...
...
@@ -74,7 +77,7 @@ public class SougouNewsCrawlerParse {
public
static
Map
<
String
,
Object
>
getSougouNewsData
(
String
word
,
Proxy
proxy
,
int
page
)
throws
Exception
{
String
htmlBody
=
downloadHtml
(
word
,
1
,
proxy
,
page
);
if
(
htmlBody
!=
null
&&
!
htmlBody
.
equals
(
""
)){
if
(
StringUtils
.
isBlank
(
htmlBody
)){
return
analysisData
(
htmlBody
,
proxy
,
word
,
"normal"
);
}
return
null
;
...
...
@@ -101,8 +104,8 @@ public class SougouNewsCrawlerParse {
more
=
false
;
}
page
++;
if
(
DataCrawler
.
sleepTime
=
=
null
){
ZhiWeiTools
.
sleep
(
5000
);
if
(
DataCrawler
.
sleepTime
!
=
null
){
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
}
}
return
list
;
...
...
@@ -130,7 +133,12 @@ public class SougouNewsCrawlerParse {
//下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++){
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取搜狗新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
...
...
@@ -154,7 +162,12 @@ public class SougouNewsCrawlerParse {
//下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++){
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取搜狗新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
View file @
f4b4b293
...
...
@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
...
...
@@ -9,6 +10,8 @@ import com.zhiwei.tools.timeparse.TimeParse;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
...
...
@@ -59,8 +62,8 @@ public class SougouZhihuCrawlerParse{
}
else
{
more
=
false
;
}
if
(
DataCrawler
.
sleepTime
=
=
null
){
ZhiWeiTools
.
sleep
(
5000
);
if
(
DataCrawler
.
sleepTime
!
=
null
){
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
}
page
++;
}
...
...
@@ -70,7 +73,7 @@ public class SougouZhihuCrawlerParse{
public
static
Map
<
String
,
Object
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
,
int
page
)
throws
Exception
{
String
htmlBody
=
downloadHtml
(
word
,
proxy
,
page
);
if
(
htmlBody
!=
null
&&
!
htmlBody
.
equals
(
""
)){
if
(
StringUtils
.
isBlank
(
htmlBody
)){
return
analysisData
(
htmlBody
,
proxy
,
word
);
}
return
null
;
...
...
@@ -97,7 +100,12 @@ public class SougouZhihuCrawlerParse{
//下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++){
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取搜狗新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
...
...
@@ -124,7 +132,12 @@ public class SougouZhihuCrawlerParse{
//下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++){
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取搜狗新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
View file @
f4b4b293
...
...
@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.LunTanData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
...
...
@@ -55,8 +56,8 @@ public class TianYaCrawlerParse {
more
=
false
;
}
page
++;
if
(
DataCrawler
.
sleepTime
=
=
null
){
ZhiWeiTools
.
sleep
(
3000
);
if
(
DataCrawler
.
sleepTime
!
=
null
){
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
}
}
...
...
@@ -87,7 +88,12 @@ public class TianYaCrawlerParse {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment