Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
soubao_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
soubao_crawler
Commits
95487743
Commit
95487743
authored
Oct 18, 2018
by
[zhangzhiwei]
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
因修改采集核心包版本,修改相应的方法
parent
f09faf1a
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
203 additions
and
212 deletions
+203
-212
pom.xml
+2
-7
src/main/java/com/zhiwei/crawler/run/MainRun.java
+2
-3
src/main/java/com/zhiwei/crawler/soubao/Crawler.java
+197
-197
src/main/java/com/zhiwei/crawler/soubao/SouBaoCrawlerThread.java
+2
-5
No files found.
pom.xml
View file @
95487743
...
...
@@ -25,22 +25,17 @@
<version>
3.8.1
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.middleware
</groupId>
<artifactId>
proxy-client
</artifactId>
<version>
0.0.2-RELEASE
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.0.
5
-SNAPSHOT
</version>
<version>
0.0.
8
-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.middleware
</groupId>
<artifactId>
cleaner-unified-urlfilter
</artifactId>
<version>
1.0.
0
.RELEASE
</version>
<version>
1.0.
6
.RELEASE
</version>
</dependency>
<dependency>
...
...
src/main/java/com/zhiwei/crawler/run/MainRun.java
View file @
95487743
...
...
@@ -4,9 +4,9 @@ import java.util.concurrent.Executors;
import
java.util.concurrent.ScheduledExecutorService
;
import
java.util.concurrent.TimeUnit
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.config.ProxyConfig
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.proxy.common.Definition.GroupType
;
public
class
MainRun
{
private
ScheduledExecutorService
scheduExec
;
...
...
@@ -21,8 +21,7 @@ public class MainRun {
public
static
void
main
(
String
[]
args
)
{
/** 初始化代理IP **/
ProxyFactory
.
init
(
ProxyConfig
.
registry
,
ProxyConfig
.
group
,
GroupType
.
PROVIDER
,
ProxyFactory:
:
getNatProxy
);
ProxyFactory
.
init
(
ProxyConfig
.
registry
,
ProxyConfig
.
group
,
GroupType
.
PROVIDER
);
new
MainRun
().
showTimer
();
}
...
...
src/main/java/com/zhiwei/crawler/soubao/Crawler.java
View file @
95487743
/**
* @Title: Crawler.java
* @Package com.zhiwei.crawler.soubao
* @author 0xff
* @date 2018年6月28日 上午9:49:32
*/
package
com
.
zhiwei
.
crawler
.
soubao
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.text.SimpleDateFormat
;
import
java.util.Calendar
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.concurrent.TimeUnit
;
import
java.util.regex.Pattern
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
com.zhiwei.crawler.download.HttpClientBuilder
;
import
com.zhiwei.crawler.download.HttpRequestBuilder
;
import
com.zhiwei.crawler.util.TreatData
;
import
okhttp3.FormBody
;
import
okhttp3.Headers
;
import
okhttp3.OkHttpClient
;
import
okhttp3.Request
;
import
okhttp3.Response
;
/**
* @ClassName: Crawler
* @Description: 搜报网爬虫
* @author 0xff
* @date 2018年6月28日 上午9:49:32
*/
public
class
Crawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
Crawler
.
class
);
public
static
void
start
(
int
days
,
String
keyword
,
Proxy
proxy
)
throws
Exception
{
if
(
days
<
0
)
{
throw
new
IllegalArgumentException
(
"搜索天数不能小于 0"
);
}
SimpleDateFormat
sdf
=
new
SimpleDateFormat
(
"yyyy-MM-dd"
);
Calendar
c
=
Calendar
.
getInstance
();
String
endDate
=
sdf
.
format
(
c
.
getTime
());
c
.
add
(
Calendar
.
DAY_OF_YEAR
,
-
1
*
days
);
String
startDate
=
sdf
.
format
(
c
.
getTime
());
StringBuilder
sb
=
new
StringBuilder
(
"http://www.soubao.net/search/searchList.aspx?timesel=custom&checkNum="
);
sb
.
append
(
"&startdate="
+
startDate
+
"&enddate="
+
endDate
);
sb
.
append
(
"&keyword="
+
URLEncoder
.
encode
(
keyword
,
"UTF-8"
));
try
{
String
url
=
sb
.
toString
();
logger
.
info
(
"关键词 {} 搜索链接 {}"
,
keyword
,
url
);
search
(
url
,
keyword
,
startDate
,
endDate
,
proxy
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"关键词 {} 采集出错"
,
keyword
,
e
);
}
}
private
static
void
search
(
String
url
,
String
keyword
,
String
startDate
,
String
endDate
,
Proxy
proxy
)
throws
Exception
{
int
count
=
0
;
OkHttpClient
client
=
HttpClientBuilder
.
newInstanceWithCookieJar
(
proxy
);
Map
<
String
,
String
>
map
=
new
HashMap
<
String
,
String
>();
map
.
put
(
"Referer"
,
"http://www.soubao.net/search/searchList.aspx"
);
map
.
put
(
"Cookie"
,
DevKit
.
buildSoubaoCookie
());
map
.
put
(
"Host"
,
"www.soubao.net"
);
map
.
put
(
"Origin"
,
"http://www.soubao.net"
);
map
.
put
(
"Content-Type"
,
"application/x-www-form-urlencoded"
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
Headers
.
of
(
map
));
Response
response
=
client
.
newCall
(
request
).
execute
();
String
body
=
response
.
body
().
string
();
logger
.
info
(
"关键词 {} 搜索成功"
,
keyword
);
Document
html
=
Jsoup
.
parse
(
body
);
boolean
needRepair
=
true
;
int
page
=
1
;
// 开始强制翻页
for
(
int
i
=
1
;
i
<=
page
;
i
++)
{
FormBody
formBody
=
new
FormBody
.
Builder
()
.
add
(
"__VIEWSTATE"
,
html
.
getElementById
(
"__VIEWSTATE"
).
attr
(
"value"
))
.
add
(
"__VIEWSTATEGENERATOR"
,
html
.
getElementById
(
"__VIEWSTATEGENERATOR"
).
attr
(
"value"
))
.
add
(
"__EVENTTARGET"
,
"AspNetPager1"
).
add
(
"__EVENTARGUMENT"
,
i
+
""
)
.
add
(
"__EVENTVALIDATION"
,
html
.
getElementById
(
"__EVENTVALIDATION"
).
attr
(
"value"
))
.
add
(
"HidTimeSelect"
,
html
.
getElementById
(
"HidTimeSelect"
).
attr
(
"value"
))
.
add
(
"HiddenMsg"
,
html
.
getElementById
(
"HiddenMsg"
).
attr
(
"value"
)).
add
(
"txtKeyword"
,
keyword
)
.
add
(
"checkNum"
,
""
).
add
(
"timesel"
,
"on"
).
add
(
"txtStartDate"
,
startDate
).
add
(
"txtEndDate"
,
endDate
)
.
build
();
request
=
HttpRequestBuilder
.
newPostRequest
(
"http://www.soubao.net/search/searchList.aspx"
,
request
.
headers
(),
formBody
);
response
=
client
.
newCall
(
request
).
execute
();
body
=
response
.
body
().
string
();
if
(!
body
.
contains
(
"rptRetList_ctl01_HLinkBT"
))
{
page
=
0
;
logger
.
info
(
"关键词 {} 无数据,退出搜索"
,
keyword
);
break
;
}
html
=
Jsoup
.
parse
(
body
);
if
(
needRepair
)
{
//修正翻页页数
try
{
page
=
Integer
.
parseInt
(
html
.
getElementById
(
"LbKeyword"
).
select
(
"span"
).
get
(
2
).
text
().
replaceAll
(
".*?/|页"
,
""
));
logger
.
info
(
"关键词 {} 搜索结果页数: {}"
,
keyword
,
page
);
needRepair
=
false
;
}
catch
(
Exception
e
)
{
throw
new
IllegalStateException
(
"关键词 "
+
keyword
+
" 获取搜索结果页数失败"
);
}
}
count
++;
logger
.
info
(
"关键词 {} 翻页页数: {} 访问成功, 页面长度:{}"
,
keyword
,
i
,
body
.
length
());
// 解析翻页
parse
(
client
,
request
.
headers
(),
html
);
TimeUnit
.
SECONDS
.
sleep
(
2
);
}
logger
.
info
(
"关键词 {} 爬取完毕,总页数: {},数据条数: {}"
,
keyword
,
page
,
count
);
}
/**
* @Title: parse
* @author hero
* @Description: 解析数据
* @param @param client
* @param @param headers
* @param @param html 设定文件
* @return void 返回类型
*/
private
static
void
parse
(
OkHttpClient
client
,
Headers
headers
,
Document
html
)
{
try
{
Elements
elements
=
html
.
select
(
"ul.newList"
).
select
(
"li"
);
logger
.
info
(
"数据大小:::{}"
,
elements
.
size
());
for
(
Element
element
:
elements
)
{
try
{
String
link
=
"http://www.soubao.net"
+
element
.
select
(
"h2"
).
select
(
"a"
).
attr
(
"href"
);
String
realUrl
=
matchRealUrl
(
client
,
headers
,
link
);
if
(
realUrl
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<
String
,
Object
>();
dataMap
.
put
(
"title"
,
element
.
select
(
"h2"
).
select
(
"a"
).
text
());
dataMap
.
put
(
"content"
,
element
.
select
(
"p.newCon"
).
text
());
dataMap
.
put
(
"source"
,
element
.
select
(
"p.newsInfo"
).
select
(
"em.paperName"
).
select
(
"span"
).
text
());
dataMap
.
put
(
"time"
,
element
.
select
(
"p.newsInfo"
).
select
(
"em.postDate"
).
select
(
"span"
).
text
());
dataMap
.
put
(
"_id"
,
realUrl
);
TreatData
.
treatDataAccount
(
dataMap
);
}
else
{
logger
.
info
(
"链接为:{},真实地址解析出现错误"
,
link
);
}
}
catch
(
Exception
e
)
{
logger
.
debug
(
"解析数据结构出现问题::"
,
e
.
fillInStackTrace
());
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"页面正文提取出错"
,
e
);
}
}
public
static
String
matchRealUrl
(
OkHttpClient
client
,
Headers
headers
,
String
url
)
{
String
regex
=
"^([hH][tT]{2}[pP]:/*|[hH][tT]{2}[pP][sS]:/*|[fF][tT][pP]:/*)(([A-Za-z0-9-~]+).)+([A-Za-z0-9-~\\/])+(\\?{0,1}(([A-Za-z0-9-~]+\\={0,1})([A-Za-z0-9-~]*)\\&{0,1})*)$"
;
String
realUrl
=
null
;
try
{
TimeUnit
.
MILLISECONDS
.
sleep
(
500
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
null
);
Response
response
=
client
.
newBuilder
().
build
().
newCall
(
request
).
execute
();
String
html
=
response
.
body
().
string
();
if
(
html
!=
null
&&
html
.
contains
(
"window.location='"
))
{
realUrl
=
html
.
split
(
"window.location='"
)[
1
].
split
(
"'</script>"
)[
0
];
realUrl
=
realUrl
.
replaceAll
(
"/./"
,
"/"
);
Pattern
pattern
=
Pattern
.
compile
(
regex
);
if
(
pattern
.
matcher
(
realUrl
).
matches
())
{
return
realUrl
;
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
}
return
realUrl
;
}
public
static
void
main
(
String
[]
args
)
{
try
{
start
(
1
,
"京东"
,
null
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
/**
* @Title: Crawler.java
* @Package com.zhiwei.crawler.soubao
* @author 0xff
* @date 2018年6月28日 上午9:49:32
*/
package
com
.
zhiwei
.
crawler
.
soubao
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.text.SimpleDateFormat
;
import
java.util.Calendar
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.concurrent.TimeUnit
;
import
java.util.regex.Pattern
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
com.zhiwei.crawler.download.HttpClientBuilder
;
import
com.zhiwei.crawler.download.HttpRequestBuilder
;
import
com.zhiwei.crawler.util.TreatData
;
import
okhttp3.FormBody
;
import
okhttp3.Headers
;
import
okhttp3.OkHttpClient
;
import
okhttp3.Request
;
import
okhttp3.Response
;
/**
* @ClassName: Crawler
* @Description: 搜报网爬虫
* @author 0xff
* @date 2018年6月28日 上午9:49:32
*/
public
class
Crawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
Crawler
.
class
);
public
static
void
start
(
int
days
,
String
keyword
,
Proxy
proxy
)
throws
Exception
{
if
(
days
<
0
)
{
throw
new
IllegalArgumentException
(
"搜索天数不能小于 0"
);
}
SimpleDateFormat
sdf
=
new
SimpleDateFormat
(
"yyyy-MM-dd"
);
Calendar
c
=
Calendar
.
getInstance
();
String
endDate
=
sdf
.
format
(
c
.
getTime
());
c
.
add
(
Calendar
.
DAY_OF_YEAR
,
-
1
*
days
);
String
startDate
=
sdf
.
format
(
c
.
getTime
());
StringBuilder
sb
=
new
StringBuilder
(
"http://www.soubao.net/search/searchList.aspx?timesel=custom&checkNum="
);
sb
.
append
(
"&startdate="
+
startDate
+
"&enddate="
+
endDate
);
sb
.
append
(
"&keyword="
+
URLEncoder
.
encode
(
keyword
,
"UTF-8"
));
try
{
String
url
=
sb
.
toString
();
logger
.
info
(
"关键词 {} 搜索链接 {}"
,
keyword
,
url
);
search
(
url
,
keyword
,
startDate
,
endDate
,
proxy
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"关键词 {} 采集出错"
,
keyword
,
e
);
}
}
private
static
void
search
(
String
url
,
String
keyword
,
String
startDate
,
String
endDate
,
Proxy
proxy
)
throws
Exception
{
int
count
=
0
;
OkHttpClient
client
=
HttpClientBuilder
.
newInstanceWithCookieJar
(
proxy
);
Map
<
String
,
String
>
map
=
new
HashMap
<
String
,
String
>();
map
.
put
(
"Referer"
,
"http://www.soubao.net/search/searchList.aspx"
);
map
.
put
(
"Cookie"
,
DevKit
.
buildSoubaoCookie
());
map
.
put
(
"Host"
,
"www.soubao.net"
);
map
.
put
(
"Origin"
,
"http://www.soubao.net"
);
map
.
put
(
"Content-Type"
,
"application/x-www-form-urlencoded"
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
Headers
.
of
(
map
));
Response
response
=
client
.
newCall
(
request
).
execute
();
String
body
=
response
.
body
().
string
();
logger
.
info
(
"关键词 {} 搜索成功"
,
keyword
);
Document
html
=
Jsoup
.
parse
(
body
);
boolean
needRepair
=
true
;
int
page
=
1
;
// 开始强制翻页
for
(
int
i
=
1
;
i
<=
page
;
i
++)
{
FormBody
formBody
=
new
FormBody
.
Builder
()
.
add
(
"__VIEWSTATE"
,
html
.
getElementById
(
"__VIEWSTATE"
).
attr
(
"value"
))
.
add
(
"__VIEWSTATEGENERATOR"
,
html
.
getElementById
(
"__VIEWSTATEGENERATOR"
).
attr
(
"value"
))
.
add
(
"__EVENTTARGET"
,
"AspNetPager1"
).
add
(
"__EVENTARGUMENT"
,
i
+
""
)
.
add
(
"__EVENTVALIDATION"
,
html
.
getElementById
(
"__EVENTVALIDATION"
).
attr
(
"value"
))
.
add
(
"HidTimeSelect"
,
html
.
getElementById
(
"HidTimeSelect"
).
attr
(
"value"
))
.
add
(
"HiddenMsg"
,
html
.
getElementById
(
"HiddenMsg"
).
attr
(
"value"
)).
add
(
"txtKeyword"
,
keyword
)
.
add
(
"checkNum"
,
""
).
add
(
"timesel"
,
"on"
).
add
(
"txtStartDate"
,
startDate
).
add
(
"txtEndDate"
,
endDate
)
.
build
();
request
=
HttpRequestBuilder
.
newPostRequest
(
"http://www.soubao.net/search/searchList.aspx"
,
request
.
headers
(),
formBody
);
response
=
client
.
newCall
(
request
).
execute
();
body
=
response
.
body
().
string
();
if
(!
body
.
contains
(
"rptRetList_ctl01_HLinkBT"
))
{
page
=
0
;
logger
.
info
(
"关键词 {} 无数据,退出搜索"
,
keyword
);
break
;
}
html
=
Jsoup
.
parse
(
body
);
if
(
needRepair
)
{
//修正翻页页数
try
{
page
=
Integer
.
parseInt
(
html
.
getElementById
(
"LbKeyword"
).
select
(
"span"
).
get
(
2
).
text
().
replaceAll
(
".*?/|页"
,
""
));
logger
.
info
(
"关键词 {} 搜索结果页数: {}"
,
keyword
,
page
);
needRepair
=
false
;
}
catch
(
Exception
e
)
{
throw
new
IllegalStateException
(
"关键词 "
+
keyword
+
" 获取搜索结果页数失败"
);
}
}
count
++;
logger
.
info
(
"关键词 {} 翻页页数: {} 访问成功, 页面长度:{}"
,
keyword
,
i
,
body
.
length
());
// 解析翻页
parse
(
client
,
request
.
headers
(),
html
);
TimeUnit
.
SECONDS
.
sleep
(
2
);
}
logger
.
info
(
"关键词 {} 爬取完毕,总页数: {},数据条数: {}"
,
keyword
,
page
,
count
);
}
/**
* @Title: parse
* @author hero
* @Description: 解析数据
* @param @param client
* @param @param headers
* @param @param html 设定文件
* @return void 返回类型
*/
private
static
void
parse
(
OkHttpClient
client
,
Headers
headers
,
Document
html
)
{
try
{
Elements
elements
=
html
.
select
(
"ul.newList"
).
select
(
"li"
);
logger
.
info
(
"数据大小:::{}"
,
elements
.
size
());
for
(
Element
element
:
elements
)
{
try
{
String
link
=
"http://www.soubao.net"
+
element
.
select
(
"h2"
).
select
(
"a"
).
attr
(
"href"
);
String
realUrl
=
matchRealUrl
(
client
,
headers
,
link
);
if
(
realUrl
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<
String
,
Object
>();
dataMap
.
put
(
"title"
,
element
.
select
(
"h2"
).
select
(
"a"
).
text
());
dataMap
.
put
(
"content"
,
element
.
select
(
"p.newCon"
).
text
());
dataMap
.
put
(
"source"
,
element
.
select
(
"p.newsInfo"
).
select
(
"em.paperName"
).
select
(
"span"
).
text
());
dataMap
.
put
(
"time"
,
element
.
select
(
"p.newsInfo"
).
select
(
"em.postDate"
).
select
(
"span"
).
text
());
dataMap
.
put
(
"_id"
,
realUrl
);
TreatData
.
treatDataAccount
(
dataMap
);
}
else
{
logger
.
info
(
"链接为:{},真实地址解析出现错误"
,
link
);
}
}
catch
(
Exception
e
)
{
logger
.
debug
(
"解析数据结构出现问题::"
,
e
.
fillInStackTrace
());
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"页面正文提取出错"
,
e
);
}
}
public
static
String
matchRealUrl
(
OkHttpClient
client
,
Headers
headers
,
String
url
)
{
String
regex
=
"^([hH][tT]{2}[pP]:/*|[hH][tT]{2}[pP][sS]:/*|[fF][tT][pP]:/*)(([A-Za-z0-9-~]+).)+([A-Za-z0-9-~\\/])+(\\?{0,1}(([A-Za-z0-9-~]+\\={0,1})([A-Za-z0-9-~]*)\\&{0,1})*)$"
;
String
realUrl
=
null
;
try
{
TimeUnit
.
MILLISECONDS
.
sleep
(
500
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
null
);
Response
response
=
client
.
newBuilder
().
build
().
newCall
(
request
).
execute
();
String
html
=
response
.
body
().
string
();
if
(
html
!=
null
&&
html
.
contains
(
"window.location='"
))
{
realUrl
=
html
.
split
(
"window.location='"
)[
1
].
split
(
"'</script>"
)[
0
];
realUrl
=
realUrl
.
replaceAll
(
"/./"
,
"/"
);
Pattern
pattern
=
Pattern
.
compile
(
regex
);
if
(
pattern
.
matcher
(
realUrl
).
matches
())
{
return
realUrl
;
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
}
return
realUrl
;
}
public
static
void
main
(
String
[]
args
)
{
try
{
start
(
1
,
"京东"
,
null
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
src/main/java/com/zhiwei/crawler/soubao/SouBaoCrawlerThread.java
View file @
95487743
package
com
.
zhiwei
.
crawler
.
soubao
;
import
java.net.Proxy
;
import
java.util.concurrent.BlockingQueue
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.crawler.proxy.Proxy
Factory
;
import
com.zhiwei.crawler.proxy.Proxy
Holder
;
import
com.zhiwei.crawler.run.SoubaoCrawlerRun
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
@@ -19,7 +18,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public
class
SouBaoCrawlerThread
extends
Thread
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SoubaoCrawlerRun
.
class
);
private
BlockingQueue
<
String
>
wordsQueue
;
public
SouBaoCrawlerThread
(
BlockingQueue
<
String
>
wordsQueue
)
{
this
.
wordsQueue
=
wordsQueue
;
}
...
...
@@ -30,11 +28,10 @@ public class SouBaoCrawlerThread extends Thread{
while
(
wordsQueue
!=
null
&&
wordsQueue
.
size
()>
0
){
try
{
String
word
=
wordsQueue
.
take
();
Proxy
proxy
=
ProxyFactory
.
proxyCallback
().
getProxy
();
/***开始采集**/
logger
.
info
(
"开始采集:{}搜报网关键词,目前未采集的关键词为:{}"
,
word
,
wordsQueue
.
size
());
long
s
=
System
.
currentTimeMillis
();
Crawler
.
start
(
1
,
word
,
proxy
);
Crawler
.
start
(
1
,
word
,
ProxyHolder
.
NAT_PROXY
.
getProxy
()
);
long
e
=
System
.
currentTimeMillis
();
logger
.
info
(
"采集:::{}搜报网关键词结束,采集所用时间为:{}"
,
word
,
(
e
-
s
));
}
catch
(
Exception
e
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment