Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
38bcf00d
Commit
38bcf00d
authored
Mar 06, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加自助翻页功能,如使用请添加休眠时间
parent
0930c2aa
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
236 additions
and
158 deletions
+236
-158
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+149
-104
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
+19
-5
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
+17
-6
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
+14
-6
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
+37
-37
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
View file @
38bcf00d
...
...
@@ -27,35 +27,40 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiduNewsCrawlerParse
.
class
);
private
static
final
String
pt
=
"百度新闻"
;
/**
* @Title: getBaiduNewsData
* @author hero
* @Description: 采集百度新闻数据
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @return 设定文件
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getBaiduNewsData
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
){
public
static
List
<
NewsData
>
getBaiduNewsData
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
0
;
boolean
more
=
true
;
while
(
more
)
{
//
最大页数为20
if
(
page
>
20
)
{
while
(
more
)
{
//
最大页数为20
if
(
page
>
20
)
{
more
=
false
;
}
String
htmlBody
=
downloadHtml
(
word
,
startTime
,
endTime
,
proxy
,
"newsdy"
,
page
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
list
.
addAll
(
dataList
);
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
}
else
{
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
}
else
{
more
=
false
;
}
page
++;
...
...
@@ -65,33 +70,61 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
}
/**
* @Title: getBaiduNewsData
ByTitle
* @Title: getBaiduNewsData
* @author hero
* @Description:
采集百度新闻数据,根据标题匹配
* @Description:
根据关键词获取数据
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @return 设定文件
* @param @param page
* @param @return
* @param @throws Exception 设定文件
* @return Map<String,Object> 返回类型
*/
public
static
Map
<
String
,
Object
>
getBaiduNewsData
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
,
int
page
)
throws
Exception
{
String
htmlBody
=
downloadHtml
(
word
,
startTime
,
endTime
,
proxy
,
"newsdy"
,
page
);
if
(
htmlBody
!=
null
)
{
return
analysisData
(
htmlBody
,
proxy
,
word
);
}
return
null
;
}
/**
* @Title: getBaiduNewsDataByTitle
* @author hero
* @Description: 采集百度新闻数据,根据标题匹配
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getBaiduNewsDataByTitle
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
){
public
static
List
<
NewsData
>
getBaiduNewsDataByTitle
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
0
;
boolean
more
=
true
;
while
(
more
)
{
//
最大页数为20
if
(
page
>
20
)
{
while
(
more
)
{
//
最大页数为20
if
(
page
>
20
)
{
more
=
false
;
}
String
htmlBody
=
downloadHtml
(
word
,
startTime
,
endTime
,
proxy
,
"newstitle"
,
page
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
list
.
addAll
(
dataList
);
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
}
else
{
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
}
else
{
more
=
false
;
}
page
++;
...
...
@@ -100,30 +133,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return
list
;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @param tn (tn=newsdy 为 全文匹配, tn=newstitle 为标题匹配)
* @param @param page
* @param @return 设定文件
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @param
* tn (tn=newsdy 为 全文匹配, tn=newstitle 为标题匹配)
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型
*/
private
static
String
downloadHtml
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
,
String
tn
,
int
page
)
{
//获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//获取链接地址
private
static
String
downloadHtml
(
String
word
,
String
startTime
,
String
endTime
,
Proxy
proxy
,
String
tn
,
int
page
)
throws
Exception
{
// 获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
// 获取链接地址
String
url
=
getUrl
(
word
,
startTime
,
endTime
,
tn
,
page
);
headerMap
.
put
(
"Host"
,
"news.baidu.com"
);
headerMap
.
put
(
"Referer"
,
url
);
//下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
//
下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
...
...
@@ -134,15 +173,15 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return
null
;
}
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
int
page
)
{
//获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//获取链接地址
url
=
url
+
"&pn="
+
page
*
30
;
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
int
page
)
throws
Exception
{
//
获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//
获取链接地址
url
=
url
+
"&pn="
+
page
*
30
;
headerMap
.
put
(
"Host"
,
"news.baidu.com"
);
headerMap
.
put
(
"Referer"
,
url
);
//下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
//
下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
...
...
@@ -153,37 +192,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return
null
;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析百度新闻数据
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @return 设定文件
* @param @param
* htmlBody
* @param @param
* proxy
* @param @param
* word
* @param @return
* 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
boolean
more
=
true
;
/** 解析页面 */
Document
document
=
Jsoup
.
parse
(
htmlBody
);
/**判断是否有下一页**/
if
(
document
.
select
(
"p#page"
)
==
null
)
{
/** 判断是否有下一页 **/
if
(
document
.
select
(
"p#page"
)
==
null
)
{
more
=
false
;
}
else
{
if
(!
document
.
select
(
"p#page"
).
text
().
contains
(
"下一页"
))
{
}
else
{
if
(!
document
.
select
(
"p#page"
).
text
().
contains
(
"下一页"
))
{
more
=
false
;
}
}
//开始解析
//
开始解析
Elements
elementes
=
document
.
select
(
"div.result"
);
String
time
=
null
;
String
source
=
null
;
...
...
@@ -195,13 +233,12 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
String
content
=
null
;
Pattern
pattern
=
null
;
Matcher
matcher
=
null
;
for
(
Element
element
:
elementes
)
{
for
(
Element
element
:
elementes
)
{
try
{
link
=
element
.
select
(
"h3.c-title"
).
select
(
"a"
).
attr
(
"href"
);
title
=
element
.
select
(
"h3.c-title"
).
select
(
"a"
).
text
();
soureAndtime
=
element
.
select
(
"div.c-row"
).
select
(
"p.c-author"
).
html
();
/**
截取时间
*/
/**
截取时间
*/
if
(
soureAndtime
.
contains
(
" "
))
{
String
soureAndtimes
[]
=
soureAndtime
.
split
(
" "
);
time
=
soureAndtimes
[
1
];
...
...
@@ -209,10 +246,10 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
}
else
{
time
=
element
.
select
(
"div.c-row"
).
select
(
"p.c-author"
).
text
();
}
/**
文章发布时间处理
**/
time
=
TimeParse
.
dateFormartString
(
TimeParse
.
stringFormartDate
(
time
),
"yyyy-MM-dd HH:mm:ss"
)
;
/**
文章发布时间处理
**/
time
=
TimeParse
.
dateFormartString
(
TimeParse
.
stringFormartDate
(
time
),
"yyyy-MM-dd HH:mm:ss"
);
// 处理文章简介
if
(
element
.
select
(
"div.c-row"
)!=
null
)
{
if
(
element
.
select
(
"div.c-row"
)
!=
null
)
{
descript
=
element
.
select
(
"div.c-row"
).
text
();
soureAndtimeText
=
element
.
select
(
"div.c-row"
).
select
(
"p.c-author"
).
text
();
content
=
descript
.
substring
(
soureAndtimeText
.
length
(),
descript
.
length
());
...
...
@@ -220,14 +257,14 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
matcher
=
pattern
.
matcher
(
content
);
content
=
matcher
.
replaceAll
(
""
).
replace
(
"-"
,
""
).
replace
(
"百度快照"
,
""
);
}
//添加到数据集合中
//
添加到数据集合中
NewsData
newsData
=
new
NewsData
(
link
,
title
,
source
,
time
,
content
,
pt
,
word
);
list
.
add
(
newsData
);
/**
采集相同新闻链接
**/
/**
采集相同新闻链接
**/
String
otherUrl
=
element
.
select
(
"div.c-row"
).
select
(
"a.c-more_link"
).
attr
(
"href"
);
if
(
otherUrl
!=
null
&&
!
otherUrl
.
equals
(
""
))
{
String
otherLink
=
"http://news.baidu.com"
+
element
.
select
(
"div.c-row"
).
select
(
"a.c-more_link"
).
attr
(
"href"
);
if
(
otherUrl
!=
null
&&
!
otherUrl
.
equals
(
""
))
{
String
otherLink
=
"http://news.baidu.com"
+
element
.
select
(
"div.c-row"
).
select
(
"a.c-more_link"
).
attr
(
"href"
);
List
<
NewsData
>
otherDataList
=
getOherBaiduNewsData
(
otherLink
,
word
,
proxy
);
list
.
addAll
(
otherDataList
);
ZhiWeiTools
.
sleep
(
100
);
...
...
@@ -244,34 +281,37 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return
resultMap
;
}
/**
* @Title: getOherBaiduNewsData
* @author hero
* @Description: 解析相似新闻
* @param @param url
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @param @param
* url
* @param @param
* word
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getOherBaiduNewsData
(
String
url
,
String
word
,
Proxy
proxy
){
public
static
List
<
NewsData
>
getOherBaiduNewsData
(
String
url
,
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
0
;
boolean
more
=
true
;
while
(
more
)
{
//
最大页数为20
if
(
page
>
20
)
{
while
(
more
)
{
//
最大页数为20
if
(
page
>
20
)
{
more
=
false
;
}
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
page
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
null
,
word
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
null
,
word
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
list
.
addAll
(
dataList
);
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
}
else
{
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
}
else
{
more
=
false
;
}
page
++;
...
...
@@ -279,31 +319,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return
list
;
}
/**
* @Title: getUrl
* @author hero
* @Description: 获取链接
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param page
* @param @return 设定文件
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型
*/
private
static
String
getUrl
(
String
word
,
String
startTime
,
String
endTime
,
String
tn
,
int
page
){
private
static
String
getUrl
(
String
word
,
String
startTime
,
String
endTime
,
String
tn
,
int
page
)
{
long
bt
=
0
;
long
et
=
0
;
String
url
=
null
;
if
(
startTime
!=
null
)
{
bt
=
TimeParse
.
stringFormartDate
(
startTime
).
getTime
()
/
1000
;
if
(
startTime
!=
null
)
{
bt
=
TimeParse
.
stringFormartDate
(
startTime
).
getTime
()
/
1000
;
}
if
(
endTime
!=
null
)
{
et
=
TimeParse
.
stringFormartDate
(
endTime
).
getTime
()
/
1000
;
if
(
endTime
!=
null
)
{
et
=
TimeParse
.
stringFormartDate
(
endTime
).
getTime
()
/
1000
;
}
if
(
word
!=
null
){
url
=
"http://news.baidu.com/ns?from=news&cl=2&bt="
+
bt
+
"&et="
+
et
+
"&q1="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)
+
"&q3=&q4=&tn="
+
tn
+
"&ct=0&rn=50&clk=sortbytime&q6=&pn="
+
page
*
50
;
if
(
word
!=
null
)
{
url
=
"http://news.baidu.com/ns?from=news&cl=2&bt="
+
bt
+
"&et="
+
et
+
"&q1="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)
+
"&q3=&q4=&tn="
+
tn
+
"&ct=0&rn=50&clk=sortbytime&q6=&pn="
+
page
*
50
;
}
return
url
;
}
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
View file @
38bcf00d
...
...
@@ -32,9 +32,10 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSoNewsData
(
String
word
,
Proxy
proxy
)
{
public
static
List
<
NewsData
>
getSoNewsData
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
1
;
boolean
more
=
true
;
...
...
@@ -59,6 +60,19 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
return
list
;
}
public
static
Map
<
String
,
Object
>
getSoNewsData
(
String
word
,
Proxy
proxy
,
int
page
)
throws
Exception
{
String
htmlBody
=
downloadHtml
(
word
,
"news"
,
proxy
,
page
);
if
(
htmlBody
!=
null
)
{
return
analysisData
(
htmlBody
,
proxy
,
word
);
}
return
null
;
}
/**
* @Title: getSoNewsDataByTitle
* @author hero
...
...
@@ -69,7 +83,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @return List<NewsData> 返回类型
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSoNewsDataByTitle
(
String
word
,
Proxy
proxy
)
{
public
static
List
<
NewsData
>
getSoNewsDataByTitle
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
1
;
boolean
more
=
true
;
...
...
@@ -104,7 +118,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
downloadHtml
(
String
word
,
String
tn
,
Proxy
proxy
,
int
page
)
{
private
static
String
downloadHtml
(
String
word
,
String
tn
,
Proxy
proxy
,
int
page
)
throws
Exception
{
// 获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
// 获取链接地址
...
...
@@ -138,7 +152,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
{
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
boolean
more
=
true
;
...
...
@@ -199,7 +213,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
analysisDataByTitle
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
{
private
static
Map
<
String
,
Object
>
analysisDataByTitle
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
boolean
more
=
true
;
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
View file @
38bcf00d
...
...
@@ -35,9 +35,10 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSougouNewsData
(
String
word
,
Proxy
proxy
){
public
static
List
<
NewsData
>
getSougouNewsData
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
1
;
boolean
more
=
true
;
...
...
@@ -62,8 +63,18 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
}
public
static
Map
<
String
,
Object
>
getSougouNewsData
(
String
word
,
Proxy
proxy
,
int
page
)
throws
Exception
{
String
htmlBody
=
downloadHtml
(
word
,
1
,
proxy
,
page
);
if
(
htmlBody
!=
null
&&
!
htmlBody
.
equals
(
""
)){
return
analysisData
(
htmlBody
,
proxy
,
word
);
}
return
null
;
}
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSougouNewsDataByTitle
(
String
word
,
Proxy
proxy
){
public
static
List
<
NewsData
>
getSougouNewsDataByTitle
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
0
;
boolean
more
=
true
;
...
...
@@ -99,7 +110,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
downloadHtml
(
String
word
,
int
mode
,
Proxy
proxy
,
int
page
)
{
private
static
String
downloadHtml
(
String
word
,
int
mode
,
Proxy
proxy
,
int
page
)
throws
Exception
{
//获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//获取链接地址
...
...
@@ -118,7 +129,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
return
null
;
}
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
int
page
)
{
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
int
page
)
throws
Exception
{
//获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//获取链接地址
...
...
@@ -149,7 +160,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
{
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
boolean
more
=
true
;
...
...
@@ -241,7 +252,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @return List<NewsData> 返回类型
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getOherSougouNewsData
(
String
url
,
String
word
,
Proxy
proxy
){
public
static
List
<
NewsData
>
getOherSougouNewsData
(
String
url
,
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
1
;
boolean
more
=
true
;
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
View file @
38bcf00d
...
...
@@ -38,9 +38,10 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy
* @param @return 设定文件
* @return List<ZhiHuData> 返回类型
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
ZhiHuData
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
){
public
static
List
<
ZhiHuData
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
ZhiHuData
>
list
=
new
ArrayList
<
ZhiHuData
>();
int
page
=
1
;
boolean
more
=
true
;
...
...
@@ -66,6 +67,13 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
}
public
static
Map
<
String
,
Object
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
,
int
page
)
throws
Exception
{
String
htmlBody
=
downloadHtml
(
word
,
proxy
,
page
);
if
(
htmlBody
!=
null
&&
!
htmlBody
.
equals
(
""
)){
return
analysisData
(
htmlBody
,
proxy
,
word
);
}
return
null
;
}
/**
*
...
...
@@ -79,7 +87,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
downloadHtml
(
String
word
,
Proxy
proxy
,
int
page
)
{
private
static
String
downloadHtml
(
String
word
,
Proxy
proxy
,
int
page
)
throws
Exception
{
//获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//获取链接地址
...
...
@@ -97,7 +105,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
return
null
;
}
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
String
type
)
{
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
String
type
)
throws
Exception
{
//获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//获取链接地址
...
...
@@ -131,7 +139,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
{
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
ZhiHuData
>
list
=
new
ArrayList
<
ZhiHuData
>();
boolean
more
=
true
;
...
...
@@ -203,7 +211,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return ZhiHuData 返回类型
*/
private
static
ZhiHuData
analysisZhihuAnswer
(
String
url
,
Proxy
proxy
,
ZhiHuData
zhihu
){
private
static
ZhiHuData
analysisZhihuAnswer
(
String
url
,
Proxy
proxy
,
ZhiHuData
zhihu
)
throws
Exception
{
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
"问答"
);
if
(
htmlBody
!=
null
){
...
...
@@ -239,7 +247,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @return ZhiHuData 返回类型
*/
@SuppressWarnings
(
"deprecation"
)
private
static
ZhiHuData
analysisZhihuArticle
(
String
url
,
Proxy
proxy
,
ZhiHuData
zhihu
){
private
static
ZhiHuData
analysisZhihuArticle
(
String
url
,
Proxy
proxy
,
ZhiHuData
zhihu
)
throws
Exception
{
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
"文章"
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
View file @
38bcf00d
package
com
.
zhiwei
.
media_data_crawler
.
test
;
import
java.net.Proxy
;
import
java.util.List
;
import
org.junit.Test
;
import
com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
public
class
DataCrawlerTest
{
@Test
public
void
getSoNewsTest
(){
String
word
=
"马云"
;
//关键词
String
startTime
=
"2017-03-01 00:00:00"
;
//开始时间
String
endTime
=
"2017-03-01 23:59:59"
;
//结束时间
Proxy
proxy
=
null
;
//代理IP,不用可不填写
//百度新闻采集demo
// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//搜狗知乎采集
List
<
ZhiHuData
>
zhihuList
=
DataCrawler
.
getSougouZhihuData
(
word
,
proxy
);
System
.
out
.
println
(
zhihuList
.
size
());
}
}
//
package com.zhiwei.media_data_crawler.test;
//
//
import java.net.Proxy;
//
import java.util.List;
//
//
import org.junit.Test;
//
//
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
//
import com.zhiwei.media_data_crawler.data.DataCrawler;
//
import com.zhiwei.media_data_crawler.entity.NewsData;
//
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
//
//
public class DataCrawlerTest {
//
//
//
//
//
//
@Test
//
public void getSoNewsTest(){
//
String word = "马云"; //关键词
//
String startTime = "2017-03-01 00:00:00"; //开始时间
//
String endTime = "2017-03-01 23:59:59"; //结束时间
//
Proxy proxy = null; //代理IP,不用可不填写
//
//百度新闻采集demo
//
//
List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
//
//
//搜狗新闻关键词采集demo
//
//
List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
//
//
//360新闻采集demo
//
//
List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
////
//搜狗知乎采集
////
List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
////
System.out.println(zhihuList.size());
//
//
}
//
//
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment