Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
ee34a906
Commit
ee34a906
authored
Mar 22, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
更新百度新闻按照标题采集数据解析错误修复
parent
6c18504b
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
82 additions
and
1 deletions
+82
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+82
-1
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
View file @
ee34a906
...
@@ -119,7 +119,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -119,7 +119,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
}
}
String
htmlBody
=
downloadHtml
(
word
,
startTime
,
endTime
,
proxy
,
"newstitle"
,
page
);
String
htmlBody
=
downloadHtml
(
word
,
startTime
,
endTime
,
proxy
,
"newstitle"
,
page
);
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
);
Map
<
String
,
Object
>
dataMap
=
analysisData
ByTitle
(
htmlBody
,
proxy
,
word
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
list
.
addAll
(
dataList
);
list
.
addAll
(
dataList
);
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
...
@@ -158,6 +158,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -158,6 +158,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
// 获取链接地址
// 获取链接地址
String
url
=
getUrl
(
word
,
startTime
,
endTime
,
tn
,
page
);
String
url
=
getUrl
(
word
,
startTime
,
endTime
,
tn
,
page
);
System
.
out
.
println
(
url
);
headerMap
.
put
(
"Host"
,
"news.baidu.com"
);
headerMap
.
put
(
"Host"
,
"news.baidu.com"
);
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
url
);
// 下载数据页面
// 下载数据页面
...
@@ -245,6 +246,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -245,6 +246,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
link
=
element
.
select
(
"h3.c-title"
).
select
(
"a"
).
attr
(
"href"
);
link
=
element
.
select
(
"h3.c-title"
).
select
(
"a"
).
attr
(
"href"
);
title
=
element
.
select
(
"h3.c-title"
).
select
(
"a"
).
text
();
title
=
element
.
select
(
"h3.c-title"
).
select
(
"a"
).
text
();
soureAndtime
=
element
.
select
(
"div.c-row"
).
select
(
"p.c-author"
).
html
();
soureAndtime
=
element
.
select
(
"div.c-row"
).
select
(
"p.c-author"
).
html
();
System
.
out
.
println
(
"time========"
+
soureAndtime
);
/** 截取时间 */
/** 截取时间 */
if
(
soureAndtime
.
contains
(
" "
))
{
if
(
soureAndtime
.
contains
(
" "
))
{
String
soureAndtimes
[]
=
soureAndtime
.
split
(
" "
);
String
soureAndtimes
[]
=
soureAndtime
.
split
(
" "
);
...
@@ -288,6 +290,85 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -288,6 +290,85 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return
resultMap
;
return
resultMap
;
}
}
private
static
Map
<
String
,
Object
>
analysisDataByTitle
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
boolean
more
=
true
;
/** 解析页面 */
Document
document
=
Jsoup
.
parse
(
htmlBody
);
/** 判断是否有下一页 **/
if
(
document
.
select
(
"p#page"
)
==
null
)
{
more
=
false
;
}
else
{
if
(!
document
.
select
(
"p#page"
).
text
().
contains
(
"下一页"
))
{
more
=
false
;
}
}
// 开始解析
Elements
elementes
=
document
.
select
(
"[class=\"result title\"]"
);
String
time
=
null
;
String
source
=
null
;
String
link
=
null
;
String
title
=
null
;
String
soureAndtime
=
null
;
String
descript
=
null
;
String
soureAndtimeText
=
null
;
String
content
=
null
;
Pattern
pattern
=
null
;
Matcher
matcher
=
null
;
for
(
Element
element
:
elementes
)
{
try
{
link
=
element
.
select
(
"h3.c-title"
).
select
(
"a"
).
attr
(
"href"
);
title
=
element
.
select
(
"h3.c-title"
).
select
(
"a"
).
text
();
soureAndtime
=
element
.
select
(
"div.c-title-author"
).
html
();
/** 截取时间 */
if
(
soureAndtime
.
contains
(
" "
))
{
String
soureAndtimes
[]
=
soureAndtime
.
split
(
" "
);
time
=
soureAndtimes
[
1
];
source
=
soureAndtimes
[
0
];
}
/** 文章发布时间处理 **/
time
=
TimeParse
.
dateFormartString
(
TimeParse
.
stringFormartDate
(
time
),
"yyyy-MM-dd HH:mm:ss"
);
// 处理文章简介
if
(
element
.
select
(
"div.c-row"
)
!=
null
)
{
descript
=
element
.
select
(
"div.c-row"
).
text
();
soureAndtimeText
=
element
.
select
(
"div.c-row"
).
select
(
"p.c-author"
).
text
();
content
=
descript
.
substring
(
soureAndtimeText
.
length
(),
descript
.
length
());
pattern
=
Pattern
.
compile
(
"\\d*条相同新闻"
);
matcher
=
pattern
.
matcher
(
content
);
content
=
matcher
.
replaceAll
(
""
).
replace
(
"-"
,
""
).
replace
(
"百度快照"
,
""
);
}
// 添加到数据集合中
NewsData
newsData
=
new
NewsData
(
link
,
title
,
source
,
time
,
content
,
pt
,
word
);
list
.
add
(
newsData
);
/** 采集相同新闻链接 **/
String
otherUrl
=
element
.
select
(
"div.c-title-author"
).
select
(
"a"
).
attr
(
"href"
);
if
(
otherUrl
!=
null
&&
!
otherUrl
.
equals
(
""
))
{
String
otherLink
=
"http://news.baidu.com"
+
otherUrl
;
List
<
NewsData
>
otherDataList
=
getOherBaiduNewsData
(
otherLink
,
word
,
proxy
);
list
.
addAll
(
otherDataList
);
ZhiWeiTools
.
sleep
(
100
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"百度新闻数据解析时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
continue
;
}
}
resultMap
.
put
(
"data"
,
list
);
resultMap
.
put
(
"more"
,
more
);
return
resultMap
;
}
/**
/**
* @Title: getOherBaiduNewsData
* @Title: getOherBaiduNewsData
* @author hero
* @author hero
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment