Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
f6fa753d
Commit
f6fa753d
authored
Feb 27, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修复搜狗新闻由于乱码引起的解析问题
parent
630e8f87
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
11 deletions
+14
-11
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
+14
-11
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
View file @
f6fa753d
...
@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
...
@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.net.Proxy
;
import
java.nio.charset.Charset
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
...
@@ -47,7 +48,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -47,7 +48,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
more
=
false
;
more
=
false
;
}
}
String
htmlBody
=
downloadHtml
(
word
,
1
,
proxy
,
page
);
String
htmlBody
=
downloadHtml
(
word
,
1
,
proxy
,
page
);
if
(
htmlBody
!=
null
){
if
(
htmlBody
!=
null
&&
!
htmlBody
.
equals
(
""
)
){
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
);
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
list
.
addAll
(
dataList
);
list
.
addAll
(
dataList
);
...
@@ -184,30 +185,32 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -184,30 +185,32 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
/**截取时间*/
/**截取时间*/
if
(
soureAndtime
.
contains
(
" "
))
{
if
(
soureAndtime
.
contains
(
" "
))
{
String
soureAndtimes
[]
=
soureAndtime
.
split
(
" "
);
String
soureAndtimes
[]
=
soureAndtime
.
split
(
" "
);
time
=
soureAndtimes
[
1
];
time
=
soureAndtimes
[
1
]
.
contains
(
"<!--resultinfodat"
)?
soureAndtimes
[
1
].
split
(
"<!--resultinfodat"
)[
0
]:
soureAndtimes
[
1
]
;
source
=
soureAndtimes
[
0
];
source
=
soureAndtimes
[
0
];
}
else
{
time
=
element
.
select
(
"div.news-detail"
).
select
(
"div.news-info"
).
select
(
"p.news-from"
).
text
();
}
}
/**文章发布时间处理**/
if
(
time
!=
null
&&
!
time
.
equals
(
""
)){
time
=
TimeParse
.
dateFormartString
(
TimeParse
.
stringFormartDate
(
time
),
"yyyy-MM-dd HH:mm:ss"
)
;
/**文章发布时间处理**/
time
=
TimeParse
.
dateFormartString
(
TimeParse
.
stringFormartDate
(
time
),
"yyyy-MM-dd HH:mm:ss"
)
;
}
// 处理文章简介
// 处理文章简介
content
=
element
.
select
(
"div.news-detail"
).
select
(
"div.news-info"
).
select
(
"p.news-txt"
).
select
(
"span#summary_1"
).
text
();
content
=
element
.
select
(
"div.news-detail"
).
select
(
"div.news-info"
).
select
(
"p.news-txt"
).
select
(
"span#summary_1"
).
text
();
//添加到数据集合中
//添加到数据集合中
if
(
title
!=
null
){
if
(
title
!=
null
&&
!
title
.
equals
(
""
)
&&
source
!=
null
&&
time
!=
null
){
NewsData
newsData
=
new
NewsData
(
link
,
title
,
source
,
time
,
content
,
pt
,
word
);
NewsData
newsData
=
new
NewsData
(
link
,
title
,
source
,
time
,
content
,
pt
,
word
);
logger
.
info
(
"搜狗新闻数据:{}"
,
newsData
);
list
.
add
(
newsData
);
list
.
add
(
newsData
);
}
}
/**采集相同新闻链接**/
/**采集相同新闻链接**/
if
(
element
.
select
(
"div.news-detail"
).
select
(
"div.news-info"
).
select
(
"p.news-txt"
).
select
(
"a#news_similar"
)!=
null
)
String
otherUrl
=
element
.
select
(
"div.news-detail"
).
select
(
"div.news-info"
).
select
(
"p.news-txt"
).
select
(
"a#news_similar"
).
attr
(
"href"
);
if
(
otherUrl
!=
null
&&
!
otherUrl
.
equals
(
""
))
{
{
String
otherLink
=
"http://news.sogou.com/news"
+
element
.
select
(
"div.news-detail"
).
select
(
"div.news-info"
).
select
(
"p.news-txt"
).
select
(
"a#news_similar"
).
attr
(
"href"
)
;
String
otherLink
=
"http://news.sogou.com/news"
+
otherUrl
;
List
<
NewsData
>
otherDataList
=
getOherSougouNewsData
(
otherLink
,
word
,
proxy
);
List
<
NewsData
>
otherDataList
=
getOherSougouNewsData
(
otherLink
,
word
,
proxy
);
list
.
addAll
(
otherDataList
);
list
.
addAll
(
otherDataList
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
printStackTrace
();
logger
.
error
(
"搜狗新闻数据解析时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
//
logger.error("搜狗新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue
;
continue
;
}
}
}
}
...
@@ -267,7 +270,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -267,7 +270,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
String
url
=
null
;
String
url
=
null
;
if
(
word
!=
null
){
if
(
word
!=
null
){
url
=
"http://news.sogou.com/news?mode="
+
mode
+
"&media=&query="
url
=
"http://news.sogou.com/news?mode="
+
mode
+
"&media=&query="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)
+
"&time=0&clusterId=&sort=1&
page=2&dp=1&page="
+
page
;
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)
+
"&time=0&clusterId=&sort=1&
dp=1"
;
}
}
return
url
;
return
url
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment