Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
45483734
Commit
45483734
authored
Jul 04, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改百度资讯接口 和 360搜索关键词采集
parent
8c543a2e
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
23 additions
and
14 deletions
+23
-14
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduInforCrawlerParse.java
+11
-3
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
+12
-11
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduInforCrawlerParse.java
View file @
45483734
...
...
@@ -49,7 +49,7 @@ public class BaiduInforCrawlerParse {
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getBaiduInforData
(
String
word
,
String
endTime
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<>();
GroupSync
groupSync
=
new
GroupSync
();
GroupSync
groupSync
=
new
GroupSync
();
for
(
int
i
=
0
;
i
<
10
;
i
++)
{
groupSync
.
add
();
String
url
=
getUrl
(
word
,
i
,
endTime
);
...
...
@@ -155,6 +155,8 @@ public class BaiduInforCrawlerParse {
more
=
false
;
}
}
// 开始解析
Elements
elementes
=
document
.
select
(
"div.result"
);
String
time
=
null
;
...
...
@@ -179,6 +181,7 @@ public class BaiduInforCrawlerParse {
source
=
soureAndtimes
[
0
];
}
else
{
time
=
element
.
select
(
"div.c-row"
).
select
(
"p.c-author"
).
text
().
trim
();
source
=
element
.
select
(
"a.c-showurl > span"
).
text
();
}
/** 文章发布时间处理 **/
time
=
time
.
replaceAll
(
" "
,
""
);
...
...
@@ -193,6 +196,9 @@ public class BaiduInforCrawlerParse {
matcher
=
pattern
.
matcher
(
content
);
content
=
matcher
.
replaceAll
(
""
).
replace
(
"-"
,
""
).
replace
(
"百度快照"
,
""
);
}
if
(
Objects
.
nonNull
(
source
))
{
source
=
source
.
replaceAll
(
"<.*?>"
,
""
).
trim
();
}
// 添加到数据集合中
NewsData
newsData
=
new
NewsData
(
link
,
title
,
source
,
time
,
content
,
PT
,
word
);
list
.
add
(
newsData
);
...
...
@@ -287,8 +293,10 @@ public class BaiduInforCrawlerParse {
String
url
=
null
;
if
(
word
!=
null
)
{
if
(
Objects
.
nonNull
(
time
))
{
// https://www.baidu.com/s?ie=utf-8&cl=2&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&tn=news&word=http%3A%2F%2Fbaijiahao.baidu.com%2Fs%3Fid%3D1600799795509096909%26wfr%3Dspider%26for%3Dpc&rsv_sug3=2&rsv_sug4=221&rsv_sug1=1&rsv_n=2&rsv_sug2=0&inputT=601
// https://www.baidu.com/s?rn=50&ie=utf-8&cl=2&medium=0&rtt=4&bsst=1&rsv_dl=news_t_sk&tn=news&wd=%E6%B5%99%E6%B1%9F%E4%B8%B4%E6%B5%B7&tfflag=0&gpc=stf%3D1559318400%2C1561910400%7Cstftype%3D2
time
=
String
.
valueOf
(
TimeParse
.
stringFormartDate
(
time
).
getTime
()/
1000
);
url
=
"http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd="
+
URLCodeUtil
.
getURLEncode
(
word
,
"UTF-8"
)
+
"&medium=0&rn=50&gpc=stf%3D
154627200
0%2C"
+
time
+
"%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn="
+
page
*
50
;
url
=
"http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd="
+
URLCodeUtil
.
getURLEncode
(
word
,
"UTF-8"
)
+
"&medium=0&rn=50&gpc=stf%3D0%2C"
+
time
+
"%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn="
+
page
*
50
;
}
else
{
url
=
"http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd="
+
URLCodeUtil
.
getURLEncode
(
word
,
"UTF-8"
)
+
"&medium=0&rn=50&pn="
+
page
*
50
;
}
...
...
@@ -299,7 +307,7 @@ public class BaiduInforCrawlerParse {
//https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E6%B5%99%E6%B1%9F&medium=1&rn=50&gpc=stf%3D1546272000%2C1548950400%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=0
// public static void main(String[] args) throws Exception {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<NewsData> ndList = getBaiduInforData("
腾讯
");
// List<NewsData> ndList = getBaiduInforData("
马云","2019-07-04 23:59:59
");
// System.out.println(ndList.size());
// }
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
View file @
45483734
...
...
@@ -30,7 +30,7 @@ public class SoNewsCrawlerParse {
private
static
Logger
logger
=
LogManager
.
getLogger
(
SoNewsCrawlerParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
final
String
pt
=
"360新闻"
;
/**
* @Title: getSoNewsData
* @author hero
...
...
@@ -43,14 +43,10 @@ public class SoNewsCrawlerParse {
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSoNewsData
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
List
<
NewsData
>
list
=
new
ArrayList
<>();
int
page
=
1
;
boolean
more
=
true
;
while
(
more
)
{
// 最大页数为50
if
(
page
>
50
)
{
more
=
false
;
}
String
htmlBody
=
downloadHtml
(
word
,
"news"
,
proxy
,
page
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
);
...
...
@@ -65,6 +61,10 @@ public class SoNewsCrawlerParse {
if
(
DataCrawler
.
sleepTime
!=
null
){
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
}
// 最大页数为50
if
(
page
>
50
)
{
more
=
false
;
}
}
return
list
;
}
...
...
@@ -93,7 +93,7 @@ public class SoNewsCrawlerParse {
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSoNewsDataByTitle
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
List
<
NewsData
>
list
=
new
ArrayList
<>();
int
page
=
1
;
boolean
more
=
true
;
while
(
more
)
{
...
...
@@ -174,8 +174,8 @@ public class SoNewsCrawlerParse {
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<>();
List
<
NewsData
>
list
=
new
ArrayList
<>();
boolean
more
=
true
;
/** 解析页面 */
...
...
@@ -200,8 +200,9 @@ public class SoNewsCrawlerParse {
if
(!
element
.
attr
(
"class"
).
equals
(
"res-list hasimg hasmediav"
)){
link
=
element
.
select
(
"h3"
).
select
(
"a"
).
attr
(
"href"
);
title
=
element
.
select
(
"h3"
).
select
(
"a"
).
text
();
time
=
element
.
select
(
"p.newsinfo"
).
select
(
"span.posttime"
).
attr
(
"title"
);
source
=
element
.
select
(
"p.newsinfo"
).
select
(
"span.sitename"
).
text
();
// #news > li > div.info.b-info > span:nth-child(3)
time
=
element
.
select
(
"div.info.b-info"
).
select
(
"span:nth-child(3)"
).
text
();
source
=
element
.
select
(
"div.info.b-info"
).
select
(
"span.sitename"
).
text
();
/** 文章发布时间处理 **/
time
=
TimeParse
.
dateFormartString
(
TimeParse
.
stringFormartDate
(
time
),
"yyyy-MM-dd HH:mm:ss"
);
// 处理文章简介
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment