Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
8c543a2e
Commit
8c543a2e
authored
Jul 01, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
新增百度资讯采集
parent
7f0418e6
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
336 additions
and
3 deletions
+336
-3
pom.xml
+2
-2
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduInforCrawlerParse.java
+306
-0
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+1
-0
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+27
-1
No files found.
pom.xml
View file @
8c543a2e
...
...
@@ -2,7 +2,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
media_data_crawler
</artifactId>
<version>
0.1.
0
-SNAPSHOT
</version>
<version>
0.1.
1
-SNAPSHOT
</version>
<name>
media_data_crawler
</name>
<description>
网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等
</description>
...
...
@@ -16,7 +16,7 @@
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.
3.6
-RELEASE
</version>
<version>
0.
5.2
-RELEASE
</version>
<scope>
provided
</scope>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduInforCrawlerParse.java
0 → 100644
View file @
8c543a2e
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.async.TaskBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
okhttp3.Response
;
public
class
BaiduInforCrawlerParse
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
BaiduNewsCrawlerParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
useCookieJar
(
true
).
throwException
(
false
).
build
();
private
static
final
String
PT
=
"百度资讯"
;
/**
* @Title: getBaiduNewsData
* @author hero
* @Description: 采集百度新闻数据
* @param @param
* word
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getBaiduInforData
(
String
word
,
String
endTime
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<>();
GroupSync
groupSync
=
new
GroupSync
();
for
(
int
i
=
0
;
i
<
10
;
i
++)
{
groupSync
.
add
();
String
url
=
getUrl
(
word
,
i
,
endTime
);
TaskBoot
.
blockingAsync
(()
->
{
try
{
String
htmlBody
=
downloadHtml
(
url
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
word
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
System
.
out
.
println
(
url
);
list
.
addAll
(
dataList
);
}
}
catch
(
Exception
e
)
{
}
finally
{
groupSync
.
done
();
}
});
}
groupSync
.
await
();
// while (more) {
// String htmlBody = downloadHtml(word, page,null);
// if (htmlBody != null) {
// Map<String, Object> dataMap = analysisData(htmlBody, word);
// List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
// list.addAll(dataList);
// logger.info("第 {} 页 ,采集到 {} 条",page,list.size());
// System.out.println("第 "+page+" 页 ,采集到 "+list.size()+" 条");
// more = (Boolean) dataMap.get("more");
// } else {
// more = false;
// }
// page++;
// if(DataCrawler.sleepTime != null ){
// ZhiWeiTools.sleep(DataCrawler.sleepTime);
// }
// // 最大页数为30
// if (page > 30) {
// more = false;
// }
// }
return
list
;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param
* word
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型
*/
private
static
String
downloadHtml
(
String
url
)
{
// 获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
// 获取链接地址
headerMap
.
put
(
"Host"
,
"www.baidu.com"
);
headerMap
.
put
(
"referer"
,
url
);
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
)){
String
result
=
response
.
body
().
string
();
if
(!
result
.
contains
(
"location.href.replace"
))
{
return
result
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
);
}
}
return
null
;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析百度新闻数据
* @param @param
* htmlBody
* @param @param
* proxy
* @param @param
* word
* @param @return
* 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
String
word
){
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<>();
List
<
NewsData
>
list
=
new
ArrayList
<>();
boolean
more
=
true
;
/** 解析页面 */
Document
document
=
Jsoup
.
parse
(
htmlBody
);
/** 判断是否有下一页 **/
if
(
document
.
select
(
"p#page"
)
==
null
)
{
more
=
false
;
}
else
{
if
(!
document
.
select
(
"p#page"
).
text
().
contains
(
"下一页"
))
{
more
=
false
;
}
}
// 开始解析
Elements
elementes
=
document
.
select
(
"div.result"
);
String
time
=
null
;
String
source
=
null
;
String
link
=
null
;
String
title
=
null
;
String
soureAndtime
=
null
;
String
descript
=
null
;
String
soureAndtimeText
=
null
;
String
content
=
null
;
Pattern
pattern
=
null
;
Matcher
matcher
=
null
;
for
(
Element
element
:
elementes
)
{
try
{
link
=
element
.
select
(
"h3.c-title"
).
select
(
"a"
).
attr
(
"href"
);
title
=
element
.
select
(
"h3.c-title"
).
select
(
"a"
).
text
();
soureAndtime
=
element
.
select
(
"div.c-row"
).
select
(
"p.c-author"
).
html
();
/** 截取时间 */
if
(
soureAndtime
.
contains
(
" "
))
{
String
soureAndtimes
[]
=
soureAndtime
.
split
(
" "
);
time
=
soureAndtimes
[
1
];
source
=
soureAndtimes
[
0
];
}
else
{
time
=
element
.
select
(
"div.c-row"
).
select
(
"p.c-author"
).
text
().
trim
();
}
/** 文章发布时间处理 **/
time
=
time
.
replaceAll
(
" "
,
""
);
time
=
TimeParse
.
dateFormartString
(
TimeParse
.
stringFormartDate
(
time
),
"yyyy-MM-dd HH:mm:ss"
);
time
=
TimeParse
.
dateFormartString
(
TimeParse
.
stringFormartDate
(
time
.
trim
()),
"yyyy-MM-dd HH:mm:ss"
);
// 处理文章简介
if
(
element
.
select
(
"div.c-row"
)
!=
null
)
{
descript
=
element
.
select
(
"div.c-row"
).
text
();
soureAndtimeText
=
element
.
select
(
"div.c-row"
).
select
(
"p.c-author"
).
text
();
content
=
descript
.
substring
(
soureAndtimeText
.
length
(),
descript
.
length
());
pattern
=
Pattern
.
compile
(
"\\d*条相同新闻"
);
matcher
=
pattern
.
matcher
(
content
);
content
=
matcher
.
replaceAll
(
""
).
replace
(
"-"
,
""
).
replace
(
"百度快照"
,
""
);
}
// 添加到数据集合中
NewsData
newsData
=
new
NewsData
(
link
,
title
,
source
,
time
,
content
,
PT
,
word
);
list
.
add
(
newsData
);
/** 采集相同新闻链接 **/
String
otherUrl
=
element
.
select
(
"div.c-row"
).
select
(
"a.c-more_link"
).
attr
(
"href"
);
if
(
otherUrl
!=
null
&&
!
otherUrl
.
equals
(
""
))
{
String
otherLink
=
"http://www.baidu.com"
+
element
.
select
(
"div.c-row"
).
select
(
"a.c-more_link"
).
attr
(
"href"
);
List
<
NewsData
>
otherDataList
=
getOherBaiduNewsData
(
otherLink
,
word
);
list
.
addAll
(
otherDataList
);
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
"soureAndtime======"
+
soureAndtime
);
logger
.
error
(
"百度新闻数据解析时出现问题,问题为:{}"
,
e
);
}
}
resultMap
.
put
(
"data"
,
list
);
resultMap
.
put
(
"more"
,
more
);
return
resultMap
;
}
private
static
String
downloadHtml
(
String
url
,
int
page
)
throws
Exception
{
// 获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
// 获取链接地址
url
=
url
+
"&pn="
+
page
*
10
;
headerMap
.
put
(
"Host"
,
"news.baidu.com"
);
headerMap
.
put
(
"Referer"
,
url
);
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
}
}
return
null
;
}
/**
* @Title: getOherBaiduNewsData
* @author hero
* @Description: 解析相似新闻
* @param @param
* url
* @param @param
* word
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getOherBaiduNewsData
(
String
url
,
String
word
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<>();
int
page
=
0
;
boolean
more
=
true
;
while
(
more
)
{
// 最大页数为20
if
(
page
>
30
)
{
more
=
false
;
}
String
htmlBody
=
downloadHtml
(
url
,
page
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
word
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
list
.
addAll
(
dataList
);
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
}
else
{
more
=
false
;
}
page
++;
}
return
list
;
}
/**
* @Title: getUrl
* @author hero
* @Description: 获取链接
* @param @param
* word
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型
*/
private
static
String
getUrl
(
String
word
,
int
page
,
String
time
)
{
String
url
=
null
;
if
(
word
!=
null
)
{
if
(
Objects
.
nonNull
(
time
))
{
time
=
String
.
valueOf
(
TimeParse
.
stringFormartDate
(
time
).
getTime
()/
1000
);
url
=
"http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd="
+
URLCodeUtil
.
getURLEncode
(
word
,
"UTF-8"
)
+
"&medium=0&rn=50&gpc=stf%3D1546272000%2C"
+
time
+
"%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn="
+
page
*
50
;
}
else
{
url
=
"http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd="
+
URLCodeUtil
.
getURLEncode
(
word
,
"UTF-8"
)
+
"&medium=0&rn=50&pn="
+
page
*
50
;
}
}
return
url
;
}
//https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E6%B5%99%E6%B1%9F&medium=1&rn=50&gpc=stf%3D1546272000%2C1548950400%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=0
// public static void main(String[] args) throws Exception {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<NewsData> ndList = getBaiduInforData("腾讯");
// System.out.println(ndList.size());
// }
}
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
View file @
8c543a2e
...
...
@@ -564,6 +564,7 @@ public class BaiduNewsCrawlerParse {
et
=
TimeParse
.
stringFormartDate
(
endTime
).
getTime
()
/
1000
;
}
if
(
word
!=
null
)
{
// url = "https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=" + URLCodeUtil.getURLEncode(word, "UTF-8") + "&medium=0&pn=" + page*10;
url
=
"http://news.baidu.com/ns?from=news&cl=2&bt="
+
bt
+
"&et="
+
et
+
"&q1="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)
+
"&q3=&q4=&tn="
+
tn
+
"&ct=0&rn=50&clk=sortbytime&q6=&pn="
+
page
*
50
;
...
...
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
View file @
8c543a2e
...
...
@@ -13,7 +13,33 @@ import com.zhiwei.media_data_crawler.entity.*;
public
class
DataCrawler
{
public
static
Long
sleepTime
;
/**
*
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词和时间,全文匹配百度新闻数据
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
*/
public
static
List
<
NewsData
>
getBaiduInforData
(
String
word
,
String
endTime
)
{
try
{
return
BaiduInforCrawlerParse
.
getBaiduInforData
(
word
,
endTime
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
Collections
.
emptyList
();
}
}
/**
*
* @Title: getBaiduNewsData
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment