Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
eaa3a775
Commit
eaa3a775
authored
Mar 06, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加搜狗知乎采集程序
parent
fc1372d5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
480 additions
and
9 deletions
+480
-9
README.md
+2
-0
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
+277
-0
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+19
-1
src/main/java/com/zhiwei/media_data_crawler/entity/ZhiHuData.java
+155
-0
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
+27
-8
No files found.
README.md
View file @
eaa3a775
...
@@ -26,6 +26,8 @@
...
@@ -26,6 +26,8 @@
List
<NewsData>
sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
List
<NewsData>
sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
//360新闻采集demo
//360新闻采集demo
List
<NewsData>
soNewsList = DataCrawler.getSoNewsData(word, proxy);
List
<NewsData>
soNewsList = DataCrawler.getSoNewsData(word, proxy);
//搜狗知乎采集
List
<ZhiHuData>
zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
0 → 100644
View file @
eaa3a775
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.URLCodeUtil
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
SougouZhihuCrawlerParse
extends
HttpClientTemplateOK
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SougouZhihuCrawlerParse
.
class
);
private
static
final
String
pt
=
"搜狗知乎"
;
/**
* @Title: getSougouNewsData
* @author hero
* @Description: 根据关键词从搜狗上采集知乎的数据
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<ZhiHuData> 返回类型
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
ZhiHuData
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
){
List
<
ZhiHuData
>
list
=
new
ArrayList
<
ZhiHuData
>();
int
page
=
1
;
boolean
more
=
true
;
while
(
more
){
//最大页数为50
if
(
page
>
50
){
more
=
false
;
}
String
htmlBody
=
downloadHtml
(
word
,
proxy
,
page
);
if
(
htmlBody
!=
null
&&
!
htmlBody
.
equals
(
""
)){
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
);
List
<
ZhiHuData
>
dataList
=
(
List
<
ZhiHuData
>)
dataMap
.
get
(
"data"
);
list
.
addAll
(
dataList
);
logger
.
info
(
"当前采集关键词:{}, 采集到第:{}页,采集到的数据总量为:{}"
,
word
,
page
,
list
.
size
());
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
}
else
{
more
=
false
;
}
ZhiWeiTools
.
sleep
(
5000
);
page
++;
}
return
list
;
}
/**
*
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param word
* @param @param mode (mode为匹配规则,mode=1 全文匹配, mode=2 为标题匹配)
* @param @param proxy
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
downloadHtml
(
String
word
,
Proxy
proxy
,
int
page
)
{
//获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//获取链接地址
String
url
=
getUrl
(
word
,
page
);
headerMap
.
put
(
"Host"
,
"zhihu.sogou.com"
);
//下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++){
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
logger
.
error
(
"获取搜狗新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
continue
;
}
}
return
null
;
}
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
String
type
)
{
//获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//获取链接地址
if
(
type
.
contains
(
"文章"
)){
headerMap
.
put
(
"Host"
,
"zhuanlan.zhihu.com"
);
}
else
{
headerMap
.
put
(
"Host"
,
"www.zhihu.com"
);
}
headerMap
.
put
(
"Referer"
,
url
);
//下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++){
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
logger
.
error
(
"获取搜狗新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
continue
;
}
}
return
null
;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析百度新闻数据
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
ZhiHuData
>
list
=
new
ArrayList
<
ZhiHuData
>();
boolean
more
=
true
;
/** 解析页面 */
Document
document
=
Jsoup
.
parse
(
htmlBody
);
/**判断是否有下一页**/
if
(
document
.
select
(
"a#zhihu_page_next"
)
==
null
)
{
more
=
false
;
}
else
{
if
(!
document
.
select
(
"div.result-page"
).
text
().
contains
(
"下一页"
))
{
more
=
false
;
}
}
//开始解析
Elements
elementes
=
document
.
select
(
"div.box-result"
).
select
(
"div.result-about-list"
);
for
(
Element
element
:
elementes
)
{
try
{
String
link
=
element
.
select
(
"h4.about-list-title"
).
select
(
"a"
).
attr
(
"href"
);
String
title
=
element
.
select
(
"h4.about-list-title"
).
select
(
"a"
).
text
();
String
typeAndAnswerText
=
element
.
select
(
"div.about-text"
).
select
(
"span.answer-num"
).
text
();
String
answerText
=
element
.
select
(
"div.about-text"
).
select
(
"span.answer-num"
).
select
(
"a"
).
text
();
String
type
=
typeAndAnswerText
.
replaceAll
(
answerText
,
""
);
ZhiHuData
zhihu
=
null
;
if
(
type
.
contains
(
"文章"
)){
String
source
=
element
.
select
(
"p.about-answer"
).
select
(
"cite"
).
text
();
Integer
attitudes_count
=
Integer
.
valueOf
(
element
.
select
(
"p.about-answer"
).
select
(
"span.count"
).
text
().
replaceAll
(
"个赞"
,
""
));
Integer
comment_count
=
0
;
if
(!
""
.
equals
(
answerText
.
replace
(
"个评论"
,
""
).
trim
())){
comment_count
=
Integer
.
valueOf
(
answerText
.
replace
(
"个评论"
,
""
).
trim
());
}
zhihu
=
new
ZhiHuData
(
link
,
title
,
pt
,
type
,
null
,
source
,
null
,
attitudes_count
,
null
,
comment_count
,
word
);
zhihu
=
analysisZhihuArticle
(
link
,
proxy
,
zhihu
);
}
else
{
Integer
answer_count
=
0
;
answerText
=
answerText
.
replace
(
"个回答"
,
""
).
trim
();
if
(
answerText
!=
null
&&
!
""
.
equals
(
answerText
)){
answer_count
=
Integer
.
valueOf
(
answer_count
);
}
zhihu
=
new
ZhiHuData
(
link
,
title
,
pt
,
type
,
null
,
null
,
null
,
null
,
answer_count
,
null
,
word
);
zhihu
=
analysisZhihuAnswer
(
link
,
proxy
,
zhihu
);
}
list
.
add
(
zhihu
);
ZhiWeiTools
.
sleep
(
1000
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
// logger.error("搜狗新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue
;
}
}
resultMap
.
put
(
"data"
,
list
);
resultMap
.
put
(
"more"
,
more
);
return
resultMap
;
}
/**
*
* @Title: analysisZhihuAnswer
* @author hero
* @Description: 解析问答
* @param @param url
* @param @param htmlBody
* @param @return 设定文件
* @return ZhiHuData 返回类型
*/
private
static
ZhiHuData
analysisZhihuAnswer
(
String
url
,
Proxy
proxy
,
ZhiHuData
zhihu
){
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
"问答"
);
if
(
htmlBody
!=
null
){
Document
document
=
Jsoup
.
parse
(
htmlBody
);
String
content
=
document
.
select
(
"div.QuestionHeader-main"
).
select
(
"div.QuestionHeader-detail"
).
text
();
String
commentCountText
=
document
.
select
(
"div.QuestionHeader-Comment"
).
text
();
String
regEx
=
"[^0-9]"
;
Pattern
p
=
Pattern
.
compile
(
regEx
);
Matcher
m
=
p
.
matcher
(
commentCountText
);
commentCountText
=
m
.
replaceAll
(
""
).
trim
();
int
comment_count
=
0
;
if
(!
commentCountText
.
equals
(
""
)
&&
commentCountText
!=
null
){
comment_count
=
Integer
.
valueOf
(
commentCountText
);
}
zhihu
.
setContent
(
content
);
zhihu
.
setComment_count
(
comment_count
);
}
return
zhihu
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
zhihu
;
}
}
/**
* @Title: analysisZhihuArticle
* @author hero
* @Description: 解析文章
* @param @param url
* @param @param htmlBody
* @param @return 设定文件
* @return ZhiHuData 返回类型
*/
@SuppressWarnings
(
"deprecation"
)
private
static
ZhiHuData
analysisZhihuArticle
(
String
url
,
Proxy
proxy
,
ZhiHuData
zhihu
){
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
"文章"
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
String
time
=
document
.
select
(
"div.HoverTitle"
).
first
().
select
(
"time"
).
attr
(
"datetime"
);
Date
date
=
new
Date
(
time
);
time
=
TimeParse
.
dateFormartString
(
date
,
"yyyy-MM-dd HH:mm:ss"
);
String
content
=
document
.
select
(
"[class=\"RichText PostIndex-content av-paddingSide av-card\"]"
).
text
();
zhihu
.
setTime
(
time
);
zhihu
.
setContent
(
content
);
return
zhihu
;
}
catch
(
Exception
e
)
{
return
zhihu
;
}
}
/**
* @Title: getUrl
* @author hero
* @Description: 获取链接
* @param @param word
* @param @param mode (mode为匹配规则,mode=1 全文匹配, mode=2 为标题匹配)
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
getUrl
(
String
word
,
int
page
){
String
url
=
null
;
if
(
word
!=
null
){
url
=
"http://zhihu.sogou.com/zhihu?query="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)
+
"&page="
+
page
;
}
return
url
;
}
}
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
View file @
eaa3a775
...
@@ -6,7 +6,9 @@ import java.util.List;
...
@@ -6,7 +6,9 @@ import java.util.List;
import
com.zhiwei.media_data_crawler.crawler.BaiduNewsCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.BaiduNewsCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.SoNewsCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.SoNewsCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.SougouNewsCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.SougouNewsCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
public
class
DataCrawler
{
public
class
DataCrawler
{
...
@@ -132,6 +134,22 @@ public class DataCrawler {
...
@@ -132,6 +134,22 @@ public class DataCrawler {
}
}
}
}
/**
* @Title: getSougouZhihuData
* @author hero
* @Description: 根据关键词在搜狗知乎采集相应的知乎数据
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<ZhiHuData> 返回类型
*/
public
static
List
<
ZhiHuData
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
){
try
{
return
SougouZhihuCrawlerParse
.
getSougouZhihuData
(
word
,
proxy
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
}
}
}
}
src/main/java/com/zhiwei/media_data_crawler/entity/ZhiHuData.java
0 → 100644
View file @
eaa3a775
package
com
.
zhiwei
.
media_data_crawler
.
entity
;
import
java.io.Serializable
;
public
class
ZhiHuData
implements
Serializable
{
private
static
final
long
serialVersionUID
=
1L
;
private
String
url
;
//地址
private
String
title
;
//标题
private
String
pt
;
//平台
private
String
type
;
//类型
private
String
time
;
//时间
private
String
source
;
//发布者
private
String
content
;
//内容
private
Integer
attitudes_count
;
//点赞数
private
Integer
answer_count
;
//回答数
private
Integer
comment_count
;
//评论数
@Override
public
String
toString
(){
return
"new ZhiHuData["
+
"url = "
+
url
+
", title = "
+
title
+
", pt = "
+
pt
+
", type = "
+
type
+
", time = "
+
time
+
", source = "
+
source
+
", content = "
+
content
+
", attitudes_count = "
+
attitudes_count
+
", answer_count = "
+
answer_count
+
", comment_count = "
+
comment_count
+
", word = "
+
word
+
"]"
;
}
public
ZhiHuData
(){}
public
ZhiHuData
(
String
url
,
String
title
,
String
pt
,
String
type
,
String
time
,
String
source
,
String
content
,
Integer
attitudes_count
,
Integer
answer_count
,
Integer
comment_count
,
String
word
){
this
.
url
=
url
;
this
.
title
=
title
;
this
.
pt
=
pt
;
this
.
type
=
type
;
this
.
time
=
time
;
this
.
source
=
source
;
this
.
content
=
content
;
this
.
attitudes_count
=
attitudes_count
;
this
.
answer_count
=
answer_count
;
this
.
comment_count
=
comment_count
;
this
.
word
=
word
;
}
private
String
word
;
//采集关键词
public
String
getWord
()
{
return
word
;
}
public
void
setWord
(
String
word
)
{
this
.
word
=
word
;
}
public
String
getPt
()
{
return
pt
;
}
public
void
setPt
(
String
pt
)
{
this
.
pt
=
pt
;
}
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
String
getTitle
()
{
return
title
;
}
public
void
setTitle
(
String
title
)
{
this
.
title
=
title
;
}
public
String
getType
()
{
return
type
;
}
public
void
setType
(
String
type
)
{
this
.
type
=
type
;
}
public
String
getTime
()
{
return
time
;
}
public
void
setTime
(
String
time
)
{
this
.
time
=
time
;
}
public
String
getSource
()
{
return
source
;
}
public
void
setSource
(
String
source
)
{
this
.
source
=
source
;
}
public
String
getContent
()
{
return
content
;
}
public
void
setContent
(
String
content
)
{
this
.
content
=
content
;
}
public
Integer
getAttitudes_count
()
{
return
attitudes_count
;
}
public
void
setAttitudes_count
(
Integer
attitudes_count
)
{
this
.
attitudes_count
=
attitudes_count
;
}
public
Integer
getAnswer_count
()
{
return
answer_count
;
}
public
void
setAnswer_count
(
Integer
answer_count
)
{
this
.
answer_count
=
answer_count
;
}
public
Integer
getComment_count
()
{
return
comment_count
;
}
public
void
setComment_count
(
Integer
comment_count
)
{
this
.
comment_count
=
comment_count
;
}
}
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
View file @
eaa3a775
package
com
.
zhiwei
.
media_data_crawler
.
test
;
package
com
.
zhiwei
.
media_data_crawler
.
test
;
import
java.net.Proxy
;
import
java.util.List
;
import
org.junit.Test
;
import
com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
public
class
DataCrawlerTest
{
public
class
DataCrawlerTest
{
// @Test
@Test
// public void getSoNewsTest(){
public
void
getSoNewsTest
(){
// String word = "马云";
String
word
=
"马云"
;
//关键词
// List<NewsData> list = DataCrawler.getSoNewsData(word, null);
String
startTime
=
"2017-03-01 00:00:00"
;
//开始时间
// for(NewsData newsData : list){
String
endTime
=
"2017-03-01 23:59:59"
;
//结束时间
// System.out.println(newsData);
Proxy
proxy
=
null
;
//代理IP,不用可不填写
// }
//百度新闻采集demo
// }
// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//搜狗知乎采集
List
<
ZhiHuData
>
zhihuList
=
DataCrawler
.
getSougouZhihuData
(
word
,
proxy
);
System
.
out
.
println
(
zhihuList
.
size
());
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment