Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
144dcd3b
Commit
144dcd3b
authored
Sep 21, 2018
by
[zhangzhiwei]
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加知乎回答采集
parent
f518499b
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
383 additions
and
3 deletions
+383
-3
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuAnwserCrawlerParse.java
+227
-0
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuAnswer.java
+156
-0
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
+0
-3
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuAnwserCrawlerParse.java
0 → 100644
View file @
144dcd3b
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.media_data_crawler.entity.ZhihuAnswer
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.select.Elements
;
import
java.net.Proxy
;
import
java.util.*
;
/**
* 知乎评论采集
*/
public
class
ZhihuAnwserCrawlerParse
{
/**
* 获取数据
* @param url
* @param endDate
* @param proxy
* @return
* @throws Exception
*/
public
static
List
<
ZhihuAnswer
>
getAnswerList
(
String
url
,
Date
endDate
,
Proxy
proxy
)
throws
Exception
{
try
{
List
<
ZhihuAnswer
>
answerList
=
new
ArrayList
<>();
String
questionId
=
getQuestionId
(
url
);
String
bord
=
getNumberBoard
(
url
,
proxy
);
boolean
more
=
true
;
int
page
=
0
;
while
(
more
){
try
{
Map
<
String
,
Object
>
dataMap
=
analsis
(
questionId
,
endDate
,
page
,
bord
,
proxy
);
if
(
dataMap
!=
null
&&
!
dataMap
.
isEmpty
()){
more
=
(
boolean
)
dataMap
.
get
(
"more"
);
List
<
ZhihuAnswer
>
list
=
(
List
<
ZhihuAnswer
>)
dataMap
.
get
(
"data"
);
if
(
list
!=
null
&&
!
list
.
isEmpty
()){
answerList
.
addAll
(
list
);
}
else
{
more
=
false
;
}
}
//单线程采集避免被封休眠8s
ZhiWeiTools
.
sleep
(
8000
);
page
++;
}
catch
(
Exception
e
){
more
=
false
;
}
}
return
answerList
;
}
catch
(
Exception
e
){
throw
e
;
}
}
/**
* 获取问题的关注者和浏览量
* @param url
* @param proxy
* @return
* @throws Exception
*/
private
static
String
getNumberBoard
(
String
url
,
Proxy
proxy
)
throws
Exception
{
try
{
String
body
=
download
(
url
,
proxy
);
Document
document
=
Jsoup
.
parse
(
body
);
Elements
views
=
document
.
select
(
"strong.NumberBoard-itemValue"
);
String
fllow
=
"0"
;
String
view
=
"0"
;
if
(
views
.
size
()
>=
2
)
{
fllow
=
views
.
get
(
0
).
attr
(
"title"
);
view
=
views
.
get
(
1
).
attr
(
"title"
);
}
return
fllow
+
","
+
view
;
}
catch
(
Exception
e
){
throw
e
;
}
}
/**
* 获取单页数据
* @param url
* @param page
* @param endDate
* @param proxy
* @return
* @throws Exception
*/
public
static
Map
<
String
,
Object
>
getAnswerList
(
String
url
,
int
page
,
Date
endDate
,
Proxy
proxy
)
throws
Exception
{
try
{
String
questionId
=
getQuestionId
(
url
);
String
bord
=
getNumberBoard
(
url
,
proxy
);
return
analsis
(
questionId
,
endDate
,
page
,
bord
,
proxy
);
}
catch
(
Exception
e
){
throw
e
;
}
}
/**
* 解析数据
* @param questionId
* @param endDate
* @param page
* @param proxy
* @return
* @throws Exception
*/
private
static
Map
<
String
,
Object
>
analsis
(
String
questionId
,
Date
endDate
,
int
page
,
String
bord
,
Proxy
proxy
)
throws
Exception
{
try
{
boolean
more
=
true
;
List
<
ZhihuAnswer
>
answerList
=
new
ArrayList
<>();
String
urlNext
=
getUrl
(
questionId
,
page
);
String
body
=
download
(
urlNext
,
proxy
);
JSONObject
dataJson
=
JSONObject
.
parseObject
(
body
);
Integer
count
=
dataJson
.
getJSONObject
(
"paging"
).
getInteger
(
"totals"
);
JSONArray
jsonArray
=
dataJson
.
getJSONArray
(
"data"
);
String
from_url
=
"https://www.zhihu.com/question/"
+
questionId
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++){
JSONObject
answerJson
=
jsonArray
.
getJSONObject
(
i
);
Date
time
=
new
Date
(
answerJson
.
getLong
(
"created_time"
)*
1000
);
if
(
time
.
after
(
endDate
)){
String
answerId
=
answerJson
.
getString
(
"id"
);
String
link
=
from_url
+
"/answers/"
+
answerId
;
String
author
=
answerJson
.
getJSONObject
(
"author"
).
getString
(
"name"
);
String
authorUrl
=
"https://www.zhihu.com/people/"
+
answerJson
.
getJSONObject
(
"author"
).
getString
(
"url_token"
);
String
content
=
ZhiWeiTools
.
delHTMLTag
(
answerJson
.
getString
(
"content"
));
String
title
=
answerJson
.
getJSONObject
(
"question"
).
getString
(
"title"
);
Integer
voteup_count
=
answerJson
.
getInteger
(
"voteup_count"
);
Integer
comment_count
=
answerJson
.
getInteger
(
"comment_count"
);
Integer
guanzhu_count
=
Integer
.
valueOf
(
bord
.
split
(
","
)[
0
]);
Integer
bord_count
=
Integer
.
valueOf
(
bord
.
split
(
","
)[
1
]);
ZhihuAnswer
zhihuAnswer
=
new
ZhihuAnswer
(
link
,
from_url
,
title
,
time
,
author
,
authorUrl
,
content
,
voteup_count
,
comment_count
,
guanzhu_count
,
bord_count
);
answerList
.
add
(
zhihuAnswer
);
}
}
if
(
count
<
page
*
20
){
more
=
false
;
}
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<>();
resultMap
.
put
(
"data"
,
answerList
);
resultMap
.
put
(
"more"
,
more
);
return
resultMap
;
}
catch
(
Exception
e
){
throw
e
;
}
}
/**
* 根据链接获取数据
* @param url
* @param proxy
* @return
* @throws Exception
*/
private
static
String
download
(
String
url
,
Proxy
proxy
)
throws
Exception
{
try
(
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
){
throw
e
;
}
}
/**
* 根据链接获取问题id
* @param url
* @return
* @throws Exception
*/
private
static
String
getQuestionId
(
String
url
)
throws
Exception
{
try
{
if
(
url
.
contains
(
"question"
)){
return
url
.
split
(
"question/"
)[
1
].
split
(
"/"
)[
0
];
}
}
catch
(
Exception
e
){
throw
e
;
}
throw
new
Exception
(
"链接不符合要求,不是正常的知乎问题链接"
);
}
/***
* 获取数据页链接
* @param questionId
* @param page
* @return
*/
private
static
String
getUrl
(
String
questionId
,
int
page
){
return
"https://www.zhihu.com/api/v4/questions/"
+
questionId
+
"/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2"
+
"Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit"
+
"%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2"
+
"Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp"
+
"%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset="
+
page
*
20
+
"&limit=20&sort_by=created"
;
}
public
static
void
main
(
String
[]
args
){
String
url
=
"https://www.zhihu.com/question/288128510"
;
Date
endDate
=
TimeParse
.
stringFormartDate
(
"2018-09-20 08:00:00"
);
try
{
getAnswerList
(
url
,
endDate
,
null
);
}
catch
(
Exception
e
){
e
.
fillInStackTrace
();
}
}
}
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuAnswer.java
0 → 100644
View file @
144dcd3b
package
com
.
zhiwei
.
media_data_crawler
.
entity
;
import
java.io.Serializable
;
import
java.util.Date
;
public
class
ZhihuAnswer
implements
Serializable
{
private
static
final
long
serialVersionUID
=
1L
;
private
String
url
;
//地址
private
String
from_url
;
//问题地址
private
String
title
;
//标题
private
Date
time
;
//时间
private
String
author
;
//发布者
private
String
authorUrl
;
//作者地址
private
String
content
;
//内容
private
Integer
attitudes_count
;
//点赞数
private
Integer
comment_count
;
//评论数
private
Integer
follow_count
;
//点赞数
private
Integer
bord_count
;
//评论数
public
ZhihuAnswer
(){}
public
ZhihuAnswer
(
String
url
,
String
from_url
,
String
title
,
Date
time
,
String
author
,
String
authorUrl
,
String
content
,
Integer
attitudes_count
,
Integer
comment_count
,
Integer
follow_count
,
Integer
bord_count
){
this
.
url
=
url
;
this
.
from_url
=
from_url
;
this
.
title
=
title
;
this
.
time
=
time
;
this
.
author
=
author
;
this
.
authorUrl
=
authorUrl
;
this
.
content
=
content
;
this
.
attitudes_count
=
attitudes_count
;
this
.
comment_count
=
comment_count
;
this
.
follow_count
=
follow_count
;
this
.
bord_count
=
bord_count
;
}
@Override
public
String
toString
()
{
return
"ZhihuAnswer{"
+
"url='"
+
url
+
'\''
+
", from_url='"
+
from_url
+
'\''
+
", title='"
+
title
+
'\''
+
", time="
+
time
+
", author='"
+
author
+
'\''
+
", authorUrl='"
+
authorUrl
+
'\''
+
", content='"
+
content
+
'\''
+
", attitudes_count="
+
attitudes_count
+
", comment_count="
+
comment_count
+
", follow_count="
+
follow_count
+
", bord_count="
+
bord_count
+
'}'
;
}
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
String
getFrom_url
()
{
return
from_url
;
}
public
void
setFrom_url
(
String
from_url
)
{
this
.
from_url
=
from_url
;
}
public
String
getTitle
()
{
return
title
;
}
public
void
setTitle
(
String
title
)
{
this
.
title
=
title
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
public
String
getAuthor
()
{
return
author
;
}
public
void
setAuthor
(
String
author
)
{
this
.
author
=
author
;
}
public
String
getAuthorUrl
()
{
return
authorUrl
;
}
public
void
setAuthorUrl
(
String
authorUrl
)
{
this
.
authorUrl
=
authorUrl
;
}
public
String
getContent
()
{
return
content
;
}
public
void
setContent
(
String
content
)
{
this
.
content
=
content
;
}
public
Integer
getAttitudes_count
()
{
return
attitudes_count
;
}
public
void
setAttitudes_count
(
Integer
attitudes_count
)
{
this
.
attitudes_count
=
attitudes_count
;
}
public
Integer
getComment_count
()
{
return
comment_count
;
}
public
Integer
getFollow_count
()
{
return
follow_count
;
}
public
void
setFollow_count
(
Integer
follow_count
)
{
this
.
follow_count
=
follow_count
;
}
public
Integer
getBord_count
()
{
return
bord_count
;
}
public
void
setBord_count
(
Integer
bord_count
)
{
this
.
bord_count
=
bord_count
;
}
public
void
setComment_count
(
Integer
comment_count
)
{
this
.
comment_count
=
comment_count
;
}
}
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
View file @
144dcd3b
...
...
@@ -3,8 +3,6 @@ package com.zhiwei.media_data_crawler.test;
import
java.net.Proxy
;
import
java.util.List
;
import
org.junit.Test
;
import
com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
...
...
@@ -20,7 +18,6 @@ public class DataCrawlerTest {
@Test
public
void
getSoNewsTest
(){
String
word
=
"马云"
;
//关键词
String
startTime
=
"2017-03-01 00:00:00"
;
//开始时间
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment