Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
c694f0ae
Commit
c694f0ae
authored
Jan 31, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加知乎用户回答采集
parent
3c2a6baa
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
100 additions
and
13 deletions
+100
-13
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuAnswerCommentParse.java
+7
-7
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuAnwserCrawlerParse.java
+1
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuCrawlerParse.java
+1
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuUserAnswerCrawlerParse.java
+72
-0
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+15
-0
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuAnswer.java
+4
-4
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuAnswerCommentParse.java
View file @
c694f0ae
...
@@ -26,13 +26,13 @@ public class ZhihuAnswerCommentParse {
...
@@ -26,13 +26,13 @@ public class ZhihuAnswerCommentParse {
private
static
Logger
logger
=
LogManager
.
getLogger
(
TianYaCrawlerParse
.
class
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
TianYaCrawlerParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
// public static void main(String[] args) {
/**
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
*
// List<ZhihuAnswerComment> zacList = getAnswerData("https://www.zhihu.com/question/36267070/answer/575449468", ProxyHolder.NAT_PROXY);
* @Description 知乎回答下回复采集
// System.out.println(zacList.size());
* @param url
//
* @param proxy
// }
* @return
*/
public
static
List
<
ZhihuAnswerComment
>
getAnswerData
(
String
url
,
ProxyHolder
proxy
)
{
public
static
List
<
ZhihuAnswerComment
>
getAnswerData
(
String
url
,
ProxyHolder
proxy
)
{
String
id
=
getAnswerId
(
url
);
String
id
=
getAnswerId
(
url
);
if
(
Objects
.
isNull
(
id
))
{
if
(
Objects
.
isNull
(
id
))
{
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuAnwserCrawlerParse.java
View file @
c694f0ae
...
@@ -23,7 +23,7 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -23,7 +23,7 @@ public class ZhihuAnwserCrawlerParse {
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
/**
*
获取数据
*
知乎回答采集
* @param url
* @param url
* @param endDate
* @param endDate
* @param proxy
* @param proxy
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuCrawlerParse.java
View file @
c694f0ae
...
@@ -35,7 +35,7 @@ public class ZhihuCrawlerParse {
...
@@ -35,7 +35,7 @@ public class ZhihuCrawlerParse {
/**
/**
* @Title: getBaiduTiebaData
* @Title: getBaiduTiebaData
* @author hero
* @author hero
* @Description:
根據關鍵詞獲取百度貼吧數據(最多50頁)
* @Description:
知乎关键词采集
* @param @param word
* @param @param word
* @param @param proxy
* @param @param proxy
* @param @param tiebaName
* @param @param tiebaName
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuUserAnswerCrawlerParse.java
0 → 100644
View file @
c694f0ae
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.media_data_crawler.entity.ZhihuAnswer
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
ZhihuUserAnswerCrawlerParse
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuUserAnswerCrawlerParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
public
static
List
<
ZhihuAnswer
>
getData
(
String
userId
,
ProxyHolder
proxy
)
{
String
url
=
"https://www.zhihu.com/api/v4/members/"
+
userId
+
"/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&limit=20&sort_by=created&offset="
;
int
page
=
0
;
List
<
ZhihuAnswer
>
dataList
=
new
ArrayList
<>();
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
// headers.put("referer", "https://www.zhihu.com/people/"+userId+"/answers");
// headers.put("user-agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
// headers.put("cookie", "tgw_l7_route=116a747939468d99065d12a386ab1c5f; _xsrf=gn2oQ7N4G6yGOny4hc3T1TRr4kPOF4ij");
while
(
true
)
{
int
count
=
1
;
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
+
page
,
headers
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
ZhihuAnswer
za
=
new
ZhihuAnswer
();
za
.
setFrom_url
(
userId
);
za
.
setTitle
(
data
.
getJSONObject
(
"question"
).
getString
(
"title"
));
za
.
setAuthor
(
data
.
getJSONObject
(
"author"
).
getString
(
"name"
));
za
.
setContent
(
data
.
getString
(
"content"
).
replaceAll
(
"<.*?>"
,
""
));
za
.
setTime
(
new
Date
(
data
.
getLong
(
"created_time"
)*
1000L
));
za
.
setUrl
(
data
.
getJSONObject
(
"question"
).
getString
(
"url"
).
replace
(
"questions"
,
"question"
)+
"/answer/"
+
data
.
getString
(
"id"
));
za
.
setAttitudes_count
(
data
.
getInteger
(
"voteup_count"
));
za
.
setComment_count
(
data
.
getInteger
(
"comment_count"
));
dataList
.
add
(
za
);
}
int
total
=
json
.
getJSONObject
(
"paging"
).
getInteger
(
"totals"
);
logger
.
info
(
" 知乎用户回答采集 {} 采集第 {} 条 ,一共采集到 {} 条 ,总条数 {}"
,
userId
,
page
,
dataList
.
size
(),
total
);
if
(
dataList
.
size
()
>
total
||
page
>
total
)
{
count
++;
if
(
count
>
3
)
{
break
;
}
}
ZhiWeiTools
.
sleep
(
200
);
page
+=
20
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" 访问出错 {} "
,
e
);
}
}
return
dataList
;
}
}
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
View file @
c694f0ae
...
@@ -425,5 +425,20 @@ public class DataCrawler {
...
@@ -425,5 +425,20 @@ public class DataCrawler {
}
}
}
}
/**
*
* @Description 知乎用户回答采集
* @param url
* @param proxy
* @return
*/
public
static
List
<
ZhihuAnswer
>
getZhihuUserAnswewr
(
String
userId
,
ProxyHolder
proxy
){
try
{
return
ZhihuUserAnswerCrawlerParse
.
getData
(
userId
,
proxy
);
}
catch
(
Exception
e
){
throw
e
;
}
}
}
}
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuAnswer.java
View file @
c694f0ae
...
@@ -21,13 +21,13 @@ public class ZhihuAnswer implements Serializable {
...
@@ -21,13 +21,13 @@ public class ZhihuAnswer implements Serializable {
private
String
content
;
//内容
private
String
content
;
//内容
private
Integer
attitudes_count
;
//点赞数
private
Integer
attitudes_count
;
//
回答
点赞数
private
Integer
comment_count
;
//评论数
private
Integer
comment_count
;
//
回答
评论数
private
Integer
follow_count
;
//点赞数
private
Integer
follow_count
;
//
问题
点赞数
private
Integer
bord_count
;
//评论数
private
Integer
bord_count
;
//
问题
评论数
public
ZhihuAnswer
(){}
public
ZhihuAnswer
(){}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment