Commit 144dcd3b by [zhangzhiwei]

添加知乎回答采集

parent f518499b
package com.zhiwei.media_data_crawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.entity.ZhihuAnswer;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.net.Proxy;
import java.util.*;
/**
* 知乎评论采集
*/
public class ZhihuAnwserCrawlerParse {
/**
* 获取数据
* @param url
* @param endDate
* @param proxy
* @return
* @throws Exception
*/
public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, Proxy proxy) throws Exception{
try{
List<ZhihuAnswer> answerList = new ArrayList<>();
String questionId = getQuestionId(url);
String bord = getNumberBoard(url, proxy);
boolean more = true;
int page = 0;
while(more){
try{
Map<String,Object> dataMap = analsis(questionId,endDate,page,bord,proxy);
if(dataMap!=null && !dataMap.isEmpty()){
more = (boolean)dataMap.get("more");
List<ZhihuAnswer> list = (List<ZhihuAnswer>)dataMap.get("data");
if(list!=null && !list.isEmpty()){
answerList.addAll(list);
}else{
more = false;
}
}
//单线程采集避免被封休眠8s
ZhiWeiTools.sleep(8000);
page++;
}catch (Exception e){
more = false;
}
}
return answerList;
}catch (Exception e){
throw e;
}
}
/**
* 获取问题的关注者和浏览量
* @param url
* @param proxy
* @return
* @throws Exception
*/
private static String getNumberBoard(String url, Proxy proxy) throws Exception{
try{
String body = download(url, proxy);
Document document = Jsoup.parse(body);
Elements views = document.select("strong.NumberBoard-itemValue");
String fllow = "0";
String view = "0";
if (views.size() >= 2) {
fllow=views.get(0).attr("title");
view=views.get(1).attr("title");
}
return fllow+","+view;
} catch (Exception e){
throw e;
}
}
/**
* 获取单页数据
* @param url
* @param page
* @param endDate
* @param proxy
* @return
* @throws Exception
*/
public static Map<String,Object> getAnswerList(String url, int page, Date endDate, Proxy proxy) throws Exception{
try{
String questionId = getQuestionId(url);
String bord = getNumberBoard(url, proxy);
return analsis(questionId,endDate,page,bord ,proxy);
}catch (Exception e){
throw e;
}
}
/**
* 解析数据
* @param questionId
* @param endDate
* @param page
* @param proxy
* @return
* @throws Exception
*/
private static Map<String,Object> analsis(String questionId, Date endDate, int page, String bord, Proxy proxy) throws Exception{
try{
boolean more = true;
List<ZhihuAnswer> answerList = new ArrayList<>();
String urlNext = getUrl(questionId, page);
String body = download(urlNext, proxy);
JSONObject dataJson = JSONObject.parseObject(body);
Integer count = dataJson.getJSONObject("paging").getInteger("totals");
JSONArray jsonArray = dataJson.getJSONArray("data");
String from_url = "https://www.zhihu.com/question/" + questionId;
for(int i=0; i<jsonArray.size(); i++){
JSONObject answerJson = jsonArray.getJSONObject(i);
Date time = new Date(answerJson.getLong("created_time")*1000);
if(time.after(endDate)){
String answerId = answerJson.getString("id");
String link = from_url+"/answers/" + answerId;
String author = answerJson.getJSONObject("author").getString("name");
String authorUrl = "https://www.zhihu.com/people/"+answerJson.getJSONObject("author").getString("url_token");
String content = ZhiWeiTools.delHTMLTag(answerJson.getString("content"));
String title = answerJson.getJSONObject("question").getString("title");
Integer voteup_count = answerJson.getInteger("voteup_count");
Integer comment_count = answerJson.getInteger("comment_count");
Integer guanzhu_count = Integer.valueOf(bord.split(",")[0]);
Integer bord_count = Integer.valueOf(bord.split(",")[1]);
ZhihuAnswer zhihuAnswer = new ZhihuAnswer(link, from_url, title, time, author, authorUrl, content,voteup_count ,comment_count, guanzhu_count, bord_count);
answerList.add(zhihuAnswer);
}
}
if(count<page*20){
more = false;
}
Map<String,Object> resultMap = new HashMap<>();
resultMap.put("data", answerList);
resultMap.put("more", more);
return resultMap;
}catch (Exception e){
throw e;
}
}
/**
* 根据链接获取数据
* @param url
* @param proxy
* @return
* @throws Exception
*/
private static String download(String url, Proxy proxy) throws Exception{
try(Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){
return response.body().string();
}catch (Exception e){
throw e;
}
}
/**
* 根据链接获取问题id
* @param url
* @return
* @throws Exception
*/
private static String getQuestionId(String url) throws Exception{
try{
if(url.contains("question")){
return url.split("question/")[1].split("/")[0];
}
}catch (Exception e){
throw e;
}
throw new Exception("链接不符合要求,不是正常的知乎问题链接");
}
/***
* 获取数据页链接
* @param questionId
* @param page
* @return
*/
private static String getUrl(String questionId, int page){
return "https://www.zhihu.com/api/v4/questions/"+questionId+"/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2" +
"Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit" +
"%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2" +
"Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp" +
"%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset="+page*20+"&limit=20&sort_by=created";
}
public static void main(String[] args){
String url = "https://www.zhihu.com/question/288128510";
Date endDate = TimeParse.stringFormartDate("2018-09-20 08:00:00");
try{
getAnswerList(url,endDate, null);
}catch (Exception e){
e.fillInStackTrace();
}
}
}
package com.zhiwei.media_data_crawler.entity;
import java.io.Serializable;
import java.util.Date;
public class ZhihuAnswer implements Serializable {
private static final long serialVersionUID = 1L;
private String url; //地址
private String from_url; //问题地址
private String title; //标题
private Date time; //时间
private String author; //发布者
private String authorUrl; //作者地址
private String content; //内容
private Integer attitudes_count; //点赞数
private Integer comment_count; //评论数
private Integer follow_count; //点赞数
private Integer bord_count; //评论数
public ZhihuAnswer(){}
public ZhihuAnswer(String url, String from_url,String title, Date time, String author,
String authorUrl ,String content, Integer attitudes_count,
Integer comment_count,Integer follow_count,Integer bord_count){
this.url = url;
this.from_url = from_url;
this.title = title;
this.time = time;
this.author = author;
this.authorUrl = authorUrl;
this.content = content;
this.attitudes_count = attitudes_count;
this.comment_count = comment_count;
this.follow_count = follow_count;
this.bord_count = bord_count;
}
@Override
public String toString() {
return "ZhihuAnswer{" +
"url='" + url + '\'' +
", from_url='" + from_url + '\'' +
", title='" + title + '\'' +
", time=" + time +
", author='" + author + '\'' +
", authorUrl='" + authorUrl + '\'' +
", content='" + content + '\'' +
", attitudes_count=" + attitudes_count +
", comment_count=" + comment_count +
", follow_count=" + follow_count +
", bord_count=" + bord_count +
'}';
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getFrom_url() {
return from_url;
}
public void setFrom_url(String from_url) {
this.from_url = from_url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getAuthorUrl() {
return authorUrl;
}
public void setAuthorUrl(String authorUrl) {
this.authorUrl = authorUrl;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public Integer getAttitudes_count() {
return attitudes_count;
}
public void setAttitudes_count(Integer attitudes_count) {
this.attitudes_count = attitudes_count;
}
public Integer getComment_count() {
return comment_count;
}
public Integer getFollow_count() {
return follow_count;
}
public void setFollow_count(Integer follow_count) {
this.follow_count = follow_count;
}
public Integer getBord_count() {
return bord_count;
}
public void setBord_count(Integer bord_count) {
this.bord_count = bord_count;
}
public void setComment_count(Integer comment_count) {
this.comment_count = comment_count;
}
}
......@@ -3,8 +3,6 @@ package com.zhiwei.media_data_crawler.test;
import java.net.Proxy;
import java.util.List;
import org.junit.Test;
import com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
import com.zhiwei.media_data_crawler.data.DataCrawler;
......@@ -19,8 +17,7 @@ public class DataCrawlerTest {
@Test
public void getSoNewsTest(){
String word = "马云"; //关键词
String startTime = "2017-03-01 00:00:00"; //开始时间
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment