Commit c694f0ae by yangchen

增加知乎用户回答采集

parent 3c2a6baa
...@@ -26,13 +26,13 @@ public class ZhihuAnswerCommentParse { ...@@ -26,13 +26,13 @@ public class ZhihuAnswerCommentParse {
private static Logger logger = LogManager.getLogger(TianYaCrawlerParse.class); private static Logger logger = LogManager.getLogger(TianYaCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot();
// public static void main(String[] args) { /**
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); *
// List<ZhihuAnswerComment> zacList = getAnswerData("https://www.zhihu.com/question/36267070/answer/575449468", ProxyHolder.NAT_PROXY); * @Description 知乎回答下回复采集
// System.out.println(zacList.size()); * @param url
// * @param proxy
// } * @return
*/
public static List<ZhihuAnswerComment> getAnswerData(String url,ProxyHolder proxy) { public static List<ZhihuAnswerComment> getAnswerData(String url,ProxyHolder proxy) {
String id = getAnswerId(url); String id = getAnswerId(url);
if(Objects.isNull(id)) { if(Objects.isNull(id)) {
......
...@@ -23,7 +23,7 @@ public class ZhihuAnwserCrawlerParse { ...@@ -23,7 +23,7 @@ public class ZhihuAnwserCrawlerParse {
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot();
/** /**
* 获取数据 * 知乎回答采集
* @param url * @param url
* @param endDate * @param endDate
* @param proxy * @param proxy
......
...@@ -35,7 +35,7 @@ public class ZhihuCrawlerParse { ...@@ -35,7 +35,7 @@ public class ZhihuCrawlerParse {
/** /**
* @Title: getBaiduTiebaData * @Title: getBaiduTiebaData
* @author hero * @author hero
* @Description: 根據關鍵詞獲取百度貼吧數據(最多50頁) * @Description: 知乎关键词采集
* @param @param word * @param @param word
* @param @param proxy * @param @param proxy
* @param @param tiebaName * @param @param tiebaName
......
package com.zhiwei.media_data_crawler.crawler;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.media_data_crawler.entity.ZhihuAnswer;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class ZhihuUserAnswerCrawlerParse {
private static final Logger logger = LoggerFactory.getLogger(ZhihuUserAnswerCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot();
public static List<ZhihuAnswer> getData(String userId,ProxyHolder proxy) {
String url = "https://www.zhihu.com/api/v4/members/"+userId+"/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&limit=20&sort_by=created&offset=";
int page = 0;
List<ZhihuAnswer> dataList = new ArrayList<>();
Map<String,Object> headers = new HashMap<>();
// headers.put("referer", "https://www.zhihu.com/people/"+userId+"/answers");
// headers.put("user-agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
// headers.put("cookie", "tgw_l7_route=116a747939468d99065d12a386ab1c5f; _xsrf=gn2oQ7N4G6yGOny4hc3T1TRr4kPOF4ij");
while(true) {
int count = 1;
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url + page,headers), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("data");
for(int i =0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
ZhihuAnswer za = new ZhihuAnswer();
za.setFrom_url(userId);
za.setTitle(data.getJSONObject("question").getString("title"));
za.setAuthor(data.getJSONObject("author").getString("name"));
za.setContent(data.getString("content").replaceAll("<.*?>", ""));
za.setTime(new Date(data.getLong("created_time")*1000L));
za.setUrl(data.getJSONObject("question").getString("url").replace("questions", "question")+"/answer/"+data.getString("id"));
za.setAttitudes_count(data.getInteger("voteup_count"));
za.setComment_count(data.getInteger("comment_count"));
dataList.add(za);
}
int total = json.getJSONObject("paging").getInteger("totals");
logger.info(" 知乎用户回答采集 {} 采集第 {} 条 ,一共采集到 {} 条 ,总条数 {}",userId,page,dataList.size(),total);
if(dataList.size() > total || page > total) {
count++;
if(count > 3) {
break;
}
}
ZhiWeiTools.sleep(200);
page+=20;
} catch (Exception e) {
logger.error(" 访问出错 {} ",e);
}
}
return dataList;
}
}
...@@ -424,6 +424,21 @@ public class DataCrawler { ...@@ -424,6 +424,21 @@ public class DataCrawler {
throw e; throw e;
} }
} }
/**
*
* @Description 知乎用户回答采集
* @param url
* @param proxy
* @return
*/
public static List<ZhihuAnswer> getZhihuUserAnswewr(String userId, ProxyHolder proxy){
try{
return ZhihuUserAnswerCrawlerParse.getData(userId, proxy);
}catch (Exception e){
throw e;
}
}
} }
...@@ -21,13 +21,13 @@ public class ZhihuAnswer implements Serializable { ...@@ -21,13 +21,13 @@ public class ZhihuAnswer implements Serializable {
private String content; //内容 private String content; //内容
private Integer attitudes_count; //点赞数 private Integer attitudes_count; //回答点赞数
private Integer comment_count; //评论数 private Integer comment_count; //回答评论数
private Integer follow_count; //点赞数 private Integer follow_count; //问题点赞数
private Integer bord_count; //评论数 private Integer bord_count; //问题评论数
public ZhihuAnswer(){} public ZhihuAnswer(){}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment