Commit 040405fc by [zhangzhiwei]

修复知乎根据用户采集用户回答死循环问题

parent c694f0ae
......@@ -26,32 +26,41 @@ public class ZhihuUserAnswerCrawlerParse {
private static HttpBoot httpBoot = new HttpBoot();
public static List<ZhihuAnswer> getData(String userId,ProxyHolder proxy) {
String url = "https://www.zhihu.com/api/v4/members/"+userId+"/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&limit=20&sort_by=created&offset=";
int page = 0;
List<ZhihuAnswer> dataList = new ArrayList<>();
Map<String,Object> headers = new HashMap<>();
// headers.put("referer", "https://www.zhihu.com/people/"+userId+"/answers");
// headers.put("user-agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
// headers.put("cookie", "tgw_l7_route=116a747939468d99065d12a386ab1c5f; _xsrf=gn2oQ7N4G6yGOny4hc3T1TRr4kPOF4ij");
// headers.put("cookie", "_zap=37e196ce-6bf6-4680-9c40-a4b3dea72a47; _xsrf=g11of1WpkFPUYCJ88GRAlpty8bMnntT0; d_c0=\"ALDmEMw9Fw6PTsQcBCjppwDT8MbPGyQLkuo=|1534857872\"; __utma=51854390.1770583360.1534857893.1534857893.1534857893.1; __utmz=51854390.1534857893.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=51854390.000--|3=entry_date=20180821=1; z_c0=\"2|1:0|10:1545787952|4:z_c0|92:Mi4xbFFnaEFBQUFBQUFBc09ZUXpEMFhEaVlBQUFCZ0FsVk5NQ2dRWFFCUGxTLS1CczBSZWdDUzgyTFZOTmd4WHJISFR3|93b1755c91416a906602a708b0a451f7748cc1ff6fe5ee318fe2e7e15d30f101\"; tst=r; q_c1=3db855c272674e60bc301eae9948df45|1547635145000|1534857872000; tgw_l7_route=1b9b7363f02f3a5519d03bdf813bc914");
while(true) {
int count = 1;
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url + page,headers), proxy)){
String urlNewww = url + page;
// System.out.println("urlNew================"+urlNewww);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(urlNewww,headers), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("data");
for(int i =0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
ZhihuAnswer za = new ZhihuAnswer();
za.setFrom_url(userId);
za.setTitle(data.getJSONObject("question").getString("title"));
za.setAuthor(data.getJSONObject("author").getString("name"));
za.setContent(data.getString("content").replaceAll("<.*?>", ""));
za.setTime(new Date(data.getLong("created_time")*1000L));
za.setUrl(data.getJSONObject("question").getString("url").replace("questions", "question")+"/answer/"+data.getString("id"));
za.setAttitudes_count(data.getInteger("voteup_count"));
za.setComment_count(data.getInteger("comment_count"));
dataList.add(za);
if(jsonArray!=null && jsonArray.size()>0){
for(int i =0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
ZhihuAnswer za = new ZhihuAnswer();
za.setFrom_url(userId);
za.setTitle(data.getJSONObject("question").getString("title"));
za.setAuthor(data.getJSONObject("author").getString("name"));
za.setContent(data.getString("content").replaceAll("<.*?>", ""));
za.setTime(new Date(data.getLong("created_time")*1000L));
za.setUrl(data.getJSONObject("question").getString("url").replace("questions", "question")+"/answer/"+data.getString("id"));
za.setAttitudes_count(data.getInteger("voteup_count"));
za.setComment_count(data.getInteger("comment_count"));
dataList.add(za);
}
}else{
break;
}
int total = json.getJSONObject("paging").getInteger("totals");
logger.info(" 知乎用户回答采集 {} 采集第 {} 条 ,一共采集到 {} 条 ,总条数 {}",userId,page,dataList.size(),total);
if(dataList.size() > total || page > total) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment