Commit ea9efe8f by yangchen

修改 知乎回答下评论采集

parent 45483734
......@@ -10,6 +10,7 @@
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
<attribute name="test" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
......@@ -20,7 +21,6 @@
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
<attribute name="org.eclipse.jst.component.nondependency" value=""/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
......
......@@ -221,16 +221,12 @@ public class BaiduInforCrawlerParse {
return resultMap;
}
private static String downloadHtml(String url, int page) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
private static String downloadHtml(String url, int page) {
// 获取链接地址
url = url + "&pn=" + page * 10;
headerMap.put("Host", "news.baidu.com");
headerMap.put("Referer", url);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_HEAVY_PROXY)){
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY)){
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
......@@ -259,10 +255,6 @@ public class BaiduInforCrawlerParse {
int page = 0;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 30) {
more = false;
}
String htmlBody = downloadHtml(url, page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, word);
......@@ -273,6 +265,10 @@ public class BaiduInforCrawlerParse {
more = false;
}
page++;
// 最大页数为20
if (page > 10) {
more = false;
}
}
return list;
}
......@@ -306,9 +302,12 @@ public class BaiduInforCrawlerParse {
//https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E6%B5%99%E6%B1%9F&medium=1&rn=50&gpc=stf%3D1546272000%2C1548950400%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=0
// public static void main(String[] args) throws Exception {
// String url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=1&wd=%E5%A5%94%E9%A9%B0+%E6%BC%8F%E6%B1%BD%E6%B2%B9&medium=0&rn=50&gpc=stf%3D0%2C1496246399%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_l_more&x_bfe_rqs=03E80&x_bfe_tjscore=0.332314&scs=2546086922&sortBy=0&pn=0";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<NewsData> ndList = getBaiduInforData("马云","2019-07-04 23:59:59");
// System.out.println(ndList.size());
// String result = downloadHtml(url,0);
// System.out.println(result);
// }
}
package com.zhiwei.media_data_crawler.crawler;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
......@@ -12,9 +11,7 @@ import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.media_data_crawler.entity.JianshuUser;
......@@ -78,7 +75,6 @@ public class JianshuCrawler {
}
break;
} catch (Exception e) {
e.printStackTrace();
logger.error("简书用户采集出错{}", e);
}
......@@ -88,6 +84,5 @@ public class JianshuCrawler {
}
public static void main(String[] args) {}
}
......@@ -214,7 +214,6 @@ public class SoNewsCrawlerParse {
} catch (Exception e) {
e.printStackTrace();
logger.error("360新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
resultMap.put("data", list);
......@@ -236,8 +235,8 @@ public class SoNewsCrawlerParse {
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>();
Map<String, Object> resultMap = new HashMap<>();
List<NewsData> list = new ArrayList<>();
boolean more = true;
/** 解析页面 */
......@@ -269,9 +268,7 @@ public class SoNewsCrawlerParse {
NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
list.add(newsData);
} catch (Exception e) {
e.printStackTrace();
logger.error("360新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
logger.error("360新闻数据解析时出现问题,问题为:{}", e);
}
}
resultMap.put("data", list);
......
......@@ -15,7 +15,6 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.media_data_crawler.entity.ZhihuAnswerComment;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
......@@ -45,10 +44,9 @@ public class ZhihuAnswerCommentParse {
int count = -1;
for(int i = 1;i < 3;i++) {
count = zacList.size();
String nurl = "https://www.zhihu.com/api/v4/answers/"+id+"/root_comments?" +
"include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2" +
"Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2Cis_author&order=norma" +
"l&limit=50&offset=" + pages + "&status=open";
//https://www.zhihu.com/api/v4/answers/708507274/root_comments?order=normal&limit=20&offset=20&status=open
String nurl = "https://www.zhihu.com/api/v4/answers/"+id+"/root_comments?order=norma" +
"l&limit=20&offset=" + pages + "&status=open";
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(nurl), proxy)){
String result = response.body().string();
zacList.addAll(getData(result));
......@@ -92,23 +90,23 @@ public class ZhihuAnswerCommentParse {
zac.setTime(new Date(createdTime * 1000L));
zac.setChildCommentCount(childCommentCount);
dataList.add(zac);
if (childCommentCount > 0) {
for (int g = 0; g < childCommentCount; g += 20) {
for(int n = 1;n < 5;n++) {
//避免太快,ip被封,导致数据无法获取
ZhiWeiTools.sleep(200);
String url2 = "https://www.zhihu.com/api/v4/comments/" + id + "/child_comments?include=%24%5B%2A%5D." +
"author%2Creply_to_author%2Ccontent%2Cvote_count&limit=" +
"50&offset=" + g + "&include=%24%5B*%5D.author%2Creply_to_author%2Ccontent%2Cvote_count";
//获取回答中的回复列表
List<ZhihuAnswerComment> replayList = getReplayList(url2,id);
if(!replayList.isEmpty()) {
dataList.addAll(replayList);
break;
}
}
}
}
// if (childCommentCount > 0) {
// for (int g = 0; g < childCommentCount; g += 20) {
// for(int n = 1;n < 5;n++) {
// //避免太快,ip被封,导致数据无法获取
// ZhiWeiTools.sleep(200);
// String url2 = "https://www.zhihu.com/api/v4/comments/" + id + "/child_comments?include=%24%5B%2A%5D." +
// "author%2Creply_to_author%2Ccontent%2Cvote_count&limit=" +
// "50&offset=" + g + "&include=%24%5B*%5D.author%2Creply_to_author%2Ccontent%2Cvote_count";
// //获取回答中的回复列表
// List<ZhihuAnswerComment> replayList = getReplayList(url2,id);
// if(!replayList.isEmpty()) {
// dataList.addAll(replayList);
// break;
// }
// }
// }
// }
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment