Commit ea9efe8f by yangchen

修改 知乎回答下评论采集

parent 45483734
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
<attributes> <attributes>
<attribute name="optional" value="true"/> <attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/> <attribute name="maven.pomderived" value="true"/>
<attribute name="test" value="true"/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
...@@ -20,7 +21,6 @@ ...@@ -20,7 +21,6 @@
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER"> <classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes> <attributes>
<attribute name="maven.pomderived" value="true"/> <attribute name="maven.pomderived" value="true"/>
<attribute name="org.eclipse.jst.component.nondependency" value=""/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/> <classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
......
...@@ -221,16 +221,12 @@ public class BaiduInforCrawlerParse { ...@@ -221,16 +221,12 @@ public class BaiduInforCrawlerParse {
return resultMap; return resultMap;
} }
private static String downloadHtml(String url, int page) throws Exception{ private static String downloadHtml(String url, int page) {
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址 // 获取链接地址
url = url + "&pn=" + page * 10; url = url + "&pn=" + page * 10;
headerMap.put("Host", "news.baidu.com");
headerMap.put("Referer", url);
// 下载数据页面 // 下载数据页面
for (int i = 1; i <= 3; i++) { for (int i = 1; i <= 3; i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_HEAVY_PROXY)){ try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY)){
return response.body().string(); return response.body().string();
} catch (Exception e) { } catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
...@@ -259,10 +255,6 @@ public class BaiduInforCrawlerParse { ...@@ -259,10 +255,6 @@ public class BaiduInforCrawlerParse {
int page = 0; int page = 0;
boolean more = true; boolean more = true;
while (more) { while (more) {
// 最大页数为20
if (page > 30) {
more = false;
}
String htmlBody = downloadHtml(url, page); String htmlBody = downloadHtml(url, page);
if (htmlBody != null) { if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, word); Map<String, Object> dataMap = analysisData(htmlBody, word);
...@@ -273,6 +265,10 @@ public class BaiduInforCrawlerParse { ...@@ -273,6 +265,10 @@ public class BaiduInforCrawlerParse {
more = false; more = false;
} }
page++; page++;
// 最大页数为20
if (page > 10) {
more = false;
}
} }
return list; return list;
} }
...@@ -306,9 +302,12 @@ public class BaiduInforCrawlerParse { ...@@ -306,9 +302,12 @@ public class BaiduInforCrawlerParse {
//https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E6%B5%99%E6%B1%9F&medium=1&rn=50&gpc=stf%3D1546272000%2C1548950400%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=0 //https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E6%B5%99%E6%B1%9F&medium=1&rn=50&gpc=stf%3D1546272000%2C1548950400%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=0
// public static void main(String[] args) throws Exception { // public static void main(String[] args) throws Exception {
// String url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=1&wd=%E5%A5%94%E9%A9%B0+%E6%BC%8F%E6%B1%BD%E6%B2%B9&medium=0&rn=50&gpc=stf%3D0%2C1496246399%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_l_more&x_bfe_rqs=03E80&x_bfe_tjscore=0.332314&scs=2546086922&sortBy=0&pn=0";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER); // ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<NewsData> ndList = getBaiduInforData("马云","2019-07-04 23:59:59"); // List<NewsData> ndList = getBaiduInforData("马云","2019-07-04 23:59:59");
// System.out.println(ndList.size()); // System.out.println(ndList.size());
// String result = downloadHtml(url,0);
// System.out.println(result);
// } // }
} }
package com.zhiwei.media_data_crawler.crawler; package com.zhiwei.media_data_crawler.crawler;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -12,9 +11,7 @@ import org.apache.logging.log4j.Logger; ...@@ -12,9 +11,7 @@ import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.media_data_crawler.entity.JianshuUser; import com.zhiwei.media_data_crawler.entity.JianshuUser;
...@@ -78,7 +75,6 @@ public class JianshuCrawler { ...@@ -78,7 +75,6 @@ public class JianshuCrawler {
} }
break; break;
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace();
logger.error("简书用户采集出错{}", e); logger.error("简书用户采集出错{}", e);
} }
...@@ -88,6 +84,5 @@ public class JianshuCrawler { ...@@ -88,6 +84,5 @@ public class JianshuCrawler {
} }
public static void main(String[] args) {}
} }
...@@ -214,7 +214,6 @@ public class SoNewsCrawlerParse { ...@@ -214,7 +214,6 @@ public class SoNewsCrawlerParse {
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
logger.error("360新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("360新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
} }
} }
resultMap.put("data", list); resultMap.put("data", list);
...@@ -236,8 +235,8 @@ public class SoNewsCrawlerParse { ...@@ -236,8 +235,8 @@ public class SoNewsCrawlerParse {
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) throws Exception{ private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>(); Map<String, Object> resultMap = new HashMap<>();
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<>();
boolean more = true; boolean more = true;
/** 解析页面 */ /** 解析页面 */
...@@ -269,9 +268,7 @@ public class SoNewsCrawlerParse { ...@@ -269,9 +268,7 @@ public class SoNewsCrawlerParse {
NewsData newsData = new NewsData(link, title, source, time, content, pt, word); NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
list.add(newsData); list.add(newsData);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); logger.error("360新闻数据解析时出现问题,问题为:{}", e);
logger.error("360新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
} }
} }
resultMap.put("data", list); resultMap.put("data", list);
......
...@@ -15,7 +15,6 @@ import com.zhiwei.crawler.core.HttpBoot; ...@@ -15,7 +15,6 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.media_data_crawler.entity.ZhihuAnswerComment; import com.zhiwei.media_data_crawler.entity.ZhihuAnswerComment;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response; import okhttp3.Response;
...@@ -45,10 +44,9 @@ public class ZhihuAnswerCommentParse { ...@@ -45,10 +44,9 @@ public class ZhihuAnswerCommentParse {
int count = -1; int count = -1;
for(int i = 1;i < 3;i++) { for(int i = 1;i < 3;i++) {
count = zacList.size(); count = zacList.size();
String nurl = "https://www.zhihu.com/api/v4/answers/"+id+"/root_comments?" + //https://www.zhihu.com/api/v4/answers/708507274/root_comments?order=normal&limit=20&offset=20&status=open
"include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2" + String nurl = "https://www.zhihu.com/api/v4/answers/"+id+"/root_comments?order=norma" +
"Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2Cis_author&order=norma" + "l&limit=20&offset=" + pages + "&status=open";
"l&limit=50&offset=" + pages + "&status=open";
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(nurl), proxy)){ try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(nurl), proxy)){
String result = response.body().string(); String result = response.body().string();
zacList.addAll(getData(result)); zacList.addAll(getData(result));
...@@ -92,23 +90,23 @@ public class ZhihuAnswerCommentParse { ...@@ -92,23 +90,23 @@ public class ZhihuAnswerCommentParse {
zac.setTime(new Date(createdTime * 1000L)); zac.setTime(new Date(createdTime * 1000L));
zac.setChildCommentCount(childCommentCount); zac.setChildCommentCount(childCommentCount);
dataList.add(zac); dataList.add(zac);
if (childCommentCount > 0) { // if (childCommentCount > 0) {
for (int g = 0; g < childCommentCount; g += 20) { // for (int g = 0; g < childCommentCount; g += 20) {
for(int n = 1;n < 5;n++) { // for(int n = 1;n < 5;n++) {
//避免太快,ip被封,导致数据无法获取 // //避免太快,ip被封,导致数据无法获取
ZhiWeiTools.sleep(200); // ZhiWeiTools.sleep(200);
String url2 = "https://www.zhihu.com/api/v4/comments/" + id + "/child_comments?include=%24%5B%2A%5D." + // String url2 = "https://www.zhihu.com/api/v4/comments/" + id + "/child_comments?include=%24%5B%2A%5D." +
"author%2Creply_to_author%2Ccontent%2Cvote_count&limit=" + // "author%2Creply_to_author%2Ccontent%2Cvote_count&limit=" +
"50&offset=" + g + "&include=%24%5B*%5D.author%2Creply_to_author%2Ccontent%2Cvote_count"; // "50&offset=" + g + "&include=%24%5B*%5D.author%2Creply_to_author%2Ccontent%2Cvote_count";
//获取回答中的回复列表 // //获取回答中的回复列表
List<ZhihuAnswerComment> replayList = getReplayList(url2,id); // List<ZhihuAnswerComment> replayList = getReplayList(url2,id);
if(!replayList.isEmpty()) { // if(!replayList.isEmpty()) {
dataList.addAll(replayList); // dataList.addAll(replayList);
break; // break;
} // }
} // }
} // }
} // }
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment