Commit 3c2a6baa by yangchen

添加 知乎用户 百度知道关键词采集

parent f19fd0ee
...@@ -59,7 +59,7 @@ public class BaiduTiebaCrawlerParse { ...@@ -59,7 +59,7 @@ public class BaiduTiebaCrawlerParse {
page++; page++;
if(DataCrawler.sleepTime==null){ if(DataCrawler.sleepTime==null){
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(3000);
} }
} }
return list; return list;
} }
......
package com.zhiwei.media_data_crawler.crawler;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
public class BaiduZhidaoCrawlerParse {
private static Logger logger = LoggerFactory.getLogger(BaiduZhidaoCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot();
public static List<Map<String,Object>> getData(String word,ProxyHolder proxy) {
try {
List<Map<String,Object>> dataList = new ArrayList<>();
int i = 0;
int count = -1;
while(true) {
count = dataList.size();
String url = "https://zhidao.baidu.com/search?lm=0&rn=10&fr=search&ie=gbk&word="+URLEncoder.encode(word, "gbk") +"&pn="+ i;
System.out.println(url);
String result = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
Document doc = Jsoup.parse(result);
Elements elements = doc.select("div.list").select("dl");
for(Element element : elements) {
Map<String,Object> map = new HashMap<>();
String ur = element.select("a.ti").attr("href").split("\\?")[0];
String title = element.select("a.ti").text();
String content = element.select("dd.answer").text();
String time = element.select("dd.dd.explain.f-light > span:nth-child(1)").text();
String source = element.select("dd.dd.explain.f-light > span:nth-child(2) > a").text();
map.put("url", ur);
map.put("title", title);
map.put("content", content);
map.put("time", time);
map.put("source", source);
dataList.add(map);
}
if(dataList.size() - count < 8) {
break;
}
i += 10;
}
return dataList;
}catch (Exception e) {
logger.error(" 采集错误 {} ",e);
}
return Collections.emptyList();
}
// public static void main(String[] argss
}
...@@ -146,7 +146,7 @@ public class TianYaCrawlerParse { ...@@ -146,7 +146,7 @@ public class TianYaCrawlerParse {
if(date.before(endDate)){ if(date.before(endDate)){
more = false; more = false;
}else{ }else{
System.out.println(luntanData); // System.out.println(luntanData);
list.add(luntanData); list.add(luntanData);
} }
} }
......
package com.zhiwei.media_data_crawler.crawler;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.Objects;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.media_data_crawler.entity.ZhihuAnswerComment;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class ZhihuAnswerCommentParse {
private static Logger logger = LogManager.getLogger(TianYaCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot();
// public static void main(String[] args) {
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// List<ZhihuAnswerComment> zacList = getAnswerData("https://www.zhihu.com/question/36267070/answer/575449468", ProxyHolder.NAT_PROXY);
// System.out.println(zacList.size());
//
// }
public static List<ZhihuAnswerComment> getAnswerData(String url,ProxyHolder proxy) {
String id = getAnswerId(url);
if(Objects.isNull(id)) {
return Collections.emptyList();
}
boolean f = true;
List<ZhihuAnswerComment> zacList = new ArrayList<>();
int pages = 0;
while(f) {
int count = -1;
for(int i = 1;i < 3;i++) {
count = zacList.size();
String nurl = "https://www.zhihu.com/api/v4/answers/"+id+"/root_comments?" +
"include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2" +
"Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2Cis_author&order=norma" +
"l&limit=50&offset=" + pages + "&status=open";
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(nurl), proxy)){
String result = response.body().string();
zacList.addAll(getData(result));
if(!Objects.equals(count, zacList.size())) {
break;
}
logger.info(" url = {} 数据量 = {} 第 {} 页",url,zacList.size(),pages/20);
} catch (Exception e) {
logger.error(" exception {} ",e);
}
}
pages += 20;
if(Objects.equals(count, zacList.size())) {
f = false;
}
}
return zacList;
}
private static List<ZhihuAnswerComment> getData(String result){
List<ZhihuAnswerComment> dataList = new ArrayList<>();
try {
JSONObject jsonObj1 = JSONObject.parseObject(result);
JSONArray jarr = jsonObj1.getJSONArray("data");
if (Objects.nonNull(jarr)) {
for (int i = 0; i < jarr.size(); i++) {
JSONObject data = jarr.getJSONObject(i);
int childCommentCount = data.getInteger("child_comment_count");
String url1 = data.getJSONObject("author").getJSONObject("member").getString("url");
String name = data.getJSONObject("author").getJSONObject("member").getString("name");
String string = data.getString("content").replaceAll("<p>", "");
String id = data.getString("id");
long createdTime = data.getLong("created_time");
int voteCount = data.getInteger("vote_count");
ZhihuAnswerComment zac = new ZhihuAnswerComment();
zac.setAttitudeCount(voteCount);
zac.setAuthor(name);
zac.setContent(string);
zac.setId(id);
zac.setAuthorUrl(url1);
zac.setTime(new Date(createdTime * 1000L));
zac.setChildCommentCount(childCommentCount);
dataList.add(zac);
if (childCommentCount > 0) {
for (int g = 0; g < childCommentCount; g += 20) {
for(int n = 1;n < 5;n++) {
//避免太快,ip被封,导致数据无法获取
ZhiWeiTools.sleep(200);
String url2 = "https://www.zhihu.com/api/v4/comments/" + id + "/child_comments?include=%24%5B%2A%5D." +
"author%2Creply_to_author%2Ccontent%2Cvote_count&limit=" +
"50&offset=" + g + "&include=%24%5B*%5D.author%2Creply_to_author%2Ccontent%2Cvote_count";
//获取回答中的回复列表
List<ZhihuAnswerComment> replayList = getReplayList(url2,id);
if(!replayList.isEmpty()) {
dataList.addAll(replayList);
break;
}
}
}
}
}
}
} catch (Exception e) {
logger.error(" 解析出错 {} ", e);
}
return dataList;
}
private static List<ZhihuAnswerComment> getReplayList(String url,String strRootID) {
List<ZhihuAnswerComment> dataList = new ArrayList<>();
try {
String result = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyFactory.getNatProxy()).body().string();
if (result != null) {
JSONObject dataJson = JSONObject.parseObject(result);
JSONArray dataArray = dataJson.getJSONArray("data");
if(Objects.nonNull(dataArray)){
for (int j = 0; j < dataArray.size(); j++) {
JSONObject data1 = dataArray.getJSONObject(j);
String url2 = data1.getJSONObject("author").getJSONObject("member").getString("url");
String name = data1.getJSONObject("author").getJSONObject("member").getString("name");
String content = data1.getString("content");
String string = content.replace("<p>", "");
Long dateTemp = data1.getLong("created_time");
int voteCount = data1.getInteger("vote_count");
String id = data1.getString("id");
ZhihuAnswerComment zac = new ZhihuAnswerComment();
zac.setAttitudeCount(voteCount);
zac.setAuthor(name);
zac.setContent(string);
zac.setId(id);
zac.setRootId(strRootID);
zac.setAuthorUrl(url2);
zac.setTime(new Date(dateTemp * 1000L));
dataList.add(zac);
}
}
}
return dataList;
} catch (Exception e) {
logger.error(" 知乎回复解析出错 {} ", e);
}
return Collections.emptyList();
}
private static String getAnswerId(String url) {
try {
if(url.contains("answer/")) {
return url.split("answer/")[1];
}
} catch (Exception e) {
logger.error(" 知乎链接id获取出错 ", e);
}
return null;
}
}
...@@ -142,11 +142,11 @@ public class ZhihuCrawlerParse { ...@@ -142,11 +142,11 @@ public class ZhihuCrawlerParse {
return null; return null;
} }
public static void main(String[] args) { // public static void main(String[] args) {
String url = "https://zhuanlan.zhihu.com/p/31577152"; // String url = "https://zhuanlan.zhihu.com/p/31577152";
ZhiHuData zqd = ZhihuCrawlerParse.getUrlData(url, null); // ZhiHuData zqd = ZhihuCrawlerParse.getUrlData(url, null);
System.out.println(zqd.toString()); // System.out.println(zqd.toString());
} // }
/** /**
* *
......
package com.zhiwei.media_data_crawler.data; package com.zhiwei.media_data_crawler.data;
import java.net.Proxy; import java.net.Proxy;
import java.util.Collections;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.media_data_crawler.crawler.*; import com.zhiwei.media_data_crawler.crawler.*;
import com.zhiwei.media_data_crawler.entity.*; import com.zhiwei.media_data_crawler.entity.*;
...@@ -38,7 +40,30 @@ public class DataCrawler { ...@@ -38,7 +40,30 @@ public class DataCrawler {
return null; return null;
} }
} }
/**
*
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词,全文匹配百度知道数据
* @param @param
* word
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
*/
public static List<Map<String,Object>> getBaiduZhidaoData(String word, ProxyHolder proxy
) {
try {
return BaiduZhidaoCrawlerParse.getData(word, proxy);
} catch (Exception e) {
return Collections.emptyList();
}
}
/** /**
* *
* @Title: getBaiduNewsData * @Title: getBaiduNewsData
...@@ -212,8 +237,7 @@ public class DataCrawler { ...@@ -212,8 +237,7 @@ public class DataCrawler {
try { try {
return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, null); return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, null);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); return Collections.emptyList();
return null;
} }
} }
...@@ -283,8 +307,7 @@ public class DataCrawler { ...@@ -283,8 +307,7 @@ public class DataCrawler {
try { try {
return TianYaCrawlerParse.getLunTanData(word, proxy, endTime); return TianYaCrawlerParse.getLunTanData(word, proxy, endTime);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); return Collections.emptyList();
return null;
} }
} }
......
...@@ -29,7 +29,6 @@ public class ZhihuAnswer implements Serializable { ...@@ -29,7 +29,6 @@ public class ZhihuAnswer implements Serializable {
private Integer bord_count; //评论数 private Integer bord_count; //评论数
public ZhihuAnswer(){} public ZhihuAnswer(){}
public ZhihuAnswer(String url, String from_url,String title, Date time, String author, public ZhihuAnswer(String url, String from_url,String title, Date time, String author,
......
package com.zhiwei.media_data_crawler.entity;
import java.util.Date;
public class ZhihuAnswerComment {
private String id;
private String author;
private String content;
private String rootId;
private Date time;
private int attitudeCount;
private String authorUrl;
private int childCommentCount;
public int getChildCommentCount() {
return childCommentCount;
}
public void setChildCommentCount(int childCommentCount) {
this.childCommentCount = childCommentCount;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getRootId() {
return rootId;
}
public void setRootId(String rootId) {
this.rootId = rootId;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public int getAttitudeCount() {
return attitudeCount;
}
public void setAttitudeCount(int attitudeCount) {
this.attitudeCount = attitudeCount;
}
public String getAuthorUrl() {
return authorUrl;
}
public void setAuthorUrl(String authorUrl) {
this.authorUrl = authorUrl;
}
}
...@@ -7,6 +7,16 @@ public class ZhihuQuestionData { ...@@ -7,6 +7,16 @@ public class ZhihuQuestionData {
private String time; private String time;
private String url; private String url;
private String authorUrl;
public String getAuthorUrl() {
return authorUrl;
}
public void setAuthorUrl(String authorUrl) {
this.authorUrl = authorUrl;
}
public String getTitle() { public String getTitle() {
return title; return title;
...@@ -35,7 +45,7 @@ public class ZhihuQuestionData { ...@@ -35,7 +45,7 @@ public class ZhihuQuestionData {
@Override @Override
public String toString() { public String toString() {
return "ZhihuQuestionData [title=" + title + ", time=" + time + ", url=" return "ZhihuQuestionData [title=" + title + ", time=" + time + ", url="
+ url + "]"; + url + ", authorUrl=" + authorUrl + "]";
} }
public ZhihuQuestionData(String title, String time, String url) { public ZhihuQuestionData(String title, String time, String url) {
...@@ -45,6 +55,15 @@ public class ZhihuQuestionData { ...@@ -45,6 +55,15 @@ public class ZhihuQuestionData {
this.url = url; this.url = url;
} }
public ZhihuQuestionData(String title, String time, String url,
String authorUrl) {
super();
this.title = title;
this.time = time;
this.url = url;
this.authorUrl = authorUrl;
}
public ZhihuQuestionData() { public ZhihuQuestionData() {
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment