Commit 8e2e2cc2 by yangchen

贴吧 增加 采集全文,百度知道增加 返回参数

parent d12dad92
......@@ -5,6 +5,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
......@@ -29,6 +30,7 @@ import okhttp3.Response;
public class BaiduTiebaCrawlerParse {
private static HttpBoot httpBoot = new HttpBoot();
private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class);
/**
* @Title: getBaiduTiebaData
* @author hero
......@@ -173,6 +175,16 @@ public class BaiduTiebaCrawlerParse {
return resultMap;
}
private static String ganalysisData(String result,String url) {
/** 解析页面 */
Document document = Jsoup.parse(result);
// 开始解析
return document.select("#j_p_postlist > div:nth-child(1) > div.d_post_content_main.d_post_content_firstfloor > div.p_content > cc > div.j_d_post_content").text();
}
/**
*
* @Description 百度贴吧获取时间
......@@ -269,6 +281,7 @@ public class BaiduTiebaCrawlerParse {
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
......@@ -363,4 +376,27 @@ public class BaiduTiebaCrawlerParse {
}
return url;
}
/**
*
* @Description 贴吧用户问题
* @param url
* @param proxy
* @return
*/
public static String getTiebaData(String url,Proxy proxy) {
if(url.contains("?")) {
url = url.split("\\?")[0];
}
try {
String htmlBody = downloadHtml(url, null);
if (Objects.nonNull(htmlBody)) {
return ganalysisData(htmlBody, url);
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}
......@@ -3,6 +3,7 @@ package com.zhiwei.media_data_crawler.crawler;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
......@@ -18,11 +19,13 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.Response;
public class BaiduZhidaoCrawlerParse {
private static Logger logger = LoggerFactory.getLogger(BaiduZhidaoCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot(false,2);
public static List<Map<String,Object>> getData(String word,ProxyHolder proxy) {
try {
......@@ -43,11 +46,17 @@ public class BaiduZhidaoCrawlerParse {
String content = element.select("dd.answer").text();
String time = element.select("dd.dd.explain.f-light > span:nth-child(1)").text();
String source = element.select("dd.dd.explain.f-light > span:nth-child(2) > a").text();
String answerCount = element.select("dd.dd.explain.f-light > span:nth-child(3) > a").text();
String like = element.select("dd.dd.explain.f-light > span:nth-child(4)").text();
map.put("url", ur);
map.put("title", title);
map.put("content", content);
map.put("time", time);
map.put("source", source);
map.put("answerCount", answerCount);
map.put("like", like);
map.put("word", word);
System.out.println(map.toString());
dataList.add(map);
}
if(dataList.size() - count < 8) {
......@@ -63,6 +72,18 @@ public class BaiduZhidaoCrawlerParse {
}
public static Date getBaiduZhidaoTime(String url ,ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
return new Date(Long.parseLong(result.split("createTime: '")[1].split("'")[0]+"000"));
} catch (Exception e) {
logger.error("百度知道问题时间获取失败{}" ,e);
}
return null;
}
// public static void main(String[] argss
}
......@@ -103,6 +103,7 @@ public class DoubanCrawlerParse {
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
......@@ -129,6 +130,7 @@ public class DoubanCrawlerParse {
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment