Commit 8e2e2cc2 by yangchen

贴吧 增加 采集全文,百度知道增加 返回参数

parent d12dad92
...@@ -5,6 +5,7 @@ import java.util.ArrayList; ...@@ -5,6 +5,7 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
...@@ -29,6 +30,7 @@ import okhttp3.Response; ...@@ -29,6 +30,7 @@ import okhttp3.Response;
public class BaiduTiebaCrawlerParse { public class BaiduTiebaCrawlerParse {
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot();
private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class); private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class);
/** /**
* @Title: getBaiduTiebaData * @Title: getBaiduTiebaData
* @author hero * @author hero
...@@ -173,6 +175,16 @@ public class BaiduTiebaCrawlerParse { ...@@ -173,6 +175,16 @@ public class BaiduTiebaCrawlerParse {
return resultMap; return resultMap;
} }
private static String ganalysisData(String result,String url) {
/** 解析页面 */
Document document = Jsoup.parse(result);
// 开始解析
return document.select("#j_p_postlist > div:nth-child(1) > div.d_post_content_main.d_post_content_firstfloor > div.p_content > cc > div.j_d_post_content").text();
}
/** /**
* *
* @Description 百度贴吧获取时间 * @Description 百度贴吧获取时间
...@@ -269,6 +281,7 @@ public class BaiduTiebaCrawlerParse { ...@@ -269,6 +281,7 @@ public class BaiduTiebaCrawlerParse {
}else { }else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY); response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
} }
return response.body().string();
} catch (Exception e) { } catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
...@@ -363,4 +376,27 @@ public class BaiduTiebaCrawlerParse { ...@@ -363,4 +376,27 @@ public class BaiduTiebaCrawlerParse {
} }
return url; return url;
} }
/**
*
* @Description 贴吧用户问题
* @param url
* @param proxy
* @return
*/
public static String getTiebaData(String url,Proxy proxy) {
if(url.contains("?")) {
url = url.split("\\?")[0];
}
try {
String htmlBody = downloadHtml(url, null);
if (Objects.nonNull(htmlBody)) {
return ganalysisData(htmlBody, url);
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
} }
...@@ -3,6 +3,7 @@ package com.zhiwei.media_data_crawler.crawler; ...@@ -3,6 +3,7 @@ package com.zhiwei.media_data_crawler.crawler;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -18,11 +19,13 @@ import com.zhiwei.crawler.core.HttpBoot; ...@@ -18,11 +19,13 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.Response;
public class BaiduZhidaoCrawlerParse { public class BaiduZhidaoCrawlerParse {
private static Logger logger = LoggerFactory.getLogger(BaiduZhidaoCrawlerParse.class); private static Logger logger = LoggerFactory.getLogger(BaiduZhidaoCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot(false,2);
public static List<Map<String,Object>> getData(String word,ProxyHolder proxy) { public static List<Map<String,Object>> getData(String word,ProxyHolder proxy) {
try { try {
...@@ -43,11 +46,17 @@ public class BaiduZhidaoCrawlerParse { ...@@ -43,11 +46,17 @@ public class BaiduZhidaoCrawlerParse {
String content = element.select("dd.answer").text(); String content = element.select("dd.answer").text();
String time = element.select("dd.dd.explain.f-light > span:nth-child(1)").text(); String time = element.select("dd.dd.explain.f-light > span:nth-child(1)").text();
String source = element.select("dd.dd.explain.f-light > span:nth-child(2) > a").text(); String source = element.select("dd.dd.explain.f-light > span:nth-child(2) > a").text();
String answerCount = element.select("dd.dd.explain.f-light > span:nth-child(3) > a").text();
String like = element.select("dd.dd.explain.f-light > span:nth-child(4)").text();
map.put("url", ur); map.put("url", ur);
map.put("title", title); map.put("title", title);
map.put("content", content); map.put("content", content);
map.put("time", time); map.put("time", time);
map.put("source", source); map.put("source", source);
map.put("answerCount", answerCount);
map.put("like", like);
map.put("word", word);
System.out.println(map.toString());
dataList.add(map); dataList.add(map);
} }
if(dataList.size() - count < 8) { if(dataList.size() - count < 8) {
...@@ -63,6 +72,18 @@ public class BaiduZhidaoCrawlerParse { ...@@ -63,6 +72,18 @@ public class BaiduZhidaoCrawlerParse {
} }
public static Date getBaiduZhidaoTime(String url ,ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
return new Date(Long.parseLong(result.split("createTime: '")[1].split("'")[0]+"000"));
} catch (Exception e) {
logger.error("百度知道问题时间获取失败{}" ,e);
}
return null;
}
// public static void main(String[] argss // public static void main(String[] argss
} }
...@@ -103,6 +103,7 @@ public class DoubanCrawlerParse { ...@@ -103,6 +103,7 @@ public class DoubanCrawlerParse {
}else { }else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY); response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
} }
return response.body().string();
} catch (Exception e) { } catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
...@@ -129,6 +130,7 @@ public class DoubanCrawlerParse { ...@@ -129,6 +130,7 @@ public class DoubanCrawlerParse {
}else { }else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY); response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
} }
return response.body().string();
} catch (Exception e) { } catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment