Commit daa0d81c by yangchen

贴吧采集修改

parent 4f5dfa32
...@@ -117,29 +117,12 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK { ...@@ -117,29 +117,12 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
String content = null; String content = null;
String author = null; String author = null;
String tid = null; String tid = null;
System.out.println("-------------------------");
// System.out.println(element.toString());
author = element.select("li.d_name").select("a").text(); author = element.select("li.d_name").select("a").text();
content = element.select("div.p_content_nameplate").select("cc").select("div.clearfix").text(); content = element.select("div.p_content_nameplate").select("cc").select("div.clearfix").text();
if(content == null ||content.length() < 1) { if(content == null ||content.length() < 1) {
content = element.select("div.j_d_post_content").text(); content = element.select("div.j_d_post_content").text();
} }
if(time == null || time.length() < 1) { time = getTime(element);
time = element.select("span.tail-info").text();
if(time.contains("楼")) {
time = time.split("楼")[1].trim();
}
}
if(time == null || time.trim().length() < 1) {
Pattern pa = Pattern.compile("date&quot;:&quot;(.*?)&quot");
Matcher ma = pa.matcher(element.toString());
while(ma.find()) {
time = ma.group(0);
time = time.split("date&quot;:&quot;")[1].split("&quot")[0];
break;
}
}
Pattern pa2 = Pattern.compile("post_id&quot(.*?),&quot"); Pattern pa2 = Pattern.compile("post_id&quot(.*?),&quot");
Matcher ma2 = pa2.matcher(element.toString()); Matcher ma2 = pa2.matcher(element.toString());
while(ma2.find()) { while(ma2.find()) {
...@@ -163,6 +146,33 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK { ...@@ -163,6 +146,33 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
} }
/** /**
*
* @Description 百度贴吧获取时间
* @param element
* @return
*/
private static String getTime(Element element) {
String time = null;
if(time == null || time.length() < 1) {
time = element.select("span.tail-info").text();
if(time.contains("楼")) {
time = time.split("楼")[1].trim();
}
}
if(time == null || time.trim().length() < 1) {
Pattern pa = Pattern.compile("date&quot;:&quot;(.*?)&quot");
Matcher ma = pa.matcher(element.toString());
while(ma.find()) {
time = ma.group(0);
time = time.split("date&quot;:&quot;")[1].split("&quot")[0];
break;
}
}
return time;
}
/**
* @Title: downloadHtml * @Title: downloadHtml
* @author hero * @author hero
* @Description: 下載百度貼吧具体页面数据 * @Description: 下載百度貼吧具体页面数据
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment