Commit 4f5dfa32 by yangchen

百度贴吧 全部回答采集添加

parent 88d622c3
...@@ -71,6 +71,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -71,6 +71,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
/** /**
* @Title: getBaiduNewsData * @Title: getBaiduNewsData
* @author hero * @author hero
* @Description: 采集百度新闻数据
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return Integer 返回类型
* @throws Exception
*/
public static int getBaiduNewsCount(String word, String startTime, String endTime, Proxy proxy,String cookie) throws Exception {
try {
String result = downloadHtml(word, startTime, endTime, proxy, "newsdy", 1,cookie);
System.out.println(result);
String s = result.split("找到相关新闻")[1];
String s1 = s.split("篇")[0];
s1 = s1.replace(",", "").replace("约", "");
return Integer.valueOf(s1);
} catch (Exception e) {
return -1;
}
}
/**
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词获取数据 * @Description: 根据关键词获取数据
* @param @param word * @param @param word
* @param @param startTime * @param @param startTime
...@@ -153,6 +183,52 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -153,6 +183,52 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
* @return String 返回类型 * @return String 返回类型
*/ */
private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn, private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn,
int page,String cookie) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, startTime, endTime, tn, page);
System.out.println(url);
headerMap.put("Host", "news.baidu.com");
headerMap.put("cookie",cookie);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @param
* tn (tn=newsdy 为 全文匹配, tn=newstitle 为标题匹配)
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn,
int page) throws Exception{ int page) throws Exception{
// 获取通用请求头 // 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
......
...@@ -6,6 +6,9 @@ import java.util.ArrayList; ...@@ -6,6 +6,9 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
...@@ -57,6 +60,141 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK { ...@@ -57,6 +60,141 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
return list; return list;
} }
@SuppressWarnings("unchecked")
public static List<TiebaData> getBaiduTiebaAnswerDataByUrl(String url, Proxy proxy) throws Exception {
List<TiebaData> list = new ArrayList<TiebaData>();
if(url.contains("?")) {
url = url.split("\\?")[0];
}
String aid = url.split("\\/")[4];
int page = 1;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 50) {
more = false;
}
String ur = url + "?pn=" + page;
String htmlBody = downloadHtml(ur, proxy);
System.out.println(url + "------------" + aid);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisDataAnswer(htmlBody,aid);
List<TiebaData> dataList = (List<TiebaData>) dataMap.get("data");
list.addAll(dataList);
System.out.println(list.size());
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
ZhiWeiTools.sleep(3000);
}
return list;
}
private static Map<String, Object> analysisDataAnswer(String htmlBody,
String aid) {
Map<String, Object> resultMap = new HashMap<String, Object>();
List<TiebaData> list = new ArrayList<TiebaData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/** 判断是否有下一页 **/
if (!document.select("li.l_pager.pager_theme_4.pb_list_pager").text().contains("下一页")) {
more = false;
}
// 开始解析
Elements elementes = document.select("div.p_postlist > div");
String title = null;
title = document.select("div.core_title.core_title_theme_bright > h1").text();
if(title == null || title.length() < 1) {
title = document.select("#j_core_title_wrap > h3").text();
}
System.out.println(title);
for(Element element : elementes) {
String time = null;
String content = null;
String author = null;
String tid = null;
System.out.println("-------------------------");
// System.out.println(element.toString());
author = element.select("li.d_name").select("a").text();
content = element.select("div.p_content_nameplate").select("cc").select("div.clearfix").text();
if(content == null ||content.length() < 1) {
content = element.select("div.j_d_post_content").text();
}
if(time == null || time.length() < 1) {
time = element.select("span.tail-info").text();
if(time.contains("楼")) {
time = time.split("楼")[1].trim();
}
}
if(time == null || time.trim().length() < 1) {
Pattern pa = Pattern.compile("date&quot;:&quot;(.*?)&quot");
Matcher ma = pa.matcher(element.toString());
while(ma.find()) {
time = ma.group(0);
time = time.split("date&quot;:&quot;")[1].split("&quot")[0];
break;
}
}
Pattern pa2 = Pattern.compile("post_id&quot(.*?),&quot");
Matcher ma2 = pa2.matcher(element.toString());
while(ma2.find()) {
tid = ma2.group(0);
tid = tid.split("&quot;:")[1].split(",&quot")[0];
break;
}
if(time != null && time.length() > 1) {
TiebaData tbd = new TiebaData("http://tieba.baidu.com/p/"+aid, title, time, tid, null, author, content, aid);
System.out.println(tbd.toString());
list.add(tbd);
}
}
if(elementes.size()==0){
more = false;
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 下載百度貼吧具体页面数据
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @param page
* @param @return
* @param @throws Exception 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String url, Proxy proxy) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
headerMap.put("Host", "tieba.baidu.com");
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
/** /**
* @Title: downloadHtml * @Title: downloadHtml
...@@ -168,11 +306,11 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK { ...@@ -168,11 +306,11 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
String url = null; String url = null;
if (word != null) { if (word != null) {
if(tiebaName!=null){ if(tiebaName!=null){
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(word, "utf-8")+"&qw="+ url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(word, "GBK")+"&qw="+
URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page; URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
}else{ }else{
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="+ url = "http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="+
URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page; URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
} }
} }
System.out.println(url); System.out.println(url);
......
...@@ -225,7 +225,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -225,7 +225,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//添加到数据集合中 //添加到数据集合中
if(title != null && !title.equals("") && source!=null && time!=null){ if(title != null && !title.equals("") && source!=null && time!=null){
NewsData newsData = new NewsData(link, title, source, time, content, pt, word); NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
logger.info("搜狗新闻数据:{}", newsData); // logger.info("搜狗新闻数据:{}", newsData);
list.add(newsData); list.add(newsData);
} }
if(!type.equals("other")){ if(!type.equals("other")){
......
...@@ -43,6 +43,27 @@ public class DataCrawler { ...@@ -43,6 +43,27 @@ public class DataCrawler {
/** /**
* *
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词和时间,百度新闻数量
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public static int getBaiduNewsCount(String word, String startTime, String endTime, Proxy proxy,String cookie){
try {
return BaiduNewsCrawlerParse.getBaiduNewsCount(word, startTime, endTime, proxy,cookie);
} catch (Exception e) {
e.printStackTrace();
return -1;
}
}
/**
*
* @Title: getBaiduNewsDataByTitle * @Title: getBaiduNewsDataByTitle
* @author hero * @author hero
* @Description: 根据关键词和时间,标题匹配百度新闻数据 * @Description: 根据关键词和时间,标题匹配百度新闻数据
...@@ -179,6 +200,24 @@ public class DataCrawler { ...@@ -179,6 +200,24 @@ public class DataCrawler {
/** /**
* @Title: getBaiduTiebaData * @Title: getBaiduTiebaData
* @author hero * @author hero
* @Description: 根据关键词采集贴吧数据
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<TiebaData> 返回类型
*/
public static List<TiebaData> getBaiduTiebaAnswserDataByUrl(String url, Proxy proxy){
try {
return BaiduTiebaCrawlerParse.getBaiduTiebaAnswerDataByUrl(url, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根据关键词采集指定贴吧内数据 * @Description: 根据关键词采集指定贴吧内数据
* @param @param word * @param @param word
* @param @param proxy * @param @param proxy
......
package com.zhiwei.media_data_crawler.test; //package com.zhiwei.media_data_crawler.test;
//
import java.net.Proxy; //import java.net.Proxy;
import java.util.List; //import java.util.List;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse; //import com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse; //import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
import com.zhiwei.media_data_crawler.data.DataCrawler; //import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.DouBanData; //import com.zhiwei.media_data_crawler.entity.DouBanData;
import com.zhiwei.media_data_crawler.entity.LunTanData; //import com.zhiwei.media_data_crawler.entity.LunTanData;
import com.zhiwei.media_data_crawler.entity.NewsData; //import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.media_data_crawler.entity.TiebaData; //import com.zhiwei.media_data_crawler.entity.TiebaData;
import com.zhiwei.media_data_crawler.entity.ZhiHuData; //import com.zhiwei.media_data_crawler.entity.ZhiHuData;
//
public class DataCrawlerTest { //public class DataCrawlerTest {
//
//
//
//
//
@Test // @Test
public void getSoNewsTest(){ // public void getSoNewsTest(){
String word = "马云"; //关键词 // String word = "马云"; //关键词
String startTime = "2017-03-01 00:00:00"; //开始时间 // String startTime = "2017-03-01 00:00:00"; //开始时间
String endTime = "2017-03-01 23:59:59"; //结束时间 // String endTime = "2017-03-01 23:59:59"; //结束时间
Proxy proxy = null; //代理IP,不用可不填写 // Proxy proxy = null; //代理IP,不用可不填写
try { // try {
// //百度新闻采集demo //// //百度新闻采集demo
// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy); //// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo //// //搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy); //// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo //// //360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy); //// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
// //搜狗知乎采集 //// //搜狗知乎采集
// List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy); //// List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
// System.out.println(zhihuList.size()); //// System.out.println(zhihuList.size());
// //Baidu貼吧採集 //// //Baidu貼吧採集
// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null //// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName); //// System.out.println("------------------------");
// //天涯论坛采集 //// String url = "http://tieba.baidu.com/p/5331709274";
// List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime); //// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaAnswserDataByUrl(url, proxy);
//豆瓣采集 //// for(TiebaData tb : tiebaList) {
// String type = "topic"; //topic 为指定话题采集,note为指定日记采集 //// System.out.println(tb.toString());
// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy); //// }
//// String cookie = "BAIDUID=4DB3FA13736131DBC2094C010E6EBCB0:FG=1; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; PSTM=1522304033; Hm_lvt_e9e114d958ea263de46e080563e254c4=1528266119,1528440375,1528878557,1529378030; Hm_lvt_0c8070895132126fa3ba3bb7df1ac58e=1531536328,1531536834; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; BD_CK_SAM=1; BDSFRCVID=N0PsJeC626Wob6j7VuCWhrB7leysxf7TH6ao-y68-1MGjomR6XA2EG0PDx8g0Kubh1opogKKKgOTHIjP; H_BDCLCKID_SF=fRkq_C-aJIvbfP0k-4rM-JL3hgT22-us2mTAQhcH0KLKMU8GhMnjjpKl3G73bfDfaIItLhQPKfb1MRjV553GXxvBbnQm3RLJtR6m_h5TtUJkeCnTDMRhqt0_hnOyKMniWKv9-pnY2ft0bD0Cjj8hj5PW5ptX5tcJK6TJW5rJabC3q45qKU6qLT5Xjn-t2UR2JaczQlvS-q_bDbKG5UjI-l0njxQA-TOIBe-q0Mjn04KMhDjJjUonDh8pbG7MJUntHCnKWMbO5hvvOn3O3M7zMlOhDG8HqTLfJJ-sL-35HJOjfn74MITjhPrH-UIsWjOT-2Q-5KL-Jf3oHCQFjpjhLp-JK4Kf2PQ8-5bh_MbdJJjoKtICeKFaXf-qhUcNtqvCteTxoUJgMInJhhvG-4Aahn8ebPRiB-b9QgbABftLK-oj-D84j58-3H; FP_UID=de1696570823638bb79d2aa9183ff687; locale=zh; BD_BOXFO=_avOi_uo26GwC; userId=1533024028678; Hm_lvt_348091a80fe10e213d94a7de762bbd44=1533024045; Hm_lpvt_348091a80fe10e213d94a7de762bbd44=1533089688; BDRCVFR[iL4hrzJ0zlT]=mk3SLVN4HKm; PSINO=3; H_PS_PSSID=; BDSVRTM=125";
//// System.out.println(DataCrawler.getBaiduNewsCount(word, startTime, endTime, proxy,cookie));
} catch (Exception e) { //// //天涯论坛采集
// TODO Auto-generated catch block //// List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
e.printStackTrace(); // //豆瓣采集
} //// String type = "topic"; //topic 为指定话题采集,note为指定日记采集
} //// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
//
} //
// } catch (Exception e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
// }
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment