Commit 4f5dfa32 by yangchen

百度贴吧 全部回答采集添加

parent 88d622c3
......@@ -69,6 +69,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
}
/**
* @Title: getBaiduNewsData
* @author hero
* @Description: 采集百度新闻数据
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return Integer 返回类型
* @throws Exception
*/
public static int getBaiduNewsCount(String word, String startTime, String endTime, Proxy proxy,String cookie) throws Exception {
try {
String result = downloadHtml(word, startTime, endTime, proxy, "newsdy", 1,cookie);
System.out.println(result);
String s = result.split("找到相关新闻")[1];
String s1 = s.split("篇")[0];
s1 = s1.replace(",", "").replace("约", "");
return Integer.valueOf(s1);
} catch (Exception e) {
return -1;
}
}
/**
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词获取数据
......@@ -131,7 +161,53 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
}
return list;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @param
* tn (tn=newsdy 为 全文匹配, tn=newstitle 为标题匹配)
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn,
int page,String cookie) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, startTime, endTime, tn, page);
System.out.println(url);
headerMap.put("Host", "news.baidu.com");
headerMap.put("cookie",cookie);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
/**
* @Title: downloadHtml
* @author hero
......
......@@ -6,6 +6,9 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
......@@ -57,8 +60,143 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
return list;
}
@SuppressWarnings("unchecked")
public static List<TiebaData> getBaiduTiebaAnswerDataByUrl(String url, Proxy proxy) throws Exception {
List<TiebaData> list = new ArrayList<TiebaData>();
if(url.contains("?")) {
url = url.split("\\?")[0];
}
String aid = url.split("\\/")[4];
int page = 1;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 50) {
more = false;
}
String ur = url + "?pn=" + page;
String htmlBody = downloadHtml(ur, proxy);
System.out.println(url + "------------" + aid);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisDataAnswer(htmlBody,aid);
List<TiebaData> dataList = (List<TiebaData>) dataMap.get("data");
list.addAll(dataList);
System.out.println(list.size());
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
ZhiWeiTools.sleep(3000);
}
return list;
}
private static Map<String, Object> analysisDataAnswer(String htmlBody,
String aid) {
Map<String, Object> resultMap = new HashMap<String, Object>();
List<TiebaData> list = new ArrayList<TiebaData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/** 判断是否有下一页 **/
if (!document.select("li.l_pager.pager_theme_4.pb_list_pager").text().contains("下一页")) {
more = false;
}
// 开始解析
Elements elementes = document.select("div.p_postlist > div");
String title = null;
title = document.select("div.core_title.core_title_theme_bright > h1").text();
if(title == null || title.length() < 1) {
title = document.select("#j_core_title_wrap > h3").text();
}
System.out.println(title);
for(Element element : elementes) {
String time = null;
String content = null;
String author = null;
String tid = null;
System.out.println("-------------------------");
// System.out.println(element.toString());
author = element.select("li.d_name").select("a").text();
content = element.select("div.p_content_nameplate").select("cc").select("div.clearfix").text();
if(content == null ||content.length() < 1) {
content = element.select("div.j_d_post_content").text();
}
if(time == null || time.length() < 1) {
time = element.select("span.tail-info").text();
if(time.contains("楼")) {
time = time.split("楼")[1].trim();
}
}
if(time == null || time.trim().length() < 1) {
Pattern pa = Pattern.compile("date&quot;:&quot;(.*?)&quot");
Matcher ma = pa.matcher(element.toString());
while(ma.find()) {
time = ma.group(0);
time = time.split("date&quot;:&quot;")[1].split("&quot")[0];
break;
}
}
Pattern pa2 = Pattern.compile("post_id&quot(.*?),&quot");
Matcher ma2 = pa2.matcher(element.toString());
while(ma2.find()) {
tid = ma2.group(0);
tid = tid.split("&quot;:")[1].split(",&quot")[0];
break;
}
if(time != null && time.length() > 1) {
TiebaData tbd = new TiebaData("http://tieba.baidu.com/p/"+aid, title, time, tid, null, author, content, aid);
System.out.println(tbd.toString());
list.add(tbd);
}
}
if(elementes.size()==0){
more = false;
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 下載百度貼吧具体页面数据
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @param page
* @param @return
* @param @throws Exception 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String url, Proxy proxy) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
headerMap.put("Host", "tieba.baidu.com");
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
/**
/**
* @Title: downloadHtml
* @author hero
* @Description: 下載百度貼吧數據
......@@ -168,11 +306,11 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
String url = null;
if (word != null) {
if(tiebaName!=null){
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(word, "utf-8")+"&qw="+
URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(word, "GBK")+"&qw="+
URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
}else{
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="+
URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
}
}
System.out.println(url);
......
......@@ -225,7 +225,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//添加到数据集合中
if(title != null && !title.equals("") && source!=null && time!=null){
NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
logger.info("搜狗新闻数据:{}", newsData);
// logger.info("搜狗新闻数据:{}", newsData);
list.add(newsData);
}
if(!type.equals("other")){
......
......@@ -42,6 +42,27 @@ public class DataCrawler {
}
/**
*
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词和时间,百度新闻数量
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public static int getBaiduNewsCount(String word, String startTime, String endTime, Proxy proxy,String cookie){
try {
return BaiduNewsCrawlerParse.getBaiduNewsCount(word, startTime, endTime, proxy,cookie);
} catch (Exception e) {
e.printStackTrace();
return -1;
}
}
/**
*
* @Title: getBaiduNewsDataByTitle
* @author hero
......@@ -177,6 +198,24 @@ public class DataCrawler {
}
/**
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根据关键词采集贴吧数据
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<TiebaData> 返回类型
*/
public static List<TiebaData> getBaiduTiebaAnswserDataByUrl(String url, Proxy proxy){
try {
return BaiduTiebaCrawlerParse.getBaiduTiebaAnswerDataByUrl(url, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根据关键词采集指定贴吧内数据
......
package com.zhiwei.media_data_crawler.test;
import java.net.Proxy;
import java.util.List;
import org.junit.Test;
import com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.DouBanData;
import com.zhiwei.media_data_crawler.entity.LunTanData;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.media_data_crawler.entity.TiebaData;
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
public class DataCrawlerTest {
@Test
public void getSoNewsTest(){
String word = "马云"; //关键词
String startTime = "2017-03-01 00:00:00"; //开始时间
String endTime = "2017-03-01 23:59:59"; //结束时间
Proxy proxy = null; //代理IP,不用可不填写
try {
// //百度新闻采集demo
// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
// //搜狗知乎采集
// List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
// System.out.println(zhihuList.size());
// //Baidu貼吧採集
// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
// //天涯论坛采集
// List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
//豆瓣采集
// String type = "topic"; //topic 为指定话题采集,note为指定日记采集
// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
//package com.zhiwei.media_data_crawler.test;
//
//import java.net.Proxy;
//import java.util.List;
//
//import org.junit.Test;
//
//import com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse;
//import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
//import com.zhiwei.media_data_crawler.data.DataCrawler;
//import com.zhiwei.media_data_crawler.entity.DouBanData;
//import com.zhiwei.media_data_crawler.entity.LunTanData;
//import com.zhiwei.media_data_crawler.entity.NewsData;
//import com.zhiwei.media_data_crawler.entity.TiebaData;
//import com.zhiwei.media_data_crawler.entity.ZhiHuData;
//
//public class DataCrawlerTest {
//
//
//
//
//
// @Test
// public void getSoNewsTest(){
// String word = "马云"; //关键词
// String startTime = "2017-03-01 00:00:00"; //开始时间
// String endTime = "2017-03-01 23:59:59"; //结束时间
// Proxy proxy = null; //代理IP,不用可不填写
// try {
//// //百度新闻采集demo
//// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
//// //搜狗新闻关键词采集demo
//// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
//// //360新闻采集demo
//// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//// //搜狗知乎采集
//// List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
//// System.out.println(zhihuList.size());
//// //Baidu貼吧採集
//// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
//// System.out.println("------------------------");
//// String url = "http://tieba.baidu.com/p/5331709274";
//// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaAnswserDataByUrl(url, proxy);
//// for(TiebaData tb : tiebaList) {
//// System.out.println(tb.toString());
//// }
//// String cookie = "BAIDUID=4DB3FA13736131DBC2094C010E6EBCB0:FG=1; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; PSTM=1522304033; Hm_lvt_e9e114d958ea263de46e080563e254c4=1528266119,1528440375,1528878557,1529378030; Hm_lvt_0c8070895132126fa3ba3bb7df1ac58e=1531536328,1531536834; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; BD_CK_SAM=1; BDSFRCVID=N0PsJeC626Wob6j7VuCWhrB7leysxf7TH6ao-y68-1MGjomR6XA2EG0PDx8g0Kubh1opogKKKgOTHIjP; H_BDCLCKID_SF=fRkq_C-aJIvbfP0k-4rM-JL3hgT22-us2mTAQhcH0KLKMU8GhMnjjpKl3G73bfDfaIItLhQPKfb1MRjV553GXxvBbnQm3RLJtR6m_h5TtUJkeCnTDMRhqt0_hnOyKMniWKv9-pnY2ft0bD0Cjj8hj5PW5ptX5tcJK6TJW5rJabC3q45qKU6qLT5Xjn-t2UR2JaczQlvS-q_bDbKG5UjI-l0njxQA-TOIBe-q0Mjn04KMhDjJjUonDh8pbG7MJUntHCnKWMbO5hvvOn3O3M7zMlOhDG8HqTLfJJ-sL-35HJOjfn74MITjhPrH-UIsWjOT-2Q-5KL-Jf3oHCQFjpjhLp-JK4Kf2PQ8-5bh_MbdJJjoKtICeKFaXf-qhUcNtqvCteTxoUJgMInJhhvG-4Aahn8ebPRiB-b9QgbABftLK-oj-D84j58-3H; FP_UID=de1696570823638bb79d2aa9183ff687; locale=zh; BD_BOXFO=_avOi_uo26GwC; userId=1533024028678; Hm_lvt_348091a80fe10e213d94a7de762bbd44=1533024045; Hm_lpvt_348091a80fe10e213d94a7de762bbd44=1533089688; BDRCVFR[iL4hrzJ0zlT]=mk3SLVN4HKm; PSINO=3; H_PS_PSSID=; BDSVRTM=125";
//// System.out.println(DataCrawler.getBaiduNewsCount(word, startTime, endTime, proxy,cookie));
//// //天涯论坛采集
//// List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
// //豆瓣采集
//// String type = "topic"; //topic 为指定话题采集,note为指定日记采集
//// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
//
//
// } catch (Exception e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
// }
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment