Commit 6c18504b by zhiwei

1.添加百度贴吧、天涯论坛、豆瓣(话题+日记)按照关键词采集程序

parent 06f917df
...@@ -30,10 +30,26 @@ ...@@ -30,10 +30,26 @@
其它类的可看相应的源码,里面有休眠时间等设置 其它类的可看相应的源码,里面有休眠时间等设置
#####更新提示2018-03-08 更新
本次更新为添加贴吧\论\豆瓣(话题+日记)根据关键词采集功能,使用demo
//百度贴吧采集
String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
//天涯论坛采集
List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
//豆瓣采集
List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
//天涯论坛采集
List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
//豆瓣采集
String type = "topic"; //topic 为指定话题采集,note为指定日记采集
List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
##### 摘要 ##### 摘要
> 这是一个基于OKHttp+Jsoup实现的网页抓取及解析功能的搜索引擎采集爬虫,目前包含:百度新闻、搜狗新闻、360新闻、搜狗知乎采集四种根据关键词采集功能 > 这是一个基于OKHttp+Jsoup实现的网页抓取及解析功能的搜索引擎采集爬虫,
的爬虫项目 目前包含:百度新闻、搜狗新闻、360新闻、搜狗知乎采集、贴吧、天涯论坛、豆瓣
按照关键词采集数据的爬虫项目
##### maven ##### maven
...@@ -58,8 +74,18 @@ ...@@ -58,8 +74,18 @@
List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy); List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//搜狗知乎采集 //搜狗知乎采集
List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy); List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
//百度贴吧采集
String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
//天涯论坛采集
List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
//豆瓣采集
List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
//天涯论坛采集
List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
//豆瓣采集
String type = "topic"; //topic 为指定话题采集,note为指定日记采集
List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
......
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.entity.TiebaData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(BaiduTiebaCrawlerParse.class);
/**
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根據關鍵詞獲取百度貼吧數據(最多50頁)
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @return
* @param @throws Exception 设定文件
* @return List<TiebaData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName) throws Exception {
List<TiebaData> list = new ArrayList<TiebaData>();
int page = 0;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 50) {
more = false;
}
String htmlBody = downloadHtml(word, proxy, tiebaName, page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
List<TiebaData> dataList = (List<TiebaData>) dataMap.get("data");
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
ZhiWeiTools.sleep(3000);
}
return list;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 下載百度貼吧數據
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @param page
* @param @return
* @param @throws Exception 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, Proxy proxy, String tiebaName,
int page) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, tiebaName, page);
headerMap.put("Host", "tieba.baidu.com");
headerMap.put("Referer", url);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析Baidu貼吧數據
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @return
* @param @throws Exception 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<TiebaData> list = new ArrayList<TiebaData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/** 判断是否有下一页 **/
if (document.select("a.next") == null) {
more = false;
} else {
if (!document.select("[class=\"pager pager-search\"]").text().contains("下一页")) {
more = false;
}
}
// 开始解析
Elements elementes = document.select("div.s_post");
String time = null;
String source = null;
String link = null;
String title = null;
String content = null;
String tid = null;
String author = null;
for(Element element : elementes) {
title = element.select("span.p_title").select("a").text().replace("回复:", "");
link = "http://tieba.baidu.com"+element.select("span.p_title").select("a").attr("href");
tid = element.select("span.p_title").select("a").attr("data-tid");
source = element.select("a.p_forum").select("font.p_violet").text();
content = element.select("div.p_content").text();
try {
author = element.select("a").select("font.p_violet").text().split(" ")[1];
time = element.select("font.p_date").text();
TiebaData tiebaData = new TiebaData(link, title, time, tid, source, author, content, word);
System.out.println(tiebaData);
list.add(tiebaData);
}catch (Exception e) {
logger.debug("无作者 或者 无来源");
continue;
}
}
if(elementes.size()==0){
more = false;
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
* @Title: getUrl
* @author hero
* @Description: 拼接請求鏈接
* @param @param word
* @param @param tiebaName
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private static String getUrl(String word, String tiebaName, int page) {
String url = null;
if (word != null) {
if(tiebaName!=null){
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(word, "utf-8")+"&qw="+
URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
}else{
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="+
URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
}
}
System.out.println(url);
return url;
}
}
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.media_data_crawler.entity.DouBanData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class DoubanCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(DoubanCrawlerParse.class);
/**
*
* @Title: getDoubanData
* @author hero
* @Description: 根据关键词获取豆瓣话题及日记数据
* @param @param word
* @param @param type type=topic,type=note
* @param @param proxy
* @param @param endTime
* @param @return
* @param @throws Exception 设定文件
* @return List<DouBanData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<DouBanData> getDoubanData(String word, String type,Proxy proxy) throws Exception {
List<DouBanData> list = new ArrayList<DouBanData>();
int page = 0;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 50) {
more = false;
}
String htmlBody = downloadHtml(word, type,proxy, page);
if (htmlBody != null) {
if(type.equals("topic")){
Map<String, Object> dataMap = analysisNoteData(htmlBody, proxy, word);
List<DouBanData> dataList = (List<DouBanData>) dataMap.get("data");
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
}else if(type.equals("note")){
Map<String, Object> dataMap = analysisTopicData(htmlBody, proxy, word);
List<DouBanData> dataList = (List<DouBanData>) dataMap.get("data");
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
}
} else {
more = false;
}
page++;
}
return list;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 下載百度貼吧數據
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @param page
* @param @return
* @param @throws Exception 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, String type,Proxy proxy,
int page) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, type, page);
headerMap.put("Host", "search.tianya.cn");
headerMap.put("Referer", url);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
private static String downloadHtml(String url, String type,Proxy proxy) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Referer", url);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析Baidu貼吧數據
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @return
* @param @throws Exception 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisTopicData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<DouBanData> list = new ArrayList<DouBanData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/** 判断是否有下一页 **/
if (!document.select("div.paginator").select("span.next").text().contains("后页")) {
more = false;
}
// 开始解析
Elements elementes = document.select("div.topics").select("tr.pl");
String link = null;
String title = null;
String group = null;
String time = null;
int reply_count = 0;
if(elementes.size()==0){
more = false;
}
for (Element element : elementes) {
link = element.select("td.td-subject").select("a").attr("href");
title = element.select("td.td-subject").select("a").text();
time = element.select("td.td-time").attr("title");
reply_count = Integer.valueOf(element.select("td.td-reply").select("span").text().split("回应")[0].trim());
group = element.select("td").get(3).text();
DouBanData douban = new DouBanData(link, title, group, null, time, null, "话题", reply_count, null);
douban = getTopicSourceAndContent(link, "话题",proxy, douban);
if(douban!=null){
list.add(douban);
}
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
private static DouBanData getTopicSourceAndContent(String url,String type, Proxy proxy, DouBanData douban){
try {
String htmlBody = downloadHtml(url, type, proxy);
if(htmlBody!=null){
Document document = Jsoup.parse(htmlBody);
String time = document.select("div.topic-doc").select("h3").select("span.color-green").text();
douban.setTime(time);
String source = document.select("div.topic-doc").select("h3").select("span.from").select("a").text();
douban.setSource(source);
String content = document.select("div.topic-doc").select("div#link-report").select("div.topic-content").text();
douban.setContent(content);
}
ZhiWeiTools.sleep(1000);
return douban;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* @Title: analysisNoteData
* @author hero
* @Description: 解析豆瓣日记数据
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @param endTime
* @param @return
* @param @throws Exception 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisNoteData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<DouBanData> list = new ArrayList<DouBanData>();
boolean more = true;
JSONObject json = JSONObject.parseObject(htmlBody);
String items = json.getString("items").replace("\\n", "").replace("\\", "");
more = json.getBooleanValue("more");
Document document = Jsoup.parse(items);
Elements elements = document.select("div.result");
String link = null;
String title = null;
String content = null;
int likeNum = 0;
for (Element element : elements) {
link = "https://www.douban.com/note/"+element.select("div.title").select("h3").select("a").attr("onclick").split("sid: ")[1].split(", qcat")[0];
title = element.select("div.title").select("h3").select("a").text();
if(element.select("div.title").select("div.info").text().contains("人喜欢")){
likeNum = Integer.valueOf(element.select("div.title").select("div.info").text().split(" 人喜欢")[0].trim());
}else{
likeNum = 0;
}
content = element.select("div.content").select("p").text();
DouBanData douban = new DouBanData(link, title, null, null, null, content, "日记", null, likeNum);
douban = getNoteSourceTime(link, "日记", proxy, douban);
if(douban != null){
list.add(douban);
}
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
* @Title: getNoteSourceTime
* @author hero
* @Description: 根据链接获取豆瓣日记时间及来源
* @param @param url
* @param @param type
* @param @param proxy
* @param @param douban
* @param @return 设定文件
* @return DouBanData 返回类型
*/
private static DouBanData getNoteSourceTime(String url,String type, Proxy proxy, DouBanData douban){
try {
String htmlBody = downloadHtml(url, type, proxy);
if(htmlBody !=null ){
Document document = Jsoup.parse(htmlBody);
String source = document.select("div.article").select("div.note-container").attr("data-author");
String time = document.select("div.article").select("div.note-container").select("span.pub-date").text();
String content = document.select("div.article").select("div#link-report").text();
douban.setSource(source);
douban.setTime(time);
if(content!=null){
douban.setContent(content);
}
}
ZhiWeiTools.sleep(1000);
return douban;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* @Title: getUrl
* @author hero
* @Description: 拼接請求鏈接
* @param @param word
* @param @param tiebaName
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private static String getUrl(String word, String type,int page) {
String url = null;
if (word != null) {
if(type.equals("topic")){
url = "https://www.douban.com/group/search?q="+URLCodeUtil.getURLEncode(word, "utf-8")+"&start="+page*50+"&cat=1013&sort=time";
}else if(type.equals("note")){
url = "https://www.douban.com/j/search?q="+URLCodeUtil.getURLEncode(word, "utf-8")+"&start="+page*20+"&cat=1015";
}
}
System.out.println(url);
return url;
}
}
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.entity.LunTanData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class TianYaCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(TianYaCrawlerParse.class);
private static final String pt = "天涯论坛";
/**
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根據關鍵詞獲取百度貼吧數據(最多50頁)
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @return
* @param @throws Exception 设定文件
* @return List<TiebaData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<LunTanData> getLunTanData(String word, Proxy proxy, String endTime) throws Exception {
List<LunTanData> list = new ArrayList<LunTanData>();
int page = 0;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 50) {
more = false;
}
String htmlBody = downloadHtml(word, proxy, page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word, endTime);
List<LunTanData> dataList = (List<LunTanData>) dataMap.get("data");
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
ZhiWeiTools.sleep(3000);
}
return list;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 下載百度貼吧數據
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @param page
* @param @return
* @param @throws Exception 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, Proxy proxy,
int page) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, page);
headerMap.put("Host", "search.tianya.cn");
headerMap.put("Referer", url);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析Baidu貼吧數據
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @return
* @param @throws Exception 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word, String endTime) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<LunTanData> list = new ArrayList<LunTanData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/** 判断是否有下一页 **/
if (!document.select("div.long-pages").select("a").text().contains("下一页")) {
more = false;
}
// 开始解析
Elements elementes = document.select("div.searchListOne").select("ul").select("li");
String time = null;
String source = null;
String link = null;
String title = null;
String content = null;
String author = null;
Integer reply_count = 0;
for(Element element : elementes) {
title = element.select("div").select("h3").select("a").text();
link = element.select("div").select("h3").select("a").attr("href");
content = element.select("div").select("p").text();
source = element.select("p.source").select("a").get(0).text();
author = element.select("p.source").select("a").get(1).text();
time = element.select("p.source").select("span").get(0).text();
reply_count = Integer.valueOf(element.select("p.source").select("span").get(1).text());
LunTanData luntanData = new LunTanData(link, title, time, source, author, content, reply_count, pt, word);
Date date = TimeParse.stringFormartDate(time);
Date endDate = TimeParse.stringFormartDate(endTime);
if(date.before(endDate)){
more = false;
}else{
System.out.println(luntanData);
list.add(luntanData);
}
}
if(elementes.size()==0){
more = false;
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
* @Title: getUrl
* @author hero
* @Description: 拼接請求鏈接
* @param @param word
* @param @param tiebaName
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private static String getUrl(String word, int page) {
String url = null;
if (word != null) {
url = "http://search.tianya.cn/bbs?q="+URLCodeUtil.getURLEncode(word, "utf-8")
+"&s=4&f=0&pn="+page;
}
System.out.println(url);
return url;
}
}
...@@ -4,10 +4,16 @@ import java.net.Proxy; ...@@ -4,10 +4,16 @@ import java.net.Proxy;
import java.util.List; import java.util.List;
import com.zhiwei.media_data_crawler.crawler.BaiduNewsCrawlerParse; import com.zhiwei.media_data_crawler.crawler.BaiduNewsCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.DoubanCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SoNewsCrawlerParse; import com.zhiwei.media_data_crawler.crawler.SoNewsCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SougouNewsCrawlerParse; import com.zhiwei.media_data_crawler.crawler.SougouNewsCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse; import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.TianYaCrawlerParse;
import com.zhiwei.media_data_crawler.entity.DouBanData;
import com.zhiwei.media_data_crawler.entity.LunTanData;
import com.zhiwei.media_data_crawler.entity.NewsData; import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.media_data_crawler.entity.TiebaData;
import com.zhiwei.media_data_crawler.entity.ZhiHuData; import com.zhiwei.media_data_crawler.entity.ZhiHuData;
public class DataCrawler { public class DataCrawler {
...@@ -152,4 +158,80 @@ public class DataCrawler { ...@@ -152,4 +158,80 @@ public class DataCrawler {
} }
} }
/**
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根据关键词采集贴吧数据
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<TiebaData> 返回类型
*/
public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy){
try {
return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, null);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根据关键词采集指定贴吧内数据
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @return 设定文件
* @return List<TiebaData> 返回类型
*/
public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName){
try {
return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, tiebaName);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* @Title: getLunTanData
* @author hero
* @Description: 根据关键词采集天涯论坛数据
* @param @param word
* @param @param proxy
* @param @param endTime
* @param @return 设定文件
* @return List<LunTanData> 返回类型
*/
public static List<LunTanData> getLunTanData(String word, Proxy proxy, String endTime){
try {
return TianYaCrawlerParse.getLunTanData(word, proxy, endTime);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* @Title: getDouBanData
* @author hero
* @Description: 根据关键词采集豆瓣数据
* @param @param word
* @param @param type type=topic,type=note
* @param @param proxy
* @param @param endTime
* @param @return 设定文件
* @return List<DouBanData> 返回类型
*/
public static List<DouBanData> getDouBanData(String word, String type, Proxy proxy){
try {
return DoubanCrawlerParse.getDoubanData(word, type, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
} }
package com.zhiwei.media_data_crawler.entity;
public class DouBanData {
private String url; //地址
private String title; //标题
private String source; //来源
private String group; //小组
private String time; //时间
private String content; //内容
private Integer reply_count; //回复数
private String type; //类型
private Integer like_count; //点赞数
public DouBanData() {}
@Override
public String toString(){
return "new DouBanData["
+ "url = " + url
+ ", title = " + title
+ ", source = " + source
+ ", group = " + group
+ ", time = " + time
+ ", type = " + type
+ ", like_count = " + like_count
+ ", reply_count = " + reply_count
+ "]";
}
public DouBanData(String url, String title, String group,String source, String time,
String content, String type, Integer reply_count, Integer like_count) {
this.url = url;
this.title = title;
this.group = group;
this.source = source;
this.time = time;
this.content = content;
this.type = type;
this.reply_count = reply_count;
this.like_count = like_count;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getGroup() {
return group;
}
public void setGroup(String group) {
this.group = group;
}
public void setReply_count(Integer reply_count) {
this.reply_count = reply_count;
}
public void setLike_count(Integer like_count) {
this.like_count = like_count;
}
}
package com.zhiwei.media_data_crawler.entity;
import java.io.Serializable;
public class LunTanData implements Serializable{
private static final long serialVersionUID = 6057811459180925060L;
private String url; //地址
private String title; //標題
private String time; //時間
private String source; //來源
private String author; //回復者或樓主
private String content; //回復內容
private Integer reply_count; //回復數
private String pt; //平台
private String word; //關鍵詞
public LunTanData(){}
public LunTanData(String url, String title,String time, String source,
String author,String content, Integer reply_count, String pt, String word){
this.url = url;
this.title = title;
this.time = time;
this.source = source;
this.author = author;
this.content = content;
this.reply_count = reply_count;
this.pt = pt;
this.word = word;
}
@Override
public String toString(){
return "new LunTanData["
+ "url = " + url
+ ", title = " + title
+ ", time = " + time
+ ", source = " + source
+ ", author = " + author
+ ", content = " + content
+ ", reply_count = " + reply_count
+ ", pt = " + pt
+ ", word = " + word
+ "]";
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public Integer getReply_count() {
return reply_count;
}
public void setReply_count(Integer reply_count) {
this.reply_count = reply_count;
}
public String getPt() {
return pt;
}
public void setPt(String pt) {
this.pt = pt;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
}
package com.zhiwei.media_data_crawler.entity;
import java.io.Serializable;
public class TiebaData implements Serializable{
private static final long serialVersionUID = 1L;
private String url; //地址
private String title; //標題
private String time; //時間
private String tid; //tid
private String source; //來源
private String author; //回復者或樓主
private String content; //回復內容
private String word; //關鍵詞
public TiebaData(){}
public TiebaData(String url, String title,String time, String tid,String source,
String author,String content, String word){
this.url = url;
this.title = title;
this.time = time;
this.tid = tid;
this.source = source;
this.author = author;
this.content = content;
this.word = word;
}
@Override
public String toString(){
return "new TiebaData["
+ "url = " + url
+ ", title = " + title
+ ", time = " + time
+ ", tid = " + tid
+ ", source = " + source
+ ", author = " + author
+ ", content = " + content
+ ", word = " + word
+ "]";
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getTid() {
return tid;
}
public void setTid(String tid) {
this.tid = tid;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
}
//package com.zhiwei.media_data_crawler.test; package com.zhiwei.media_data_crawler.test;
//
//import java.net.Proxy; import java.net.Proxy;
//import java.util.List; import java.util.List;
//
//import org.junit.Test; import org.junit.Test;
//
//import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse; import com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse;
//import com.zhiwei.media_data_crawler.data.DataCrawler; import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
//import com.zhiwei.media_data_crawler.entity.NewsData; import com.zhiwei.media_data_crawler.data.DataCrawler;
//import com.zhiwei.media_data_crawler.entity.ZhiHuData; import com.zhiwei.media_data_crawler.entity.DouBanData;
// import com.zhiwei.media_data_crawler.entity.LunTanData;
//public class DataCrawlerTest { import com.zhiwei.media_data_crawler.entity.NewsData;
// import com.zhiwei.media_data_crawler.entity.TiebaData;
// import com.zhiwei.media_data_crawler.entity.ZhiHuData;
//
// public class DataCrawlerTest {
//
// @Test
// public void getSoNewsTest(){
// String word = "马云"; //关键词
// String startTime = "2017-03-01 00:00:00"; //开始时间
// String endTime = "2017-03-01 23:59:59"; //结束时间 @Test
// Proxy proxy = null; //代理IP,不用可不填写 public void getSoNewsTest(){
String word = "马云"; //关键词
String startTime = "2017-03-01 00:00:00"; //开始时间
String endTime = "2017-03-01 23:59:59"; //结束时间
Proxy proxy = null; //代理IP,不用可不填写
try {
// //百度新闻采集demo // //百度新闻采集demo
//// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy); // List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
//// //搜狗新闻关键词采集demo // //搜狗新闻关键词采集demo
//// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy); // List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
//// //360新闻采集demo // //360新闻采集demo
//// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy); // List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//// //搜狗知乎采集 // //搜狗知乎采集
//// List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy); // List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
//// System.out.println(zhihuList.size()); // System.out.println(zhihuList.size());
// // //Baidu貼吧採集
// } // String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
// // List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
//} // //天涯论坛采集
// List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
//豆瓣采集
// String type = "topic"; //topic 为指定话题采集,note为指定日记采集
// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment