Commit 88e4e8c0 by win 10

天涯论坛添加采集开始时间,知乎添加图片量采集

parent ed4f527e
...@@ -96,6 +96,68 @@ public class BaiduInforCrawlerParse { ...@@ -96,6 +96,68 @@ public class BaiduInforCrawlerParse {
} }
/** /**
* @Title: getBaiduNewsData
* @author hero
* @Description: 采集百度新闻数据
* @param @param
* word
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getBaiduInforDataManyWord(String word,String endTime,String saveWord) throws Exception {
List<NewsData> list = new ArrayList<>();
GroupSync groupSync = new GroupSync();
for(int i = 0;i< 10;i++) {
groupSync.add();
String url = getUrl(word, i,endTime);
TaskBoot.blockingAsync(() -> {
try {
String htmlBody = downloadHtml(url);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody,saveWord);
List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
System.out.println(url);
list.addAll(dataList);
}
} catch (Exception e) {
} finally {
groupSync.done();
}
});
}
groupSync.await();
// while (more) {
// String htmlBody = downloadHtml(word, page,null);
// if (htmlBody != null) {
// Map<String, Object> dataMap = analysisData(htmlBody, word);
// List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
// list.addAll(dataList);
// logger.info("第 {} 页 ,采集到 {} 条",page,list.size());
// System.out.println("第 "+page+" 页 ,采集到 "+list.size()+" 条");
// more = (Boolean) dataMap.get("more");
// } else {
// more = false;
// }
// page++;
// if(DataCrawler.sleepTime != null ){
// ZhiWeiTools.sleep(DataCrawler.sleepTime);
// }
// // 最大页数为30
// if (page > 30) {
// more = false;
// }
// }
return list;
}
/**
* @Title: downloadHtml * @Title: downloadHtml
* @author hero * @author hero
* @Description: 获取数据流 * @Description: 获取数据流
...@@ -303,7 +365,7 @@ public class BaiduInforCrawlerParse { ...@@ -303,7 +365,7 @@ public class BaiduInforCrawlerParse {
//https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E6%B5%99%E6%B1%9F&medium=1&rn=50&gpc=stf%3D1546272000%2C1548950400%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=0 //https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E6%B5%99%E6%B1%9F&medium=1&rn=50&gpc=stf%3D1546272000%2C1548950400%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=0
// public static void main(String[] args) throws Exception { // public static void main(String[] args) throws Exception {
// String url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=1&wd=%E5%A5%94%E9%A9%B0+%E6%BC%8F%E6%B1%BD%E6%B2%B9&medium=0&rn=50&gpc=stf%3D0%2C1496246399%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_l_more&x_bfe_rqs=03E80&x_bfe_tjscore=0.332314&scs=2546086922&sortBy=0&pn=0"; // String url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=1&wd=%E5%A5%94%E9%A9%B0+%E6%BC%8F%E6%B1%BD%E6%B2%B9&medium=0&rn=50&gpc=stf%3D0%2C1496246399%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_l_more&x_bfe_rqs=03E80&x_bfe_tjscore=0.332314&scs=2546086922&sortBy=0&pn=0";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER); // ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER, 10000008);//初始化代理
// List<NewsData> ndList = getBaiduInforData("马云","2019-07-04 23:59:59"); // List<NewsData> ndList = getBaiduInforData("马云","2019-07-04 23:59:59");
// System.out.println(ndList.size()); // System.out.println(ndList.size());
// String result = downloadHtml(url,0); // String result = downloadHtml(url,0);
......
...@@ -17,24 +17,65 @@ import org.jsoup.nodes.Element; ...@@ -17,24 +17,65 @@ import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler; import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.TiebaData; import com.zhiwei.media_data_crawler.entity.TiebaData;
import com.zhiwei.media_data_crawler.excelentity.DataExcel;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response; import okhttp3.Response;
/**
* 百度贴吧采集
* @author xMx
* @date 2019年10月31日 下午5:47:28
*/
public class BaiduTiebaCrawlerParse { public class BaiduTiebaCrawlerParse {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class); private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class);
// public static void main(String[] args) {
// ProxyFactory.init(SimpleConfig.builder().registry("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181")
// .appName("xumiaoxin").appId(10000008).group("local").build());
//
// List<DataExcel> bodyList = new ArrayList<>();
//
// try {
// List<String> wordList = WordsReadFile.getWords("D:\\crawlerdata\\关键词6.txt");
// for(String s:wordList) {
// List<TiebaData> dataList = getBaiduTiebaData(s, null, null);
// dataList.forEach(data -> {
// DataExcel dataExcel = new DataExcel();
// dataExcel.setAuthor(data.getAuthor());
// dataExcel.setContent(data.getContent());
// dataExcel.setSource(data.getSource());
// dataExcel.setTid(data.getTid());
// dataExcel.setTime(data.getTime());
// dataExcel.setTitle(data.getTitle());
// dataExcel.setUrl(data.getUrl());
// dataExcel.setWord(data.getWord());
//
// bodyList.add(dataExcel);
// });
// }
// } catch (Exception e) {
// e.toString();
// }
//
// EasyExcel.write("D:\\crawlerdata\\百度贴吧-花木兰2.xlsx", DataExcel.class).sheet("数据").doWrite(bodyList);
// System.out.println("导出成功");
// }
/** /**
* @Title: getBaiduTiebaData * @Title: getBaiduTiebaData
* @author hero * @author hero
* @Description: 根據關鍵詞獲取百度貼吧數據(最多50頁) * @Description: 根据关键词获取百度贴吧数据
* @param @param word * @param @param word
* @param @param proxy * @param @param proxy
* @param @param tiebaName * @param @param tiebaName
...@@ -43,28 +84,29 @@ public class BaiduTiebaCrawlerParse { ...@@ -43,28 +84,29 @@ public class BaiduTiebaCrawlerParse {
* @return List<TiebaData> 返回类型 * @return List<TiebaData> 返回类型
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName) throws Exception { public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName, String startTime) throws Exception {
List<TiebaData> list = new ArrayList<TiebaData>(); List<TiebaData> list = new ArrayList<TiebaData>();
int page = 0; int page = 1;
boolean more = true; boolean more = true;
while (more) { while (more) {
// 最大页数为20 try {
if (page > 50) {
more = false;
}
String htmlBody = downloadHtml(word, proxy, tiebaName, page); String htmlBody = downloadHtml(word, proxy, tiebaName, page);
if (htmlBody != null) { if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word); Map<String, Object> dataMap = analysisData(htmlBody, proxy, word, startTime);
List<TiebaData> dataList = (List<TiebaData>) dataMap.get("data"); List<TiebaData> dataList = (List<TiebaData>) dataMap.get("data");
list.addAll(dataList); list.addAll(dataList);
more = (Boolean) dataMap.get("more"); more = (Boolean) dataMap.get("more");
} else {
more = false;
} }
page++; page++;
if(DataCrawler.sleepTime!=null){ } catch (Exception e) {
ZhiWeiTools.sleep(DataCrawler.sleepTime); logger.error("百度贴吧数据获取失败", e);
} }
// //最大页数为75页
// if (page > 20) {
// more = false;
// }
} }
return list; return list;
} }
...@@ -85,7 +127,7 @@ public class BaiduTiebaCrawlerParse { ...@@ -85,7 +127,7 @@ public class BaiduTiebaCrawlerParse {
public static Map<String,Object> getBaiduTiebaData(String word, Proxy proxy, String tiebaName,int page) throws Exception { public static Map<String,Object> getBaiduTiebaData(String word, Proxy proxy, String tiebaName,int page) throws Exception {
String htmlBody = downloadHtml(word, proxy, tiebaName, page); String htmlBody = downloadHtml(word, proxy, tiebaName, page);
if (htmlBody != null) { if (htmlBody != null) {
return analysisData(htmlBody, proxy, word); return analysisData(htmlBody, proxy, word, null);
} }
return null; return null;
} }
...@@ -270,6 +312,9 @@ public class BaiduTiebaCrawlerParse { ...@@ -270,6 +312,9 @@ public class BaiduTiebaCrawlerParse {
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址 // 获取链接地址
String url = getUrl(word, tiebaName, page); String url = getUrl(word, tiebaName, page);
logger.info("采集进度 {} === {}", word , url);
headerMap.put("Host", "tieba.baidu.com"); headerMap.put("Host", "tieba.baidu.com");
headerMap.put("Referer", url); headerMap.put("Referer", url);
// 下载数据页面 // 下载数据页面
...@@ -283,11 +328,9 @@ public class BaiduTiebaCrawlerParse { ...@@ -283,11 +328,9 @@ public class BaiduTiebaCrawlerParse {
} }
return response.body().string(); return response.body().string();
} catch (Exception e) { } catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题", e);
if(i==3){ if(i==3){
throw e; throw e;
}else{
continue;
} }
} }
} }
...@@ -306,7 +349,7 @@ public class BaiduTiebaCrawlerParse { ...@@ -306,7 +349,7 @@ public class BaiduTiebaCrawlerParse {
* @param @throws Exception 设定文件 * @param @throws Exception 设定文件
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{ private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word, String startTime) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>(); Map<String, Object> resultMap = new HashMap<String, Object>();
List<TiebaData> list = new ArrayList<TiebaData>(); List<TiebaData> list = new ArrayList<TiebaData>();
boolean more = true; boolean more = true;
...@@ -338,16 +381,21 @@ public class BaiduTiebaCrawlerParse { ...@@ -338,16 +381,21 @@ public class BaiduTiebaCrawlerParse {
try { try {
author = element.select("a").select("font.p_violet").text().split(" ")[1]; author = element.select("a").select("font.p_violet").text().split(" ")[1];
time = element.select("font.p_date").text(); time = element.select("font.p_date").text();
long artTime = TimeParse.stringFormartDate(time).getTime();//文章时间
long star = TimeParse.stringFormartDate(startTime).getTime();//采集开始时间
if(artTime < star) {
more = false;
break;
}
TiebaData tiebaData = new TiebaData(link, title, time, tid, source, author, content, word); TiebaData tiebaData = new TiebaData(link, title, time, tid, source, author, content, word);
list.add(tiebaData); list.add(tiebaData);
}catch (Exception e) { }catch (Exception e) {
logger.debug("无作者 或者 无来源"); logger.debug("无作者 或者 无来源");
continue;
} }
} }
if(elementes.size()==0){
more = false;
}
resultMap.put("data", list); resultMap.put("data", list);
resultMap.put("more", more); resultMap.put("more", more);
return resultMap; return resultMap;
......
package com.zhiwei.media_data_crawler.crawler;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.media_data_crawler.entity.ZhihuAnswer;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.tools.timeparse.TimeParse;
/**
* 出知乎评论(图片数据量和用户评论排名)
* @author xMx
* @date 2019年10月19日 上午11:01:29
*/
public class CrawlerTest {
public static void main(String[] args) throws Exception {
//代理地址
String address = "zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181";
String appName = "xumaioxin";
long appId = 10000008L;
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group("local").build());
String wordFileName = "D://crawlerdata/关键词5.txt";
String dataFileName = "D://crawlerdata/知乎2.xlsx";
String endTime = "1970-01-01 23:59:59";
List<String> wordList = WordsReadFile.getWords(wordFileName);
List<Map<String, Object>> resultList = new ArrayList<>();
for(String s:wordList) {
// List<ZhihuAnswer> zhihuAnswer = ZhihuAnwserCrawlerParse.getAnswerList(s,TimeParse.stringFormartDate(endTime),ProxyHolder.NAT_HEAVY_PROXY);
List<ZhihuAnswer> zhihuAnswer = ZhihuAnwserCrawlerParse.getPictureCount(s);
for(ZhihuAnswer z:zhihuAnswer) {
Map<String, Object> map = new HashMap<>();
map.put("地址", z.getUrl());
map.put("问题地址", z.getFrom_url());
map.put("标题", z.getTitle());
map.put("时间", z.getTime());
map.put("发布者", z.getAuthor());
map.put("作者地址", z.getAuthorUrl());
map.put("内容", z.getContent());
map.put("回答点赞数", z.getAttitudes_count());
map.put("回答评论数", z.getComment_count());
map.put("问题点赞数", z.getFollow_count());
map.put("问题评论数", z.getBord_count());
map.put("图片数量", z.getImgCount());
map.put("排名", z.getSort());
resultList.add(map);
}
}
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<>();
headList.add("地址");
headList.add("问题地址");
headList.add("标题");
headList.add("时间");
headList.add("发布者");
headList.add("作者地址");
headList.add("内容");
headList.add("回答点赞数");
headList.add("回答评论数");
headList.add("问题点赞数");
headList.add("问题评论数");
headList.add("图片数量");
headList.add("排名");
poi.exportExcel(dataFileName, "数据", headList, resultList);
System.out.println("导出成功");
}
}
...@@ -17,6 +17,7 @@ import com.zhiwei.crawler.utils.RequestUtils; ...@@ -17,6 +17,7 @@ import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.media_data_crawler.entity.JianshuUser; import com.zhiwei.media_data_crawler.entity.JianshuUser;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import okhttp3.MediaType;
import okhttp3.Response; import okhttp3.Response;
/** /**
...@@ -43,7 +44,7 @@ public class JianshuCrawler { ...@@ -43,7 +44,7 @@ public class JianshuCrawler {
headers.put("origin", "https://www.jianshu.com"); headers.put("origin", "https://www.jianshu.com");
headers.put("accept", "application/json"); headers.put("accept", "application/json");
headers.put("user-agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"); headers.put("user-agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url,headers,null), ProxyHolder.NAT_HEAVY_PROXY)){ try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url,okhttp3.RequestBody.create(MediaType.parse("application/json"), headers.toString())), ProxyHolder.NAT_HEAVY_PROXY)){
String result = response.body().string(); String result = response.body().string();
System.out.println(result); System.out.println(result);
if(result.contains("搜索过于频繁")) { if(result.contains("搜索过于频繁")) {
......
package com.zhiwei.media_data_crawler.crawler;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class WordsReadFile {
private static Logger logger = LoggerFactory.getLogger(WordsReadFile.class);
/**
*
* @Title: getWords
* @author hero
* @Description: 从txt文件中读取关键词
* @param @param
* wordFileName 关键词文件全路径
* @param @return 设定文件
* @return List<String> 返回类型
*/
public static List<String> getWords(String wordFileName) {
List<String> list = null;
try {
list = new ArrayList<String>();
BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(wordFileName),"GBK"));
String line = "";
while((line = br.readLine())!=null)
{
if(line.length() >= 1) {
list.add(line);
}
}
br.close();
return list;
} catch (IOException e) {
logger.debug("读取关键词文件失败 {}",e.getMessage());
return Collections.emptyList();
}
}
}
...@@ -22,6 +22,11 @@ import com.zhiwei.tools.tools.ZhiWeiTools; ...@@ -22,6 +22,11 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response; import okhttp3.Response;
/**
* 获取用户的回答列表,https://www.zhihu.com/people/xie-yu-shi-29/answers
* @author xMx
* @date 2020年3月3日 上午9:17:16
*/
public class ZhihuUserAnswerCrawlerParse { public class ZhihuUserAnswerCrawlerParse {
private static final Logger logger = LoggerFactory.getLogger(ZhihuUserAnswerCrawlerParse.class); private static final Logger logger = LoggerFactory.getLogger(ZhihuUserAnswerCrawlerParse.class);
......
...@@ -29,11 +29,15 @@ public class ZhihuAnswer implements Serializable { ...@@ -29,11 +29,15 @@ public class ZhihuAnswer implements Serializable {
private Integer bord_count; //问题评论数 private Integer bord_count; //问题评论数
private Integer imgCount; //图片数量
private Integer sort; //排名
public ZhihuAnswer(){} public ZhihuAnswer(){}
public ZhihuAnswer(String url, String from_url,String title, Date time, String author, public ZhihuAnswer(String url, String from_url,String title, Date time, String author,
String authorUrl ,String content, Integer attitudes_count, String authorUrl ,String content, Integer attitudes_count,
Integer comment_count,Integer follow_count,Integer bord_count){ Integer comment_count,Integer follow_count,Integer bord_count, Integer imgCount, Integer sort){
this.url = url; this.url = url;
this.from_url = from_url; this.from_url = from_url;
this.title = title; this.title = title;
...@@ -45,7 +49,8 @@ public class ZhihuAnswer implements Serializable { ...@@ -45,7 +49,8 @@ public class ZhihuAnswer implements Serializable {
this.comment_count = comment_count; this.comment_count = comment_count;
this.follow_count = follow_count; this.follow_count = follow_count;
this.bord_count = bord_count; this.bord_count = bord_count;
this.imgCount = imgCount;
this.sort = sort;
} }
@Override @Override
...@@ -62,6 +67,8 @@ public class ZhihuAnswer implements Serializable { ...@@ -62,6 +67,8 @@ public class ZhihuAnswer implements Serializable {
", comment_count=" + comment_count + ", comment_count=" + comment_count +
", follow_count=" + follow_count + ", follow_count=" + follow_count +
", bord_count=" + bord_count + ", bord_count=" + bord_count +
", imgCount=" + imgCount +
", sort=" + sort +
'}'; '}';
} }
...@@ -149,6 +156,22 @@ public class ZhihuAnswer implements Serializable { ...@@ -149,6 +156,22 @@ public class ZhihuAnswer implements Serializable {
this.bord_count = bord_count; this.bord_count = bord_count;
} }
public Integer getImgCount() {
return imgCount;
}
public void setImgCount(Integer imgCount) {
this.imgCount = imgCount;
}
public Integer getSort() {
return sort;
}
public void setSort(Integer sort) {
this.sort = sort;
}
public void setComment_count(Integer comment_count) { public void setComment_count(Integer comment_count) {
this.comment_count = comment_count; this.comment_count = comment_count;
} }
......
package com.zhiwei.media_data_crawler.excelentity;
import com.alibaba.excel.annotation.ExcelProperty;
/**
* easy导出文件标题
* @author xMx
* @date 2019年10月29日 上午9:15:40
*/
public class DataExcel {
@ExcelProperty(value = "地址",index = 0)
private String url;
@ExcelProperty("标题")
private String title;
@ExcelProperty("时间")
private String time;
@ExcelProperty("tid")
private String tid;
@ExcelProperty("来源")
private String source;
@ExcelProperty("回复者或楼主")
private String author;
@ExcelProperty("回复内容")
private String content;
@ExcelProperty("关键词")
private String word;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getTid() {
return tid;
}
public void setTid(String tid) {
this.tid = tid;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
}
...@@ -7,8 +7,10 @@ ...@@ -7,8 +7,10 @@
//import java.util.List; //import java.util.List;
//import java.util.Map; //import java.util.Map;
// //
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.media_data_crawler.data.DataCrawler; //import com.zhiwei.media_data_crawler.data.DataCrawler;
//import com.zhiwei.media_data_crawler.entity.LunTanData;
//import com.zhiwei.media_data_crawler.entity.ZhiHuData; //import com.zhiwei.media_data_crawler.entity.ZhiHuData;
//import com.zhiwei.tools.timeparse.TimeParse; //import com.zhiwei.tools.timeparse.TimeParse;
// //
...@@ -24,7 +26,7 @@ ...@@ -24,7 +26,7 @@
// String word = "58同城"; //关键词 // String word = "58同城"; //关键词
// String startTime = "2018-10-23 23:00:00"; //开始时间 // String startTime = "2018-10-23 23:00:00"; //开始时间
// String endTime = "2018-10-23 23:59:59"; //结束时间 // String endTime = "2018-10-23 23:59:59"; //结束时间
// Proxy proxy = null; //代理IP,不用可不填写 // ProxyHolder proxy = null; //代理IP,不用可不填写
// try { // try {
//// //百度新闻采集demo //// //百度新闻采集demo
//// List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy); //// List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
...@@ -35,8 +37,8 @@ ...@@ -35,8 +37,8 @@
//// //Baidu貼吧採集 //// //Baidu貼吧採集
//// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null //// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
//// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName); //// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
//// //天涯论坛采集 // //天涯论坛采集
//// List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime); // List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, startTime, endTime);
// //豆瓣采集 // //豆瓣采集
//// String type = "topic"; //topic 为指定话题采集,note为指定日记采集 //// String type = "topic"; //topic 为指定话题采集,note为指定日记采集
//// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy); //// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
...@@ -62,7 +64,7 @@ ...@@ -62,7 +64,7 @@
// //
// for(int i=0;i<words.length;i++){ // for(int i=0;i<words.length;i++){
// System.out.println(words[i]+" 开始采集"); // System.out.println(words[i]+" 开始采集");
// List<ZhiHuData> zhihuList = DataCrawler.getZhihuByWord(words[i], "a_week",endDate, proxy); // List<ZhiHuData> zhihuList = DataCrawler.getZhihuByWord(words[i], "a_week",endDate, null);
// System.out.println(words[i]+"=============="+zhihuList.size()); // System.out.println(words[i]+"=============="+zhihuList.size());
// for(ZhiHuData zhiHuData : zhihuList) { // for(ZhiHuData zhiHuData : zhihuList) {
// Map<String,Object> map = new HashMap<String,Object>(); // Map<String,Object> map = new HashMap<String,Object>();
...@@ -90,14 +92,4 @@ ...@@ -90,14 +92,4 @@
// } // }
// } // }
// //
//
//
//
//
//
//
//
//
//
//
//} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment