Commit 87ffee30 by [zhangzhiwei]

修改核心包版本

parent 7dd71cb4
......@@ -2,7 +2,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>media_data_crawler</artifactId>
<version>0.0.5-SNAPSHOT</version>
<version>0.0.6-SNAPSHOT</version>
<name>media_data_crawler</name>
<description>网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等</description>
......@@ -65,12 +65,12 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.9-SNAPSHOT</version>
<version>0.1.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>excelpoi</artifactId>
<version>0.0.1-SNAPSHOT</version>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.1.1-RELEASE</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package com.zhiwei.media_data_crawler.test;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
import com.zhiwei.tools.timeparse.TimeParse;
public class DataCrawlerTest {
public static void main(String[] args) {
DataCrawlerTest.getSoNewsTest();
}
public static void getSoNewsTest(){
String word = "58同城"; //关键词
String startTime = "2018-10-23 23:00:00"; //开始时间
String endTime = "2018-10-23 23:59:59"; //结束时间
Proxy proxy = null; //代理IP,不用可不填写
try {
// //百度新闻采集demo
// List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
// //Baidu貼吧採集
// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
// //天涯论坛采集
// List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
//豆瓣采集
// String type = "topic"; //topic 为指定话题采集,note为指定日记采集
// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
// List<NewsData> list = DataCrawler.getSoData("京东", "www.toutiao.com", "d", proxy);
Date endDate = TimeParse.stringFormartDate(endTime);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> dataList = new ArrayList<>();
List<String> headList = new ArrayList<>();
headList.add("url");
headList.add("title");
headList.add("pt");
headList.add("type");
headList.add("time");
headList.add("source");
headList.add("content");
headList.add("attitudes_count");
headList.add("answer_count");
headList.add("comment_count");
headList.add("word");
//搜狗知乎采集
String[] words = word.split("\\|");
for(int i=0;i<words.length;i++){
System.out.println(words[i]+" 开始采集");
List<ZhiHuData> zhihuList = DataCrawler.getZhihuByWord(words[i], "a_week",endDate, proxy);
System.out.println(words[i]+"=============="+zhihuList.size());
for(ZhiHuData zhiHuData : zhihuList) {
Map<String,Object> map = new HashMap<String,Object>();
map.put("url", zhiHuData.getUrl());
map.put("title", zhiHuData.getTitle());
map.put("pt", zhiHuData.getPt());
map.put("type", zhiHuData.getType());
map.put("time", zhiHuData.getTime());
map.put("source", zhiHuData.getSource());
map.put("content", zhiHuData.getContent());
map.put("attitudes_count", zhiHuData.getAttitudes_count());
map.put("answer_count", zhiHuData.getAnswer_count());
map.put("comment_count", zhiHuData.getComment_count());
map.put("word", zhiHuData.getWord());
dataList.add(map);
}
}
poi.exportExcel("F://知乎数据采集.xlsx", "0", headList, dataList);;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
//package com.zhiwei.media_data_crawler.test;
//
//import java.net.Proxy;
//import java.util.ArrayList;
//import java.util.Date;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.media_data_crawler.data.DataCrawler;
//import com.zhiwei.media_data_crawler.entity.ZhiHuData;
//import com.zhiwei.tools.timeparse.TimeParse;
//
//public class DataCrawlerTest {
//
// public static void main(String[] args) {
// DataCrawlerTest.getSoNewsTest();
// }
//
//
//
// public static void getSoNewsTest(){
// String word = "58同城"; //关键词
// String startTime = "2018-10-23 23:00:00"; //开始时间
// String endTime = "2018-10-23 23:59:59"; //结束时间
// Proxy proxy = null; //代理IP,不用可不填写
// try {
//// //百度新闻采集demo
//// List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
//// //搜狗新闻关键词采集demo
//// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
//// //360新闻采集demo
//// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//// //Baidu貼吧採集
//// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
//// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
//// //天涯论坛采集
//// List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
// //豆瓣采集
//// String type = "topic"; //topic 为指定话题采集,note为指定日记采集
//// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
//// List<NewsData> list = DataCrawler.getSoData("京东", "www.toutiao.com", "d", proxy);
//
// Date endDate = TimeParse.stringFormartDate(endTime);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<Map<String,Object>> dataList = new ArrayList<>();
// List<String> headList = new ArrayList<>();
// headList.add("url");
// headList.add("title");
// headList.add("pt");
// headList.add("type");
// headList.add("time");
// headList.add("source");
// headList.add("content");
// headList.add("attitudes_count");
// headList.add("answer_count");
// headList.add("comment_count");
// headList.add("word");
// //搜狗知乎采集
// String[] words = word.split("\\|");
//
// for(int i=0;i<words.length;i++){
// System.out.println(words[i]+" 开始采集");
// List<ZhiHuData> zhihuList = DataCrawler.getZhihuByWord(words[i], "a_week",endDate, proxy);
// System.out.println(words[i]+"=============="+zhihuList.size());
// for(ZhiHuData zhiHuData : zhihuList) {
// Map<String,Object> map = new HashMap<String,Object>();
// map.put("url", zhiHuData.getUrl());
// map.put("title", zhiHuData.getTitle());
// map.put("pt", zhiHuData.getPt());
// map.put("type", zhiHuData.getType());
// map.put("time", zhiHuData.getTime());
// map.put("source", zhiHuData.getSource());
// map.put("content", zhiHuData.getContent());
// map.put("attitudes_count", zhiHuData.getAttitudes_count());
// map.put("answer_count", zhiHuData.getAnswer_count());
// map.put("comment_count", zhiHuData.getComment_count());
// map.put("word", zhiHuData.getWord());
// dataList.add(map);
// }
//
// }
//
// poi.exportExcel("F://知乎数据采集.xlsx", "0", headList, dataList);;
//
// } catch (Exception e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
// }
//
//
//
//
//
//
//
//
//
//
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment