Commit 87ffee30 by [zhangzhiwei]

修改核心包版本

parent 7dd71cb4
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>media_data_crawler</artifactId> <artifactId>media_data_crawler</artifactId>
<version>0.0.5-SNAPSHOT</version> <version>0.0.6-SNAPSHOT</version>
<name>media_data_crawler</name> <name>media_data_crawler</name>
<description>网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等</description> <description>网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等</description>
...@@ -65,12 +65,12 @@ ...@@ -65,12 +65,12 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.0.9-SNAPSHOT</version> <version>0.1.1-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>excelpoi</artifactId> <artifactId>crawler-core</artifactId>
<version>0.0.1-SNAPSHOT</version> <version>0.1.1-RELEASE</version>
</dependency> </dependency>
</dependencies> </dependencies>
</project> </project>
\ No newline at end of file
package com.zhiwei.media_data_crawler.test; //package com.zhiwei.media_data_crawler.test;
//
import java.net.Proxy; //import java.net.Proxy;
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.Date; //import java.util.Date;
import java.util.HashMap; //import java.util.HashMap;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.media_data_crawler.data.DataCrawler; //import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.ZhiHuData; //import com.zhiwei.media_data_crawler.entity.ZhiHuData;
import com.zhiwei.tools.timeparse.TimeParse; //import com.zhiwei.tools.timeparse.TimeParse;
//
public class DataCrawlerTest { //public class DataCrawlerTest {
//
public static void main(String[] args) { // public static void main(String[] args) {
DataCrawlerTest.getSoNewsTest(); // DataCrawlerTest.getSoNewsTest();
} // }
//
//
//
public static void getSoNewsTest(){ // public static void getSoNewsTest(){
String word = "58同城"; //关键词 // String word = "58同城"; //关键词
String startTime = "2018-10-23 23:00:00"; //开始时间 // String startTime = "2018-10-23 23:00:00"; //开始时间
String endTime = "2018-10-23 23:59:59"; //结束时间 // String endTime = "2018-10-23 23:59:59"; //结束时间
Proxy proxy = null; //代理IP,不用可不填写 // Proxy proxy = null; //代理IP,不用可不填写
try { // try {
// //百度新闻采集demo //// //百度新闻采集demo
// List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy); //// List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo //// //搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy); //// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo //// //360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy); //// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
// //Baidu貼吧採集 //// //Baidu貼吧採集
// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null //// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName); //// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
// //天涯论坛采集 //// //天涯论坛采集
// List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime); //// List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
//豆瓣采集 // //豆瓣采集
// String type = "topic"; //topic 为指定话题采集,note为指定日记采集 //// String type = "topic"; //topic 为指定话题采集,note为指定日记采集
// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy); //// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
// List<NewsData> list = DataCrawler.getSoData("京东", "www.toutiao.com", "d", proxy); //// List<NewsData> list = DataCrawler.getSoData("京东", "www.toutiao.com", "d", proxy);
//
Date endDate = TimeParse.stringFormartDate(endTime); // Date endDate = TimeParse.stringFormartDate(endTime);
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> dataList = new ArrayList<>(); // List<Map<String,Object>> dataList = new ArrayList<>();
List<String> headList = new ArrayList<>(); // List<String> headList = new ArrayList<>();
headList.add("url"); // headList.add("url");
headList.add("title"); // headList.add("title");
headList.add("pt"); // headList.add("pt");
headList.add("type"); // headList.add("type");
headList.add("time"); // headList.add("time");
headList.add("source"); // headList.add("source");
headList.add("content"); // headList.add("content");
headList.add("attitudes_count"); // headList.add("attitudes_count");
headList.add("answer_count"); // headList.add("answer_count");
headList.add("comment_count"); // headList.add("comment_count");
headList.add("word"); // headList.add("word");
//搜狗知乎采集 // //搜狗知乎采集
String[] words = word.split("\\|"); // String[] words = word.split("\\|");
//
for(int i=0;i<words.length;i++){ // for(int i=0;i<words.length;i++){
System.out.println(words[i]+" 开始采集"); // System.out.println(words[i]+" 开始采集");
List<ZhiHuData> zhihuList = DataCrawler.getZhihuByWord(words[i], "a_week",endDate, proxy); // List<ZhiHuData> zhihuList = DataCrawler.getZhihuByWord(words[i], "a_week",endDate, proxy);
System.out.println(words[i]+"=============="+zhihuList.size()); // System.out.println(words[i]+"=============="+zhihuList.size());
for(ZhiHuData zhiHuData : zhihuList) { // for(ZhiHuData zhiHuData : zhihuList) {
Map<String,Object> map = new HashMap<String,Object>(); // Map<String,Object> map = new HashMap<String,Object>();
map.put("url", zhiHuData.getUrl()); // map.put("url", zhiHuData.getUrl());
map.put("title", zhiHuData.getTitle()); // map.put("title", zhiHuData.getTitle());
map.put("pt", zhiHuData.getPt()); // map.put("pt", zhiHuData.getPt());
map.put("type", zhiHuData.getType()); // map.put("type", zhiHuData.getType());
map.put("time", zhiHuData.getTime()); // map.put("time", zhiHuData.getTime());
map.put("source", zhiHuData.getSource()); // map.put("source", zhiHuData.getSource());
map.put("content", zhiHuData.getContent()); // map.put("content", zhiHuData.getContent());
map.put("attitudes_count", zhiHuData.getAttitudes_count()); // map.put("attitudes_count", zhiHuData.getAttitudes_count());
map.put("answer_count", zhiHuData.getAnswer_count()); // map.put("answer_count", zhiHuData.getAnswer_count());
map.put("comment_count", zhiHuData.getComment_count()); // map.put("comment_count", zhiHuData.getComment_count());
map.put("word", zhiHuData.getWord()); // map.put("word", zhiHuData.getWord());
dataList.add(map); // dataList.add(map);
} // }
//
} // }
//
poi.exportExcel("F://知乎数据采集.xlsx", "0", headList, dataList);; // poi.exportExcel("F://知乎数据采集.xlsx", "0", headList, dataList);;
//
} catch (Exception e) { // } catch (Exception e) {
// TODO Auto-generated catch block // // TODO Auto-generated catch block
e.printStackTrace(); // e.printStackTrace();
} // }
} // }
//
//
//
//
//
//
//
//
//
//
//
} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment