Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
87ffee30
Commit
87ffee30
authored
Dec 14, 2018
by
[zhangzhiwei]
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改核心包版本
parent
7dd71cb4
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
109 additions
and
108 deletions
+109
-108
pom.xml
+6
-5
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
+103
-103
No files found.
pom.xml
View file @
87ffee30
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
media_data_crawler
</artifactId>
<artifactId>
media_data_crawler
</artifactId>
<version>
0.0.
5
-SNAPSHOT
</version>
<version>
0.0.
6
-SNAPSHOT
</version>
<name>
media_data_crawler
</name>
<name>
media_data_crawler
</name>
<description>
网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等
</description>
<description>
网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等
</description>
...
@@ -65,12 +65,12 @@
...
@@ -65,12 +65,12 @@
<dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.
0.9
-SNAPSHOT
</version>
<version>
0.
1.1
-SNAPSHOT
</version>
</dependency>
</dependency>
<dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
.crawler
</groupId>
<artifactId>
excelpoi
</artifactId>
<artifactId>
crawler-core
</artifactId>
<version>
0.
0.1-SNAPSHOT
</version>
<version>
0.
1.1-RELEASE
</version>
</dependency>
</dependency>
</dependencies>
</dependencies>
</project>
</project>
\ No newline at end of file
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
View file @
87ffee30
package
com
.
zhiwei
.
media_data_crawler
.
test
;
//
package com.zhiwei.media_data_crawler.test;
//
import
java.net.Proxy
;
//
import java.net.Proxy;
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.Date
;
//
import java.util.Date;
import
java.util.HashMap
;
//
import java.util.HashMap;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
//
import com.zhiwei.media_data_crawler.data.DataCrawler;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
//
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
import
com.zhiwei.tools.timeparse.TimeParse
;
//
import com.zhiwei.tools.timeparse.TimeParse;
//
public
class
DataCrawlerTest
{
//
public class DataCrawlerTest {
//
public
static
void
main
(
String
[]
args
)
{
//
public static void main(String[] args) {
DataCrawlerTest
.
getSoNewsTest
();
//
DataCrawlerTest.getSoNewsTest();
}
//
}
//
//
//
public
static
void
getSoNewsTest
(){
//
public static void getSoNewsTest(){
String
word
=
"58同城"
;
//关键词
//
String word = "58同城"; //关键词
String
startTime
=
"2018-10-23 23:00:00"
;
//开始时间
//
String startTime = "2018-10-23 23:00:00"; //开始时间
String
endTime
=
"2018-10-23 23:59:59"
;
//结束时间
//
String endTime = "2018-10-23 23:59:59"; //结束时间
Proxy
proxy
=
null
;
//代理IP,不用可不填写
//
Proxy proxy = null; //代理IP,不用可不填写
try
{
//
try {
// //百度新闻采集demo
//
//
//百度新闻采集demo
// List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
//
//
List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo
//
//
//搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
//
//
List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo
//
//
//360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//
//
List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
// //Baidu貼吧採集
//
//
//Baidu貼吧採集
// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
//
//
String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
//
//
List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
// //天涯论坛采集
//
//
//天涯论坛采集
// List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
//
//
List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, endTime);
//豆瓣采集
//
//豆瓣采集
// String type = "topic"; //topic 为指定话题采集,note为指定日记采集
//
//
String type = "topic"; //topic 为指定话题采集,note为指定日记采集
// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
//
//
List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
// List<NewsData> list = DataCrawler.getSoData("京东", "www.toutiao.com", "d", proxy);
//
//
List<NewsData> list = DataCrawler.getSoData("京东", "www.toutiao.com", "d", proxy);
//
Date
endDate
=
TimeParse
.
stringFormartDate
(
endTime
);
//
Date endDate = TimeParse.stringFormartDate(endTime);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
//
List<Map<String,Object>> dataList = new ArrayList<>();
List
<
String
>
headList
=
new
ArrayList
<>();
//
List<String> headList = new ArrayList<>();
headList
.
add
(
"url"
);
//
headList.add("url");
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"pt"
);
//
headList.add("pt");
headList
.
add
(
"type"
);
//
headList.add("type");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"source"
);
//
headList.add("source");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"attitudes_count"
);
//
headList.add("attitudes_count");
headList
.
add
(
"answer_count"
);
//
headList.add("answer_count");
headList
.
add
(
"comment_count"
);
//
headList.add("comment_count");
headList
.
add
(
"word"
);
//
headList.add("word");
//搜狗知乎采集
//
//搜狗知乎采集
String
[]
words
=
word
.
split
(
"\\|"
);
//
String[] words = word.split("\\|");
//
for
(
int
i
=
0
;
i
<
words
.
length
;
i
++){
//
for(int i=0;i<words.length;i++){
System
.
out
.
println
(
words
[
i
]+
" 开始采集"
);
//
System.out.println(words[i]+" 开始采集");
List
<
ZhiHuData
>
zhihuList
=
DataCrawler
.
getZhihuByWord
(
words
[
i
],
"a_week"
,
endDate
,
proxy
);
//
List<ZhiHuData> zhihuList = DataCrawler.getZhihuByWord(words[i], "a_week",endDate, proxy);
System
.
out
.
println
(
words
[
i
]+
"=============="
+
zhihuList
.
size
());
//
System.out.println(words[i]+"=============="+zhihuList.size());
for
(
ZhiHuData
zhiHuData
:
zhihuList
)
{
//
for(ZhiHuData zhiHuData : zhihuList) {
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
//
Map<String,Object> map = new HashMap<String,Object>();
map
.
put
(
"url"
,
zhiHuData
.
getUrl
());
//
map.put("url", zhiHuData.getUrl());
map
.
put
(
"title"
,
zhiHuData
.
getTitle
());
//
map.put("title", zhiHuData.getTitle());
map
.
put
(
"pt"
,
zhiHuData
.
getPt
());
//
map.put("pt", zhiHuData.getPt());
map
.
put
(
"type"
,
zhiHuData
.
getType
());
//
map.put("type", zhiHuData.getType());
map
.
put
(
"time"
,
zhiHuData
.
getTime
());
//
map.put("time", zhiHuData.getTime());
map
.
put
(
"source"
,
zhiHuData
.
getSource
());
//
map.put("source", zhiHuData.getSource());
map
.
put
(
"content"
,
zhiHuData
.
getContent
());
//
map.put("content", zhiHuData.getContent());
map
.
put
(
"attitudes_count"
,
zhiHuData
.
getAttitudes_count
());
//
map.put("attitudes_count", zhiHuData.getAttitudes_count());
map
.
put
(
"answer_count"
,
zhiHuData
.
getAnswer_count
());
//
map.put("answer_count", zhiHuData.getAnswer_count());
map
.
put
(
"comment_count"
,
zhiHuData
.
getComment_count
());
//
map.put("comment_count", zhiHuData.getComment_count());
map
.
put
(
"word"
,
zhiHuData
.
getWord
());
//
map.put("word", zhiHuData.getWord());
dataList
.
add
(
map
);
//
dataList.add(map);
}
//
}
//
}
//
}
//
poi
.
exportExcel
(
"F://知乎数据采集.xlsx"
,
"0"
,
headList
,
dataList
);;
//
poi.exportExcel("F://知乎数据采集.xlsx", "0", headList, dataList);;
//
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
// TODO Auto-generated catch block
//
// TODO Auto-generated catch block
e
.
printStackTrace
();
//
e.printStackTrace();
}
//
}
}
//
}
//
//
//
//
//
//
//
//
//
//
//
}
//
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment