Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
f2fc1084
Commit
f2fc1084
authored
Apr 13, 2020
by
win 10
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
删除mongo和excel的pom依赖
parent
88e4e8c0
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
0 additions
and
301 deletions
+0
-301
pom.xml
+0
-14
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
+0
-33
src/main/java/com/zhiwei/media_data_crawler/crawler/CrawlerTest.java
+0
-80
src/main/java/com/zhiwei/media_data_crawler/excelentity/DataExcel.java
+0
-100
src/test/java/com/zhiwei/media_data_crawler/test/GetTiayaDataTest.java
+0
-74
No files found.
pom.xml
View file @
f2fc1084
...
...
@@ -19,20 +19,6 @@
<version>
0.6.1.0-SNAPSHOT
</version>
<scope>
provided
</scope>
</dependency>
<!-- excel导出 -->
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
excelpoi
</artifactId>
<version>
0.0.3-SNAPSHOT
</version>
<scope>
provided
</scope>
</dependency>
<dependency>
<groupId>
com.alibaba
</groupId>
<artifactId>
easyexcel
</artifactId>
<version>
2.0.0-beta3
</version>
<scope>
provided
</scope>
</dependency>
</dependencies>
<!-- 打包管理 -->
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
View file @
f2fc1084
...
...
@@ -22,7 +22,6 @@ import com.zhiwei.crawler.proxy.ProxyHolder;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.TiebaData
;
import
com.zhiwei.media_data_crawler.excelentity.DataExcel
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.timeparse.TimeParse
;
...
...
@@ -40,38 +39,6 @@ public class BaiduTiebaCrawlerParse {
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
Logger
logger
=
LogManager
.
getLogger
(
BaiduTiebaCrawlerParse
.
class
);
// public static void main(String[] args) {
// ProxyFactory.init(SimpleConfig.builder().registry("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181")
// .appName("xumiaoxin").appId(10000008).group("local").build());
//
// List<DataExcel> bodyList = new ArrayList<>();
//
// try {
// List<String> wordList = WordsReadFile.getWords("D:\\crawlerdata\\关键词6.txt");
// for(String s:wordList) {
// List<TiebaData> dataList = getBaiduTiebaData(s, null, null);
// dataList.forEach(data -> {
// DataExcel dataExcel = new DataExcel();
// dataExcel.setAuthor(data.getAuthor());
// dataExcel.setContent(data.getContent());
// dataExcel.setSource(data.getSource());
// dataExcel.setTid(data.getTid());
// dataExcel.setTime(data.getTime());
// dataExcel.setTitle(data.getTitle());
// dataExcel.setUrl(data.getUrl());
// dataExcel.setWord(data.getWord());
//
// bodyList.add(dataExcel);
// });
// }
// } catch (Exception e) {
// e.toString();
// }
//
// EasyExcel.write("D:\\crawlerdata\\百度贴吧-花木兰2.xlsx", DataExcel.class).sheet("数据").doWrite(bodyList);
// System.out.println("导出成功");
// }
/**
* @Title: getBaiduTiebaData
* @author hero
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/CrawlerTest.java
deleted
100644 → 0
View file @
88e4e8c0
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.media_data_crawler.entity.ZhihuAnswer
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.tools.timeparse.TimeParse
;
/**
* 出知乎评论(图片数据量和用户评论排名)
* @author xMx
* @date 2019年10月19日 上午11:01:29
*/
public
class
CrawlerTest
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
//代理地址
String
address
=
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
;
String
appName
=
"xumaioxin"
;
long
appId
=
10000008L
;
ProxyFactory
.
init
(
SimpleConfig
.
builder
().
registry
(
address
).
appName
(
appName
).
appId
(
appId
).
group
(
"local"
).
build
());
String
wordFileName
=
"D://crawlerdata/关键词5.txt"
;
String
dataFileName
=
"D://crawlerdata/知乎2.xlsx"
;
String
endTime
=
"1970-01-01 23:59:59"
;
List
<
String
>
wordList
=
WordsReadFile
.
getWords
(
wordFileName
);
List
<
Map
<
String
,
Object
>>
resultList
=
new
ArrayList
<>();
for
(
String
s:
wordList
)
{
// List<ZhihuAnswer> zhihuAnswer = ZhihuAnwserCrawlerParse.getAnswerList(s,TimeParse.stringFormartDate(endTime),ProxyHolder.NAT_HEAVY_PROXY);
List
<
ZhihuAnswer
>
zhihuAnswer
=
ZhihuAnwserCrawlerParse
.
getPictureCount
(
s
);
for
(
ZhihuAnswer
z:
zhihuAnswer
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"地址"
,
z
.
getUrl
());
map
.
put
(
"问题地址"
,
z
.
getFrom_url
());
map
.
put
(
"标题"
,
z
.
getTitle
());
map
.
put
(
"时间"
,
z
.
getTime
());
map
.
put
(
"发布者"
,
z
.
getAuthor
());
map
.
put
(
"作者地址"
,
z
.
getAuthorUrl
());
map
.
put
(
"内容"
,
z
.
getContent
());
map
.
put
(
"回答点赞数"
,
z
.
getAttitudes_count
());
map
.
put
(
"回答评论数"
,
z
.
getComment_count
());
map
.
put
(
"问题点赞数"
,
z
.
getFollow_count
());
map
.
put
(
"问题评论数"
,
z
.
getBord_count
());
map
.
put
(
"图片数量"
,
z
.
getImgCount
());
map
.
put
(
"排名"
,
z
.
getSort
());
resultList
.
add
(
map
);
}
}
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"地址"
);
headList
.
add
(
"问题地址"
);
headList
.
add
(
"标题"
);
headList
.
add
(
"时间"
);
headList
.
add
(
"发布者"
);
headList
.
add
(
"作者地址"
);
headList
.
add
(
"内容"
);
headList
.
add
(
"回答点赞数"
);
headList
.
add
(
"回答评论数"
);
headList
.
add
(
"问题点赞数"
);
headList
.
add
(
"问题评论数"
);
headList
.
add
(
"图片数量"
);
headList
.
add
(
"排名"
);
poi
.
exportExcel
(
dataFileName
,
"数据"
,
headList
,
resultList
);
System
.
out
.
println
(
"导出成功"
);
}
}
src/main/java/com/zhiwei/media_data_crawler/excelentity/DataExcel.java
deleted
100644 → 0
View file @
88e4e8c0
package
com
.
zhiwei
.
media_data_crawler
.
excelentity
;
import
com.alibaba.excel.annotation.ExcelProperty
;
/**
* easy导出文件标题
* @author xMx
* @date 2019年10月29日 上午9:15:40
*/
public
class
DataExcel
{
@ExcelProperty
(
value
=
"地址"
,
index
=
0
)
private
String
url
;
@ExcelProperty
(
"标题"
)
private
String
title
;
@ExcelProperty
(
"时间"
)
private
String
time
;
@ExcelProperty
(
"tid"
)
private
String
tid
;
@ExcelProperty
(
"来源"
)
private
String
source
;
@ExcelProperty
(
"回复者或楼主"
)
private
String
author
;
@ExcelProperty
(
"回复内容"
)
private
String
content
;
@ExcelProperty
(
"关键词"
)
private
String
word
;
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
String
getTitle
()
{
return
title
;
}
public
void
setTitle
(
String
title
)
{
this
.
title
=
title
;
}
public
String
getTime
()
{
return
time
;
}
public
void
setTime
(
String
time
)
{
this
.
time
=
time
;
}
public
String
getTid
()
{
return
tid
;
}
public
void
setTid
(
String
tid
)
{
this
.
tid
=
tid
;
}
public
String
getSource
()
{
return
source
;
}
public
void
setSource
(
String
source
)
{
this
.
source
=
source
;
}
public
String
getAuthor
()
{
return
author
;
}
public
void
setAuthor
(
String
author
)
{
this
.
author
=
author
;
}
public
String
getContent
()
{
return
content
;
}
public
void
setContent
(
String
content
)
{
this
.
content
=
content
;
}
public
String
getWord
()
{
return
word
;
}
public
void
setWord
(
String
word
)
{
this
.
word
=
word
;
}
}
src/test/java/com/zhiwei/media_data_crawler/test/GetTiayaDataTest.java
deleted
100644 → 0
View file @
88e4e8c0
package
com
.
zhiwei
.
media_data_crawler
.
test
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.media_data_crawler.crawler.WordsReadFile
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.LunTanData
;
import
com.zhiwei.proxy.config.SimpleConfig
;
/**
* 天涯论坛数据获取
* @author xMx
* @date 2019年11月8日 下午4:08:29
*/
public
class
GetTiayaDataTest
{
public
static
void
main
(
String
[]
args
)
{
String
wordFilePath
=
"D:\\crawlerdata\\关键词6.txt"
;
//关键词
String
filePath
=
"D:\\crawlerdata\\天涯论坛-精装房.xlsx"
;
String
startTime
=
"2019-01-01 00:00:00"
;
//开始时间
String
endTime
=
"2019-11-08 23:59:59"
;
//结束时间
//代理地址
String
address
=
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
;
String
appName
=
"xumaioxin"
;
long
appId
=
10000008L
;
ProxyFactory
.
init
(
SimpleConfig
.
builder
().
registry
(
address
).
appName
(
appName
).
appId
(
appId
).
group
(
"local"
).
build
());
List
<
String
>
wordList
=
WordsReadFile
.
getWords
(
wordFilePath
);
List
<
LunTanData
>
list
=
new
ArrayList
<>();
wordList
.
forEach
(
word
->{
list
.
addAll
(
DataCrawler
.
getLunTanData
(
word
,
null
,
startTime
,
endTime
));
});
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
list
.
forEach
(
data
->{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"地址"
,
data
.
getUrl
());
map
.
put
(
"标题"
,
data
.
getTitle
());
map
.
put
(
"时间"
,
data
.
getTime
());
map
.
put
(
"来源"
,
data
.
getSource
());
map
.
put
(
"回复者或楼主"
,
data
.
getAuthor
());
map
.
put
(
"回复内容"
,
data
.
getContent
());
map
.
put
(
"回复数"
,
data
.
getReply_count
());
map
.
put
(
"平台"
,
data
.
getPt
());
map
.
put
(
"关键词"
,
data
.
getWord
());
bodyList
.
add
(
map
);
});
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"地址"
);
headList
.
add
(
"标题"
);
headList
.
add
(
"时间"
);
headList
.
add
(
"来源"
);
headList
.
add
(
"回复者或楼主"
);
headList
.
add
(
"回复内容"
);
headList
.
add
(
"回复数"
);
headList
.
add
(
"平台"
);
headList
.
add
(
"关键词"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
filePath
,
"数据"
,
headList
,
bodyList
);
System
.
out
.
println
(
"导出成功"
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment