Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
88e4e8c0
Commit
88e4e8c0
authored
Apr 13, 2020
by
win 10
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
天涯论坛添加采集开始时间,知乎添加图片量采集
parent
ed4f527e
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
407 additions
and
47 deletions
+407
-47
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduInforCrawlerParse.java
+63
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
+77
-29
src/main/java/com/zhiwei/media_data_crawler/crawler/CrawlerTest.java
+80
-0
src/main/java/com/zhiwei/media_data_crawler/crawler/JianshuCrawler.java
+2
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/WordsReadFile.java
+49
-0
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuUserAnswerCrawlerParse.java
+5
-0
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuAnswer.java
+25
-2
src/main/java/com/zhiwei/media_data_crawler/excelentity/DataExcel.java
+100
-0
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
+6
-14
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduInforCrawlerParse.java
View file @
88e4e8c0
...
...
@@ -95,6 +95,68 @@ public class BaiduInforCrawlerParse {
return
list
;
}
/**
* @Title: getBaiduNewsData
* @author hero
* @Description: 采集百度新闻数据
* @param @param
* word
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getBaiduInforDataManyWord
(
String
word
,
String
endTime
,
String
saveWord
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<>();
GroupSync
groupSync
=
new
GroupSync
();
for
(
int
i
=
0
;
i
<
10
;
i
++)
{
groupSync
.
add
();
String
url
=
getUrl
(
word
,
i
,
endTime
);
TaskBoot
.
blockingAsync
(()
->
{
try
{
String
htmlBody
=
downloadHtml
(
url
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
saveWord
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
System
.
out
.
println
(
url
);
list
.
addAll
(
dataList
);
}
}
catch
(
Exception
e
)
{
}
finally
{
groupSync
.
done
();
}
});
}
groupSync
.
await
();
// while (more) {
// String htmlBody = downloadHtml(word, page,null);
// if (htmlBody != null) {
// Map<String, Object> dataMap = analysisData(htmlBody, word);
// List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
// list.addAll(dataList);
// logger.info("第 {} 页 ,采集到 {} 条",page,list.size());
// System.out.println("第 "+page+" 页 ,采集到 "+list.size()+" 条");
// more = (Boolean) dataMap.get("more");
// } else {
// more = false;
// }
// page++;
// if(DataCrawler.sleepTime != null ){
// ZhiWeiTools.sleep(DataCrawler.sleepTime);
// }
// // 最大页数为30
// if (page > 30) {
// more = false;
// }
// }
return
list
;
}
/**
* @Title: downloadHtml
* @author hero
...
...
@@ -303,7 +365,7 @@ public class BaiduInforCrawlerParse {
//https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E6%B5%99%E6%B1%9F&medium=1&rn=50&gpc=stf%3D1546272000%2C1548950400%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=0
// public static void main(String[] args) throws Exception {
// String url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=1&wd=%E5%A5%94%E9%A9%B0+%E6%BC%8F%E6%B1%BD%E6%B2%B9&medium=0&rn=50&gpc=stf%3D0%2C1496246399%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_l_more&x_bfe_rqs=03E80&x_bfe_tjscore=0.332314&scs=2546086922&sortBy=0&pn=0";
// ProxyFactory.init("zookeeper://192.168.0.
36:2181", "local", GroupType.PROVIDER);
// ProxyFactory.init("zookeeper://192.168.0.
11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER, 10000008);//初始化代理
// List<NewsData> ndList = getBaiduInforData("马云","2019-07-04 23:59:59");
// System.out.println(ndList.size());
// String result = downloadHtml(url,0);
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
View file @
88e4e8c0
...
...
@@ -17,24 +17,65 @@ import org.jsoup.nodes.Element;
import
org.jsoup.select.Elements
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.TiebaData
;
import
com.zhiwei.media_data_crawler.excelentity.DataExcel
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
/**
* 百度贴吧采集
* @author xMx
* @date 2019年10月31日 下午5:47:28
*/
public
class
BaiduTiebaCrawlerParse
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
Logger
logger
=
LogManager
.
getLogger
(
BaiduTiebaCrawlerParse
.
class
);
// public static void main(String[] args) {
// ProxyFactory.init(SimpleConfig.builder().registry("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181")
// .appName("xumiaoxin").appId(10000008).group("local").build());
//
// List<DataExcel> bodyList = new ArrayList<>();
//
// try {
// List<String> wordList = WordsReadFile.getWords("D:\\crawlerdata\\关键词6.txt");
// for(String s:wordList) {
// List<TiebaData> dataList = getBaiduTiebaData(s, null, null);
// dataList.forEach(data -> {
// DataExcel dataExcel = new DataExcel();
// dataExcel.setAuthor(data.getAuthor());
// dataExcel.setContent(data.getContent());
// dataExcel.setSource(data.getSource());
// dataExcel.setTid(data.getTid());
// dataExcel.setTime(data.getTime());
// dataExcel.setTitle(data.getTitle());
// dataExcel.setUrl(data.getUrl());
// dataExcel.setWord(data.getWord());
//
// bodyList.add(dataExcel);
// });
// }
// } catch (Exception e) {
// e.toString();
// }
//
// EasyExcel.write("D:\\crawlerdata\\百度贴吧-花木兰2.xlsx", DataExcel.class).sheet("数据").doWrite(bodyList);
// System.out.println("导出成功");
// }
/**
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根
據關鍵詞獲取百度貼吧數據(最多50頁)
* @Description: 根
据关键词获取百度贴吧数据
* @param @param word
* @param @param proxy
* @param @param tiebaName
...
...
@@ -43,28 +84,29 @@ public class BaiduTiebaCrawlerParse {
* @return List<TiebaData> 返回类型
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
TiebaData
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
,
String
tiebaName
)
throws
Exception
{
public
static
List
<
TiebaData
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
,
String
tiebaName
,
String
startTime
)
throws
Exception
{
List
<
TiebaData
>
list
=
new
ArrayList
<
TiebaData
>();
int
page
=
0
;
int
page
=
1
;
boolean
more
=
true
;
while
(
more
)
{
// 最大页数为20
if
(
page
>
50
)
{
more
=
false
;
}
String
htmlBody
=
downloadHtml
(
word
,
proxy
,
tiebaName
,
page
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
);
List
<
TiebaData
>
dataList
=
(
List
<
TiebaData
>)
dataMap
.
get
(
"data"
);
list
.
addAll
(
dataList
);
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
}
else
{
more
=
false
;
}
page
++;
if
(
DataCrawler
.
sleepTime
!=
null
){
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
}
try
{
String
htmlBody
=
downloadHtml
(
word
,
proxy
,
tiebaName
,
page
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
,
startTime
);
List
<
TiebaData
>
dataList
=
(
List
<
TiebaData
>)
dataMap
.
get
(
"data"
);
list
.
addAll
(
dataList
);
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
}
page
++;
}
catch
(
Exception
e
)
{
logger
.
error
(
"百度贴吧数据获取失败"
,
e
);
}
// //最大页数为75页
// if (page > 20) {
// more = false;
// }
}
return
list
;
}
...
...
@@ -85,7 +127,7 @@ public class BaiduTiebaCrawlerParse {
public
static
Map
<
String
,
Object
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
,
String
tiebaName
,
int
page
)
throws
Exception
{
String
htmlBody
=
downloadHtml
(
word
,
proxy
,
tiebaName
,
page
);
if
(
htmlBody
!=
null
)
{
return
analysisData
(
htmlBody
,
proxy
,
word
);
return
analysisData
(
htmlBody
,
proxy
,
word
,
null
);
}
return
null
;
}
...
...
@@ -270,6 +312,9 @@ public class BaiduTiebaCrawlerParse {
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
// 获取链接地址
String
url
=
getUrl
(
word
,
tiebaName
,
page
);
logger
.
info
(
"采集进度 {} === {}"
,
word
,
url
);
headerMap
.
put
(
"Host"
,
"tieba.baidu.com"
);
headerMap
.
put
(
"Referer"
,
url
);
// 下载数据页面
...
...
@@ -283,11 +328,9 @@ public class BaiduTiebaCrawlerParse {
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题
,问题为:{}"
,
e
.
fillInStackTrace
()
);
logger
.
error
(
"获取数据时出现问题
"
,
e
);
if
(
i
==
3
){
throw
e
;
}
else
{
continue
;
}
}
}
...
...
@@ -306,7 +349,7 @@ public class BaiduTiebaCrawlerParse {
* @param @throws Exception 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
,
String
startTime
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
TiebaData
>
list
=
new
ArrayList
<
TiebaData
>();
boolean
more
=
true
;
...
...
@@ -338,16 +381,21 @@ public class BaiduTiebaCrawlerParse {
try
{
author
=
element
.
select
(
"a"
).
select
(
"font.p_violet"
).
text
().
split
(
" "
)[
1
];
time
=
element
.
select
(
"font.p_date"
).
text
();
long
artTime
=
TimeParse
.
stringFormartDate
(
time
).
getTime
();
//文章时间
long
star
=
TimeParse
.
stringFormartDate
(
startTime
).
getTime
();
//采集开始时间
if
(
artTime
<
star
)
{
more
=
false
;
break
;
}
TiebaData
tiebaData
=
new
TiebaData
(
link
,
title
,
time
,
tid
,
source
,
author
,
content
,
word
);
list
.
add
(
tiebaData
);
}
catch
(
Exception
e
)
{
logger
.
debug
(
"无作者 或者 无来源"
);
continue
;
}
}
if
(
elementes
.
size
()==
0
){
more
=
false
;
}
resultMap
.
put
(
"data"
,
list
);
resultMap
.
put
(
"more"
,
more
);
return
resultMap
;
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/CrawlerTest.java
0 → 100644
View file @
88e4e8c0
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.media_data_crawler.entity.ZhihuAnswer
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.tools.timeparse.TimeParse
;
/**
* 出知乎评论(图片数据量和用户评论排名)
* @author xMx
* @date 2019年10月19日 上午11:01:29
*/
public
class
CrawlerTest
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
//代理地址
String
address
=
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
;
String
appName
=
"xumaioxin"
;
long
appId
=
10000008L
;
ProxyFactory
.
init
(
SimpleConfig
.
builder
().
registry
(
address
).
appName
(
appName
).
appId
(
appId
).
group
(
"local"
).
build
());
String
wordFileName
=
"D://crawlerdata/关键词5.txt"
;
String
dataFileName
=
"D://crawlerdata/知乎2.xlsx"
;
String
endTime
=
"1970-01-01 23:59:59"
;
List
<
String
>
wordList
=
WordsReadFile
.
getWords
(
wordFileName
);
List
<
Map
<
String
,
Object
>>
resultList
=
new
ArrayList
<>();
for
(
String
s:
wordList
)
{
// List<ZhihuAnswer> zhihuAnswer = ZhihuAnwserCrawlerParse.getAnswerList(s,TimeParse.stringFormartDate(endTime),ProxyHolder.NAT_HEAVY_PROXY);
List
<
ZhihuAnswer
>
zhihuAnswer
=
ZhihuAnwserCrawlerParse
.
getPictureCount
(
s
);
for
(
ZhihuAnswer
z:
zhihuAnswer
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"地址"
,
z
.
getUrl
());
map
.
put
(
"问题地址"
,
z
.
getFrom_url
());
map
.
put
(
"标题"
,
z
.
getTitle
());
map
.
put
(
"时间"
,
z
.
getTime
());
map
.
put
(
"发布者"
,
z
.
getAuthor
());
map
.
put
(
"作者地址"
,
z
.
getAuthorUrl
());
map
.
put
(
"内容"
,
z
.
getContent
());
map
.
put
(
"回答点赞数"
,
z
.
getAttitudes_count
());
map
.
put
(
"回答评论数"
,
z
.
getComment_count
());
map
.
put
(
"问题点赞数"
,
z
.
getFollow_count
());
map
.
put
(
"问题评论数"
,
z
.
getBord_count
());
map
.
put
(
"图片数量"
,
z
.
getImgCount
());
map
.
put
(
"排名"
,
z
.
getSort
());
resultList
.
add
(
map
);
}
}
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"地址"
);
headList
.
add
(
"问题地址"
);
headList
.
add
(
"标题"
);
headList
.
add
(
"时间"
);
headList
.
add
(
"发布者"
);
headList
.
add
(
"作者地址"
);
headList
.
add
(
"内容"
);
headList
.
add
(
"回答点赞数"
);
headList
.
add
(
"回答评论数"
);
headList
.
add
(
"问题点赞数"
);
headList
.
add
(
"问题评论数"
);
headList
.
add
(
"图片数量"
);
headList
.
add
(
"排名"
);
poi
.
exportExcel
(
dataFileName
,
"数据"
,
headList
,
resultList
);
System
.
out
.
println
(
"导出成功"
);
}
}
src/main/java/com/zhiwei/media_data_crawler/crawler/JianshuCrawler.java
View file @
88e4e8c0
...
...
@@ -17,6 +17,7 @@ import com.zhiwei.crawler.utils.RequestUtils;
import
com.zhiwei.media_data_crawler.entity.JianshuUser
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
okhttp3.MediaType
;
import
okhttp3.Response
;
/**
...
...
@@ -43,7 +44,7 @@ public class JianshuCrawler {
headers
.
put
(
"origin"
,
"https://www.jianshu.com"
);
headers
.
put
(
"accept"
,
"application/json"
);
headers
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
headers
,
null
),
ProxyHolder
.
NAT_HEAVY_PROXY
)){
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
okhttp3
.
RequestBody
.
create
(
MediaType
.
parse
(
"application/json"
),
headers
.
toString
())
),
ProxyHolder
.
NAT_HEAVY_PROXY
)){
String
result
=
response
.
body
().
string
();
System
.
out
.
println
(
result
);
if
(
result
.
contains
(
"搜索过于频繁"
))
{
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/WordsReadFile.java
0 → 100644
View file @
88e4e8c0
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.io.BufferedReader
;
import
java.io.FileInputStream
;
import
java.io.IOException
;
import
java.io.InputStreamReader
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
public
class
WordsReadFile
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WordsReadFile
.
class
);
/**
*
* @Title: getWords
* @author hero
* @Description: 从txt文件中读取关键词
* @param @param
* wordFileName 关键词文件全路径
* @param @return 设定文件
* @return List<String> 返回类型
*/
public
static
List
<
String
>
getWords
(
String
wordFileName
)
{
List
<
String
>
list
=
null
;
try
{
list
=
new
ArrayList
<
String
>();
BufferedReader
br
=
new
BufferedReader
(
new
InputStreamReader
(
new
FileInputStream
(
wordFileName
),
"GBK"
));
String
line
=
""
;
while
((
line
=
br
.
readLine
())!=
null
)
{
if
(
line
.
length
()
>=
1
)
{
list
.
add
(
line
);
}
}
br
.
close
();
return
list
;
}
catch
(
IOException
e
)
{
logger
.
debug
(
"读取关键词文件失败 {}"
,
e
.
getMessage
());
return
Collections
.
emptyList
();
}
}
}
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuUserAnswerCrawlerParse.java
View file @
88e4e8c0
...
...
@@ -22,6 +22,11 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
import
okhttp3.Response
;
/**
* 获取用户的回答列表,https://www.zhihu.com/people/xie-yu-shi-29/answers
* @author xMx
* @date 2020年3月3日 上午9:17:16
*/
public
class
ZhihuUserAnswerCrawlerParse
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuUserAnswerCrawlerParse
.
class
);
...
...
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuAnswer.java
View file @
88e4e8c0
...
...
@@ -29,11 +29,15 @@ public class ZhihuAnswer implements Serializable {
private
Integer
bord_count
;
//问题评论数
private
Integer
imgCount
;
//图片数量
private
Integer
sort
;
//排名
public
ZhihuAnswer
(){}
public
ZhihuAnswer
(
String
url
,
String
from_url
,
String
title
,
Date
time
,
String
author
,
String
authorUrl
,
String
content
,
Integer
attitudes_count
,
Integer
comment_count
,
Integer
follow_count
,
Integer
bord_count
){
Integer
comment_count
,
Integer
follow_count
,
Integer
bord_count
,
Integer
imgCount
,
Integer
sort
){
this
.
url
=
url
;
this
.
from_url
=
from_url
;
this
.
title
=
title
;
...
...
@@ -45,7 +49,8 @@ public class ZhihuAnswer implements Serializable {
this
.
comment_count
=
comment_count
;
this
.
follow_count
=
follow_count
;
this
.
bord_count
=
bord_count
;
this
.
imgCount
=
imgCount
;
this
.
sort
=
sort
;
}
@Override
...
...
@@ -62,6 +67,8 @@ public class ZhihuAnswer implements Serializable {
", comment_count="
+
comment_count
+
", follow_count="
+
follow_count
+
", bord_count="
+
bord_count
+
", imgCount="
+
imgCount
+
", sort="
+
sort
+
'}'
;
}
...
...
@@ -148,6 +155,22 @@ public class ZhihuAnswer implements Serializable {
public
void
setBord_count
(
Integer
bord_count
)
{
this
.
bord_count
=
bord_count
;
}
public
Integer
getImgCount
()
{
return
imgCount
;
}
public
void
setImgCount
(
Integer
imgCount
)
{
this
.
imgCount
=
imgCount
;
}
public
Integer
getSort
()
{
return
sort
;
}
public
void
setSort
(
Integer
sort
)
{
this
.
sort
=
sort
;
}
public
void
setComment_count
(
Integer
comment_count
)
{
this
.
comment_count
=
comment_count
;
...
...
src/main/java/com/zhiwei/media_data_crawler/excelentity/DataExcel.java
0 → 100644
View file @
88e4e8c0
package
com
.
zhiwei
.
media_data_crawler
.
excelentity
;
import
com.alibaba.excel.annotation.ExcelProperty
;
/**
* easy导出文件标题
* @author xMx
* @date 2019年10月29日 上午9:15:40
*/
public
class
DataExcel
{
@ExcelProperty
(
value
=
"地址"
,
index
=
0
)
private
String
url
;
@ExcelProperty
(
"标题"
)
private
String
title
;
@ExcelProperty
(
"时间"
)
private
String
time
;
@ExcelProperty
(
"tid"
)
private
String
tid
;
@ExcelProperty
(
"来源"
)
private
String
source
;
@ExcelProperty
(
"回复者或楼主"
)
private
String
author
;
@ExcelProperty
(
"回复内容"
)
private
String
content
;
@ExcelProperty
(
"关键词"
)
private
String
word
;
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
String
getTitle
()
{
return
title
;
}
public
void
setTitle
(
String
title
)
{
this
.
title
=
title
;
}
public
String
getTime
()
{
return
time
;
}
public
void
setTime
(
String
time
)
{
this
.
time
=
time
;
}
public
String
getTid
()
{
return
tid
;
}
public
void
setTid
(
String
tid
)
{
this
.
tid
=
tid
;
}
public
String
getSource
()
{
return
source
;
}
public
void
setSource
(
String
source
)
{
this
.
source
=
source
;
}
public
String
getAuthor
()
{
return
author
;
}
public
void
setAuthor
(
String
author
)
{
this
.
author
=
author
;
}
public
String
getContent
()
{
return
content
;
}
public
void
setContent
(
String
content
)
{
this
.
content
=
content
;
}
public
String
getWord
()
{
return
word
;
}
public
void
setWord
(
String
word
)
{
this
.
word
=
word
;
}
}
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
View file @
88e4e8c0
...
...
@@ -7,8 +7,10 @@
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.media_data_crawler.data.DataCrawler;
//import com.zhiwei.media_data_crawler.entity.LunTanData;
//import com.zhiwei.media_data_crawler.entity.ZhiHuData;
//import com.zhiwei.tools.timeparse.TimeParse;
//
...
...
@@ -24,7 +26,7 @@
// String word = "58同城"; //关键词
// String startTime = "2018-10-23 23:00:00"; //开始时间
// String endTime = "2018-10-23 23:59:59"; //结束时间
// Proxy proxy = null; //代理IP,不用可不填写
// Proxy
Holder
proxy = null; //代理IP,不用可不填写
// try {
//// //百度新闻采集demo
//// List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
...
...
@@ -35,8 +37,8 @@
//// //Baidu貼吧採集
//// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
//// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
//
//
//天涯论坛采集
//
// List<LunTanData> list = DataCrawler.getLunTanData(word, proxy
, endTime);
// //天涯论坛采集
//
List<LunTanData> list = DataCrawler.getLunTanData(word, proxy, startTime
, endTime);
// //豆瓣采集
//// String type = "topic"; //topic 为指定话题采集,note为指定日记采集
//// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
...
...
@@ -62,7 +64,7 @@
//
// for(int i=0;i<words.length;i++){
// System.out.println(words[i]+" 开始采集");
// List<ZhiHuData> zhihuList = DataCrawler.getZhihuByWord(words[i], "a_week",endDate,
proxy
);
// List<ZhiHuData> zhihuList = DataCrawler.getZhihuByWord(words[i], "a_week",endDate,
null
);
// System.out.println(words[i]+"=============="+zhihuList.size());
// for(ZhiHuData zhiHuData : zhihuList) {
// Map<String,Object> map = new HashMap<String,Object>();
...
...
@@ -90,14 +92,4 @@
// }
// }
//
//
//
//
//
//
//
//
//
//
//
//}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment