Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
5bb9510d
Commit
5bb9510d
authored
Apr 13, 2020
by
win 10
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
解决冲突
parent
a56fa9e1
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
152 additions
and
62 deletions
+152
-62
pom.xml
+16
-2
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuAnwserCrawlerParse.java
+68
-28
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+61
-30
src/test/java/com/zhiwei/media_data_crawler/test/GetTiayaDataTest.java
+7
-2
No files found.
pom.xml
View file @
5bb9510d
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
media_data_crawler
</artifactId>
<artifactId>
media_data_crawler
</artifactId>
<version>
0.1.
2
-SNAPSHOT
</version>
<version>
0.1.
3
-SNAPSHOT
</version>
<name>
media_data_crawler
</name>
<name>
media_data_crawler
</name>
<description>
网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等
</description>
<description>
网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等
</description>
...
@@ -16,9 +16,23 @@
...
@@ -16,9 +16,23 @@
<dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<artifactId>
crawler-core
</artifactId>
<version>
0.
5.2-RELEASE
</version>
<version>
0.
6.1.0-SNAPSHOT
</version>
<scope>
provided
</scope>
<scope>
provided
</scope>
</dependency>
</dependency>
<!-- excel导出 -->
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
excelpoi
</artifactId>
<version>
0.0.3-SNAPSHOT
</version>
<scope>
provided
</scope>
</dependency>
<dependency>
<groupId>
com.alibaba
</groupId>
<artifactId>
easyexcel
</artifactId>
<version>
2.0.0-beta3
</version>
<scope>
provided
</scope>
</dependency>
</dependencies>
</dependencies>
<!-- 打包管理 -->
<!-- 打包管理 -->
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuAnwserCrawlerParse.java
View file @
5bb9510d
...
@@ -16,8 +16,11 @@ import org.slf4j.LoggerFactory;
...
@@ -16,8 +16,11 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.ZhihuAnswer
;
import
com.zhiwei.media_data_crawler.entity.ZhihuAnswer
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
okhttp3.Response
;
...
@@ -49,6 +52,50 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -49,6 +52,50 @@ public class ZhihuAnwserCrawlerParse {
return
-
1
;
return
-
1
;
}
}
public
static
List
<
ZhihuAnswer
>
getPictureCount
(
String
url
)
{
List
<
ZhihuAnswer
>
answerList
=
new
ArrayList
<>();
logger
.
info
(
"知乎回答采集开始:{}"
,
url
);
try
{
if
(
url
.
contains
(
"/answer"
))
{
url
=
url
.
split
(
"/answer"
)[
0
];
}
int
n
=
-
1
;
int
i
=
1
;
while
(
true
)
{
try
{
n
++;
Map
<
String
,
Object
>
dataMap
=
DataCrawler
.
getAnswerList
(
url
,
n
,
TimeParse
.
stringFormartDate
(
"2000-01-01"
),
ProxyHolder
.
NAT_HEAVY_PROXY
);
List
<
ZhihuAnswer
>
list
=
(
List
<
ZhihuAnswer
>)
dataMap
.
get
(
"data"
);
if
(
list
!=
null
&&
!
list
.
isEmpty
()){
logger
.
info
(
"知乎回答采集链接:{} 页数 {} ,此页总数 {}"
,
url
,
n
,
list
.
size
());
i
=
1
;
answerList
.
addAll
(
list
);
}
else
{
n
--;
i
++;
}
// ZhiWeiTools.sleep(100);
if
(!(
boolean
)
dataMap
.
get
(
"more"
))
{
break
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" exception {} "
,
e
);
// ZhiWeiTools.sleep(100);
i
++;
n
--;
}
if
(
i
>
10
)
{
break
;
}
}
}
catch
(
Exception
e
)
{
e
.
toString
();
}
// ZhiWeiTools.sleep(1000);
logger
.
info
(
"知乎回答采集结束:{}"
,
url
);
return
answerList
;
}
/**
/**
* 知乎回答采集
* 知乎回答采集
* @param url
* @param url
...
@@ -57,7 +104,7 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -57,7 +104,7 @@ public class ZhihuAnwserCrawlerParse {
* @return
* @return
* @throws Exception
* @throws Exception
*/
*/
public
static
List
<
ZhihuAnswer
>
getAnswerList
(
String
url
,
Date
endDate
,
Proxy
proxy
)
throws
Exception
{
public
static
List
<
ZhihuAnswer
>
getAnswerList
(
String
url
,
Date
endDate
,
Proxy
Holder
proxy
)
throws
Exception
{
try
{
try
{
List
<
ZhihuAnswer
>
answerList
=
new
ArrayList
<>();
List
<
ZhihuAnswer
>
answerList
=
new
ArrayList
<>();
String
questionId
=
getQuestionId
(
url
);
String
questionId
=
getQuestionId
(
url
);
...
@@ -80,7 +127,7 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -80,7 +127,7 @@ public class ZhihuAnwserCrawlerParse {
more
=
false
;
more
=
false
;
}
}
//单线程采集避免被封休眠8s
//单线程采集避免被封休眠8s
ZhiWeiTools
.
sleep
(
8
000
);
// ZhiWeiTools.sleep(3
000);
page
++;
page
++;
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
more
=
false
;
more
=
false
;
...
@@ -92,7 +139,6 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -92,7 +139,6 @@ public class ZhihuAnwserCrawlerParse {
}
}
}
}
/**
/**
* 获取问题的关注者和浏览量
* 获取问题的关注者和浏览量
* @param url
* @param url
...
@@ -100,7 +146,7 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -100,7 +146,7 @@ public class ZhihuAnwserCrawlerParse {
* @return
* @return
* @throws Exception
* @throws Exception
*/
*/
private
static
String
getNumberBoard
(
String
url
,
Proxy
proxy
)
throws
Exception
{
private
static
String
getNumberBoard
(
String
url
,
Proxy
Holder
proxy
)
throws
Exception
{
try
{
try
{
String
body
=
download
(
url
,
proxy
);
String
body
=
download
(
url
,
proxy
);
Document
document
=
Jsoup
.
parse
(
body
);
Document
document
=
Jsoup
.
parse
(
body
);
...
@@ -117,10 +163,6 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -117,10 +163,6 @@ public class ZhihuAnwserCrawlerParse {
}
}
}
}
/**
/**
* 获取单页数据
* 获取单页数据
* @param url
* @param url
...
@@ -130,17 +172,16 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -130,17 +172,16 @@ public class ZhihuAnwserCrawlerParse {
* @return
* @return
* @throws Exception
* @throws Exception
*/
*/
public
static
Map
<
String
,
Object
>
getAnswerList
(
String
url
,
int
page
,
Date
endDate
,
Proxy
proxy
)
throws
Exception
{
public
static
Map
<
String
,
Object
>
getAnswerList
(
String
url
,
int
page
,
Date
endDate
,
Proxy
Holder
proxy
)
throws
Exception
{
try
{
try
{
String
questionId
=
getQuestionId
(
url
);
String
questionId
=
getQuestionId
(
url
);
String
bord
=
getNumberBoard
(
url
,
proxy
);
String
bord
=
getNumberBoard
(
url
,
proxy
);
return
analsis
(
questionId
,
endDate
,
page
,
bord
,
proxy
);
return
analsis
(
questionId
,
endDate
,
page
,
bord
,
proxy
);
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
throw
e
;
throw
e
;
}
}
}
}
/**
/**
* 解析数据
* 解析数据
* @param questionId
* @param questionId
...
@@ -150,7 +191,7 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -150,7 +191,7 @@ public class ZhihuAnwserCrawlerParse {
* @return
* @return
* @throws Exception
* @throws Exception
*/
*/
private
static
Map
<
String
,
Object
>
analsis
(
String
questionId
,
Date
endDate
,
int
page
,
String
bord
,
Proxy
proxy
)
throws
Exception
{
private
static
Map
<
String
,
Object
>
analsis
(
String
questionId
,
Date
endDate
,
int
page
,
String
bord
,
Proxy
Holder
proxy
)
throws
Exception
{
try
{
try
{
boolean
more
=
true
;
boolean
more
=
true
;
List
<
ZhihuAnswer
>
answerList
=
new
ArrayList
<>();
List
<
ZhihuAnswer
>
answerList
=
new
ArrayList
<>();
...
@@ -160,23 +201,29 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -160,23 +201,29 @@ public class ZhihuAnwserCrawlerParse {
Integer
count
=
dataJson
.
getJSONObject
(
"paging"
).
getInteger
(
"totals"
);
Integer
count
=
dataJson
.
getJSONObject
(
"paging"
).
getInteger
(
"totals"
);
JSONArray
jsonArray
=
dataJson
.
getJSONArray
(
"data"
);
JSONArray
jsonArray
=
dataJson
.
getJSONArray
(
"data"
);
String
from_url
=
"https://www.zhihu.com/question/"
+
questionId
;
String
fromUrl
=
"https://www.zhihu.com/question/"
+
questionId
;
Integer
sort
=
page
*
20
+
1
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++){
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++){
JSONObject
answerJson
=
jsonArray
.
getJSONObject
(
i
);
JSONObject
answerJson
=
jsonArray
.
getJSONObject
(
i
);
Date
time
=
new
Date
(
answerJson
.
getLong
(
"created_time"
)*
1000
);
Date
time
=
new
Date
(
answerJson
.
getLong
(
"created_time"
)*
1000
);
if
(
time
.
after
(
endDate
)){
if
(
time
.
after
(
endDate
)){
String
answerId
=
answerJson
.
getString
(
"id"
);
String
answerId
=
answerJson
.
getString
(
"id"
);
String
link
=
from_url
+
"/answer/"
+
answerId
;
String
link
=
fromUrl
+
"/answer/"
+
answerId
;
System
.
out
.
println
(
"正在处理 === "
+
link
);
String
author
=
answerJson
.
getJSONObject
(
"author"
).
getString
(
"name"
);
String
author
=
answerJson
.
getJSONObject
(
"author"
).
getString
(
"name"
);
String
authorUrl
=
"https://www.zhihu.com/people/"
+
answerJson
.
getJSONObject
(
"author"
).
getString
(
"url_token"
);
String
authorUrl
=
"https://www.zhihu.com/people/"
+
answerJson
.
getJSONObject
(
"author"
).
getString
(
"url_token"
);
String
content
=
ZhiWeiTools
.
delHTMLTag
(
answerJson
.
getString
(
"content"
));
String
content
=
ZhiWeiTools
.
delHTMLTag
(
answerJson
.
getString
(
"content"
));
String
[]
imgContent
=
answerJson
.
getString
(
"content"
).
split
(
"<img"
);
Integer
imgCount
=
(
imgContent
.
length
-
1
)/
2
;
String
title
=
answerJson
.
getJSONObject
(
"question"
).
getString
(
"title"
);
String
title
=
answerJson
.
getJSONObject
(
"question"
).
getString
(
"title"
);
Integer
voteup
_c
ount
=
answerJson
.
getInteger
(
"voteup_count"
);
Integer
voteup
C
ount
=
answerJson
.
getInteger
(
"voteup_count"
);
Integer
comment
_c
ount
=
answerJson
.
getInteger
(
"comment_count"
);
Integer
comment
C
ount
=
answerJson
.
getInteger
(
"comment_count"
);
Integer
guanzhu
_c
ount
=
Integer
.
valueOf
(
bord
.
split
(
","
)[
0
]);
Integer
guanzhu
C
ount
=
Integer
.
valueOf
(
bord
.
split
(
","
)[
0
]);
Integer
bord
_c
ount
=
Integer
.
valueOf
(
bord
.
split
(
","
)[
1
]);
Integer
bord
C
ount
=
Integer
.
valueOf
(
bord
.
split
(
","
)[
1
]);
ZhihuAnswer
zhihuAnswer
=
new
ZhihuAnswer
(
link
,
from
_url
,
title
,
time
,
author
,
authorUrl
,
content
,
voteup_count
,
comment_count
,
guanzhu_count
,
bord_coun
t
);
ZhihuAnswer
zhihuAnswer
=
new
ZhihuAnswer
(
link
,
from
Url
,
title
,
time
,
author
,
authorUrl
,
content
,
voteupCount
,
commentCount
,
guanzhuCount
,
bordCount
,
imgCount
,
sor
t
);
answerList
.
add
(
zhihuAnswer
);
answerList
.
add
(
zhihuAnswer
);
System
.
out
.
println
(
imgCount
+
" ---- "
+
sort
);
sort
++;
}
}
}
}
if
(
count
<
page
*
20
){
if
(
count
<
page
*
20
){
...
@@ -191,7 +238,6 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -191,7 +238,6 @@ public class ZhihuAnwserCrawlerParse {
}
}
}
}
/**
/**
* 根据链接获取数据
* 根据链接获取数据
* @param url
* @param url
...
@@ -199,7 +245,7 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -199,7 +245,7 @@ public class ZhihuAnwserCrawlerParse {
* @return
* @return
* @throws Exception
* @throws Exception
*/
*/
private
static
String
download
(
String
url
,
Proxy
proxy
)
throws
Exception
{
private
static
String
download
(
String
url
,
Proxy
Holder
proxy
)
throws
Exception
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
return
response
.
body
().
string
();
return
response
.
body
().
string
();
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
...
@@ -207,7 +253,6 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -207,7 +253,6 @@ public class ZhihuAnwserCrawlerParse {
}
}
}
}
/**
/**
* 根据链接获取问题id
* 根据链接获取问题id
* @param url
* @param url
...
@@ -237,11 +282,9 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -237,11 +282,9 @@ public class ZhihuAnwserCrawlerParse {
"Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit"
+
"Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit"
+
"%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2"
+
"%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2"
+
"Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp"
+
"Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp"
+
"%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset="
+
page
*
20
+
"&limit=20
&sort_by=created
"
;
"%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset="
+
page
*
20
+
"&limit=20"
;
}
}
public
static
void
main
(
String
[]
args
){
public
static
void
main
(
String
[]
args
){
// String url = "https://www.zhihu.com/question/288128510";
// String url = "https://www.zhihu.com/question/288128510";
// Date endDate = TimeParse.stringFormartDate("2018-09-20 08:00:00");
// Date endDate = TimeParse.stringFormartDate("2018-09-20 08:00:00");
...
@@ -253,7 +296,4 @@ public class ZhihuAnwserCrawlerParse {
...
@@ -253,7 +296,4 @@ public class ZhihuAnwserCrawlerParse {
getAnswerCount
(
"https://www.zhihu.com/question/41539825"
,
null
);
getAnswerCount
(
"https://www.zhihu.com/question/41539825"
,
null
);
}
}
}
}
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
View file @
5bb9510d
...
@@ -35,7 +35,33 @@ public class DataCrawler {
...
@@ -35,7 +35,33 @@ public class DataCrawler {
try
{
try
{
return
BaiduInforCrawlerParse
.
getBaiduInforData
(
word
,
endTime
);
return
BaiduInforCrawlerParse
.
getBaiduInforData
(
word
,
endTime
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
Collections
.
emptyList
();
}
}
/**
*
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词和时间,全文匹配百度新闻数据
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
*/
public
static
List
<
NewsData
>
getBaiduInforDataManyWord
(
String
word
,
String
endTime
,
String
saveWord
)
{
try
{
return
BaiduInforCrawlerParse
.
getBaiduInforDataManyWord
(
word
,
endTime
,
saveWord
);
}
catch
(
Exception
e
)
{
e
.
toString
();
return
Collections
.
emptyList
();
return
Collections
.
emptyList
();
}
}
}
}
...
@@ -62,8 +88,8 @@ public class DataCrawler {
...
@@ -62,8 +88,8 @@ public class DataCrawler {
try
{
try
{
return
BaiduNewsCrawlerParse
.
getBaiduNewsData
(
word
,
startTime
,
endTime
,
proxy
);
return
BaiduNewsCrawlerParse
.
getBaiduNewsData
(
word
,
startTime
,
endTime
,
proxy
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
null
;
return
Collections
.
emptyList
()
;
}
}
}
}
...
@@ -106,7 +132,7 @@ public class DataCrawler {
...
@@ -106,7 +132,7 @@ public class DataCrawler {
try
{
try
{
return
BaiduNewsCrawlerParse
.
getBaiduNewsCount
(
word
,
startTime
,
endTime
,
proxy
,
cookie
);
return
BaiduNewsCrawlerParse
.
getBaiduNewsCount
(
word
,
startTime
,
endTime
,
proxy
,
cookie
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
-
1
;
return
-
1
;
}
}
}
}
...
@@ -132,8 +158,8 @@ public class DataCrawler {
...
@@ -132,8 +158,8 @@ public class DataCrawler {
try
{
try
{
return
BaiduNewsCrawlerParse
.
getBaiduNewsDataByTitle
(
word
,
startTime
,
endTime
,
proxy
);
return
BaiduNewsCrawlerParse
.
getBaiduNewsDataByTitle
(
word
,
startTime
,
endTime
,
proxy
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
null
;
return
Collections
.
emptyList
()
;
}
}
}
}
...
@@ -154,8 +180,8 @@ public class DataCrawler {
...
@@ -154,8 +180,8 @@ public class DataCrawler {
try
{
try
{
return
SoNewsCrawlerParse
.
getSoNewsData
(
word
,
proxy
);
return
SoNewsCrawlerParse
.
getSoNewsData
(
word
,
proxy
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
null
;
return
Collections
.
emptyList
()
;
}
}
}
}
...
@@ -176,8 +202,8 @@ public class DataCrawler {
...
@@ -176,8 +202,8 @@ public class DataCrawler {
try
{
try
{
return
SoNewsCrawlerParse
.
getSoNewsDataByTitle
(
word
,
proxy
);
return
SoNewsCrawlerParse
.
getSoNewsDataByTitle
(
word
,
proxy
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
null
;
return
Collections
.
emptyList
()
;
}
}
}
}
...
@@ -199,8 +225,8 @@ public class DataCrawler {
...
@@ -199,8 +225,8 @@ public class DataCrawler {
System
.
out
.
println
(
"开始采集sogou"
);
System
.
out
.
println
(
"开始采集sogou"
);
return
SougouNewsCrawlerParse
.
getSougouNewsData
(
word
,
proxy
);
return
SougouNewsCrawlerParse
.
getSougouNewsData
(
word
,
proxy
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
null
;
return
Collections
.
emptyList
()
;
}
}
}
}
...
@@ -221,8 +247,8 @@ public class DataCrawler {
...
@@ -221,8 +247,8 @@ public class DataCrawler {
try
{
try
{
return
SougouNewsCrawlerParse
.
getSougouNewsDataByTitle
(
word
,
proxy
);
return
SougouNewsCrawlerParse
.
getSougouNewsDataByTitle
(
word
,
proxy
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
null
;
return
Collections
.
emptyList
()
;
}
}
}
}
...
@@ -242,8 +268,8 @@ public class DataCrawler {
...
@@ -242,8 +268,8 @@ public class DataCrawler {
try
{
try
{
return
SougouZhihuCrawlerParse
.
getSougouZhihuData
(
word
,
proxy
);
return
SougouZhihuCrawlerParse
.
getSougouZhihuData
(
word
,
proxy
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
null
;
return
Collections
.
emptyList
()
;
}
}
}
}
...
@@ -259,14 +285,14 @@ public class DataCrawler {
...
@@ -259,14 +285,14 @@ public class DataCrawler {
* 设定文件
* 设定文件
* @return List<TiebaData> 返回类型
* @return List<TiebaData> 返回类型
*/
*/
public
static
List
<
TiebaData
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
)
{
public
static
List
<
TiebaData
>
getBaiduTiebaData
SortByTime
(
String
word
,
Proxy
proxy
,
String
startTime
)
{
try
{
try
{
return
BaiduTiebaCrawlerParse
.
getBaiduTiebaData
(
word
,
proxy
,
null
);
return
BaiduTiebaCrawlerParse
.
getBaiduTiebaData
(
word
,
proxy
,
null
,
startTime
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
return
Collections
.
emptyList
();
return
Collections
.
emptyList
();
}
}
}
}
/**
/**
* @Title: getBaiduTiebaData
* @Title: getBaiduTiebaData
* @author hero
* @author hero
...
@@ -291,8 +317,8 @@ public class DataCrawler {
...
@@ -291,8 +317,8 @@ public class DataCrawler {
try
{
try
{
return
BaiduTiebaCrawlerParse
.
getBaiduTiebaAnswerDataByUrl
(
url
,
proxy
);
return
BaiduTiebaCrawlerParse
.
getBaiduTiebaAnswerDataByUrl
(
url
,
proxy
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
null
;
return
Collections
.
emptyList
()
;
}
}
}
}
...
@@ -308,10 +334,10 @@ public class DataCrawler {
...
@@ -308,10 +334,10 @@ public class DataCrawler {
*/
*/
public
static
List
<
TiebaData
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
,
String
tiebaName
)
{
public
static
List
<
TiebaData
>
getBaiduTiebaData
(
String
word
,
Proxy
proxy
,
String
tiebaName
)
{
try
{
try
{
return
BaiduTiebaCrawlerParse
.
getBaiduTiebaData
(
word
,
proxy
,
tiebaName
);
return
BaiduTiebaCrawlerParse
.
getBaiduTiebaData
(
word
,
proxy
,
tiebaName
,
null
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
null
;
return
Collections
.
emptyList
()
;
}
}
}
}
...
@@ -353,8 +379,8 @@ public class DataCrawler {
...
@@ -353,8 +379,8 @@ public class DataCrawler {
try
{
try
{
return
DoubanCrawlerParse
.
getDoubanData
(
word
,
type
,
proxy
);
return
DoubanCrawlerParse
.
getDoubanData
(
word
,
type
,
proxy
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
null
;
return
Collections
.
emptyList
()
;
}
}
}
}
...
@@ -374,8 +400,8 @@ public class DataCrawler {
...
@@ -374,8 +400,8 @@ public class DataCrawler {
try
{
try
{
return
SoCrawlerParse
.
getSoData
(
word
,
site
,
time
,
proxy
);
return
SoCrawlerParse
.
getSoData
(
word
,
site
,
time
,
proxy
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
null
;
return
Collections
.
emptyList
()
;
}
}
}
}
...
@@ -387,10 +413,11 @@ public class DataCrawler {
...
@@ -387,10 +413,11 @@ public class DataCrawler {
* @return
* @return
* @throws Exception
* @throws Exception
*/
*/
public
static
List
<
ZhihuAnswer
>
getAnswerList
(
String
url
,
Date
endDate
,
Proxy
proxy
)
throws
Exception
{
public
static
List
<
ZhihuAnswer
>
getAnswerList
(
String
url
,
Date
endDate
,
Proxy
Holder
proxy
)
throws
Exception
{
try
{
try
{
return
ZhihuAnwserCrawlerParse
.
getAnswerList
(
url
,
endDate
,
proxy
);
return
ZhihuAnwserCrawlerParse
.
getAnswerList
(
url
,
endDate
,
proxy
);
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
e
.
toString
();
throw
e
;
throw
e
;
}
}
}
}
...
@@ -404,10 +431,11 @@ public class DataCrawler {
...
@@ -404,10 +431,11 @@ public class DataCrawler {
* @return
* @return
* @throws Exception
* @throws Exception
*/
*/
public
static
Map
<
String
,
Object
>
getAnswerList
(
String
url
,
int
page
,
Date
endDate
,
Proxy
proxy
)
throws
Exception
{
public
static
Map
<
String
,
Object
>
getAnswerList
(
String
url
,
int
page
,
Date
endDate
,
Proxy
Holder
proxy
)
throws
Exception
{
try
{
try
{
return
ZhihuAnwserCrawlerParse
.
getAnswerList
(
url
,
page
,
endDate
,
proxy
);
return
ZhihuAnwserCrawlerParse
.
getAnswerList
(
url
,
page
,
endDate
,
proxy
);
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
e
.
toString
();
throw
e
;
throw
e
;
}
}
}
}
...
@@ -428,6 +456,7 @@ public class DataCrawler {
...
@@ -428,6 +456,7 @@ public class DataCrawler {
try
{
try
{
return
ZhihuCrawlerParse
.
getZhihuData
(
word
,
timeLimit
,
proxy
,
endDate
);
return
ZhihuCrawlerParse
.
getZhihuData
(
word
,
timeLimit
,
proxy
,
endDate
);
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
e
.
toString
();
throw
e
;
throw
e
;
}
}
}
}
...
@@ -443,6 +472,7 @@ public class DataCrawler {
...
@@ -443,6 +472,7 @@ public class DataCrawler {
try
{
try
{
return
ZhihuCrawlerParse
.
getZhihuUser
(
url
,
proxy
);
return
ZhihuCrawlerParse
.
getZhihuUser
(
url
,
proxy
);
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
e
.
toString
();
throw
e
;
throw
e
;
}
}
}
}
...
@@ -458,6 +488,7 @@ public class DataCrawler {
...
@@ -458,6 +488,7 @@ public class DataCrawler {
try
{
try
{
return
ZhihuUserAnswerCrawlerParse
.
getData
(
userId
,
proxy
);
return
ZhihuUserAnswerCrawlerParse
.
getData
(
userId
,
proxy
);
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
e
.
toString
();
throw
e
;
throw
e
;
}
}
}
}
...
...
src/test/java/com/zhiwei/media_data_crawler/test/GetTiayaDataTest.java
View file @
5bb9510d
...
@@ -5,12 +5,12 @@ import java.util.HashMap;
...
@@ -5,12 +5,12 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.media_data_crawler.crawler.WordsReadFile
;
import
com.zhiwei.media_data_crawler.crawler.WordsReadFile
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.LunTanData
;
import
com.zhiwei.media_data_crawler.entity.LunTanData
;
import
com.zhiwei.proxy.config.SimpleConfig
;
/**
/**
* 天涯论坛数据获取
* 天涯论坛数据获取
...
@@ -25,7 +25,12 @@ public class GetTiayaDataTest {
...
@@ -25,7 +25,12 @@ public class GetTiayaDataTest {
String
startTime
=
"2019-01-01 00:00:00"
;
//开始时间
String
startTime
=
"2019-01-01 00:00:00"
;
//开始时间
String
endTime
=
"2019-11-08 23:59:59"
;
//结束时间
String
endTime
=
"2019-11-08 23:59:59"
;
//结束时间
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000008
);
//代理地址
String
address
=
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
;
String
appName
=
"xumaioxin"
;
long
appId
=
10000008L
;
ProxyFactory
.
init
(
SimpleConfig
.
builder
().
registry
(
address
).
appName
(
appName
).
appId
(
appId
).
group
(
"local"
).
build
());
List
<
String
>
wordList
=
WordsReadFile
.
getWords
(
wordFilePath
);
List
<
String
>
wordList
=
WordsReadFile
.
getWords
(
wordFilePath
);
List
<
LunTanData
>
list
=
new
ArrayList
<>();
List
<
LunTanData
>
list
=
new
ArrayList
<>();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment