Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
20ce0e8c
Commit
20ce0e8c
authored
Nov 17, 2018
by
[zhangzhiwei]
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改代理ip及爬虫核心包
parent
9ef31c31
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
321 additions
and
23 deletions
+321
-23
pom.xml
+7
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
+13
-3
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuCrawlerParse.java
+206
-0
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+20
-0
src/main/java/com/zhiwei/media_data_crawler/entity/ZhiHuData.java
+11
-1
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
+64
-18
No files found.
pom.xml
View file @
20ce0e8c
...
@@ -65,7 +65,12 @@
...
@@ -65,7 +65,12 @@
<dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.0.8-SNAPSHOT
</version>
<version>
0.0.9-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
excelpoi
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
</dependency>
</dependency>
</dependencies>
</dependencies>
</project>
</project>
\ No newline at end of file
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
View file @
20ce0e8c
...
@@ -195,7 +195,7 @@ public class SougouZhihuCrawlerParse{
...
@@ -195,7 +195,7 @@ public class SougouZhihuCrawlerParse{
}
}
comment_count
=
Integer
.
valueOf
(
commentCount
);
comment_count
=
Integer
.
valueOf
(
commentCount
);
}
}
zhihu
=
new
ZhiHuData
(
link
,
title
,
pt
,
type
,
null
,
source
,
null
,
attitudes_count
,
null
,
comment_count
,
word
);
zhihu
=
new
ZhiHuData
(
link
,
title
,
pt
,
type
,
null
,
source
,
null
,
attitudes_count
,
null
,
comment_count
,
null
,
word
);
zhihu
=
analysisZhihuArticle
(
link
,
proxy
,
zhihu
);
zhihu
=
analysisZhihuArticle
(
link
,
proxy
,
zhihu
);
}
else
{
}
else
{
Integer
answer_count
=
0
;
Integer
answer_count
=
0
;
...
@@ -206,7 +206,7 @@ public class SougouZhihuCrawlerParse{
...
@@ -206,7 +206,7 @@ public class SougouZhihuCrawlerParse{
}
}
answer_count
=
Integer
.
valueOf
(
answerText
);
answer_count
=
Integer
.
valueOf
(
answerText
);
}
}
zhihu
=
new
ZhiHuData
(
link
,
title
,
pt
,
type
,
null
,
null
,
null
,
null
,
answer_count
,
null
,
word
);
zhihu
=
new
ZhiHuData
(
link
,
title
,
pt
,
type
,
null
,
null
,
null
,
null
,
answer_count
,
null
,
null
,
word
);
zhihu
=
analysisZhihuAnswer
(
link
,
proxy
,
zhihu
);
zhihu
=
analysisZhihuAnswer
(
link
,
proxy
,
zhihu
);
}
}
list
.
add
(
zhihu
);
list
.
add
(
zhihu
);
...
@@ -241,6 +241,15 @@ public class SougouZhihuCrawlerParse{
...
@@ -241,6 +241,15 @@ public class SougouZhihuCrawlerParse{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
String
content
=
document
.
select
(
"div.QuestionHeader-main"
).
select
(
"div.QuestionHeader-detail"
).
text
();
String
content
=
document
.
select
(
"div.QuestionHeader-main"
).
select
(
"div.QuestionHeader-detail"
).
text
();
String
commentCountText
=
document
.
select
(
"div.QuestionHeader-Comment"
).
text
();
String
commentCountText
=
document
.
select
(
"div.QuestionHeader-Comment"
).
text
();
String
time
=
""
;
if
(
htmlBody
.
contains
(
"pubDate"
)){
time
=
htmlBody
.
split
(
""pubDate": ""
)[
1
].
split
(
"""
)[
0
];
if
(
time
!=
null
){
time
=
time
.
replaceAll
(
"T"
,
" "
);
}
}
else
{
System
.
out
.
println
(
"+++++++++++++++++++++++"
);
}
String
regEx
=
"[^0-9]"
;
String
regEx
=
"[^0-9]"
;
Pattern
p
=
Pattern
.
compile
(
regEx
);
Pattern
p
=
Pattern
.
compile
(
regEx
);
Matcher
m
=
p
.
matcher
(
commentCountText
);
Matcher
m
=
p
.
matcher
(
commentCountText
);
...
@@ -251,6 +260,7 @@ public class SougouZhihuCrawlerParse{
...
@@ -251,6 +260,7 @@ public class SougouZhihuCrawlerParse{
}
}
zhihu
.
setContent
(
content
);
zhihu
.
setContent
(
content
);
zhihu
.
setComment_count
(
comment_count
);
zhihu
.
setComment_count
(
comment_count
);
zhihu
.
setTime
(
time
);
}
}
return
zhihu
;
return
zhihu
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -274,7 +284,7 @@ public class SougouZhihuCrawlerParse{
...
@@ -274,7 +284,7 @@ public class SougouZhihuCrawlerParse{
try
{
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
"文章"
);
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
"文章"
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
String
time
=
document
.
select
(
"div.HoverTitle"
).
first
().
select
(
"time"
).
attr
(
"datetime"
)
;
String
time
=
htmlBody
.
split
(
""updated":"
)[
1
].
split
(
","reviewers"
)[
0
]
;
Date
date
=
new
Date
(
time
);
Date
date
=
new
Date
(
time
);
time
=
TimeParse
.
dateFormartString
(
date
,
"yyyy-MM-dd HH:mm:ss"
);
time
=
TimeParse
.
dateFormartString
(
date
,
"yyyy-MM-dd HH:mm:ss"
);
String
content
=
document
.
select
(
"[class=\"RichText PostIndex-content av-paddingSide av-card\"]"
).
text
();
String
content
=
document
.
select
(
"[class=\"RichText PostIndex-content av-paddingSide av-card\"]"
).
text
();
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuCrawlerParse.java
0 → 100644
View file @
20ce0e8c
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
ZhihuCrawlerParse
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
TianYaCrawlerParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根據關鍵詞獲取百度貼吧數據(最多50頁)
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @return
* @param @throws Exception 设定文件
* @return List<TiebaData> 返回类型
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
ZhiHuData
>
getZhihuData
(
String
word
,
String
timeLimit
,
Proxy
proxy
,
Date
endTime
)
throws
Exception
{
List
<
ZhiHuData
>
list
=
new
ArrayList
<
ZhiHuData
>();
int
page
=
0
;
boolean
more
=
true
;
while
(
more
)
{
// 最大页数为20
if
(
page
>
20
)
{
more
=
false
;
}
String
htmlBody
=
downloadHtml
(
word
,
timeLimit
,
proxy
,
page
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
,
endTime
);
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
List
<
ZhiHuData
>
dataList
=
(
List
<
ZhiHuData
>)
dataMap
.
get
(
"data"
);
if
(
dataList
!=
null
&&
!
dataList
.
isEmpty
()){
list
.
addAll
(
dataList
);
}
else
{
more
=
false
;
}
}
else
{
more
=
false
;
}
page
++;
if
(
DataCrawler
.
sleepTime
==
null
)
{
ZhiWeiTools
.
sleep
(
3000
);
}
}
return
list
;
}
/**
* @param word
* @param timeLimit
* @param proxy
* @param page
* @return
* @throws Exception
*/
private
static
String
downloadHtml
(
String
word
,
String
timeLimit
,
Proxy
proxy
,
int
page
)
throws
Exception
{
// 获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
// 获取链接地址
String
url
=
getUrl
(
word
,
timeLimit
,
page
);
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
}
else
{
continue
;
}
}
}
return
null
;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析Baidu貼吧數據
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @return
* @param @throws Exception 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
,
Date
endTime
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
ZhiHuData
>
list
=
new
ArrayList
<
ZhiHuData
>();
boolean
more
=
true
;
try
{
JSONArray
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
);
if
(
dataJson
!=
null
&&
dataJson
.
size
()>=
0
){
String
url
=
null
;
//地址
String
title
;
//标题
String
type
;
//类型
String
time
;
//时间
String
source
;
//发布者
String
content
;
//内容
Integer
attitudes_count
;
//点赞数
Integer
answer_count
;
//回答数
Integer
comment_count
;
//评论数
Integer
follower_count
;
//评论数
Date
date
=
null
;
for
(
int
i
=
0
;
i
<
dataJson
.
size
();
i
++){
JSONObject
objectJson
=
dataJson
.
getJSONObject
(
i
).
getJSONObject
(
"object"
);
try
{
if
(!
dataJson
.
getJSONObject
(
i
).
containsKey
(
"data_list"
)){
date
=
new
Date
(
objectJson
.
getLong
(
"created_time"
)*
1000
);
time
=
TimeParse
.
dateFormartString
(
date
,
"yyyy-MM-dd HH:mm:ss"
);
source
=
objectJson
.
getJSONObject
(
"author"
).
getString
(
"name"
);
type
=
objectJson
.
getString
(
"type"
);
attitudes_count
=
objectJson
.
getInteger
(
"voteup_count"
)!=
null
?
objectJson
.
getInteger
(
"voteup_count"
):
0
;
follower_count
=
objectJson
.
getInteger
(
"follower_count"
)!=
null
?
objectJson
.
getInteger
(
"follower_count"
):
0
;
comment_count
=
objectJson
.
getInteger
(
"comment_count"
)!=
null
?
objectJson
.
getInteger
(
"comment_count"
):
0
;
answer_count
=
objectJson
.
getInteger
(
"answer_count"
)!=
null
?
objectJson
.
getInteger
(
"answer_count"
):
0
;
if
(
objectJson
.
containsKey
(
"question"
)){
title
=
objectJson
.
getJSONObject
(
"question"
).
getString
(
"name"
);
content
=
objectJson
.
getString
(
"content"
)+
objectJson
.
getString
(
"excerpt"
);
}
else
{
title
=
objectJson
.
getString
(
"title"
);
content
=
objectJson
.
getString
(
"content"
)+
objectJson
.
getString
(
"excerpt"
);
}
if
(
type
.
equals
(
"answer"
)){
url
=
"https://www.zhihu.com/question/"
+
objectJson
.
getLong
(
"id"
)+
"/answer/"
+
objectJson
.
getJSONObject
(
"question"
).
getLong
(
"id"
);
}
else
if
(
type
.
equals
(
"article"
)){
url
=
"https://zhuanlan.zhihu.com/p/"
+
objectJson
.
getLong
(
"id"
);
}
else
if
(
type
.
equals
(
"question"
)){
url
=
"https://www.zhihu.com/question/"
+
objectJson
.
getLong
(
"id"
);
}
content
=
ZhiWeiTools
.
delHTMLTag
(
content
);
title
=
ZhiWeiTools
.
delHTMLTag
(
title
);
ZhiHuData
zhihuData
=
new
ZhiHuData
(
url
,
title
,
"知乎"
,
type
,
time
,
source
,
content
,
attitudes_count
,
answer_count
,
comment_count
,
follower_count
,
word
);
list
.
add
(
zhihuData
);
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
"======="
+
objectJson
);
continue
;
}
}
}
else
{
more
=
false
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
System
.
out
.
println
();
more
=
false
;
}
resultMap
.
put
(
"data"
,
list
);
resultMap
.
put
(
"more"
,
more
);
return
resultMap
;
}
/**
* @Title: getUrl
* @author hero
* @Description: 拼接請求鏈接
* @param @param word
* @param @param tiebaName
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
getUrl
(
String
word
,
String
timeLimit
,
int
page
)
{
String
url
=
null
;
if
(
word
!=
null
)
{
url
=
"https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&limit=50&show_all_topics=0&q="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)
+
"&show_all_topics=0&time_zone="
+
timeLimit
+
"&offset="
+
page
*
50
;
}
System
.
out
.
println
(
url
);
return
url
;
}
}
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
View file @
20ce0e8c
...
@@ -369,4 +369,24 @@ public class DataCrawler {
...
@@ -369,4 +369,24 @@ public class DataCrawler {
/**
* 知乎根据关键词采集
* @param word
* @param timeLimit a_day 1天内, a_week 一周内, three_months 三个月内
* @param endDate
* @param proxy
* @return
* @throws Exception
*/
public
static
List
<
ZhiHuData
>
getZhihuByWord
(
String
word
,
String
timeLimit
,
Date
endDate
,
Proxy
proxy
)
throws
Exception
{
try
{
return
ZhihuCrawlerParse
.
getZhihuData
(
word
,
timeLimit
,
proxy
,
endDate
);
}
catch
(
Exception
e
){
throw
e
;
}
}
}
}
src/main/java/com/zhiwei/media_data_crawler/entity/ZhiHuData.java
View file @
20ce0e8c
...
@@ -26,6 +26,8 @@ public class ZhiHuData implements Serializable{
...
@@ -26,6 +26,8 @@ public class ZhiHuData implements Serializable{
private
Integer
comment_count
;
//评论数
private
Integer
comment_count
;
//评论数
private
Integer
follower_count
;
@Override
@Override
public
String
toString
(){
public
String
toString
(){
return
"new ZhiHuData["
return
"new ZhiHuData["
...
@@ -39,6 +41,7 @@ public class ZhiHuData implements Serializable{
...
@@ -39,6 +41,7 @@ public class ZhiHuData implements Serializable{
+
", attitudes_count = "
+
attitudes_count
+
", attitudes_count = "
+
attitudes_count
+
", answer_count = "
+
answer_count
+
", answer_count = "
+
answer_count
+
", comment_count = "
+
comment_count
+
", comment_count = "
+
comment_count
+
", follower_count = "
+
follower_count
+
", word = "
+
word
+
", word = "
+
word
+
"]"
;
+
"]"
;
}
}
...
@@ -47,7 +50,7 @@ public class ZhiHuData implements Serializable{
...
@@ -47,7 +50,7 @@ public class ZhiHuData implements Serializable{
public
ZhiHuData
(
String
url
,
String
title
,
String
pt
,
String
type
,
String
time
,
String
source
,
public
ZhiHuData
(
String
url
,
String
title
,
String
pt
,
String
type
,
String
time
,
String
source
,
String
content
,
Integer
attitudes_count
,
Integer
answer_count
,
Integer
comment_count
String
content
,
Integer
attitudes_count
,
Integer
answer_count
,
Integer
comment_count
,
String
word
){
,
Integer
follower_count
,
String
word
){
this
.
url
=
url
;
this
.
url
=
url
;
this
.
title
=
title
;
this
.
title
=
title
;
this
.
pt
=
pt
;
this
.
pt
=
pt
;
...
@@ -58,6 +61,7 @@ public class ZhiHuData implements Serializable{
...
@@ -58,6 +61,7 @@ public class ZhiHuData implements Serializable{
this
.
attitudes_count
=
attitudes_count
;
this
.
attitudes_count
=
attitudes_count
;
this
.
answer_count
=
answer_count
;
this
.
answer_count
=
answer_count
;
this
.
comment_count
=
comment_count
;
this
.
comment_count
=
comment_count
;
this
.
follower_count
=
follower_count
;
this
.
word
=
word
;
this
.
word
=
word
;
}
}
...
@@ -151,5 +155,11 @@ private String word; //采集关键词
...
@@ -151,5 +155,11 @@ private String word; //采集关键词
this
.
comment_count
=
comment_count
;
this
.
comment_count
=
comment_count
;
}
}
public
Integer
getFollower_count
()
{
return
follower_count
;
}
public
void
setFollower_count
(
Integer
follower_count
)
{
this
.
follower_count
=
follower_count
;
}
}
}
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
View file @
20ce0e8c
package
com
.
zhiwei
.
media_data_crawler
.
test
;
package
com
.
zhiwei
.
media_data_crawler
.
test
;
import
java.net.Proxy
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.DouBanData
;
import
com.zhiwei.media_data_crawler.entity.LunTanData
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.media_data_crawler.entity.TiebaData
;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
import
com.zhiwei.tools.timeparse.TimeParse
;
public
class
DataCrawlerTest
{
public
class
DataCrawlerTest
{
public
static
void
main
(
String
[]
args
)
{
DataCrawlerTest
.
getSoNewsTest
();
}
public
static
void
getSoNewsTest
(){
public
void
getSoNewsTest
(){
String
word
=
"58同城"
;
//关键词
String
word
=
"马云"
;
//关键词
String
startTime
=
"2018-10-23 23:00:00"
;
//开始时间
String
startTime
=
"2017-03-01 00:00:00"
;
//开始时间
String
endTime
=
"2018-10-23 23:59:59"
;
//结束时间
String
endTime
=
"2017-03-01 23:59:59"
;
//结束时间
Proxy
proxy
=
null
;
//代理IP,不用可不填写
Proxy
proxy
=
null
;
//代理IP,不用可不填写
try
{
try
{
// //百度新闻采集demo
// //百度新闻采集demo
List
<
NewsData
>
list
=
DataCrawler
.
getBaiduNewsData
(
word
,
startTime
,
endTime
,
proxy
);
//
List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo
// //搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo
// //360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
// //搜狗知乎采集
// List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
// System.out.println(zhihuList.size());
// //Baidu貼吧採集
// //Baidu貼吧採集
// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
...
@@ -41,17 +40,64 @@ public class DataCrawlerTest {
...
@@ -41,17 +40,64 @@ public class DataCrawlerTest {
//豆瓣采集
//豆瓣采集
// String type = "topic"; //topic 为指定话题采集,note为指定日记采集
// String type = "topic"; //topic 为指定话题采集,note为指定日记采集
// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
// List<NewsData> list = DataCrawler.getSoData("京东", "www.toutiao.com", "d", proxy);
Date
endDate
=
TimeParse
.
stringFormartDate
(
endTime
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"url"
);
headList
.
add
(
"title"
);
headList
.
add
(
"pt"
);
headList
.
add
(
"type"
);
headList
.
add
(
"time"
);
headList
.
add
(
"source"
);
headList
.
add
(
"content"
);
headList
.
add
(
"attitudes_count"
);
headList
.
add
(
"answer_count"
);
headList
.
add
(
"comment_count"
);
headList
.
add
(
"word"
);
//搜狗知乎采集
String
[]
words
=
word
.
split
(
"\\|"
);
// List<NewsData> list = DataCrawler.getSoData("京东", "www.toutiao.com", "d", proxy);
for
(
int
i
=
0
;
i
<
words
.
length
;
i
++){
for
(
NewsData
newsData
:
list
)
{
System
.
out
.
println
(
words
[
i
]+
" 开始采集"
);
System
.
out
.
println
(
newsData
);
List
<
ZhiHuData
>
zhihuList
=
DataCrawler
.
getZhihuByWord
(
words
[
i
],
"a_week"
,
endDate
,
proxy
);
System
.
out
.
println
(
words
[
i
]+
"=============="
+
zhihuList
.
size
());
for
(
ZhiHuData
zhiHuData
:
zhihuList
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
map
.
put
(
"url"
,
zhiHuData
.
getUrl
());
map
.
put
(
"title"
,
zhiHuData
.
getTitle
());
map
.
put
(
"pt"
,
zhiHuData
.
getPt
());
map
.
put
(
"type"
,
zhiHuData
.
getType
());
map
.
put
(
"time"
,
zhiHuData
.
getTime
());
map
.
put
(
"source"
,
zhiHuData
.
getSource
());
map
.
put
(
"content"
,
zhiHuData
.
getContent
());
map
.
put
(
"attitudes_count"
,
zhiHuData
.
getAttitudes_count
());
map
.
put
(
"answer_count"
,
zhiHuData
.
getAnswer_count
());
map
.
put
(
"comment_count"
,
zhiHuData
.
getComment_count
());
map
.
put
(
"word"
,
zhiHuData
.
getWord
());
dataList
.
add
(
map
);
}
}
}
poi
.
exportExcel
(
"F://知乎数据采集.xlsx"
,
"0"
,
headList
,
dataList
);;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
// TODO Auto-generated catch block
// TODO Auto-generated catch block
e
.
printStackTrace
();
e
.
printStackTrace
();
}
}
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment