Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
3c2a6baa
Commit
3c2a6baa
authored
Jan 21, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加 知乎用户 百度知道关键词采集
parent
f19fd0ee
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
378 additions
and
13 deletions
+378
-13
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
+1
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduZhidaoCrawlerParse.java
+68
-0
src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
+1
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuAnswerCommentParse.java
+169
-0
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuCrawlerParse.java
+5
-5
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+27
-4
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuAnswer.java
+0
-1
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuAnswerComment.java
+87
-0
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuQuestionData.java
+20
-1
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
View file @
3c2a6baa
...
@@ -59,7 +59,7 @@ public class BaiduTiebaCrawlerParse {
...
@@ -59,7 +59,7 @@ public class BaiduTiebaCrawlerParse {
page
++;
page
++;
if
(
DataCrawler
.
sleepTime
==
null
){
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
3000
);
ZhiWeiTools
.
sleep
(
3000
);
}
}
}
}
return
list
;
return
list
;
}
}
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduZhidaoCrawlerParse.java
0 → 100644
View file @
3c2a6baa
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
public
class
BaiduZhidaoCrawlerParse
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiduZhidaoCrawlerParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
ProxyHolder
proxy
)
{
try
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
i
=
0
;
int
count
=
-
1
;
while
(
true
)
{
count
=
dataList
.
size
();
String
url
=
"https://zhidao.baidu.com/search?lm=0&rn=10&fr=search&ie=gbk&word="
+
URLEncoder
.
encode
(
word
,
"gbk"
)
+
"&pn="
+
i
;
System
.
out
.
println
(
url
);
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
).
body
().
string
();
Document
doc
=
Jsoup
.
parse
(
result
);
Elements
elements
=
doc
.
select
(
"div.list"
).
select
(
"dl"
);
for
(
Element
element
:
elements
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
String
ur
=
element
.
select
(
"a.ti"
).
attr
(
"href"
).
split
(
"\\?"
)[
0
];
String
title
=
element
.
select
(
"a.ti"
).
text
();
String
content
=
element
.
select
(
"dd.answer"
).
text
();
String
time
=
element
.
select
(
"dd.dd.explain.f-light > span:nth-child(1)"
).
text
();
String
source
=
element
.
select
(
"dd.dd.explain.f-light > span:nth-child(2) > a"
).
text
();
map
.
put
(
"url"
,
ur
);
map
.
put
(
"title"
,
title
);
map
.
put
(
"content"
,
content
);
map
.
put
(
"time"
,
time
);
map
.
put
(
"source"
,
source
);
dataList
.
add
(
map
);
}
if
(
dataList
.
size
()
-
count
<
8
)
{
break
;
}
i
+=
10
;
}
return
dataList
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" 采集错误 {} "
,
e
);
}
return
Collections
.
emptyList
();
}
// public static void main(String[] argss
}
src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
View file @
3c2a6baa
...
@@ -146,7 +146,7 @@ public class TianYaCrawlerParse {
...
@@ -146,7 +146,7 @@ public class TianYaCrawlerParse {
if
(
date
.
before
(
endDate
)){
if
(
date
.
before
(
endDate
)){
more
=
false
;
more
=
false
;
}
else
{
}
else
{
System
.
out
.
println
(
luntanData
);
//
System.out.println(luntanData);
list
.
add
(
luntanData
);
list
.
add
(
luntanData
);
}
}
}
}
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuAnswerCommentParse.java
0 → 100644
View file @
3c2a6baa
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.media_data_crawler.entity.ZhihuAnswerComment
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
ZhihuAnswerCommentParse
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
TianYaCrawlerParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
// public static void main(String[] args) {
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// List<ZhihuAnswerComment> zacList = getAnswerData("https://www.zhihu.com/question/36267070/answer/575449468", ProxyHolder.NAT_PROXY);
// System.out.println(zacList.size());
//
// }
public
static
List
<
ZhihuAnswerComment
>
getAnswerData
(
String
url
,
ProxyHolder
proxy
)
{
String
id
=
getAnswerId
(
url
);
if
(
Objects
.
isNull
(
id
))
{
return
Collections
.
emptyList
();
}
boolean
f
=
true
;
List
<
ZhihuAnswerComment
>
zacList
=
new
ArrayList
<>();
int
pages
=
0
;
while
(
f
)
{
int
count
=
-
1
;
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
count
=
zacList
.
size
();
String
nurl
=
"https://www.zhihu.com/api/v4/answers/"
+
id
+
"/root_comments?"
+
"include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2"
+
"Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2Cis_author&order=norma"
+
"l&limit=50&offset="
+
pages
+
"&status=open"
;
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
nurl
),
proxy
)){
String
result
=
response
.
body
().
string
();
zacList
.
addAll
(
getData
(
result
));
if
(!
Objects
.
equals
(
count
,
zacList
.
size
()))
{
break
;
}
logger
.
info
(
" url = {} 数据量 = {} 第 {} 页"
,
url
,
zacList
.
size
(),
pages
/
20
);
}
catch
(
Exception
e
)
{
logger
.
error
(
" exception {} "
,
e
);
}
}
pages
+=
20
;
if
(
Objects
.
equals
(
count
,
zacList
.
size
()))
{
f
=
false
;
}
}
return
zacList
;
}
private
static
List
<
ZhihuAnswerComment
>
getData
(
String
result
){
List
<
ZhihuAnswerComment
>
dataList
=
new
ArrayList
<>();
try
{
JSONObject
jsonObj1
=
JSONObject
.
parseObject
(
result
);
JSONArray
jarr
=
jsonObj1
.
getJSONArray
(
"data"
);
if
(
Objects
.
nonNull
(
jarr
))
{
for
(
int
i
=
0
;
i
<
jarr
.
size
();
i
++)
{
JSONObject
data
=
jarr
.
getJSONObject
(
i
);
int
childCommentCount
=
data
.
getInteger
(
"child_comment_count"
);
String
url1
=
data
.
getJSONObject
(
"author"
).
getJSONObject
(
"member"
).
getString
(
"url"
);
String
name
=
data
.
getJSONObject
(
"author"
).
getJSONObject
(
"member"
).
getString
(
"name"
);
String
string
=
data
.
getString
(
"content"
).
replaceAll
(
"<p>"
,
""
);
String
id
=
data
.
getString
(
"id"
);
long
createdTime
=
data
.
getLong
(
"created_time"
);
int
voteCount
=
data
.
getInteger
(
"vote_count"
);
ZhihuAnswerComment
zac
=
new
ZhihuAnswerComment
();
zac
.
setAttitudeCount
(
voteCount
);
zac
.
setAuthor
(
name
);
zac
.
setContent
(
string
);
zac
.
setId
(
id
);
zac
.
setAuthorUrl
(
url1
);
zac
.
setTime
(
new
Date
(
createdTime
*
1000L
));
zac
.
setChildCommentCount
(
childCommentCount
);
dataList
.
add
(
zac
);
if
(
childCommentCount
>
0
)
{
for
(
int
g
=
0
;
g
<
childCommentCount
;
g
+=
20
)
{
for
(
int
n
=
1
;
n
<
5
;
n
++)
{
//避免太快,ip被封,导致数据无法获取
ZhiWeiTools
.
sleep
(
200
);
String
url2
=
"https://www.zhihu.com/api/v4/comments/"
+
id
+
"/child_comments?include=%24%5B%2A%5D."
+
"author%2Creply_to_author%2Ccontent%2Cvote_count&limit="
+
"50&offset="
+
g
+
"&include=%24%5B*%5D.author%2Creply_to_author%2Ccontent%2Cvote_count"
;
//获取回答中的回复列表
List
<
ZhihuAnswerComment
>
replayList
=
getReplayList
(
url2
,
id
);
if
(!
replayList
.
isEmpty
())
{
dataList
.
addAll
(
replayList
);
break
;
}
}
}
}
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" 解析出错 {} "
,
e
);
}
return
dataList
;
}
private
static
List
<
ZhihuAnswerComment
>
getReplayList
(
String
url
,
String
strRootID
)
{
List
<
ZhihuAnswerComment
>
dataList
=
new
ArrayList
<>();
try
{
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyFactory
.
getNatProxy
()).
body
().
string
();
if
(
result
!=
null
)
{
JSONObject
dataJson
=
JSONObject
.
parseObject
(
result
);
JSONArray
dataArray
=
dataJson
.
getJSONArray
(
"data"
);
if
(
Objects
.
nonNull
(
dataArray
)){
for
(
int
j
=
0
;
j
<
dataArray
.
size
();
j
++)
{
JSONObject
data1
=
dataArray
.
getJSONObject
(
j
);
String
url2
=
data1
.
getJSONObject
(
"author"
).
getJSONObject
(
"member"
).
getString
(
"url"
);
String
name
=
data1
.
getJSONObject
(
"author"
).
getJSONObject
(
"member"
).
getString
(
"name"
);
String
content
=
data1
.
getString
(
"content"
);
String
string
=
content
.
replace
(
"<p>"
,
""
);
Long
dateTemp
=
data1
.
getLong
(
"created_time"
);
int
voteCount
=
data1
.
getInteger
(
"vote_count"
);
String
id
=
data1
.
getString
(
"id"
);
ZhihuAnswerComment
zac
=
new
ZhihuAnswerComment
();
zac
.
setAttitudeCount
(
voteCount
);
zac
.
setAuthor
(
name
);
zac
.
setContent
(
string
);
zac
.
setId
(
id
);
zac
.
setRootId
(
strRootID
);
zac
.
setAuthorUrl
(
url2
);
zac
.
setTime
(
new
Date
(
dateTemp
*
1000L
));
dataList
.
add
(
zac
);
}
}
}
return
dataList
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" 知乎回复解析出错 {} "
,
e
);
}
return
Collections
.
emptyList
();
}
private
static
String
getAnswerId
(
String
url
)
{
try
{
if
(
url
.
contains
(
"answer/"
))
{
return
url
.
split
(
"answer/"
)[
1
];
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" 知乎链接id获取出错 "
,
e
);
}
return
null
;
}
}
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuCrawlerParse.java
View file @
3c2a6baa
...
@@ -142,11 +142,11 @@ public class ZhihuCrawlerParse {
...
@@ -142,11 +142,11 @@ public class ZhihuCrawlerParse {
return
null
;
return
null
;
}
}
public
static
void
main
(
String
[]
args
)
{
//
public static void main(String[] args) {
String
url
=
"https://zhuanlan.zhihu.com/p/31577152"
;
//
String url = "https://zhuanlan.zhihu.com/p/31577152";
ZhiHuData
zqd
=
ZhihuCrawlerParse
.
getUrlData
(
url
,
null
);
//
ZhiHuData zqd = ZhihuCrawlerParse.getUrlData(url, null);
System
.
out
.
println
(
zqd
.
toString
());
//
System.out.println(zqd.toString());
}
//
}
/**
/**
*
*
...
...
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
View file @
3c2a6baa
package
com
.
zhiwei
.
media_data_crawler
.
data
;
package
com
.
zhiwei
.
media_data_crawler
.
data
;
import
java.net.Proxy
;
import
java.net.Proxy
;
import
java.util.Collections
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.media_data_crawler.crawler.*
;
import
com.zhiwei.media_data_crawler.crawler.*
;
import
com.zhiwei.media_data_crawler.entity.*
;
import
com.zhiwei.media_data_crawler.entity.*
;
...
@@ -38,7 +40,30 @@ public class DataCrawler {
...
@@ -38,7 +40,30 @@ public class DataCrawler {
return
null
;
return
null
;
}
}
}
}
/**
*
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词,全文匹配百度知道数据
* @param @param
* word
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
*/
public
static
List
<
Map
<
String
,
Object
>>
getBaiduZhidaoData
(
String
word
,
ProxyHolder
proxy
)
{
try
{
return
BaiduZhidaoCrawlerParse
.
getData
(
word
,
proxy
);
}
catch
(
Exception
e
)
{
return
Collections
.
emptyList
();
}
}
/**
/**
*
*
* @Title: getBaiduNewsData
* @Title: getBaiduNewsData
...
@@ -212,8 +237,7 @@ public class DataCrawler {
...
@@ -212,8 +237,7 @@ public class DataCrawler {
try
{
try
{
return
BaiduTiebaCrawlerParse
.
getBaiduTiebaData
(
word
,
proxy
,
null
);
return
BaiduTiebaCrawlerParse
.
getBaiduTiebaData
(
word
,
proxy
,
null
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
Collections
.
emptyList
();
return
null
;
}
}
}
}
...
@@ -283,8 +307,7 @@ public class DataCrawler {
...
@@ -283,8 +307,7 @@ public class DataCrawler {
try
{
try
{
return
TianYaCrawlerParse
.
getLunTanData
(
word
,
proxy
,
endTime
);
return
TianYaCrawlerParse
.
getLunTanData
(
word
,
proxy
,
endTime
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
Collections
.
emptyList
();
return
null
;
}
}
}
}
...
...
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuAnswer.java
View file @
3c2a6baa
...
@@ -29,7 +29,6 @@ public class ZhihuAnswer implements Serializable {
...
@@ -29,7 +29,6 @@ public class ZhihuAnswer implements Serializable {
private
Integer
bord_count
;
//评论数
private
Integer
bord_count
;
//评论数
public
ZhihuAnswer
(){}
public
ZhihuAnswer
(){}
public
ZhihuAnswer
(
String
url
,
String
from_url
,
String
title
,
Date
time
,
String
author
,
public
ZhihuAnswer
(
String
url
,
String
from_url
,
String
title
,
Date
time
,
String
author
,
...
...
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuAnswerComment.java
0 → 100644
View file @
3c2a6baa
package
com
.
zhiwei
.
media_data_crawler
.
entity
;
import
java.util.Date
;
public
class
ZhihuAnswerComment
{
private
String
id
;
private
String
author
;
private
String
content
;
private
String
rootId
;
private
Date
time
;
private
int
attitudeCount
;
private
String
authorUrl
;
private
int
childCommentCount
;
public
int
getChildCommentCount
()
{
return
childCommentCount
;
}
public
void
setChildCommentCount
(
int
childCommentCount
)
{
this
.
childCommentCount
=
childCommentCount
;
}
public
String
getId
()
{
return
id
;
}
public
void
setId
(
String
id
)
{
this
.
id
=
id
;
}
public
String
getAuthor
()
{
return
author
;
}
public
void
setAuthor
(
String
author
)
{
this
.
author
=
author
;
}
public
String
getContent
()
{
return
content
;
}
public
void
setContent
(
String
content
)
{
this
.
content
=
content
;
}
public
String
getRootId
()
{
return
rootId
;
}
public
void
setRootId
(
String
rootId
)
{
this
.
rootId
=
rootId
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
public
int
getAttitudeCount
()
{
return
attitudeCount
;
}
public
void
setAttitudeCount
(
int
attitudeCount
)
{
this
.
attitudeCount
=
attitudeCount
;
}
public
String
getAuthorUrl
()
{
return
authorUrl
;
}
public
void
setAuthorUrl
(
String
authorUrl
)
{
this
.
authorUrl
=
authorUrl
;
}
}
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuQuestionData.java
View file @
3c2a6baa
...
@@ -7,6 +7,16 @@ public class ZhihuQuestionData {
...
@@ -7,6 +7,16 @@ public class ZhihuQuestionData {
private
String
time
;
private
String
time
;
private
String
url
;
private
String
url
;
private
String
authorUrl
;
public
String
getAuthorUrl
()
{
return
authorUrl
;
}
public
void
setAuthorUrl
(
String
authorUrl
)
{
this
.
authorUrl
=
authorUrl
;
}
public
String
getTitle
()
{
public
String
getTitle
()
{
return
title
;
return
title
;
...
@@ -35,7 +45,7 @@ public class ZhihuQuestionData {
...
@@ -35,7 +45,7 @@ public class ZhihuQuestionData {
@Override
@Override
public
String
toString
()
{
public
String
toString
()
{
return
"ZhihuQuestionData [title="
+
title
+
", time="
+
time
+
", url="
return
"ZhihuQuestionData [title="
+
title
+
", time="
+
time
+
", url="
+
url
+
"]"
;
+
url
+
"
, authorUrl="
+
authorUrl
+
"
]"
;
}
}
public
ZhihuQuestionData
(
String
title
,
String
time
,
String
url
)
{
public
ZhihuQuestionData
(
String
title
,
String
time
,
String
url
)
{
...
@@ -45,6 +55,15 @@ public class ZhihuQuestionData {
...
@@ -45,6 +55,15 @@ public class ZhihuQuestionData {
this
.
url
=
url
;
this
.
url
=
url
;
}
}
public
ZhihuQuestionData
(
String
title
,
String
time
,
String
url
,
String
authorUrl
)
{
super
();
this
.
title
=
title
;
this
.
time
=
time
;
this
.
url
=
url
;
this
.
authorUrl
=
authorUrl
;
}
public
ZhihuQuestionData
()
{
public
ZhihuQuestionData
()
{
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment