Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
c4e67a9e
Commit
c4e67a9e
authored
Jan 12, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
知乎 增加用户采集和 链接更新问题时间
parent
bb73a9c6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
391 additions
and
1 deletions
+391
-1
pom.xml
+1
-0
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuCrawlerParse.java
+157
-0
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+14
-0
src/main/java/com/zhiwei/media_data_crawler/entity/ZhiHuData.java
+2
-1
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuAuthor.java
+166
-0
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuQuestionData.java
+51
-0
No files found.
pom.xml
View file @
c4e67a9e
...
...
@@ -7,6 +7,7 @@
<description>
网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等
</description>
<!-- 打包管理 -->
<build>
<plugins>
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuCrawlerParse.java
View file @
c4e67a9e
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
...
...
@@ -16,6 +18,8 @@ import com.zhiwei.crawler.core.HttpBoot;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
import
com.zhiwei.media_data_crawler.entity.ZhihuAuthor
;
import
com.zhiwei.media_data_crawler.entity.ZhihuQuestionData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
...
...
@@ -70,6 +74,159 @@ public class ZhihuCrawlerParse {
return
list
;
}
/**
*
* @Description 传入参数获取时间
* @param id
* @param proxy
* @return
*/
public
static
ZhihuQuestionData
getQuestionData
(
String
id
,
Proxy
proxy
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
"https://www.zhihu.com/api/v4/questions/"
+
id
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
String
ur
=
json
.
getString
(
"url"
);
String
title
=
json
.
getString
(
"title"
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(
json
.
getInteger
(
"created"
)*
1000L
),
"yyyy-MM-dd HH:mm:ss"
);
return
new
ZhihuQuestionData
(
title
,
time
,
ur
);
}
catch
(
Exception
e
)
{
logger
.
error
(
" 知乎 问题获取出错 {} "
,
e
);
}
return
null
;
}
/**
*
* @Description 知乎用户采集
* @param url
* @param proxy
* @return
*/
public
static
ZhihuAuthor
getZhihuUser
(
String
url
,
Proxy
proxy
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
String
jsondata
=
result
.
split
(
"js-initialData\" type=\"text/json\"\\>"
)[
1
]
.
split
(
"\\</script"
)[
0
];
JSONObject
json
=
JSONObject
.
parseObject
(
jsondata
);
String
jsonget
=
url
.
split
(
"/"
)[
4
];
JSONObject
jsonentities
=
json
.
getJSONObject
(
"initialState"
).
getJSONObject
(
"entities"
)
.
getJSONObject
(
"users"
).
getJSONObject
(
jsonget
);
ZhihuAuthor
za
=
new
ZhihuAuthor
();
if
(
Objects
.
nonNull
(
jsonentities
))
{
za
.
setName
(
jsonentities
.
getString
(
"name"
));
za
.
setThank
(
jsonentities
.
getInteger
(
"thankedCount"
));
za
.
setCollection
(
jsonentities
.
getInteger
(
"favoritedCount"
));
za
.
setFensi
(
jsonentities
.
getInteger
(
"followerCount"
));
za
.
setDescript
(
jsonentities
.
getString
(
"description"
).
replaceAll
(
"<.*?>"
,
""
));
za
.
setGuanzhu
(
jsonentities
.
getInteger
(
"followingCount"
));
za
.
setTags
(
jsonentities
.
getString
(
"headline"
));
za
.
setIsauthentication
(
jsonentities
.
getBooleanValue
(
"isOrg"
));
za
.
setAuthentication
(
jsonentities
.
getString
(
"orgName"
));
za
.
setLike
(
jsonentities
.
getInteger
(
"voteupCount"
));
za
.
setEdit
(
jsonentities
.
getInteger
(
"logsCount"
));
if
(
Objects
.
nonNull
(
jsonentities
.
get
(
"badge"
))
&&
!
jsonentities
.
getJSONArray
(
"badge"
).
isEmpty
())
{
za
.
setAuthdescription
(
jsonentities
.
getJSONArray
(
"badge"
).
getJSONObject
(
0
).
getString
(
"description"
));
}
if
(
Objects
.
nonNull
(
jsonentities
.
get
(
"locations"
))
&&
!
jsonentities
.
getJSONArray
(
"locations"
).
isEmpty
())
{
za
.
setLocations
(
jsonentities
.
getJSONArray
(
"locations"
).
getJSONObject
(
0
).
getString
(
"name"
));
}
if
(
Objects
.
nonNull
(
jsonentities
.
get
(
"business"
)))
{
za
.
setBusiness
(
jsonentities
.
getJSONObject
(
"business"
).
getString
(
"name"
));
}
za
.
setUrl
(
url
);
return
za
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
null
;
}
// public static void main(String[] args) {
// String url = "68781862";
// ZhihuQuestionData zqd = ZhihuCrawlerParse.getQuestionData(url, null);
// System.out.println(zqd.toString());
// }
/**
* 根据关键词获取链接地址(栏目msg,问答question)
*
* @param keys
* 关键词
* @param proxy
* 代理
* @param num
* 页数,一页10条
* @return
*/
public
List
<
ZhiHuData
>
getUrlByKey
(
String
word
,
Proxy
proxy
,
int
num
,
String
cookie
)
{
List
<
ZhiHuData
>
da
=
new
ArrayList
<>();
String
result
=
""
;
Map
<
String
,
Object
>
header
=
new
HashMap
<>();
header
.
put
(
"Cookie"
,
cookie
);
header
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
try
{
String
u
=
"https://www.zhihu.com/search?type=content&q="
+
URLEncoder
.
encode
(
word
,
"utf-8"
);
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
u
,
header
),
proxy
).
body
().
string
();
try
{
String
searchHashId
=
result
.
split
(
"search_hash_id="
)[
1
].
split
(
"&"
)[
0
];
int
n
=
5
;
while
(
true
)
{
result
=
null
;
u
=
"https://www.zhihu.com/api/v4/search_v3?t=general&q="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&correction=1&offset="
+
n
+
"&limit=20&search_hash_id="
+
searchHashId
;
logger
.
info
(
" 采集链接 u = {} "
,
u
);
System
.
out
.
println
(
u
);
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
u
,
header
),
proxy
).
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONArray
(
"data"
);
int
c
=
da
.
size
();
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
try
{
ZhiHuData
zhihuData
=
new
ZhiHuData
();
String
type
=
data
.
getJSONObject
(
"object"
).
getString
(
"type"
);
zhihuData
.
setType
(
type
);
zhihuData
.
setComment_count
(
data
.
getJSONObject
(
"object"
).
getInteger
(
"comment_count"
));
if
(
data
.
getJSONObject
(
"object"
).
getLong
(
"created_time"
)
==
null
)
{
continue
;
}
zhihuData
.
setTime
(
TimeParse
.
dateFormartString
(
new
Date
(
data
.
getJSONObject
(
"object"
).
getLong
(
"created_time"
)*
1000L
),
"yyyy-MM-dd HH:mm:ss"
));
zhihuData
.
setSource
(
data
.
getJSONObject
(
"object"
).
getJSONObject
(
"author"
).
getString
(
"name"
));
if
(
"article"
.
equals
(
type
))
{
zhihuData
.
setTitle
(
data
.
getJSONObject
(
"object"
).
getString
(
"title"
).
replaceAll
(
"<.*?>"
,
""
));
zhihuData
.
setAttitudes_count
(
data
.
getJSONObject
(
"object"
).
getInteger
(
"voteup_count"
));
zhihuData
.
setContent
(
data
.
getJSONObject
(
"object"
).
getString
(
"content"
).
replaceAll
(
"<.*?>"
,
""
));
zhihuData
.
setUrl
(
data
.
getJSONObject
(
"object"
).
getString
(
"url"
).
replace
(
"https://api.zhihu.com/articles/"
,
"https://zhuanlan.zhihu.com/p/"
));
}
else
{
zhihuData
.
setTitle
(
data
.
getJSONObject
(
"highlight"
).
getString
(
"title"
).
replaceAll
(
"<.*?>"
,
""
));
zhihuData
.
setContent
(
data
.
getJSONObject
(
"highlight"
).
getString
(
"description"
).
replaceAll
(
"<.*?>"
,
""
));
if
(
data
.
getJSONObject
(
"object"
).
getJSONObject
(
"question"
)
!=
null
)
{
zhihuData
.
setUrl
(
"https://www.zhihu.com/question/"
+
data
.
getJSONObject
(
"object"
).
getJSONObject
(
"question"
).
getString
(
"id"
));
}
else
{
zhihuData
.
setUrl
(
"https://www.zhihu.com/question/"
+
data
.
getJSONObject
(
"object"
).
getString
(
"id"
));
}
}
System
.
out
.
println
(
zhihuData
.
toString
());
da
.
add
(
zhihuData
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析数据出错 {}"
,
e
);
continue
;
}
}
if
(
c
==
da
.
size
())
{
break
;
}
n
+=
10
;
Thread
.
sleep
(
3000
);
}
}
catch
(
Exception
e1
)
{
logger
.
error
(
" 获取数据出错 {} "
,
e1
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" 获取数据出错 {} "
,
e
);
}
return
da
;
}
/**
* @param word
...
...
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
View file @
c4e67a9e
...
...
@@ -387,6 +387,20 @@ public class DataCrawler {
}
}
/**
*
* @Description 知乎用户依据链接采集
* @param url
* @param proxy
* @return
*/
public
static
ZhihuAuthor
getZhihuUser
(
String
url
,
Proxy
proxy
){
try
{
return
ZhihuCrawlerParse
.
getZhihuUser
(
url
,
proxy
);
}
catch
(
Exception
e
){
throw
e
;
}
}
}
src/main/java/com/zhiwei/media_data_crawler/entity/ZhiHuData.java
View file @
c4e67a9e
...
...
@@ -65,7 +65,7 @@ public class ZhiHuData implements Serializable{
this
.
word
=
word
;
}
private
String
word
;
//采集关键词
private
String
word
;
//采集关键词
public
String
getWord
()
{
return
word
;
...
...
@@ -162,4 +162,5 @@ private String word; //采集关键词
public
void
setFollower_count
(
Integer
follower_count
)
{
this
.
follower_count
=
follower_count
;
}
}
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuAuthor.java
0 → 100644
View file @
c4e67a9e
package
com
.
zhiwei
.
media_data_crawler
.
entity
;
public
class
ZhihuAuthor
{
private
String
url
;
private
String
name
;
private
String
descript
;
private
String
tags
;
private
int
like
;
//赞同数
private
int
edit
;
//编辑数
private
int
guanzhu
;
//关注数
private
int
fensi
;
//粉丝数
private
int
thank
;
//感谢数
private
int
collection
;
//收藏数
private
boolean
isauthentication
;
//是否认证
private
String
authentication
;
//认证公司
private
String
authdescription
;
//认证描述
private
String
locations
;
//居住信息
private
String
business
;
//所在行业
public
String
getBusiness
()
{
return
business
;
}
public
void
setBusiness
(
String
business
)
{
this
.
business
=
business
;
}
public
String
getLocations
()
{
return
locations
;
}
public
void
setLocations
(
String
locations
)
{
this
.
locations
=
locations
;
}
public
String
getAuthdescription
()
{
return
authdescription
;
}
public
void
setAuthdescription
(
String
authdescription
)
{
this
.
authdescription
=
authdescription
;
}
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
String
getName
()
{
return
name
;
}
public
void
setName
(
String
name
)
{
this
.
name
=
name
;
}
public
String
getDescript
()
{
return
descript
;
}
public
void
setDescript
(
String
descript
)
{
this
.
descript
=
descript
;
}
public
String
getTags
()
{
return
tags
;
}
public
void
setTags
(
String
tags
)
{
this
.
tags
=
tags
;
}
public
int
getLike
()
{
return
like
;
}
public
void
setLike
(
int
like
)
{
this
.
like
=
like
;
}
public
int
getEdit
()
{
return
edit
;
}
public
void
setEdit
(
int
edit
)
{
this
.
edit
=
edit
;
}
public
int
getGuanzhu
()
{
return
guanzhu
;
}
public
void
setGuanzhu
(
int
guanzhu
)
{
this
.
guanzhu
=
guanzhu
;
}
public
int
getFensi
()
{
return
fensi
;
}
public
void
setFensi
(
int
fensi
)
{
this
.
fensi
=
fensi
;
}
public
int
getThank
()
{
return
thank
;
}
public
void
setThank
(
int
thank
)
{
this
.
thank
=
thank
;
}
public
int
getCollection
()
{
return
collection
;
}
public
void
setCollection
(
int
collection
)
{
this
.
collection
=
collection
;
}
public
boolean
isIsauthentication
()
{
return
isauthentication
;
}
public
void
setIsauthentication
(
boolean
isauthentication
)
{
this
.
isauthentication
=
isauthentication
;
}
public
String
getAuthentication
()
{
return
authentication
;
}
public
void
setAuthentication
(
String
authentication
)
{
this
.
authentication
=
authentication
;
}
@Override
public
String
toString
()
{
return
"ZhihuAuthor [url="
+
url
+
", name="
+
name
+
", descript="
+
descript
+
", tags="
+
tags
+
", like="
+
like
+
", edit="
+
edit
+
", guanzhu="
+
guanzhu
+
", fensi="
+
fensi
+
", thank="
+
thank
+
", collection="
+
collection
+
", isauthentication="
+
isauthentication
+
", authentication="
+
authentication
+
", authdescription="
+
authdescription
+
", locations="
+
locations
+
", business="
+
business
+
"]"
;
}
}
src/main/java/com/zhiwei/media_data_crawler/entity/ZhihuQuestionData.java
0 → 100644
View file @
c4e67a9e
package
com
.
zhiwei
.
media_data_crawler
.
entity
;
public
class
ZhihuQuestionData
{
private
String
title
;
private
String
time
;
private
String
url
;
public
String
getTitle
()
{
return
title
;
}
public
void
setTitle
(
String
title
)
{
this
.
title
=
title
;
}
public
String
getTime
()
{
return
time
;
}
public
void
setTime
(
String
time
)
{
this
.
time
=
time
;
}
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
@Override
public
String
toString
()
{
return
"ZhihuQuestionData [title="
+
title
+
", time="
+
time
+
", url="
+
url
+
"]"
;
}
public
ZhihuQuestionData
(
String
title
,
String
time
,
String
url
)
{
super
();
this
.
title
=
title
;
this
.
time
=
time
;
this
.
url
=
url
;
}
public
ZhihuQuestionData
()
{
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment