Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
9a849364
Commit
9a849364
authored
Mar 19, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of
http://git.zhiweidata.top/zhangzhiwei/toutiao.git
parents
06c9a6ab
53f01f3e
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
92 additions
and
16 deletions
+92
-16
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
+7
-2
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+71
-0
src/main/java/com/zhiwei/toutiao/parse/TouTiaoCommentParse.java
+14
-14
No files found.
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
View file @
9a849364
...
@@ -103,8 +103,9 @@ public class TouTiaoAccountParse {
...
@@ -103,8 +103,9 @@ public class TouTiaoAccountParse {
boolean
f
=
true
;
boolean
f
=
true
;
int
page
=
0
;
int
page
=
0
;
while
(
f
){
while
(
f
){
String
url
=
"https://www.toutiao.com/
search_content/?offset="
+
page
*
20
+
"&format=json&keyword="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&autoload=true&count=20&cur_tab=4&from=media
"
;
String
url
=
"https://www.toutiao.com/
api/search/content/?aid=24&app_name=web_search&offset="
+
page
*
20
+
"&format=json&keyword="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&autoload=true&count=20&en_qc=1&cur_tab=4&from=media&pd=user
"
;
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
=
Tools
.
getTouTiaoHeader
();
System
.
out
.
println
(
url
);
try
{
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
){
if
(
htmlBody
!=
null
){
...
@@ -358,19 +359,23 @@ public class TouTiaoAccountParse {
...
@@ -358,19 +359,23 @@ public class TouTiaoAccountParse {
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
user_id
=
data
.
getLong
(
"id"
);
user_id
=
data
.
getLong
(
"id"
);
name
=
data
.
getString
(
"name"
);
name
=
data
.
getString
(
"name"
);
if
(
data
.
containsKey
(
"media_id"
))
{
media_id
=
data
.
getLong
(
"media_id"
);
media_id
=
data
.
getLong
(
"media_id"
);
}
description
=
data
.
getString
(
"description"
);
description
=
data
.
getString
(
"description"
);
user_verified
=
data
.
getInteger
(
"user_verified"
);
user_verified
=
data
.
getInteger
(
"user_verified"
);
verify_content
=
data
.
getString
(
"verify_content"
);
verify_content
=
data
.
getString
(
"verify_content"
);
follow_count
=
data
.
getInteger
(
"follow_count"
);
follow_count
=
data
.
getInteger
(
"follow_count"
);
img_url
=
"https:"
+
data
.
getString
(
"avatar_url"
);
img_url
=
"https:"
+
data
.
getString
(
"avatar_url"
);
create_time
=
new
Date
(
Integer
.
valueOf
(
data
.
getString
(
"create_time"
)+
"000"
));
create_time
=
new
Date
(
Long
.
parseLong
((
data
.
getString
(
"create_time"
)+
"000"
)
));
gender
=
data
.
getString
(
"gender"
);
gender
=
data
.
getString
(
"gender"
);
user_type
=
data
.
getString
(
"user_type"
);
user_type
=
data
.
getString
(
"user_type"
);
tta
=
new
TouTiaoAccount
(
user_id
,
name
,
media_id
,
description
,
user_verified
,
tta
=
new
TouTiaoAccount
(
user_id
,
name
,
media_id
,
description
,
user_verified
,
verify_content
,
follow_count
,
img_url
,
create_time
,
gender
,
user_type
);
verify_content
,
follow_count
,
img_url
,
create_time
,
gender
,
user_type
);
if
(
Objects
.
nonNull
(
proxy
))
{
ZhiWeiTools
.
sleep
(
1000
);
ZhiWeiTools
.
sleep
(
1000
);
}
TouTiaoAccount
ttaUpdate
=
getTouTiaoAccountInfoByUserId
(
user_id
+
""
,
proxy
);
TouTiaoAccount
ttaUpdate
=
getTouTiaoAccountInfoByUserId
(
user_id
+
""
,
proxy
);
if
(
ttaUpdate
!=
null
){
if
(
ttaUpdate
!=
null
){
tta
.
setFriend_count
(
ttaUpdate
.
getFriend_count
());
tta
.
setFriend_count
(
ttaUpdate
.
getFriend_count
());
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
View file @
9a849364
...
@@ -401,6 +401,77 @@ public class TouTiaoArticleParse {
...
@@ -401,6 +401,77 @@ public class TouTiaoArticleParse {
}
}
/**
/**
*
* @Description 微头条客户端解析
* @param userId
* @param endDate
* @param proxy
* @param max_behot_time
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getClientMicroToutiaoCrawler
(
String
userId
,
ProxyHolder
proxy
,
Long
max_behot_time
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
String
ma
=
""
;
while
(
true
)
{
String
url
=
"https://i.snssdk.com/api/feed/profile/v1/?visited_uid="
+
userId
+
"&offset="
+
max_behot_time
;
System
.
out
.
println
(
url
);
ma
=
String
.
valueOf
(
max_behot_time
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
max_behot_time
=
json
.
getLongValue
(
"offset"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
System
.
out
.
println
(
json
.
toString
());
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
try
{
JSONObject
dataJSON
=
data
.
getJSONObject
(
"content"
).
getJSONObject
(
"raw_data"
);
System
.
out
.
println
(
dataJSON
.
toString
());
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
if
(
dataJSON
.
containsKey
(
"comment_base"
)
&&
dataJSON
.
getJSONObject
(
"comment_base"
)!=
null
)
{
JSONObject
commentBase
=
dataJSON
.
getJSONObject
(
"comment_base"
);
Date
date
=
new
Date
(
commentBase
.
getLongValue
(
"create_time"
)
*
1000
);
String
href
=
"http://weitoutiao.zjurl.cn/ugc/share/wap/comment/"
+
dataJSON
.
getLongValue
(
"id"
);
String
source
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"name"
);
String
content
=
commentBase
.
getString
(
"content"
);
String
readNum
=
commentBase
.
getJSONObject
(
"action"
).
getInteger
(
"read_count"
)
+
""
;
String
commentNum
=
commentBase
.
getJSONObject
(
"action"
).
getInteger
(
"comment_count"
)
+
""
;
String
user_id
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"user_id"
);
if
(
dataJSON
.
containsKey
(
"origin_group"
))
{
String
replayUrl
=
dataJSON
.
getJSONObject
(
"origin_group"
).
getString
(
"article_url"
);
String
title
=
dataJSON
.
getJSONObject
(
"origin_group"
).
getString
(
"title"
);
map
.
put
(
"title"
,
title
);
map
.
put
(
"replayUrl"
,
replayUrl
);
}
map
.
put
(
"time"
,
date
);
map
.
put
(
"href"
,
href
);
map
.
put
(
"source"
,
source
);
map
.
put
(
"content"
,
content
);
map
.
put
(
"readNum"
,
readNum
);
map
.
put
(
"commentNum"
,
commentNum
);
map
.
put
(
"user_id"
,
user_id
);
// System.out.println(map.toString());
dataList
.
add
(
map
);
}
}
catch
(
Exception
e
)
{
// System.out.println(data.toString());
e
.
printStackTrace
();
}
}
System
.
out
.
println
(
" 采集到 条 == "
+
dataList
.
size
()
+
" -- "
+
ma
+
" -- "
+
max_behot_time
);
if
(
ma
.
equals
(
String
.
valueOf
(
max_behot_time
)))
{
break
;
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"客户端微头条采集错误 {}"
,
e
);
}
}
return
dataList
;
}
/**
* @Title: parseHtmlByMicroAccount
* @Title: parseHtmlByMicroAccount
* @author hero
* @author hero
* @Description: 解析微头条数据
* @Description: 解析微头条数据
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoCommentParse.java
View file @
9a849364
...
@@ -6,6 +6,7 @@ import java.util.ArrayList;
...
@@ -6,6 +6,7 @@ import java.util.ArrayList;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
...
@@ -72,11 +73,16 @@ public class TouTiaoCommentParse {
...
@@ -72,11 +73,16 @@ public class TouTiaoCommentParse {
{
{
List
<
TouTiaoComment
>
commentes
=
analySisComment
(
htmlBody
,
url
);
List
<
TouTiaoComment
>
commentes
=
analySisComment
(
htmlBody
,
url
);
ttList
.
addAll
(
commentes
);
ttList
.
addAll
(
commentes
);
logger
.
info
(
" url {} 采集到第 {} 页 采集到 {} 条数据 "
,
url
,
page
,
ttList
.
size
());
}
else
}
else
{
{
logger
.
info
(
"采集出现问题,地址为:{}"
,
url
);
logger
.
info
(
"采集出现问题,地址为:{}"
,
url
);
}
}
if
(
Objects
.
nonNull
(
proxy
))
{
ZhiWeiTools
.
sleep
(
100
);
}
else
{
ZhiWeiTools
.
sleep
(
4000
);
ZhiWeiTools
.
sleep
(
4000
);
}
break
;
break
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
continue
;
continue
;
...
@@ -97,7 +103,7 @@ public class TouTiaoCommentParse {
...
@@ -97,7 +103,7 @@ public class TouTiaoCommentParse {
*/
*/
private
static
List
<
TouTiaoComment
>
analySisComment
(
String
htmlBody
,
String
url
)
private
static
List
<
TouTiaoComment
>
analySisComment
(
String
htmlBody
,
String
url
)
{
{
List
<
TouTiaoComment
>
list
=
new
ArrayList
<
TouTiaoComment
>();
List
<
TouTiaoComment
>
list
=
new
ArrayList
<>();
try
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
commentes
=
json
.
getJSONArray
(
"data"
);
JSONArray
commentes
=
json
.
getJSONArray
(
"data"
);
...
@@ -118,9 +124,7 @@ public class TouTiaoCommentParse {
...
@@ -118,9 +124,7 @@ public class TouTiaoCommentParse {
list
.
add
(
ttComment
);
list
.
add
(
ttComment
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
debug
(
"解析今日头条评论列表出现为题,{}"
,
e
);
logger
.
debug
(
"解析今日头条评论列表出现为题,{}"
,
e
.
getMessage
());
return
null
;
}
}
return
list
;
return
list
;
}
}
...
@@ -148,12 +152,10 @@ public class TouTiaoCommentParse {
...
@@ -148,12 +152,10 @@ public class TouTiaoCommentParse {
return
(
int
)
Math
.
ceil
((
double
)
count
/
20.0
);
return
(
int
)
Math
.
ceil
((
double
)
count
/
20.0
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
);
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
.
getMessage
());
return
0
;
}
}
}
}
return
0
;
return
-
1
;
}
}
...
@@ -218,7 +220,7 @@ public class TouTiaoCommentParse {
...
@@ -218,7 +220,7 @@ public class TouTiaoCommentParse {
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
}
}
return
0
;
return
-
1
;
}
}
/**
/**
...
@@ -243,19 +245,17 @@ public class TouTiaoCommentParse {
...
@@ -243,19 +245,17 @@ public class TouTiaoCommentParse {
try
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
data
=
json
.
getJSONObject
(
"data"
);
JSONObject
data
=
json
.
getJSONObject
(
"data"
);
int
count
=
data
.
getIntValue
(
"total"
);
return
data
.
getIntValue
(
"total"
);
return
count
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
);
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
.
getMessage
());
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
continue
;
continue
;
}
}
}
}
return
0
;
return
-
1
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment