Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
9d384b56
Commit
9d384b56
authored
Oct 28, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加更新今日头条阅读数功能
parent
34d3c078
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
1085 additions
and
997 deletions
+1085
-997
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+736
-652
src/main/java/com/zhiwei/toutiao/parse/TouTiaoCommentParse.java
+349
-345
No files found.
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
View file @
9d384b56
/**
* @Title: TouTiaoParse.java
* @Package com.zhiwei.toutiao.parse
* @Description:
* @author hero
* @date 2016年9月2日 上午11:17:44
* @version V1.0
*/
/**
*
*/
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
javax.script.ScriptEngine
;
import
javax.script.ScriptEngineManager
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.toutiao.bean.Signature
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.util.Tools
;
import
okhttp3.Response
;
/**
* @Description:头条帐号采集
* @author hero
* @date 2016年9月2日 上午11:17:44
*/
public
class
TouTiaoArticleParse
{
private
static
ScriptEngine
scriptEngine
=
new
ScriptEngineManager
().
getEngineByName
(
"javascript"
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoArticleParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/***
* 获取头条数据
*
* @Description:
* @param @param
* url
* @param @return
* @return List<TouTiao> 返回类型
* @throws Exception
*/
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
mediaId
,
String
maxBehotTime
,
Date
endData
,
Proxy
proxy
)
throws
Exception
{
Signature
signature
=
new
Signature
();
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
mediaId
+
"&count=20&as="
+
signature
.
getAs
()
+
"&cp="
+
signature
.
getCp
();
if
(
maxBehotTime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
}
}
else
{
logger
.
info
(
"数据为null"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
throw
e
;
}
return
Collections
.
emptyMap
();
}
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
mediaId
,
String
maxBehotTime
,
Date
endData
,
ProxyHolder
proxy
)
throws
Exception
{
Signature
signature
=
new
Signature
();
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
mediaId
+
"&count=20&as="
+
signature
.
getAs
()
+
"&cp="
+
signature
.
getCp
();
if
(
maxBehotTime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
}
}
else
{
logger
.
info
(
"数据为null"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
throw
e
;
}
return
Collections
.
emptyMap
();
}
/**
* 获取今日头条历史文章接口新
*
* @param user_id
* @param max_behot_time
* @param endData
* @param proxy
* @return
* @throws Exception
*/
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
userId
,
String
maxBehotTime
,
Date
endData
,
Proxy
proxy
)
throws
Exception
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
Signature
signature
=
new
Signature
(
userId
,
maxBehotTime
);
String
as
=
signature
.
getAs
();
String
cp
=
signature
.
getCp
();
String
signatureStr
=
signature
.
getSignature
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
userId
+
"&max_behot_time="
+
maxBehotTime
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
signatureStr
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
userId
,
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
}
}
else
{
logger
.
info
(
"数据为null"
);
continue
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
throw
e
;
}
}
return
Collections
.
emptyMap
();
}
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
userId
,
String
maxBehotTime
,
Date
endData
,
ProxyHolder
proxy
)
throws
Exception
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
Signature
signature
=
new
Signature
(
userId
,
maxBehotTime
);
String
as
=
signature
.
getAs
();
String
cp
=
signature
.
getCp
();
String
signatureStr
=
signature
.
getSignature
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
userId
+
"&max_behot_time="
+
maxBehotTime
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
signatureStr
;
logger
.
info
(
"当前采集的历史文章链接:::{}"
,
url
);
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
String
htmlBody
=
null
;
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
userId
,
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
}
else
{
break
;
}
}
else
{
logger
.
info
(
"数据为null,获取到的文本为:::{}"
,
htmlBody
);
continue
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
);
throw
e
;
}
}
return
Collections
.
emptyMap
();
}
/***
* 根据帐号解析历史文章地址
*
* @Description:根据帐号解析历史文章地址
* @param @param
* htmlBody
* @param @return
* @return List<String> 返回类型
*/
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Long
maxBehotTime
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
maxBehotTime
=
Long
.
valueOf
(
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
));
String
title
=
null
;
String
content
=
null
;
String
time
=
null
;
Date
date
=
null
;
String
readNum
=
null
;
String
commentNum
=
null
;
String
playNum
=
null
;
String
shareNum
=
null
;
String
source
=
null
;
String
userId
=
null
;
String
articleType
=
null
;
List
<
String
>
labelList
=
null
;
String
likeNum
=
null
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
String
href
=
"https://www.toutiao.com/"
;
if
(
data
.
containsKey
(
"group_id"
))
{
href
=
href
+
"a"
+
data
.
getLongValue
(
"group_id"
);
title
=
data
.
getString
(
"title"
);
content
=
data
.
getString
(
"abstract"
);
time
=
data
.
getLongValue
(
"behot_time"
)
*
1000
+
""
;
date
=
TimeParse
.
stringFormartDate
(
time
);
readNum
=
data
.
getString
(
"go_detail_count"
);
commentNum
=
data
.
getString
(
"comments_count"
);
playNum
=
data
.
getString
(
"detail_play_effective_count"
);
shareNum
=
data
.
getString
(
"share_count"
);
source
=
data
.
getString
(
"source"
);
userId
=
data
.
getLong
(
"creator_uid"
)
+
""
;
articleType
=
data
.
getString
(
"chinese_tag"
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
userId
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
shareNum
,
"今日头条"
,
articleType
,
likeNum
);
if
(
data
.
containsKey
(
"label"
))
{
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
tt
.
setLabelList
(
labelList
);
}
dataList
.
add
(
tt
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
return
null
;
}
if
(
endDate
!=
null
)
{
if
(
maxBehotTime
!=
null
&&
!
"0"
.
equals
(
maxBehotTime
))
{
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
maxBehotTime
+
"000"
));
if
(
endDate
.
after
(
nextDate
))
{
maxBehotTime
=
null
;
}
}
}
map
.
put
(
"max_behot_time"
,
maxBehotTime
);
map
.
put
(
"data"
,
dataList
);
return
map
;
}
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
userId
,
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Long
maxBehotTime
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
maxBehotTime
=
Long
.
valueOf
(
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
));
String
title
=
null
;
String
content
=
null
;
String
time
=
null
;
Date
date
=
null
;
String
readNum
=
null
;
String
commentNum
=
null
;
String
playNum
=
null
;
String
shareNum
=
null
;
String
source
=
null
;
String
articleType
=
null
;
List
<
String
>
labelList
=
null
;
String
likeNum
=
null
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
String
href
=
"https://www.toutiao.com/"
;
if
(
data
.
containsKey
(
"group_id"
))
{
href
=
href
+
"a"
+
data
.
getLongValue
(
"group_id"
);
title
=
data
.
getString
(
"title"
);
content
=
data
.
getString
(
"abstract"
);
time
=
data
.
getLongValue
(
"behot_time"
)
*
1000
+
""
;
date
=
TimeParse
.
stringFormartDate
(
time
);
readNum
=
data
.
getString
(
"go_detail_count"
);
commentNum
=
data
.
getString
(
"comments_count"
);
playNum
=
data
.
getString
(
"detail_play_effective_count"
);
shareNum
=
data
.
getString
(
"share_count"
);
source
=
data
.
getString
(
"source"
);
articleType
=
data
.
getString
(
"chinese_tag"
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
userId
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
shareNum
,
"今日头条"
,
articleType
,
likeNum
);
if
(
data
.
containsKey
(
"label"
))
{
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
tt
.
setLabelList
(
labelList
);
}
dataList
.
add
(
tt
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
return
null
;
}
if
(
endDate
!=
null
)
{
if
(
maxBehotTime
!=
null
&&
!
"0"
.
equals
(
maxBehotTime
))
{
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
maxBehotTime
+
"000"
));
if
(
endDate
.
after
(
nextDate
))
{
maxBehotTime
=
null
;
}
}
}
map
.
put
(
"max_behot_time"
,
maxBehotTime
);
map
.
put
(
"data"
,
dataList
);
return
map
;
}
/**
* @Title: getMicroTouTiaoCrawler
* @author hero
* @Description: 根据用户user_id查询用户微头条数据
* @param @param
* user_id
* @param @param
* endDate
* @param @param
* proxy
* @param @return
* @param @throws
* IOException 设定文件
* @return List<Map<String,Object>> 返回类型
*/
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
userId
,
Date
endDate
,
Proxy
proxy
,
String
maxBehotTime
)
throws
IOException
{
String
url
=
"https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id="
+
userId
;
if
(
maxBehotTime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
}
System
.
out
.
println
(
url
);
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()
>
0
)
{
return
dataMap
;
}
}
else
{
logger
.
info
(
"数据为null"
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"获取数据出错::{},数据为null"
,
e
);
return
null
;
}
return
null
;
}
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
userId
,
Date
endDate
,
ProxyHolder
proxy
,
Long
maxBehotTime
)
throws
IOException
{
String
url
=
"https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id="
+
userId
;
if
(
maxBehotTime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
}
logger
.
info
(
"微头条采集链接:::{}"
,
url
);
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"create_time"
))
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()
>
0
)
{
return
dataMap
;
}
}
else
{
logger
.
info
(
"数据为null"
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"获取数据出错::{},数据为null"
,
e
);
return
null
;
}
return
null
;
}
/**
*
* @Description 微头条客户端解析
* @param userId
* @param endDate
* @param proxy
* @param max_behot_time
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getClientMicroToutiaoCrawler
(
String
userId
,
ProxyHolder
proxy
,
Long
maxBehotTime
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
String
ma
=
""
;
while
(
true
)
{
String
url
=
"https://i.snssdk.com/api/feed/profile/v1/?visited_uid="
+
userId
+
"&offset="
+
maxBehotTime
;
ma
=
String
.
valueOf
(
maxBehotTime
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
maxBehotTime
=
json
.
getLongValue
(
"offset"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
try
{
JSONObject
dataJSON
=
data
.
getJSONObject
(
"content"
).
getJSONObject
(
"raw_data"
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
if
(
dataJSON
.
containsKey
(
"comment_base"
)
&&
dataJSON
.
getJSONObject
(
"comment_base"
)!=
null
)
{
JSONObject
commentBase
=
dataJSON
.
getJSONObject
(
"comment_base"
);
Date
date
=
new
Date
(
commentBase
.
getLongValue
(
"create_time"
)
*
1000
);
String
href
=
"http://weitoutiao.zjurl.cn/ugc/share/wap/comment/"
+
dataJSON
.
getLongValue
(
"id"
);
String
source
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"name"
);
String
content
=
commentBase
.
getString
(
"content"
);
String
readNum
=
commentBase
.
getJSONObject
(
"action"
).
getInteger
(
"read_count"
)
+
""
;
String
commentNum
=
commentBase
.
getJSONObject
(
"action"
).
getInteger
(
"comment_count"
)
+
""
;
userId
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"user_id"
);
if
(
dataJSON
.
containsKey
(
"origin_group"
))
{
String
replayUrl
=
dataJSON
.
getJSONObject
(
"origin_group"
).
getString
(
"article_url"
);
String
title
=
dataJSON
.
getJSONObject
(
"origin_group"
).
getString
(
"title"
);
map
.
put
(
"title"
,
title
);
map
.
put
(
"replayUrl"
,
replayUrl
);
}
map
.
put
(
"time"
,
date
);
map
.
put
(
"href"
,
href
);
map
.
put
(
"source"
,
source
);
map
.
put
(
"content"
,
content
);
map
.
put
(
"readNum"
,
readNum
);
map
.
put
(
"commentNum"
,
commentNum
);
map
.
put
(
"user_id"
,
userId
);
dataList
.
add
(
map
);
}
}
catch
(
Exception
e
)
{
// System.out.println(data.toString());
e
.
printStackTrace
();
}
}
System
.
out
.
println
(
" 采集到 条 == "
+
dataList
.
size
()
+
" -- "
+
ma
+
" -- "
+
maxBehotTime
);
if
(
ma
.
equals
(
String
.
valueOf
(
maxBehotTime
)))
{
break
;
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"客户端微头条采集错误 {}"
,
e
);
}
}
return
dataList
;
}
/**
* @Title: parseHtmlByMicroAccount
* @author hero
* @Description: 解析微头条数据
* @param @param
* htmlBody
* @param @param
* endDate
* @param @return
* 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
parseHtmlByMicroAccount
(
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Long
maxBehotTime
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
boolean
more
=
false
;
if
(
json
.
containsKey
(
"has_more"
))
{
more
=
json
.
getBoolean
(
"has_more"
);
}
if
(
json
.
containsKey
(
"next"
))
{
maxBehotTime
=
json
.
getJSONObject
(
"next"
).
getLongValue
(
"max_behot_time"
);
}
Date
date
=
null
;
if
(
json
.
containsKey
(
"data"
))
{
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
String
href
=
null
;
String
source
=
null
;
String
title
=
null
;
String
content
=
null
;
String
readNum
=
null
;
String
commentNum
=
null
;
String
playNum
=
null
;
String
userId
=
null
;
String
likeNum
=
null
;
String
articleType
=
null
;
int
count
=
16
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
String
text
=
null
;
if
(
data
.
containsKey
(
"stream_cell"
)
&&
data
.
getJSONObject
(
"stream_cell"
)!=
null
)
{
text
=
data
.
getJSONObject
(
"stream_cell"
).
getString
(
"raw_data"
);
}
else
if
(
data
.
containsKey
(
"concern_talk_cell"
))
{
text
=
data
.
getJSONObject
(
"concern_talk_cell"
).
getString
(
"packed_json_str"
);
}
JSONObject
dataJSON
=
JSONObject
.
parseObject
(
text
);
if
(
dataJSON
.
containsKey
(
"comment_base"
)
&&
dataJSON
.
getJSONObject
(
"comment_base"
)!=
null
)
{
JSONObject
commentBase
=
dataJSON
.
getJSONObject
(
"comment_base"
);
date
=
new
Date
(
commentBase
.
getLongValue
(
"create_time"
)
*
1000
);
href
=
"https://www.toutiao.com/a"
+
dataJSON
.
getLongValue
(
"id"
);
source
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"name"
);
content
=
dataJSON
.
getString
(
"content"
);
readNum
=
dataJSON
.
getJSONObject
(
"action"
).
getInteger
(
"read_count"
)
+
""
;
likeNum
=
dataJSON
.
getJSONObject
(
"action"
).
getInteger
(
"digg_count"
)+
""
;
commentNum
=
dataJSON
.
getJSONObject
(
"action"
).
getInteger
(
"comment_count"
)
+
""
;
userId
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"user_id"
);
if
(
content
!=
null
&&
!
""
.
equals
(
content
))
{
if
(
content
.
length
()
<
16
)
{
count
=
content
.
length
();
}
title
=
content
.
substring
(
0
,
count
);
}
}
else
{
date
=
new
Date
(
dataJSON
.
getLongValue
(
"create_time"
)
*
1000
);
href
=
"https://www.toutiao.com/a"
+
dataJSON
.
getString
(
"thread_id"
);
source
=
dataJSON
.
getJSONObject
(
"user"
).
getString
(
"name"
);
content
=
dataJSON
.
getString
(
"content"
);
readNum
=
dataJSON
.
getInteger
(
"read_count"
)
+
""
;
commentNum
=
dataJSON
.
getInteger
(
"comment_count"
)
+
""
;
likeNum
=
dataJSON
.
getInteger
(
"digg_count"
)+
""
;
userId
=
dataJSON
.
getJSONObject
(
"user"
).
getString
(
"user_id"
);
if
(
content
!=
null
&&
!
""
.
equals
(
content
))
{
if
(
content
.
length
()
<
16
)
{
count
=
content
.
length
();
}
title
=
content
.
substring
(
0
,
count
);
}
}
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
userId
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
"0"
,
"微头条"
,
articleType
,
likeNum
);
dataList
.
add
(
tt
);
}
catch
(
Exception
e
)
{
continue
;
}
}
}
else
{
System
.
out
.
println
(
json
);
}
/** 验证是否有下一页数据 **/
if
(
more
)
{
if
(
maxBehotTime
!=
null
&&
maxBehotTime
!=
0
)
{
if
(
endDate
.
after
(
date
))
{
maxBehotTime
=
null
;
}
}
}
else
{
maxBehotTime
=
null
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
map
.
put
(
"max_behot_time"
,
maxBehotTime
);
map
.
put
(
"data"
,
dataList
);
return
map
;
}
/**
* 根据链接获取全文
* @param url
* @param proxy
* @return
*/
public
static
String
getContent
(
String
url
,
Proxy
proxy
)
{
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
null
);
String
regex
=
"<script>var BASE_DATA[\\s\\S]+?</script>"
;
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"articleInfo"
))
{
//通过正则截取需要的js代码
Matcher
matcher
=
Pattern
.
compile
(
regex
).
matcher
(
htmlBody
);
if
(
matcher
.
find
())
{
String
content
=
matcher
.
group
().
replaceAll
(
"<script>var BASE_DATA = |;</script>"
,
""
);
//通过js引擎执行js代码
String
jsContent
=
"eval(("
+
content
+
")).articleInfo.content.toString();"
;
String
contentHtml
=
scriptEngine
.
eval
(
jsContent
).
toString
();
//解析最后的数据
return
Jsoup
.
parse
(
contentHtml
).
text
();
}
}
return
null
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"跟据链接采集全文出现错误"
,
e
);
return
null
;
}
}
/**
* 下载数据
* @param url
* @param proxy
* @param headMap
* @return
*/
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headMap
)
{
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
break
;
}
else
{
continue
;
}
}
}
return
null
;
}
}
/**
* @Title: TouTiaoParse.java
* @Package com.zhiwei.toutiao.parse
* @Description:
* @author hero
* @date 2016年9月2日 上午11:17:44
* @version V1.0
*/
/**
*
*/
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.*
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
javax.script.ScriptEngine
;
import
javax.script.ScriptEngineManager
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.toutiao.bean.Signature
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.util.Tools
;
import
okhttp3.Response
;
/**
* @Description:头条帐号采集
* @author hero
* @date 2016年9月2日 上午11:17:44
*/
public
class
TouTiaoArticleParse
{
private
static
ScriptEngine
scriptEngine
=
new
ScriptEngineManager
().
getEngineByName
(
"javascript"
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoArticleParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/***
* 获取头条数据
*
* @Description:
* @param @param
* url
* @param @return
* @return List<TouTiao> 返回类型
* @throws Exception
*/
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
mediaId
,
String
maxBehotTime
,
Date
endData
,
Proxy
proxy
)
throws
Exception
{
Signature
signature
=
new
Signature
();
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
mediaId
+
"&count=20&as="
+
signature
.
getAs
()
+
"&cp="
+
signature
.
getCp
();
if
(
maxBehotTime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
}
}
else
{
logger
.
info
(
"数据为null"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
throw
e
;
}
return
Collections
.
emptyMap
();
}
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
mediaId
,
String
maxBehotTime
,
Date
endData
,
ProxyHolder
proxy
)
throws
Exception
{
Signature
signature
=
new
Signature
();
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
mediaId
+
"&count=20&as="
+
signature
.
getAs
()
+
"&cp="
+
signature
.
getCp
();
if
(
maxBehotTime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
}
}
else
{
logger
.
info
(
"数据为null"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
throw
e
;
}
return
Collections
.
emptyMap
();
}
/**
* 获取今日头条历史文章接口新
*
* @param user_id
* @param max_behot_time
* @param endData
* @param proxy
* @return
* @throws Exception
*/
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
userId
,
String
maxBehotTime
,
Date
endData
,
Proxy
proxy
)
throws
Exception
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
Signature
signature
=
new
Signature
(
userId
,
maxBehotTime
);
String
as
=
signature
.
getAs
();
String
cp
=
signature
.
getCp
();
String
signatureStr
=
signature
.
getSignature
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
userId
+
"&max_behot_time="
+
maxBehotTime
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
signatureStr
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
userId
,
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
}
}
else
{
logger
.
info
(
"数据为null"
);
continue
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
throw
e
;
}
}
return
Collections
.
emptyMap
();
}
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
userId
,
String
maxBehotTime
,
Date
endData
,
ProxyHolder
proxy
)
throws
Exception
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
Signature
signature
=
new
Signature
(
userId
,
maxBehotTime
);
String
as
=
signature
.
getAs
();
String
cp
=
signature
.
getCp
();
String
signatureStr
=
signature
.
getSignature
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
userId
+
"&max_behot_time="
+
maxBehotTime
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
signatureStr
;
logger
.
info
(
"当前采集的历史文章链接:::{}"
,
url
);
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
String
htmlBody
=
null
;
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
userId
,
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
}
else
{
break
;
}
}
else
{
logger
.
info
(
"数据为null,获取到的文本为:::{}"
,
htmlBody
);
continue
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
);
throw
e
;
}
}
return
Collections
.
emptyMap
();
}
/***
* 根据帐号解析历史文章地址
*
* @Description:根据帐号解析历史文章地址
* @param @param
* htmlBody
* @param @return
* @return List<String> 返回类型
*/
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Long
maxBehotTime
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
maxBehotTime
=
Long
.
valueOf
(
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
));
String
title
=
null
;
String
content
=
null
;
String
time
=
null
;
Date
date
=
null
;
String
readNum
=
null
;
String
commentNum
=
null
;
String
playNum
=
null
;
String
shareNum
=
null
;
String
source
=
null
;
String
userId
=
null
;
String
articleType
=
null
;
List
<
String
>
labelList
=
null
;
String
likeNum
=
null
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
String
href
=
"https://www.toutiao.com/"
;
if
(
data
.
containsKey
(
"group_id"
))
{
href
=
href
+
"a"
+
data
.
getLongValue
(
"group_id"
);
title
=
data
.
getString
(
"title"
);
content
=
data
.
getString
(
"abstract"
);
time
=
data
.
getLongValue
(
"behot_time"
)
*
1000
+
""
;
date
=
TimeParse
.
stringFormartDate
(
time
);
readNum
=
data
.
getString
(
"go_detail_count"
);
commentNum
=
data
.
getString
(
"comments_count"
);
playNum
=
data
.
getString
(
"detail_play_effective_count"
);
shareNum
=
data
.
getString
(
"share_count"
);
source
=
data
.
getString
(
"source"
);
userId
=
data
.
getLong
(
"creator_uid"
)
+
""
;
articleType
=
data
.
getString
(
"chinese_tag"
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
userId
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
shareNum
,
"今日头条"
,
articleType
,
likeNum
);
if
(
data
.
containsKey
(
"label"
))
{
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
tt
.
setLabelList
(
labelList
);
}
dataList
.
add
(
tt
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
return
null
;
}
if
(
endDate
!=
null
)
{
if
(
maxBehotTime
!=
null
&&
!
"0"
.
equals
(
maxBehotTime
))
{
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
maxBehotTime
+
"000"
));
if
(
endDate
.
after
(
nextDate
))
{
maxBehotTime
=
null
;
}
}
}
map
.
put
(
"max_behot_time"
,
maxBehotTime
);
map
.
put
(
"data"
,
dataList
);
return
map
;
}
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
userId
,
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Long
maxBehotTime
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
maxBehotTime
=
Long
.
valueOf
(
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
));
String
title
=
null
;
String
content
=
null
;
String
time
=
null
;
Date
date
=
null
;
String
readNum
=
null
;
String
commentNum
=
null
;
String
playNum
=
null
;
String
shareNum
=
null
;
String
source
=
null
;
String
articleType
=
null
;
List
<
String
>
labelList
=
null
;
String
likeNum
=
null
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
String
href
=
"https://www.toutiao.com/"
;
if
(
data
.
containsKey
(
"group_id"
))
{
href
=
href
+
"a"
+
data
.
getLongValue
(
"group_id"
);
title
=
data
.
getString
(
"title"
);
content
=
data
.
getString
(
"abstract"
);
time
=
data
.
getLongValue
(
"behot_time"
)
*
1000
+
""
;
date
=
TimeParse
.
stringFormartDate
(
time
);
readNum
=
data
.
getString
(
"go_detail_count"
);
commentNum
=
data
.
getString
(
"comments_count"
);
playNum
=
data
.
getString
(
"detail_play_effective_count"
);
shareNum
=
data
.
getString
(
"share_count"
);
source
=
data
.
getString
(
"source"
);
articleType
=
data
.
getString
(
"chinese_tag"
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
userId
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
shareNum
,
"今日头条"
,
articleType
,
likeNum
);
if
(
data
.
containsKey
(
"label"
))
{
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
tt
.
setLabelList
(
labelList
);
}
dataList
.
add
(
tt
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
return
null
;
}
if
(
endDate
!=
null
)
{
if
(
maxBehotTime
!=
null
&&
!
"0"
.
equals
(
maxBehotTime
))
{
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
maxBehotTime
+
"000"
));
if
(
endDate
.
after
(
nextDate
))
{
maxBehotTime
=
null
;
}
}
}
map
.
put
(
"max_behot_time"
,
maxBehotTime
);
map
.
put
(
"data"
,
dataList
);
return
map
;
}
/**
* @Title: getMicroTouTiaoCrawler
* @author hero
* @Description: 根据用户user_id查询用户微头条数据
* @param @param
* user_id
* @param @param
* endDate
* @param @param
* proxy
* @param @return
* @param @throws
* IOException 设定文件
* @return List<Map<String,Object>> 返回类型
*/
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
userId
,
Date
endDate
,
Proxy
proxy
,
String
maxBehotTime
)
throws
IOException
{
String
url
=
"https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id="
+
userId
;
if
(
maxBehotTime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
}
System
.
out
.
println
(
url
);
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()
>
0
)
{
return
dataMap
;
}
}
else
{
logger
.
info
(
"数据为null"
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"获取数据出错::{},数据为null"
,
e
);
return
null
;
}
return
null
;
}
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
userId
,
Date
endDate
,
ProxyHolder
proxy
,
Long
maxBehotTime
)
throws
IOException
{
String
url
=
"https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id="
+
userId
;
if
(
maxBehotTime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
}
logger
.
info
(
"微头条采集链接:::{}"
,
url
);
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"create_time"
))
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()
>
0
)
{
return
dataMap
;
}
}
else
{
logger
.
info
(
"数据为null"
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"获取数据出错::{},数据为null"
,
e
);
return
null
;
}
return
null
;
}
/**
*
* @Description 微头条客户端解析
* @param userId
* @param endDate
* @param proxy
* @param max_behot_time
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getClientMicroToutiaoCrawler
(
String
userId
,
ProxyHolder
proxy
,
Long
maxBehotTime
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
String
ma
=
""
;
while
(
true
)
{
String
url
=
"https://i.snssdk.com/api/feed/profile/v1/?visited_uid="
+
userId
+
"&offset="
+
maxBehotTime
;
ma
=
String
.
valueOf
(
maxBehotTime
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
maxBehotTime
=
json
.
getLongValue
(
"offset"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
try
{
JSONObject
dataJSON
=
data
.
getJSONObject
(
"content"
).
getJSONObject
(
"raw_data"
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
if
(
dataJSON
.
containsKey
(
"comment_base"
)
&&
dataJSON
.
getJSONObject
(
"comment_base"
)!=
null
)
{
JSONObject
commentBase
=
dataJSON
.
getJSONObject
(
"comment_base"
);
Date
date
=
new
Date
(
commentBase
.
getLongValue
(
"create_time"
)
*
1000
);
String
href
=
"http://weitoutiao.zjurl.cn/ugc/share/wap/comment/"
+
dataJSON
.
getLongValue
(
"id"
);
String
source
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"name"
);
String
content
=
commentBase
.
getString
(
"content"
);
String
readNum
=
commentBase
.
getJSONObject
(
"action"
).
getInteger
(
"read_count"
)
+
""
;
String
commentNum
=
commentBase
.
getJSONObject
(
"action"
).
getInteger
(
"comment_count"
)
+
""
;
userId
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"user_id"
);
if
(
dataJSON
.
containsKey
(
"origin_group"
))
{
String
replayUrl
=
dataJSON
.
getJSONObject
(
"origin_group"
).
getString
(
"article_url"
);
String
title
=
dataJSON
.
getJSONObject
(
"origin_group"
).
getString
(
"title"
);
map
.
put
(
"title"
,
title
);
map
.
put
(
"replayUrl"
,
replayUrl
);
}
map
.
put
(
"time"
,
date
);
map
.
put
(
"href"
,
href
);
map
.
put
(
"source"
,
source
);
map
.
put
(
"content"
,
content
);
map
.
put
(
"readNum"
,
readNum
);
map
.
put
(
"commentNum"
,
commentNum
);
map
.
put
(
"user_id"
,
userId
);
dataList
.
add
(
map
);
}
}
catch
(
Exception
e
)
{
// System.out.println(data.toString());
e
.
printStackTrace
();
}
}
System
.
out
.
println
(
" 采集到 条 == "
+
dataList
.
size
()
+
" -- "
+
ma
+
" -- "
+
maxBehotTime
);
if
(
ma
.
equals
(
String
.
valueOf
(
maxBehotTime
)))
{
break
;
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"客户端微头条采集错误 {}"
,
e
);
}
}
return
dataList
;
}
/**
* @Title: parseHtmlByMicroAccount
* @author hero
* @Description: 解析微头条数据
* @param @param
* htmlBody
* @param @param
* endDate
* @param @return
* 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
parseHtmlByMicroAccount
(
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Long
maxBehotTime
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
boolean
more
=
false
;
if
(
json
.
containsKey
(
"has_more"
))
{
more
=
json
.
getBoolean
(
"has_more"
);
}
if
(
json
.
containsKey
(
"next"
))
{
maxBehotTime
=
json
.
getJSONObject
(
"next"
).
getLongValue
(
"max_behot_time"
);
}
Date
date
=
null
;
if
(
json
.
containsKey
(
"data"
))
{
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
String
href
=
null
;
String
source
=
null
;
String
title
=
null
;
String
content
=
null
;
String
readNum
=
null
;
String
commentNum
=
null
;
String
playNum
=
null
;
String
userId
=
null
;
String
likeNum
=
null
;
String
articleType
=
null
;
int
count
=
16
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
String
text
=
null
;
if
(
data
.
containsKey
(
"stream_cell"
)
&&
data
.
getJSONObject
(
"stream_cell"
)!=
null
)
{
text
=
data
.
getJSONObject
(
"stream_cell"
).
getString
(
"raw_data"
);
}
else
if
(
data
.
containsKey
(
"concern_talk_cell"
))
{
text
=
data
.
getJSONObject
(
"concern_talk_cell"
).
getString
(
"packed_json_str"
);
}
JSONObject
dataJSON
=
JSONObject
.
parseObject
(
text
);
if
(
dataJSON
.
containsKey
(
"comment_base"
)
&&
dataJSON
.
getJSONObject
(
"comment_base"
)!=
null
)
{
JSONObject
commentBase
=
dataJSON
.
getJSONObject
(
"comment_base"
);
date
=
new
Date
(
commentBase
.
getLongValue
(
"create_time"
)
*
1000
);
href
=
"https://www.toutiao.com/a"
+
dataJSON
.
getLongValue
(
"id"
);
source
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"name"
);
content
=
dataJSON
.
getString
(
"content"
);
readNum
=
dataJSON
.
getJSONObject
(
"action"
).
getInteger
(
"read_count"
)
+
""
;
likeNum
=
dataJSON
.
getJSONObject
(
"action"
).
getInteger
(
"digg_count"
)+
""
;
commentNum
=
dataJSON
.
getJSONObject
(
"action"
).
getInteger
(
"comment_count"
)
+
""
;
userId
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"user_id"
);
if
(
content
!=
null
&&
!
""
.
equals
(
content
))
{
if
(
content
.
length
()
<
16
)
{
count
=
content
.
length
();
}
title
=
content
.
substring
(
0
,
count
);
}
}
else
{
date
=
new
Date
(
dataJSON
.
getLongValue
(
"create_time"
)
*
1000
);
href
=
"https://www.toutiao.com/a"
+
dataJSON
.
getString
(
"thread_id"
);
source
=
dataJSON
.
getJSONObject
(
"user"
).
getString
(
"name"
);
content
=
dataJSON
.
getString
(
"content"
);
readNum
=
dataJSON
.
getInteger
(
"read_count"
)
+
""
;
commentNum
=
dataJSON
.
getInteger
(
"comment_count"
)
+
""
;
likeNum
=
dataJSON
.
getInteger
(
"digg_count"
)+
""
;
userId
=
dataJSON
.
getJSONObject
(
"user"
).
getString
(
"user_id"
);
if
(
content
!=
null
&&
!
""
.
equals
(
content
))
{
if
(
content
.
length
()
<
16
)
{
count
=
content
.
length
();
}
title
=
content
.
substring
(
0
,
count
);
}
}
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
userId
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
"0"
,
"微头条"
,
articleType
,
likeNum
);
dataList
.
add
(
tt
);
}
catch
(
Exception
e
)
{
continue
;
}
}
}
else
{
System
.
out
.
println
(
json
);
}
/** 验证是否有下一页数据 **/
if
(
more
)
{
if
(
maxBehotTime
!=
null
&&
maxBehotTime
!=
0
)
{
if
(
endDate
.
after
(
date
))
{
maxBehotTime
=
null
;
}
}
}
else
{
maxBehotTime
=
null
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
map
.
put
(
"max_behot_time"
,
maxBehotTime
);
map
.
put
(
"data"
,
dataList
);
return
map
;
}
/**
* 根据链接获取全文
* @param url
* @param proxy
* @return
*/
public
static
String
getContent
(
String
url
,
Proxy
proxy
)
{
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
null
);
String
regex
=
"<script>var BASE_DATA[\\s\\S]+?</script>"
;
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"articleInfo"
))
{
//通过正则截取需要的js代码
Matcher
matcher
=
Pattern
.
compile
(
regex
).
matcher
(
htmlBody
);
if
(
matcher
.
find
())
{
String
content
=
matcher
.
group
().
replaceAll
(
"<script>var BASE_DATA = |;</script>"
,
""
);
//通过js引擎执行js代码
String
jsContent
=
"eval(("
+
content
+
")).articleInfo.content.toString();"
;
String
contentHtml
=
scriptEngine
.
eval
(
jsContent
).
toString
();
//解析最后的数据
return
Jsoup
.
parse
(
contentHtml
).
text
();
}
}
return
null
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"跟据链接采集全文出现错误"
,
e
);
return
null
;
}
}
/**
* 根据文章url获取itemId
* @param url
* @param proxy
* @return
* @throws Exception
*/
private
static
String
getItemIdByUrl
(
String
url
,
Proxy
proxy
)
throws
Exception
{
String
itemId
=
null
;
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
.
contains
(
"itemId"
))
{
itemId
=
htmlBody
.
split
(
"itemId: '"
)[
1
]
.
split
(
"',"
)[
0
].
trim
();
}
}
else
{
logger
.
info
(
"获取itemId失败,链接地址为:{}"
,
url
);
}
return
itemId
;
}
/**
* 根据文章url获取文章信息
* @param url
* @param proxy
* @return
* @throws Exception
*/
public
static
TouTiaoArticle
getToutiaoArticleInfoByUrl
(
String
url
,
Proxy
proxy
)
throws
Exception
{
String
itemId
=
getItemIdByUrl
(
url
,
proxy
);
if
(
Objects
.
nonNull
(
itemId
)){
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
String
urlNew
=
"https://m.toutiao.com/i"
+
itemId
+
"/info/?_signature=&i="
+
itemId
;
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://m.toutiao.com/i"
+
itemId
+
"/"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Mobile Safari/537.36"
);
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
try
{
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
String
commentNum
=
data
.
getInteger
(
"comment_count"
).
toString
();
String
readNum
=
data
.
getInteger
(
"impression_count"
).
toString
();
String
playCount
=
data
.
getInteger
(
"video_play_count"
).
toString
();
String
userId
=
data
.
getJSONObject
(
"media_user"
).
getLong
(
"id"
).
toString
();
String
source
=
data
.
getString
(
"source"
);
String
title
=
data
.
getString
(
"title"
);
String
link
=
data
.
getString
(
"url"
);
String
content
=
data
.
getString
(
"content"
);
if
(
data
.
containsKey
(
"content"
)
&&
StringUtils
.
isNotBlank
(
content
)){
content
=
Jsoup
.
parse
(
content
).
text
();
}
Date
time
=
new
Date
(
data
.
getLong
(
"publish_time"
)*
1000
);
TouTiaoArticle
touTiaoArticle
=
new
TouTiaoArticle
();
touTiaoArticle
.
setUrl
(
url
);
touTiaoArticle
.
setTitle
(
title
);
touTiaoArticle
.
setUser_id
(
userId
);
touTiaoArticle
.
setSource
(
source
);
touTiaoArticle
.
setTime
(
time
);
touTiaoArticle
.
setContent
(
content
);
touTiaoArticle
.
setCommentCount
(
commentNum
);
touTiaoArticle
.
setReadNum
(
readNum
);
touTiaoArticle
.
setPlayCount
(
playCount
);
return
touTiaoArticle
;
}
catch
(
Exception
e
)
{
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
);
}
}
}
catch
(
Exception
e
)
{
continue
;
}
}
}
return
null
;
}
/**
* 下载数据
* @param url
* @param proxy
* @param headMap
* @return
*/
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headMap
)
{
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
break
;
}
else
{
continue
;
}
}
}
return
null
;
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoCommentParse.java
View file @
9d384b56
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.toutiao.bean.TouTiaoComment
;
import
com.zhiwei.toutiao.util.Tools
;
import
okhttp3.Response
;
/**
* @ClassName: TouTiaoComment
* @Description: 今日头条评论数据
* @author hero
* @date 2016年12月9日 下午7:50:28
*/
public
class
TouTiaoCommentParse
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoCommentParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
*
* @Title: getTouTiaoComment
* @author hero
* @Description: 获取评论列表,可指定限制返回条数
* @param @param url
* @param @param count
* @param @param proxy
* @param @return
* @param @throws Exception 设定文件
* @return List<TouTiaoComment> 返回类型
*/
public
static
List
<
TouTiaoComment
>
getTouTiaoComment
(
String
url
,
int
returnCount
,
Proxy
proxy
)
throws
Exception
{
List
<
TouTiaoComment
>
ttList
=
new
ArrayList
<
TouTiaoComment
>();
String
group_id
=
getGroupId
(
url
,
proxy
);
//查询评论总页数
if
(
group_id
!=
null
){
int
page
=
getPage
(
group_id
,
proxy
);
if
(
returnCount
>
0
){
int
pageMax
=
(
int
)
Math
.
ceil
((
double
)
returnCount
/
20.0
);
if
(
page
>=
pageMax
){
page
=
pageMax
;
}
}
for
(
int
i
=
0
;
i
<
page
;
i
++)
{
String
urlNew
=
"http://is.snssdk.com/article/v2/tab_comments/?app_name=news_article&offset="
+
i
*
20
+
"&group_id="
+
group_id
+
"&aggr_type=1&count=20&fold=1&item_id="
+
group_id
+
"&ts="
+
System
.
currentTimeMillis
();
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"User-Agent"
,
"News 6.6.5 rv:6.6.5.03 (iPhone; iOS 11.3; zh_CN) Cronet"
);
headerMap
.
put
(
"Host"
,
"is.snssdk.com"
);
for
(
int
j
=
1
;
j
<=
3
;
j
++){
try
{
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
List
<
TouTiaoComment
>
commentes
=
analySisComment
(
htmlBody
,
url
);
ttList
.
addAll
(
commentes
);
logger
.
info
(
" url {} 采集到第 {} 页 采集到 {} 条数据 "
,
url
,
page
,
ttList
.
size
());
}
else
{
logger
.
info
(
"采集出现问题,地址为:{}"
,
url
);
}
if
(
Objects
.
nonNull
(
proxy
))
{
ZhiWeiTools
.
sleep
(
100
);
}
else
{
ZhiWeiTools
.
sleep
(
4000
);
}
break
;
}
catch
(
Exception
e
)
{
continue
;
}
}
}
}
return
ttList
;
}
/**
* @Title: analySisComment
* @Description: TODO(解析评论列表)
* @param @param htmlBody
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
private
static
List
<
TouTiaoComment
>
analySisComment
(
String
htmlBody
,
String
url
)
{
List
<
TouTiaoComment
>
list
=
new
ArrayList
<>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
commentes
=
json
.
getJSONArray
(
"data"
);
for
(
int
a
=
0
;
a
<
commentes
.
size
();
a
++)
{
JSONObject
comment
=
commentes
.
getJSONObject
(
a
).
getJSONObject
(
"comment"
);
String
id
=
comment
.
getString
(
"id"
);
String
text
=
comment
.
getString
(
"text"
);
String
name
=
comment
.
getString
(
"user_name"
);
int
reply_count
=
comment
.
getIntValue
(
"reply_count"
);
int
digg_count
=
comment
.
getIntValue
(
"digg_count"
);
long
timeLong
=
comment
.
getLongValue
(
"create_time"
)*
1000
;
Date
date
=
new
Date
(
timeLong
);
TouTiaoComment
ttComment
=
new
TouTiaoComment
(
id
,
text
,
name
,
reply_count
,
digg_count
,
date
,
url
);
list
.
add
(
ttComment
);
}
}
catch
(
Exception
e
)
{
logger
.
debug
(
"解析今日头条评论列表出现为题,{}"
,
e
);
}
return
list
;
}
/**
* @Title: getPage
* @Description: TODO(获取总页数)
* @param @param url
* @param @return 设定文件
* @return int 返回类型
* @throws Exception
*/
private
static
int
getPage
(
String
groupId
,
Proxy
proxy
)
throws
Exception
{
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
groupId
+
"&item_id=0&count=20&offset=0"
;
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
data
=
json
.
getJSONObject
(
"data"
);
int
count
=
data
.
getIntValue
(
"total"
);
return
(
int
)
Math
.
ceil
((
double
)
count
/
20.0
);
}
catch
(
Exception
e
)
{
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
);
}
}
return
-
1
;
}
/**
* @Title: findCommentCount
* @author hero
* @Description: 根据id获取头条评论数
* @param @param url
* @param @param proxy
* @param @return 设定文件
* @return int 返回类型
*/
public
static
int
findCommentCount
(
String
url
,
Proxy
proxy
)
{
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"commentInfo"
))
{
try
{
return
Integer
.
valueOf
(
htmlBody
.
split
(
"comments_count: "
)[
1
].
split
(
","
)[
0
]);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
return
0
;
}
}
}
catch
(
Exception
e
)
{
continue
;
}
}
return
0
;
}
/**
* @Title: findCommentCount
* @author hero
* @Description: 根据id获取头条评论数
* @param @param url
* @param @param proxy
* @param @return 设定文件
* @return int 返回类型
*/
public
static
int
findNewCommentCountByProxy
(
String
url
,
Proxy
proxy
)
{
try
{
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"commentInfo"
))
{
try
{
return
Integer
.
valueOf
(
htmlBody
.
split
(
"comments_count: "
)[
1
].
split
(
","
)[
0
]);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
return
-
1
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
}
return
-
1
;
}
/**
* @Title: getCommentCount
* @Description: TODO(根据id查看评论数)
* @param @param url
* @param @return 设定文件
* @return int 返回类型
* @throws IOException
*/
public
static
int
getCommentCount
(
String
url
,
Proxy
proxy
)
{
String
group_id
=
getGroupId
(
url
,
proxy
);
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
group_id
+
"&item_id=0&count=20&offset=0"
;
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
data
=
json
.
getJSONObject
(
"data"
);
return
data
.
getIntValue
(
"total"
);
}
catch
(
Exception
e
)
{
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
);
}
}
}
catch
(
Exception
e
)
{
continue
;
}
}
return
-
1
;
}
/**
* @Title: getGroupId
* @Description: TODO(获取groupId用于更新评论列表)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
getGroupId
(
String
url
,
Proxy
proxy
)
{
String
groupId
=
null
;
if
(
url
.
contains
(
"/a"
)||
url
.
contains
(
"/group/"
))
{
if
(
url
.
contains
(
"/a"
))
{
groupId
=
url
.
split
(
"/a"
)[
1
].
replace
(
"/"
,
""
);
}
else
{
groupId
=
url
.
split
(
"/group/"
)[
1
].
replace
(
"/"
,
""
);
}
}
else
if
(
url
.
contains
(
"/i"
)||
url
.
contains
(
"/item/"
))
{
groupId
=
gettGroupIdByUrl
(
url
,
proxy
);
}
return
groupId
;
}
/**
* @Title: gettGroupIdByUrl
* @Description: TODO(解析并获取groupId)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
gettGroupIdByUrl
(
String
url
,
Proxy
proxy
)
{
String
groupId
=
null
;
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
.
contains
(
"groupId"
))
{
groupId
=
htmlBody
.
split
(
"groupId: '"
)[
1
]
.
split
(
"',"
)[
0
].
trim
();
}
}
else
{
logger
.
info
(
"获取groupId失败,链接地址为:{}"
,
url
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"获取groupId失败,链接地址为:{}"
,
url
,
e
);
}
return
groupId
;
}
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
)
{
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
break
;
}
else
{
continue
;
}
}
}
return
null
;
}
}
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.toutiao.bean.TouTiaoComment
;
import
com.zhiwei.toutiao.util.Tools
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
/**
* @ClassName: TouTiaoComment
* @Description: 今日头条评论数据
* @author hero
* @date 2016年12月9日 下午7:50:28
*/
public
class
TouTiaoCommentParse
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoCommentParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
*
* @Title: getTouTiaoComment
* @author hero
* @Description: 获取评论列表,可指定限制返回条数
* @param @param url
* @param @param count
* @param @param proxy
* @param @return
* @param @throws Exception 设定文件
* @return List<TouTiaoComment> 返回类型
*/
public
static
List
<
TouTiaoComment
>
getTouTiaoComment
(
String
url
,
int
returnCount
,
Proxy
proxy
)
throws
Exception
{
List
<
TouTiaoComment
>
ttList
=
new
ArrayList
<
TouTiaoComment
>();
String
group_id
=
getGroupId
(
url
,
proxy
);
//查询评论总页数
if
(
group_id
!=
null
){
int
page
=
getPage
(
group_id
,
proxy
);
if
(
returnCount
>
0
){
int
pageMax
=
(
int
)
Math
.
ceil
((
double
)
returnCount
/
20.0
);
if
(
page
>=
pageMax
){
page
=
pageMax
;
}
}
for
(
int
i
=
0
;
i
<
page
;
i
++)
{
String
urlNew
=
"http://is.snssdk.com/article/v2/tab_comments/?app_name=news_article&offset="
+
i
*
20
+
"&group_id="
+
group_id
+
"&aggr_type=1&count=20&fold=1&item_id="
+
group_id
+
"&ts="
+
System
.
currentTimeMillis
();
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"User-Agent"
,
"News 6.6.5 rv:6.6.5.03 (iPhone; iOS 11.3; zh_CN) Cronet"
);
headerMap
.
put
(
"Host"
,
"is.snssdk.com"
);
for
(
int
j
=
1
;
j
<=
3
;
j
++){
try
{
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
List
<
TouTiaoComment
>
commentes
=
analySisComment
(
htmlBody
,
url
);
ttList
.
addAll
(
commentes
);
logger
.
info
(
" url {} 采集到第 {} 页 采集到 {} 条数据 "
,
url
,
page
,
ttList
.
size
());
}
else
{
logger
.
info
(
"采集出现问题,地址为:{}"
,
url
);
}
if
(
Objects
.
nonNull
(
proxy
))
{
ZhiWeiTools
.
sleep
(
100
);
}
else
{
ZhiWeiTools
.
sleep
(
4000
);
}
break
;
}
catch
(
Exception
e
)
{
continue
;
}
}
}
}
return
ttList
;
}
/**
* @Title: analySisComment
* @Description: TODO(解析评论列表)
* @param @param htmlBody
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
private
static
List
<
TouTiaoComment
>
analySisComment
(
String
htmlBody
,
String
url
)
{
List
<
TouTiaoComment
>
list
=
new
ArrayList
<>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
commentes
=
json
.
getJSONArray
(
"data"
);
for
(
int
a
=
0
;
a
<
commentes
.
size
();
a
++)
{
JSONObject
comment
=
commentes
.
getJSONObject
(
a
).
getJSONObject
(
"comment"
);
String
id
=
comment
.
getString
(
"id"
);
String
text
=
comment
.
getString
(
"text"
);
String
name
=
comment
.
getString
(
"user_name"
);
int
reply_count
=
comment
.
getIntValue
(
"reply_count"
);
int
digg_count
=
comment
.
getIntValue
(
"digg_count"
);
long
timeLong
=
comment
.
getLongValue
(
"create_time"
)*
1000
;
Date
date
=
new
Date
(
timeLong
);
TouTiaoComment
ttComment
=
new
TouTiaoComment
(
id
,
text
,
name
,
reply_count
,
digg_count
,
date
,
url
);
list
.
add
(
ttComment
);
}
}
catch
(
Exception
e
)
{
logger
.
debug
(
"解析今日头条评论列表出现为题,{}"
,
e
);
}
return
list
;
}
/**
* @Title: getPage
* @Description: TODO(获取总页数)
* @param @param url
* @param @return 设定文件
* @return int 返回类型
* @throws Exception
*/
private
static
int
getPage
(
String
groupId
,
Proxy
proxy
)
throws
Exception
{
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
groupId
+
"&item_id=0&count=20&offset=0"
;
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
data
=
json
.
getJSONObject
(
"data"
);
int
count
=
data
.
getIntValue
(
"total"
);
return
(
int
)
Math
.
ceil
((
double
)
count
/
20.0
);
}
catch
(
Exception
e
)
{
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
);
}
}
return
-
1
;
}
/**
* @Title: findCommentCount
* @author hero
* @Description: 根据id获取头条评论数
* @param @param url
* @param @param proxy
* @param @return 设定文件
* @return int 返回类型
*/
public
static
int
findCommentCount
(
String
url
,
Proxy
proxy
)
{
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"commentInfo"
))
{
try
{
return
Integer
.
valueOf
(
htmlBody
.
split
(
"comments_count: "
)[
1
].
split
(
","
)[
0
]);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
return
0
;
}
}
}
catch
(
Exception
e
)
{
continue
;
}
}
return
0
;
}
/**
* @Title: findCommentCount
* @author hero
* @Description: 根据id获取头条评论数
* @param @param url
* @param @param proxy
* @param @return 设定文件
* @return int 返回类型
*/
public
static
int
findNewCommentCountByProxy
(
String
url
,
Proxy
proxy
)
{
try
{
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"commentInfo"
))
{
try
{
return
Integer
.
valueOf
(
htmlBody
.
split
(
"comments_count: "
)[
1
].
split
(
","
)[
0
]);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
return
-
1
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
}
return
-
1
;
}
/**
* @Title: getCommentCount
* @Description: TODO(根据id查看评论数)
* @param @param url
* @param @return 设定文件
* @return int 返回类型
* @throws IOException
*/
public
static
int
getCommentCount
(
String
url
,
Proxy
proxy
)
throws
Exception
{
String
group_id
=
getGroupId
(
url
,
proxy
);
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
group_id
+
"&item_id=0&count=20&offset=0"
;
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
data
=
json
.
getJSONObject
(
"data"
);
return
data
.
getIntValue
(
"total"
);
}
catch
(
Exception
e
)
{
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
);
}
}
}
catch
(
Exception
e
)
{
continue
;
}
}
return
-
1
;
}
/**
* @Title: getGroupId
* @Description: TODO(获取groupId用于更新评论列表)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
getGroupId
(
String
url
,
Proxy
proxy
)
throws
Exception
{
String
groupId
=
null
;
if
(
url
.
contains
(
"/a"
)||
url
.
contains
(
"/group/"
))
{
if
(
url
.
contains
(
"/a"
))
{
groupId
=
url
.
split
(
"/a"
)[
1
].
replace
(
"/"
,
""
);
}
else
{
groupId
=
url
.
split
(
"/group/"
)[
1
].
replace
(
"/"
,
""
);
}
}
else
if
(
url
.
contains
(
"/i"
)||
url
.
contains
(
"/item/"
))
{
groupId
=
getGroupIdByUrl
(
url
,
proxy
);
}
return
groupId
;
}
/**
* @Title: gettGroupIdByUrl
* @Description: TODO(解析并获取groupId)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
getGroupIdByUrl
(
String
url
,
Proxy
proxy
)
throws
Exception
{
String
groupId
=
null
;
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
.
contains
(
"groupId"
))
{
groupId
=
htmlBody
.
split
(
"groupId: '"
)[
1
]
.
split
(
"',"
)[
0
].
trim
();
}
}
else
{
logger
.
info
(
"获取groupId失败,链接地址为:{}"
,
url
);
}
return
groupId
;
}
/**
* 下载数据
* @param url
* @param proxy
* @param headerMap
* @return
*/
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
)
{
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
break
;
}
else
{
continue
;
}
}
}
return
null
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment