Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
9d384b56
Commit
9d384b56
authored
Oct 28, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加更新今日头条阅读数功能
parent
34d3c078
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
1085 additions
and
997 deletions
+1085
-997
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+736
-652
src/main/java/com/zhiwei/toutiao/parse/TouTiaoCommentParse.java
+349
-345
No files found.
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
View file @
9d384b56
/**
/**
* @Title: TouTiaoParse.java
* @Title: TouTiaoParse.java
* @Package com.zhiwei.toutiao.parse
* @Package com.zhiwei.toutiao.parse
* @Description:
* @Description:
* @author hero
* @author hero
* @date 2016年9月2日 上午11:17:44
* @date 2016年9月2日 上午11:17:44
* @version V1.0
* @version V1.0
*/
*/
/**
/**
*
*
*/
*/
package
com
.
zhiwei
.
toutiao
.
parse
;
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.*
;
import
java.util.Collections
;
import
java.util.regex.Matcher
;
import
java.util.Date
;
import
java.util.regex.Pattern
;
import
java.util.HashMap
;
import
java.util.List
;
import
javax.script.ScriptEngine
;
import
java.util.Map
;
import
javax.script.ScriptEngineManager
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
javax.script.ScriptEngine
;
import
org.apache.logging.log4j.Logger
;
import
javax.script.ScriptEngineManager
;
import
org.jsoup.Jsoup
;
import
org.apache.commons.lang3.StringUtils
;
import
com.alibaba.fastjson.JSONArray
;
import
org.apache.logging.log4j.LogManager
;
import
com.alibaba.fastjson.JSONObject
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
org.jsoup.Jsoup
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.alibaba.fastjson.JSONArray
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.toutiao.bean.Signature
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
okhttp3.Response
;
import
com.zhiwei.toutiao.bean.Signature
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
/**
import
com.zhiwei.toutiao.util.Tools
;
* @Description:头条帐号采集
* @author hero
import
okhttp3.Response
;
* @date 2016年9月2日 上午11:17:44
*/
/**
public
class
TouTiaoArticleParse
{
* @Description:头条帐号采集
* @author hero
private
static
ScriptEngine
scriptEngine
=
new
ScriptEngineManager
().
getEngineByName
(
"javascript"
);
* @date 2016年9月2日 上午11:17:44
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoArticleParse
.
class
);
*/
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
public
class
TouTiaoArticleParse
{
private
static
ScriptEngine
scriptEngine
=
new
ScriptEngineManager
().
getEngineByName
(
"javascript"
);
/***
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoArticleParse
.
class
);
* 获取头条数据
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
*
* @Description:
* @param @param
/***
* url
* 获取头条数据
* @param @return
*
* @return List<TouTiao> 返回类型
* @Description:
* @throws Exception
* @param @param
*/
* url
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
mediaId
,
String
maxBehotTime
,
Date
endData
,
Proxy
proxy
)
* @param @return
throws
Exception
{
* @return List<TouTiao> 返回类型
Signature
signature
=
new
Signature
();
* @throws Exception
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
mediaId
+
"&count=20&as="
*/
+
signature
.
getAs
()
+
"&cp="
+
signature
.
getCp
();
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
mediaId
,
String
maxBehotTime
,
Date
endData
,
Proxy
proxy
)
if
(
maxBehotTime
!=
null
)
{
throws
Exception
{
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
Signature
signature
=
new
Signature
();
}
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
mediaId
+
"&count=20&as="
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
+
signature
.
getAs
()
+
"&cp="
+
signature
.
getCp
();
headerMap
.
put
(
"Referer"
,
url
);
if
(
maxBehotTime
!=
null
)
{
try
{
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
headerMap
.
put
(
"Referer"
,
url
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
try
{
return
ttList
;
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
}
else
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
logger
.
info
(
"数据为null"
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
}
return
ttList
;
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
}
else
{
throw
e
;
logger
.
info
(
"数据为null"
);
}
}
return
Collections
.
emptyMap
();
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
throw
e
;
}
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
mediaId
,
String
maxBehotTime
,
Date
endData
,
ProxyHolder
proxy
)
return
Collections
.
emptyMap
();
throws
Exception
{
}
Signature
signature
=
new
Signature
();
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
mediaId
+
"&count=20&as="
+
signature
.
getAs
()
+
"&cp="
+
signature
.
getCp
();
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
mediaId
,
String
maxBehotTime
,
Date
endData
,
ProxyHolder
proxy
)
if
(
maxBehotTime
!=
null
)
{
throws
Exception
{
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
Signature
signature
=
new
Signature
();
}
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
mediaId
+
"&count=20&as="
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
+
signature
.
getAs
()
+
"&cp="
+
signature
.
getCp
();
headerMap
.
put
(
"Referer"
,
url
);
if
(
maxBehotTime
!=
null
)
{
try
{
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
headerMap
.
put
(
"Referer"
,
url
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
try
{
return
ttList
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
}
else
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
logger
.
info
(
"数据为null"
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
}
return
ttList
;
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
}
else
{
throw
e
;
logger
.
info
(
"数据为null"
);
}
}
return
Collections
.
emptyMap
();
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
throw
e
;
/**
}
* 获取今日头条历史文章接口新
return
Collections
.
emptyMap
();
*
}
* @param user_id
* @param max_behot_time
/**
* @param endData
* 获取今日头条历史文章接口新
* @param proxy
*
* @return
* @param user_id
* @throws Exception
* @param max_behot_time
*/
* @param endData
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
userId
,
String
maxBehotTime
,
Date
endData
,
* @param proxy
Proxy
proxy
)
throws
Exception
{
* @return
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
* @throws Exception
Signature
signature
=
new
Signature
(
userId
,
maxBehotTime
);
*/
String
as
=
signature
.
getAs
();
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
userId
,
String
maxBehotTime
,
Date
endData
,
String
cp
=
signature
.
getCp
();
Proxy
proxy
)
throws
Exception
{
String
signatureStr
=
signature
.
getSignature
();
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
userId
+
"&max_behot_time="
Signature
signature
=
new
Signature
(
userId
,
maxBehotTime
);
+
maxBehotTime
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
signatureStr
;
String
as
=
signature
.
getAs
();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
String
cp
=
signature
.
getCp
();
headerMap
.
put
(
"user-agent"
,
String
signatureStr
=
signature
.
getSignature
();
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
userId
+
"&max_behot_time="
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
+
maxBehotTime
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
signatureStr
;
try
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
headerMap
.
put
(
"user-agent"
,
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
userId
,
htmlBody
,
endData
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
try
{
return
ttList
;
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
}
else
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
userId
,
htmlBody
,
endData
);
logger
.
info
(
"数据为null"
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
continue
;
return
ttList
;
}
}
}
catch
(
Exception
e
)
{
}
else
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
logger
.
info
(
"数据为null"
);
throw
e
;
continue
;
}
}
}
}
catch
(
Exception
e
)
{
return
Collections
.
emptyMap
();
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
}
throw
e
;
}
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
userId
,
String
maxBehotTime
,
Date
endData
,
}
ProxyHolder
proxy
)
throws
Exception
{
return
Collections
.
emptyMap
();
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
}
Signature
signature
=
new
Signature
(
userId
,
maxBehotTime
);
String
as
=
signature
.
getAs
();
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
userId
,
String
maxBehotTime
,
Date
endData
,
String
cp
=
signature
.
getCp
();
ProxyHolder
proxy
)
throws
Exception
{
String
signatureStr
=
signature
.
getSignature
();
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
userId
+
"&max_behot_time="
Signature
signature
=
new
Signature
(
userId
,
maxBehotTime
);
+
maxBehotTime
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
signatureStr
;
String
as
=
signature
.
getAs
();
logger
.
info
(
"当前采集的历史文章链接:::{}"
,
url
);
String
cp
=
signature
.
getCp
();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
String
signatureStr
=
signature
.
getSignature
();
headerMap
.
put
(
"user-agent"
,
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
userId
+
"&max_behot_time="
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
+
maxBehotTime
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
signatureStr
;
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
logger
.
info
(
"当前采集的历史文章链接:::{}"
,
url
);
String
htmlBody
=
null
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
try
{
headerMap
.
put
(
"user-agent"
,
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
userId
,
htmlBody
,
endData
);
String
htmlBody
=
null
;
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
try
{
return
ttList
;
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
}
else
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
break
;
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
userId
,
htmlBody
,
endData
);
}
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
}
else
{
return
ttList
;
logger
.
info
(
"数据为null,获取到的文本为:::{}"
,
htmlBody
);
}
else
{
continue
;
break
;
}
}
}
catch
(
Exception
e
)
{
}
else
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
);
logger
.
info
(
"数据为null,获取到的文本为:::{}"
,
htmlBody
);
throw
e
;
continue
;
}
}
}
}
catch
(
Exception
e
)
{
return
Collections
.
emptyMap
();
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
);
}
throw
e
;
}
/***
}
* 根据帐号解析历史文章地址
return
Collections
.
emptyMap
();
*
}
* @Description:根据帐号解析历史文章地址
* @param @param
/***
* htmlBody
* 根据帐号解析历史文章地址
* @param @return
*
* @return List<String> 返回类型
* @Description:根据帐号解析历史文章地址
*/
* @param @param
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
htmlBody
,
Date
endDate
)
{
* htmlBody
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
* @param @return
Long
maxBehotTime
=
null
;
* @return List<String> 返回类型
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
*/
try
{
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
htmlBody
,
Date
endDate
)
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
Long
maxBehotTime
=
null
;
maxBehotTime
=
Long
.
valueOf
(
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
));
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
String
title
=
null
;
try
{
String
content
=
null
;
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
String
time
=
null
;
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
Date
date
=
null
;
maxBehotTime
=
Long
.
valueOf
(
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
));
String
readNum
=
null
;
String
title
=
null
;
String
commentNum
=
null
;
String
content
=
null
;
String
playNum
=
null
;
String
time
=
null
;
String
shareNum
=
null
;
Date
date
=
null
;
String
source
=
null
;
String
readNum
=
null
;
String
userId
=
null
;
String
commentNum
=
null
;
String
articleType
=
null
;
String
playNum
=
null
;
List
<
String
>
labelList
=
null
;
String
shareNum
=
null
;
String
likeNum
=
null
;
String
source
=
null
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
String
userId
=
null
;
try
{
String
articleType
=
null
;
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
List
<
String
>
labelList
=
null
;
String
href
=
"https://www.toutiao.com/"
;
String
likeNum
=
null
;
if
(
data
.
containsKey
(
"group_id"
))
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
href
=
href
+
"a"
+
data
.
getLongValue
(
"group_id"
);
try
{
title
=
data
.
getString
(
"title"
);
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
content
=
data
.
getString
(
"abstract"
);
String
href
=
"https://www.toutiao.com/"
;
time
=
data
.
getLongValue
(
"behot_time"
)
*
1000
+
""
;
if
(
data
.
containsKey
(
"group_id"
))
{
date
=
TimeParse
.
stringFormartDate
(
time
);
href
=
href
+
"a"
+
data
.
getLongValue
(
"group_id"
);
readNum
=
data
.
getString
(
"go_detail_count"
);
title
=
data
.
getString
(
"title"
);
commentNum
=
data
.
getString
(
"comments_count"
);
content
=
data
.
getString
(
"abstract"
);
playNum
=
data
.
getString
(
"detail_play_effective_count"
);
time
=
data
.
getLongValue
(
"behot_time"
)
*
1000
+
""
;
shareNum
=
data
.
getString
(
"share_count"
);
date
=
TimeParse
.
stringFormartDate
(
time
);
source
=
data
.
getString
(
"source"
);
readNum
=
data
.
getString
(
"go_detail_count"
);
userId
=
data
.
getLong
(
"creator_uid"
)
+
""
;
commentNum
=
data
.
getString
(
"comments_count"
);
articleType
=
data
.
getString
(
"chinese_tag"
);
playNum
=
data
.
getString
(
"detail_play_effective_count"
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
userId
,
source
,
date
,
content
,
commentNum
,
shareNum
=
data
.
getString
(
"share_count"
);
playNum
,
readNum
,
shareNum
,
"今日头条"
,
articleType
,
likeNum
);
source
=
data
.
getString
(
"source"
);
if
(
data
.
containsKey
(
"label"
))
{
userId
=
data
.
getLong
(
"creator_uid"
)
+
""
;
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
articleType
=
data
.
getString
(
"chinese_tag"
);
tt
.
setLabelList
(
labelList
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
userId
,
source
,
date
,
content
,
commentNum
,
}
playNum
,
readNum
,
shareNum
,
"今日头条"
,
articleType
,
likeNum
);
dataList
.
add
(
tt
);
if
(
data
.
containsKey
(
"label"
))
{
}
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
}
catch
(
Exception
e
)
{
tt
.
setLabelList
(
labelList
);
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
}
continue
;
dataList
.
add
(
tt
);
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
continue
;
return
null
;
}
}
}
}
catch
(
Exception
e
)
{
if
(
endDate
!=
null
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
if
(
maxBehotTime
!=
null
&&
!
"0"
.
equals
(
maxBehotTime
))
{
return
null
;
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
maxBehotTime
+
"000"
));
}
if
(
endDate
.
after
(
nextDate
))
{
maxBehotTime
=
null
;
if
(
endDate
!=
null
)
{
}
if
(
maxBehotTime
!=
null
&&
!
"0"
.
equals
(
maxBehotTime
))
{
}
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
maxBehotTime
+
"000"
));
}
if
(
endDate
.
after
(
nextDate
))
{
map
.
put
(
"max_behot_time"
,
maxBehotTime
);
maxBehotTime
=
null
;
map
.
put
(
"data"
,
dataList
);
}
return
map
;
}
}
}
map
.
put
(
"max_behot_time"
,
maxBehotTime
);
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
userId
,
String
htmlBody
,
Date
endDate
)
{
map
.
put
(
"data"
,
dataList
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
return
map
;
Long
maxBehotTime
=
null
;
}
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
try
{
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
userId
,
String
htmlBody
,
Date
endDate
)
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
Long
maxBehotTime
=
null
;
maxBehotTime
=
Long
.
valueOf
(
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
));
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
String
title
=
null
;
try
{
String
content
=
null
;
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
String
time
=
null
;
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
Date
date
=
null
;
maxBehotTime
=
Long
.
valueOf
(
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
));
String
readNum
=
null
;
String
title
=
null
;
String
commentNum
=
null
;
String
content
=
null
;
String
playNum
=
null
;
String
time
=
null
;
String
shareNum
=
null
;
Date
date
=
null
;
String
source
=
null
;
String
readNum
=
null
;
String
articleType
=
null
;
String
commentNum
=
null
;
List
<
String
>
labelList
=
null
;
String
playNum
=
null
;
String
likeNum
=
null
;
String
shareNum
=
null
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
String
source
=
null
;
try
{
String
articleType
=
null
;
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
List
<
String
>
labelList
=
null
;
String
href
=
"https://www.toutiao.com/"
;
String
likeNum
=
null
;
if
(
data
.
containsKey
(
"group_id"
))
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
href
=
href
+
"a"
+
data
.
getLongValue
(
"group_id"
);
try
{
title
=
data
.
getString
(
"title"
);
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
content
=
data
.
getString
(
"abstract"
);
String
href
=
"https://www.toutiao.com/"
;
time
=
data
.
getLongValue
(
"behot_time"
)
*
1000
+
""
;
if
(
data
.
containsKey
(
"group_id"
))
{
date
=
TimeParse
.
stringFormartDate
(
time
);
href
=
href
+
"a"
+
data
.
getLongValue
(
"group_id"
);
readNum
=
data
.
getString
(
"go_detail_count"
);
title
=
data
.
getString
(
"title"
);
commentNum
=
data
.
getString
(
"comments_count"
);
content
=
data
.
getString
(
"abstract"
);
playNum
=
data
.
getString
(
"detail_play_effective_count"
);
time
=
data
.
getLongValue
(
"behot_time"
)
*
1000
+
""
;
shareNum
=
data
.
getString
(
"share_count"
);
date
=
TimeParse
.
stringFormartDate
(
time
);
source
=
data
.
getString
(
"source"
);
readNum
=
data
.
getString
(
"go_detail_count"
);
articleType
=
data
.
getString
(
"chinese_tag"
);
commentNum
=
data
.
getString
(
"comments_count"
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
userId
,
source
,
date
,
content
,
commentNum
,
playNum
=
data
.
getString
(
"detail_play_effective_count"
);
playNum
,
readNum
,
shareNum
,
"今日头条"
,
articleType
,
likeNum
);
shareNum
=
data
.
getString
(
"share_count"
);
if
(
data
.
containsKey
(
"label"
))
{
source
=
data
.
getString
(
"source"
);
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
articleType
=
data
.
getString
(
"chinese_tag"
);
tt
.
setLabelList
(
labelList
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
userId
,
source
,
date
,
content
,
commentNum
,
}
playNum
,
readNum
,
shareNum
,
"今日头条"
,
articleType
,
likeNum
);
dataList
.
add
(
tt
);
if
(
data
.
containsKey
(
"label"
))
{
}
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
}
catch
(
Exception
e
)
{
tt
.
setLabelList
(
labelList
);
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
}
continue
;
dataList
.
add
(
tt
);
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
continue
;
return
null
;
}
}
}
if
(
endDate
!=
null
)
{
}
catch
(
Exception
e
)
{
if
(
maxBehotTime
!=
null
&&
!
"0"
.
equals
(
maxBehotTime
))
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
maxBehotTime
+
"000"
));
return
null
;
if
(
endDate
.
after
(
nextDate
))
{
}
maxBehotTime
=
null
;
if
(
endDate
!=
null
)
{
}
if
(
maxBehotTime
!=
null
&&
!
"0"
.
equals
(
maxBehotTime
))
{
}
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
maxBehotTime
+
"000"
));
}
if
(
endDate
.
after
(
nextDate
))
{
map
.
put
(
"max_behot_time"
,
maxBehotTime
);
maxBehotTime
=
null
;
map
.
put
(
"data"
,
dataList
);
}
return
map
;
}
}
}
map
.
put
(
"max_behot_time"
,
maxBehotTime
);
/**
map
.
put
(
"data"
,
dataList
);
* @Title: getMicroTouTiaoCrawler
return
map
;
* @author hero
}
* @Description: 根据用户user_id查询用户微头条数据
* @param @param
/**
* user_id
* @Title: getMicroTouTiaoCrawler
* @param @param
* @author hero
* endDate
* @Description: 根据用户user_id查询用户微头条数据
* @param @param
* @param @param
* proxy
* user_id
* @param @return
* @param @param
* @param @throws
* endDate
* IOException 设定文件
* @param @param
* @return List<Map<String,Object>> 返回类型
* proxy
*/
* @param @return
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
userId
,
Date
endDate
,
Proxy
proxy
,
* @param @throws
String
maxBehotTime
)
throws
IOException
{
* IOException 设定文件
String
url
=
"https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id="
+
userId
;
* @return List<Map<String,Object>> 返回类型
if
(
maxBehotTime
!=
null
)
{
*/
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
userId
,
Date
endDate
,
Proxy
proxy
,
}
String
maxBehotTime
)
throws
IOException
{
System
.
out
.
println
(
url
);
String
url
=
"https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id="
+
userId
;
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
if
(
maxBehotTime
!=
null
)
{
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
try
{
}
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
System
.
out
.
println
(
url
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()
>
0
)
{
try
{
return
dataMap
;
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
}
if
(
htmlBody
!=
null
)
{
}
else
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
logger
.
info
(
"数据为null"
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()
>
0
)
{
}
return
dataMap
;
}
catch
(
Exception
e
)
{
}
logger
.
info
(
"获取数据出错::{},数据为null"
,
e
);
}
else
{
return
null
;
logger
.
info
(
"数据为null"
);
}
}
return
null
;
}
catch
(
Exception
e
)
{
}
logger
.
info
(
"获取数据出错::{},数据为null"
,
e
);
return
null
;
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
userId
,
Date
endDate
,
ProxyHolder
proxy
,
}
Long
maxBehotTime
)
throws
IOException
{
return
null
;
String
url
=
"https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id="
+
userId
;
}
if
(
maxBehotTime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
userId
,
Date
endDate
,
ProxyHolder
proxy
,
}
Long
maxBehotTime
)
throws
IOException
{
logger
.
info
(
"微头条采集链接:::{}"
,
url
);
String
url
=
"https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id="
+
userId
;
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
if
(
maxBehotTime
!=
null
)
{
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
url
=
url
+
"&max_behot_time="
+
maxBehotTime
;
try
{
}
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
logger
.
info
(
"微头条采集链接:::{}"
,
url
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"create_time"
))
{
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
userId
+
"/"
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()
>
0
)
{
try
{
return
dataMap
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"create_time"
))
{
}
else
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
logger
.
info
(
"数据为null"
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()
>
0
)
{
}
return
dataMap
;
}
catch
(
Exception
e
)
{
}
logger
.
info
(
"获取数据出错::{},数据为null"
,
e
);
}
else
{
return
null
;
logger
.
info
(
"数据为null"
);
}
}
return
null
;
}
catch
(
Exception
e
)
{
}
logger
.
info
(
"获取数据出错::{},数据为null"
,
e
);
return
null
;
/**
}
*
return
null
;
* @Description 微头条客户端解析
}
* @param userId
* @param endDate
/**
* @param proxy
*
* @param max_behot_time
* @Description 微头条客户端解析
* @return
* @param userId
*/
* @param endDate
public
static
List
<
Map
<
String
,
Object
>>
getClientMicroToutiaoCrawler
(
String
userId
,
ProxyHolder
proxy
,
* @param proxy
Long
maxBehotTime
)
{
* @param max_behot_time
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
* @return
String
ma
=
""
;
*/
while
(
true
)
{
public
static
List
<
Map
<
String
,
Object
>>
getClientMicroToutiaoCrawler
(
String
userId
,
ProxyHolder
proxy
,
String
url
=
"https://i.snssdk.com/api/feed/profile/v1/?visited_uid="
+
userId
+
"&offset="
+
maxBehotTime
;
Long
maxBehotTime
)
{
ma
=
String
.
valueOf
(
maxBehotTime
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
ma
=
""
;
String
result
=
response
.
body
().
string
();
while
(
true
)
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
String
url
=
"https://i.snssdk.com/api/feed/profile/v1/?visited_uid="
+
userId
+
"&offset="
+
maxBehotTime
;
maxBehotTime
=
json
.
getLongValue
(
"offset"
);
ma
=
String
.
valueOf
(
maxBehotTime
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
String
result
=
response
.
body
().
string
();
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
try
{
maxBehotTime
=
json
.
getLongValue
(
"offset"
);
JSONObject
dataJSON
=
data
.
getJSONObject
(
"content"
).
getJSONObject
(
"raw_data"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
if
(
dataJSON
.
containsKey
(
"comment_base"
)
&&
dataJSON
.
getJSONObject
(
"comment_base"
)!=
null
)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
JSONObject
commentBase
=
dataJSON
.
getJSONObject
(
"comment_base"
);
try
{
Date
date
=
new
Date
(
commentBase
.
getLongValue
(
"create_time"
)
*
1000
);
JSONObject
dataJSON
=
data
.
getJSONObject
(
"content"
).
getJSONObject
(
"raw_data"
);
String
href
=
"http://weitoutiao.zjurl.cn/ugc/share/wap/comment/"
+
dataJSON
.
getLongValue
(
"id"
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
String
source
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"name"
);
if
(
dataJSON
.
containsKey
(
"comment_base"
)
&&
dataJSON
.
getJSONObject
(
"comment_base"
)!=
null
)
{
String
content
=
commentBase
.
getString
(
"content"
);
JSONObject
commentBase
=
dataJSON
.
getJSONObject
(
"comment_base"
);
String
readNum
=
commentBase
.
getJSONObject
(
"action"
).
getInteger
(
"read_count"
)
+
""
;
Date
date
=
new
Date
(
commentBase
.
getLongValue
(
"create_time"
)
*
1000
);
String
commentNum
=
commentBase
.
getJSONObject
(
"action"
).
getInteger
(
"comment_count"
)
+
""
;
String
href
=
"http://weitoutiao.zjurl.cn/ugc/share/wap/comment/"
+
dataJSON
.
getLongValue
(
"id"
);
userId
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"user_id"
);
String
source
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"name"
);
if
(
dataJSON
.
containsKey
(
"origin_group"
))
{
String
content
=
commentBase
.
getString
(
"content"
);
String
replayUrl
=
dataJSON
.
getJSONObject
(
"origin_group"
).
getString
(
"article_url"
);
String
readNum
=
commentBase
.
getJSONObject
(
"action"
).
getInteger
(
"read_count"
)
+
""
;
String
title
=
dataJSON
.
getJSONObject
(
"origin_group"
).
getString
(
"title"
);
String
commentNum
=
commentBase
.
getJSONObject
(
"action"
).
getInteger
(
"comment_count"
)
+
""
;
map
.
put
(
"title"
,
title
);
userId
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"user_id"
);
map
.
put
(
"replayUrl"
,
replayUrl
);
if
(
dataJSON
.
containsKey
(
"origin_group"
))
{
}
String
replayUrl
=
dataJSON
.
getJSONObject
(
"origin_group"
).
getString
(
"article_url"
);
map
.
put
(
"time"
,
date
);
String
title
=
dataJSON
.
getJSONObject
(
"origin_group"
).
getString
(
"title"
);
map
.
put
(
"href"
,
href
);
map
.
put
(
"title"
,
title
);
map
.
put
(
"source"
,
source
);
map
.
put
(
"replayUrl"
,
replayUrl
);
map
.
put
(
"content"
,
content
);
}
map
.
put
(
"readNum"
,
readNum
);
map
.
put
(
"time"
,
date
);
map
.
put
(
"commentNum"
,
commentNum
);
map
.
put
(
"href"
,
href
);
map
.
put
(
"user_id"
,
userId
);
map
.
put
(
"source"
,
source
);
dataList
.
add
(
map
);
map
.
put
(
"content"
,
content
);
}
map
.
put
(
"readNum"
,
readNum
);
}
catch
(
Exception
e
)
{
map
.
put
(
"commentNum"
,
commentNum
);
// System.out.println(data.toString());
map
.
put
(
"user_id"
,
userId
);
e
.
printStackTrace
();
dataList
.
add
(
map
);
}
}
}
}
catch
(
Exception
e
)
{
// System.out.println(data.toString());
System
.
out
.
println
(
" 采集到 条 == "
+
dataList
.
size
()
+
" -- "
+
ma
+
" -- "
+
maxBehotTime
);
e
.
printStackTrace
();
if
(
ma
.
equals
(
String
.
valueOf
(
maxBehotTime
)))
{
}
break
;
}
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
" 采集到 条 == "
+
dataList
.
size
()
+
" -- "
+
ma
+
" -- "
+
maxBehotTime
);
logger
.
info
(
"客户端微头条采集错误 {}"
,
e
);
if
(
ma
.
equals
(
String
.
valueOf
(
maxBehotTime
)))
{
}
break
;
}
}
return
dataList
;
}
catch
(
Exception
e
)
{
}
logger
.
info
(
"客户端微头条采集错误 {}"
,
e
);
}
/**
}
* @Title: parseHtmlByMicroAccount
return
dataList
;
* @author hero
}
* @Description: 解析微头条数据
* @param @param
/**
* htmlBody
* @Title: parseHtmlByMicroAccount
* @param @param
* @author hero
* endDate
* @Description: 解析微头条数据
* @param @return
* @param @param
* 设定文件
* htmlBody
* @return Map<String,Object> 返回类型
* @param @param
*/
* endDate
private
static
Map
<
String
,
Object
>
parseHtmlByMicroAccount
(
String
htmlBody
,
Date
endDate
)
{
* @param @return
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
* 设定文件
Long
maxBehotTime
=
null
;
* @return Map<String,Object> 返回类型
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
*/
try
{
private
static
Map
<
String
,
Object
>
parseHtmlByMicroAccount
(
String
htmlBody
,
Date
endDate
)
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
boolean
more
=
false
;
Long
maxBehotTime
=
null
;
if
(
json
.
containsKey
(
"has_more"
))
{
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
more
=
json
.
getBoolean
(
"has_more"
);
try
{
}
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
json
.
containsKey
(
"next"
))
{
boolean
more
=
false
;
maxBehotTime
=
json
.
getJSONObject
(
"next"
).
getLongValue
(
"max_behot_time"
);
if
(
json
.
containsKey
(
"has_more"
))
{
}
more
=
json
.
getBoolean
(
"has_more"
);
}
Date
date
=
null
;
if
(
json
.
containsKey
(
"next"
))
{
if
(
json
.
containsKey
(
"data"
))
{
maxBehotTime
=
json
.
getJSONObject
(
"next"
).
getLongValue
(
"max_behot_time"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
}
String
href
=
null
;
String
source
=
null
;
Date
date
=
null
;
String
title
=
null
;
if
(
json
.
containsKey
(
"data"
))
{
String
content
=
null
;
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
String
readNum
=
null
;
String
href
=
null
;
String
commentNum
=
null
;
String
source
=
null
;
String
playNum
=
null
;
String
title
=
null
;
String
userId
=
null
;
String
content
=
null
;
String
likeNum
=
null
;
String
readNum
=
null
;
String
articleType
=
null
;
String
commentNum
=
null
;
int
count
=
16
;
String
playNum
=
null
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
String
userId
=
null
;
try
{
String
likeNum
=
null
;
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
String
articleType
=
null
;
String
text
=
null
;
int
count
=
16
;
if
(
data
.
containsKey
(
"stream_cell"
)
&&
data
.
getJSONObject
(
"stream_cell"
)!=
null
)
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
text
=
data
.
getJSONObject
(
"stream_cell"
).
getString
(
"raw_data"
);
try
{
}
else
if
(
data
.
containsKey
(
"concern_talk_cell"
))
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
text
=
data
.
getJSONObject
(
"concern_talk_cell"
).
getString
(
"packed_json_str"
);
String
text
=
null
;
}
if
(
data
.
containsKey
(
"stream_cell"
)
&&
data
.
getJSONObject
(
"stream_cell"
)!=
null
)
{
text
=
data
.
getJSONObject
(
"stream_cell"
).
getString
(
"raw_data"
);
JSONObject
dataJSON
=
JSONObject
.
parseObject
(
text
);
}
else
if
(
data
.
containsKey
(
"concern_talk_cell"
))
{
if
(
dataJSON
.
containsKey
(
"comment_base"
)
&&
dataJSON
.
getJSONObject
(
"comment_base"
)!=
null
)
{
text
=
data
.
getJSONObject
(
"concern_talk_cell"
).
getString
(
"packed_json_str"
);
JSONObject
commentBase
=
dataJSON
.
getJSONObject
(
"comment_base"
);
}
date
=
new
Date
(
commentBase
.
getLongValue
(
"create_time"
)
*
1000
);
href
=
"https://www.toutiao.com/a"
+
dataJSON
.
getLongValue
(
"id"
);
JSONObject
dataJSON
=
JSONObject
.
parseObject
(
text
);
source
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"name"
);
if
(
dataJSON
.
containsKey
(
"comment_base"
)
&&
dataJSON
.
getJSONObject
(
"comment_base"
)!=
null
)
{
content
=
dataJSON
.
getString
(
"content"
);
JSONObject
commentBase
=
dataJSON
.
getJSONObject
(
"comment_base"
);
readNum
=
dataJSON
.
getJSONObject
(
"action"
).
getInteger
(
"read_count"
)
+
""
;
date
=
new
Date
(
commentBase
.
getLongValue
(
"create_time"
)
*
1000
);
likeNum
=
dataJSON
.
getJSONObject
(
"action"
).
getInteger
(
"digg_count"
)+
""
;
href
=
"https://www.toutiao.com/a"
+
dataJSON
.
getLongValue
(
"id"
);
commentNum
=
dataJSON
.
getJSONObject
(
"action"
).
getInteger
(
"comment_count"
)
+
""
;
source
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"name"
);
userId
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"user_id"
);
content
=
dataJSON
.
getString
(
"content"
);
if
(
content
!=
null
&&
!
""
.
equals
(
content
))
{
readNum
=
dataJSON
.
getJSONObject
(
"action"
).
getInteger
(
"read_count"
)
+
""
;
if
(
content
.
length
()
<
16
)
{
likeNum
=
dataJSON
.
getJSONObject
(
"action"
).
getInteger
(
"digg_count"
)+
""
;
count
=
content
.
length
();
commentNum
=
dataJSON
.
getJSONObject
(
"action"
).
getInteger
(
"comment_count"
)
+
""
;
}
userId
=
commentBase
.
getJSONObject
(
"user"
).
getJSONObject
(
"info"
).
getString
(
"user_id"
);
title
=
content
.
substring
(
0
,
count
);
if
(
content
!=
null
&&
!
""
.
equals
(
content
))
{
}
if
(
content
.
length
()
<
16
)
{
count
=
content
.
length
();
}
else
{
}
date
=
new
Date
(
dataJSON
.
getLongValue
(
"create_time"
)
*
1000
);
title
=
content
.
substring
(
0
,
count
);
href
=
"https://www.toutiao.com/a"
+
dataJSON
.
getString
(
"thread_id"
);
}
source
=
dataJSON
.
getJSONObject
(
"user"
).
getString
(
"name"
);
content
=
dataJSON
.
getString
(
"content"
);
}
else
{
readNum
=
dataJSON
.
getInteger
(
"read_count"
)
+
""
;
date
=
new
Date
(
dataJSON
.
getLongValue
(
"create_time"
)
*
1000
);
commentNum
=
dataJSON
.
getInteger
(
"comment_count"
)
+
""
;
href
=
"https://www.toutiao.com/a"
+
dataJSON
.
getString
(
"thread_id"
);
likeNum
=
dataJSON
.
getInteger
(
"digg_count"
)+
""
;
source
=
dataJSON
.
getJSONObject
(
"user"
).
getString
(
"name"
);
userId
=
dataJSON
.
getJSONObject
(
"user"
).
getString
(
"user_id"
);
content
=
dataJSON
.
getString
(
"content"
);
if
(
content
!=
null
&&
!
""
.
equals
(
content
))
{
readNum
=
dataJSON
.
getInteger
(
"read_count"
)
+
""
;
if
(
content
.
length
()
<
16
)
{
commentNum
=
dataJSON
.
getInteger
(
"comment_count"
)
+
""
;
count
=
content
.
length
();
likeNum
=
dataJSON
.
getInteger
(
"digg_count"
)+
""
;
}
userId
=
dataJSON
.
getJSONObject
(
"user"
).
getString
(
"user_id"
);
title
=
content
.
substring
(
0
,
count
);
if
(
content
!=
null
&&
!
""
.
equals
(
content
))
{
}
if
(
content
.
length
()
<
16
)
{
}
count
=
content
.
length
();
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
userId
,
source
,
date
,
content
,
commentNum
,
}
playNum
,
readNum
,
"0"
,
"微头条"
,
articleType
,
likeNum
);
title
=
content
.
substring
(
0
,
count
);
dataList
.
add
(
tt
);
}
}
catch
(
Exception
e
)
{
}
continue
;
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
userId
,
source
,
date
,
content
,
commentNum
,
}
playNum
,
readNum
,
"0"
,
"微头条"
,
articleType
,
likeNum
);
}
dataList
.
add
(
tt
);
}
else
{
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
json
);
continue
;
}
}
}
}
else
{
/** 验证是否有下一页数据 **/
System
.
out
.
println
(
json
);
if
(
more
)
{
}
if
(
maxBehotTime
!=
null
&&
maxBehotTime
!=
0
)
{
if
(
endDate
.
after
(
date
))
{
maxBehotTime
=
null
;
/** 验证是否有下一页数据 **/
}
if
(
more
)
{
}
if
(
maxBehotTime
!=
null
&&
maxBehotTime
!=
0
)
{
}
else
{
if
(
endDate
.
after
(
date
))
{
maxBehotTime
=
null
;
maxBehotTime
=
null
;
}
}
}
catch
(
Exception
e
)
{
}
e
.
printStackTrace
();
}
else
{
}
maxBehotTime
=
null
;
}
map
.
put
(
"max_behot_time"
,
maxBehotTime
);
}
catch
(
Exception
e
)
{
map
.
put
(
"data"
,
dataList
);
e
.
printStackTrace
();
}
return
map
;
}
map
.
put
(
"max_behot_time"
,
maxBehotTime
);
map
.
put
(
"data"
,
dataList
);
/**
* 根据链接获取全文
return
map
;
* @param url
}
* @param proxy
* @return
/**
*/
* 根据链接获取全文
public
static
String
getContent
(
String
url
,
Proxy
proxy
)
{
* @param url
try
{
* @param proxy
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
null
);
* @return
String
regex
=
"<script>var BASE_DATA[\\s\\S]+?</script>"
;
*/
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"articleInfo"
))
{
public
static
String
getContent
(
String
url
,
Proxy
proxy
)
{
//通过正则截取需要的js代码
try
{
Matcher
matcher
=
Pattern
.
compile
(
regex
).
matcher
(
htmlBody
);
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
null
);
if
(
matcher
.
find
())
{
String
regex
=
"<script>var BASE_DATA[\\s\\S]+?</script>"
;
String
content
=
matcher
.
group
().
replaceAll
(
"<script>var BASE_DATA = |;</script>"
,
""
);
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"articleInfo"
))
{
//通过js引擎执行js代码
//通过正则截取需要的js代码
String
jsContent
=
"eval(("
+
content
+
")).articleInfo.content.toString();"
;
Matcher
matcher
=
Pattern
.
compile
(
regex
).
matcher
(
htmlBody
);
String
contentHtml
=
scriptEngine
.
eval
(
jsContent
).
toString
();
if
(
matcher
.
find
())
{
//解析最后的数据
String
content
=
matcher
.
group
().
replaceAll
(
"<script>var BASE_DATA = |;</script>"
,
""
);
return
Jsoup
.
parse
(
contentHtml
).
text
();
//通过js引擎执行js代码
}
String
jsContent
=
"eval(("
+
content
+
")).articleInfo.content.toString();"
;
}
String
contentHtml
=
scriptEngine
.
eval
(
jsContent
).
toString
();
return
null
;
//解析最后的数据
}
catch
(
Exception
e
)
{
return
Jsoup
.
parse
(
contentHtml
).
text
();
logger
.
error
(
"跟据链接采集全文出现错误"
,
e
);
}
return
null
;
}
}
return
null
;
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"跟据链接采集全文出现错误"
,
e
);
return
null
;
}
/**
* 根据文章url获取itemId
}
* @param url
* @param proxy
/**
* @return
* 下载数据
* @throws Exception
* @param url
*/
* @param proxy
private
static
String
getItemIdByUrl
(
String
url
,
Proxy
proxy
)
throws
Exception
* @param headMap
{
* @return
String
itemId
=
null
;
*/
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headMap
)
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
if
(
htmlBody
!=
null
)
try
{
{
Response
response
=
null
;
if
(
htmlBody
.
contains
(
"itemId"
))
if
(
proxy
!=
null
)
{
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
proxy
);
itemId
=
htmlBody
.
split
(
"itemId: '"
)[
1
]
}
else
{
.
split
(
"',"
)[
0
].
trim
();
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
);
}
}
}
else
return
response
.
body
().
string
();
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"获取itemId失败,链接地址为:{}"
,
url
);
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
}
if
(
i
==
3
){
return
itemId
;
break
;
}
}
else
{
continue
;
/**
}
* 根据文章url获取文章信息
}
* @param url
}
* @param proxy
return
null
;
* @return
}
* @throws Exception
}
*/
public
static
TouTiaoArticle
getToutiaoArticleInfoByUrl
(
String
url
,
Proxy
proxy
)
throws
Exception
{
String
itemId
=
getItemIdByUrl
(
url
,
proxy
);
if
(
Objects
.
nonNull
(
itemId
)){
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
String
urlNew
=
"https://m.toutiao.com/i"
+
itemId
+
"/info/?_signature=&i="
+
itemId
;
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://m.toutiao.com/i"
+
itemId
+
"/"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Mobile Safari/537.36"
);
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
try
{
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
String
commentNum
=
data
.
getInteger
(
"comment_count"
).
toString
();
String
readNum
=
data
.
getInteger
(
"impression_count"
).
toString
();
String
playCount
=
data
.
getInteger
(
"video_play_count"
).
toString
();
String
userId
=
data
.
getJSONObject
(
"media_user"
).
getLong
(
"id"
).
toString
();
String
source
=
data
.
getString
(
"source"
);
String
title
=
data
.
getString
(
"title"
);
String
link
=
data
.
getString
(
"url"
);
String
content
=
data
.
getString
(
"content"
);
if
(
data
.
containsKey
(
"content"
)
&&
StringUtils
.
isNotBlank
(
content
)){
content
=
Jsoup
.
parse
(
content
).
text
();
}
Date
time
=
new
Date
(
data
.
getLong
(
"publish_time"
)*
1000
);
TouTiaoArticle
touTiaoArticle
=
new
TouTiaoArticle
();
touTiaoArticle
.
setUrl
(
url
);
touTiaoArticle
.
setTitle
(
title
);
touTiaoArticle
.
setUser_id
(
userId
);
touTiaoArticle
.
setSource
(
source
);
touTiaoArticle
.
setTime
(
time
);
touTiaoArticle
.
setContent
(
content
);
touTiaoArticle
.
setCommentCount
(
commentNum
);
touTiaoArticle
.
setReadNum
(
readNum
);
touTiaoArticle
.
setPlayCount
(
playCount
);
return
touTiaoArticle
;
}
catch
(
Exception
e
)
{
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
);
}
}
}
catch
(
Exception
e
)
{
continue
;
}
}
}
return
null
;
}
/**
* 下载数据
* @param url
* @param proxy
* @param headMap
* @return
*/
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headMap
)
{
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
break
;
}
else
{
continue
;
}
}
}
return
null
;
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoCommentParse.java
View file @
9d384b56
package
com
.
zhiwei
.
toutiao
.
parse
;
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
com.alibaba.fastjson.JSONArray
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.alibaba.fastjson.JSONArray
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.toutiao.bean.TouTiaoComment
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.toutiao.bean.TouTiaoComment
;
import
okhttp3.Response
;
import
com.zhiwei.toutiao.util.Tools
;
/**
import
okhttp3.Response
;
* @ClassName: TouTiaoComment
import
org.jsoup.Jsoup
;
* @Description: 今日头条评论数据
* @author hero
/**
* @date 2016年12月9日 下午7:50:28
* @ClassName: TouTiaoComment
*/
* @Description: 今日头条评论数据
public
class
TouTiaoCommentParse
{
* @author hero
* @date 2016年12月9日 下午7:50:28
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoCommentParse
.
class
);
*/
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
public
class
TouTiaoCommentParse
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoCommentParse
.
class
);
/**
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
*
* @Title: getTouTiaoComment
* @author hero
/**
* @Description: 获取评论列表,可指定限制返回条数
*
* @param @param url
* @Title: getTouTiaoComment
* @param @param count
* @author hero
* @param @param proxy
* @Description: 获取评论列表,可指定限制返回条数
* @param @return
* @param @param url
* @param @throws Exception 设定文件
* @param @param count
* @return List<TouTiaoComment> 返回类型
* @param @param proxy
*/
* @param @return
public
static
List
<
TouTiaoComment
>
getTouTiaoComment
(
String
url
,
int
returnCount
,
Proxy
proxy
)
throws
Exception
* @param @throws Exception 设定文件
{
* @return List<TouTiaoComment> 返回类型
List
<
TouTiaoComment
>
ttList
=
new
ArrayList
<
TouTiaoComment
>();
*/
String
group_id
=
getGroupId
(
url
,
proxy
);
public
static
List
<
TouTiaoComment
>
getTouTiaoComment
(
String
url
,
int
returnCount
,
Proxy
proxy
)
throws
Exception
//查询评论总页数
{
if
(
group_id
!=
null
){
List
<
TouTiaoComment
>
ttList
=
new
ArrayList
<
TouTiaoComment
>();
int
page
=
getPage
(
group_id
,
proxy
);
String
group_id
=
getGroupId
(
url
,
proxy
);
if
(
returnCount
>
0
){
//查询评论总页数
int
pageMax
=
(
int
)
Math
.
ceil
((
double
)
returnCount
/
20.0
);
if
(
group_id
!=
null
){
if
(
page
>=
pageMax
){
int
page
=
getPage
(
group_id
,
proxy
);
page
=
pageMax
;
if
(
returnCount
>
0
){
}
int
pageMax
=
(
int
)
Math
.
ceil
((
double
)
returnCount
/
20.0
);
}
if
(
page
>=
pageMax
){
for
(
int
i
=
0
;
i
<
page
;
i
++)
page
=
pageMax
;
{
}
String
urlNew
=
"http://is.snssdk.com/article/v2/tab_comments/?app_name=news_article&offset="
}
+
i
*
20
+
"&group_id="
+
group_id
+
"&aggr_type=1&count=20&fold=1&item_id="
+
group_id
+
"&ts="
+
System
.
currentTimeMillis
();
for
(
int
i
=
0
;
i
<
page
;
i
++)
//设置头信息
{
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
urlNew
=
"http://is.snssdk.com/article/v2/tab_comments/?app_name=news_article&offset="
headerMap
.
put
(
"User-Agent"
,
"News 6.6.5 rv:6.6.5.03 (iPhone; iOS 11.3; zh_CN) Cronet"
);
+
i
*
20
+
"&group_id="
+
group_id
+
"&aggr_type=1&count=20&fold=1&item_id="
+
group_id
+
"&ts="
+
System
.
currentTimeMillis
();
headerMap
.
put
(
"Host"
,
"is.snssdk.com"
);
//设置头信息
for
(
int
j
=
1
;
j
<=
3
;
j
++){
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
try
{
headerMap
.
put
(
"User-Agent"
,
"News 6.6.5 rv:6.6.5.03 (iPhone; iOS 11.3; zh_CN) Cronet"
);
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
headerMap
.
put
(
"Host"
,
"is.snssdk.com"
);
if
(
htmlBody
!=
null
)
for
(
int
j
=
1
;
j
<=
3
;
j
++){
{
try
{
List
<
TouTiaoComment
>
commentes
=
analySisComment
(
htmlBody
,
url
);
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
ttList
.
addAll
(
commentes
);
if
(
htmlBody
!=
null
)
logger
.
info
(
" url {} 采集到第 {} 页 采集到 {} 条数据 "
,
url
,
page
,
ttList
.
size
());
{
}
else
List
<
TouTiaoComment
>
commentes
=
analySisComment
(
htmlBody
,
url
);
{
ttList
.
addAll
(
commentes
);
logger
.
info
(
"采集出现问题,地址为:{}"
,
url
);
logger
.
info
(
" url {} 采集到第 {} 页 采集到 {} 条数据 "
,
url
,
page
,
ttList
.
size
());
}
}
else
if
(
Objects
.
nonNull
(
proxy
))
{
{
ZhiWeiTools
.
sleep
(
100
);
logger
.
info
(
"采集出现问题,地址为:{}"
,
url
);
}
else
{
}
ZhiWeiTools
.
sleep
(
4000
);
if
(
Objects
.
nonNull
(
proxy
))
{
}
ZhiWeiTools
.
sleep
(
100
);
break
;
}
else
{
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
4000
);
continue
;
}
}
break
;
}
}
catch
(
Exception
e
)
{
}
continue
;
}
}
return
ttList
;
}
}
}
}
return
ttList
;
/**
}
* @Title: analySisComment
* @Description: TODO(解析评论列表)
* @param @param htmlBody
/**
* @param @return 设定文件
* @Title: analySisComment
* @return List<DBObject> 返回类型
* @Description: TODO(解析评论列表)
*/
* @param @param htmlBody
private
static
List
<
TouTiaoComment
>
analySisComment
(
String
htmlBody
,
String
url
)
* @param @return 设定文件
{
* @return List<DBObject> 返回类型
List
<
TouTiaoComment
>
list
=
new
ArrayList
<>();
*/
try
{
private
static
List
<
TouTiaoComment
>
analySisComment
(
String
htmlBody
,
String
url
)
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
{
JSONArray
commentes
=
json
.
getJSONArray
(
"data"
);
List
<
TouTiaoComment
>
list
=
new
ArrayList
<>();
for
(
int
a
=
0
;
a
<
commentes
.
size
();
a
++)
try
{
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
comment
=
commentes
.
getJSONObject
(
a
).
getJSONObject
(
"comment"
);
JSONArray
commentes
=
json
.
getJSONArray
(
"data"
);
String
id
=
comment
.
getString
(
"id"
);
for
(
int
a
=
0
;
a
<
commentes
.
size
();
a
++)
String
text
=
comment
.
getString
(
"text"
);
{
String
name
=
comment
.
getString
(
"user_name"
);
JSONObject
comment
=
commentes
.
getJSONObject
(
a
).
getJSONObject
(
"comment"
);
int
reply_count
=
comment
.
getIntValue
(
"reply_count"
);
String
id
=
comment
.
getString
(
"id"
);
int
digg_count
=
comment
.
getIntValue
(
"digg_count"
);
String
text
=
comment
.
getString
(
"text"
);
long
timeLong
=
comment
.
getLongValue
(
"create_time"
)*
1000
;
String
name
=
comment
.
getString
(
"user_name"
);
Date
date
=
new
Date
(
timeLong
);
int
reply_count
=
comment
.
getIntValue
(
"reply_count"
);
int
digg_count
=
comment
.
getIntValue
(
"digg_count"
);
TouTiaoComment
ttComment
=
new
TouTiaoComment
(
id
,
long
timeLong
=
comment
.
getLongValue
(
"create_time"
)*
1000
;
text
,
name
,
reply_count
,
digg_count
,
Date
date
=
new
Date
(
timeLong
);
date
,
url
);
list
.
add
(
ttComment
);
TouTiaoComment
ttComment
=
new
TouTiaoComment
(
id
,
}
text
,
name
,
reply_count
,
digg_count
,
}
catch
(
Exception
e
)
{
date
,
url
);
logger
.
debug
(
"解析今日头条评论列表出现为题,{}"
,
e
);
list
.
add
(
ttComment
);
}
}
return
list
;
}
catch
(
Exception
e
)
{
}
logger
.
debug
(
"解析今日头条评论列表出现为题,{}"
,
e
);
}
/**
return
list
;
* @Title: getPage
}
* @Description: TODO(获取总页数)
* @param @param url
/**
* @param @return 设定文件
* @Title: getPage
* @return int 返回类型
* @Description: TODO(获取总页数)
* @throws Exception
* @param @param url
*/
* @param @return 设定文件
private
static
int
getPage
(
String
groupId
,
Proxy
proxy
)
throws
Exception
* @return int 返回类型
{
* @throws Exception
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
groupId
+
"&item_id=0&count=20&offset=0"
;
*/
//设置头信息
private
static
int
getPage
(
String
groupId
,
Proxy
proxy
)
throws
Exception
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
{
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
groupId
+
"&item_id=0&count=20&offset=0"
;
if
(
htmlBody
!=
null
)
//设置头信息
{
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
try
{
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
htmlBody
!=
null
)
JSONObject
data
=
json
.
getJSONObject
(
"data"
);
{
int
count
=
data
.
getIntValue
(
"total"
);
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
return
(
int
)
Math
.
ceil
((
double
)
count
/
20.0
);
JSONObject
data
=
json
.
getJSONObject
(
"data"
);
}
catch
(
Exception
e
)
{
int
count
=
data
.
getIntValue
(
"total"
);
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
);
}
return
(
int
)
Math
.
ceil
((
double
)
count
/
20.0
);
}
}
catch
(
Exception
e
)
{
return
-
1
;
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
);
}
}
}
return
-
1
;
}
/**
* @Title: findCommentCount
* @author hero
* @Description: 根据id获取头条评论数
/**
* @param @param url
* @Title: findCommentCount
* @param @param proxy
* @author hero
* @param @return 设定文件
* @Description: 根据id获取头条评论数
* @return int 返回类型
* @param @param url
*/
* @param @param proxy
public
static
int
findCommentCount
(
String
url
,
Proxy
proxy
)
* @param @return 设定文件
{
* @return int 返回类型
for
(
int
i
=
0
;
i
<
3
;
i
++){
*/
try
{
public
static
int
findCommentCount
(
String
url
,
Proxy
proxy
)
//设置头信息
{
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
for
(
int
i
=
0
;
i
<
3
;
i
++){
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
try
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"commentInfo"
))
//设置头信息
{
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
return
Integer
.
valueOf
(
htmlBody
.
split
(
"comments_count: "
)[
1
].
split
(
","
)[
0
]);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"commentInfo"
))
}
catch
(
Exception
e
)
{
{
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
try
{
return
0
;
return
Integer
.
valueOf
(
htmlBody
.
split
(
"comments_count: "
)[
1
].
split
(
","
)[
0
]);
}
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
}
catch
(
Exception
e
)
{
return
0
;
continue
;
}
}
}
}
}
catch
(
Exception
e
)
{
return
0
;
continue
;
}
}
}
/**
return
0
;
* @Title: findCommentCount
}
* @author hero
* @Description: 根据id获取头条评论数
/**
* @param @param url
* @Title: findCommentCount
* @param @param proxy
* @author hero
* @param @return 设定文件
* @Description: 根据id获取头条评论数
* @return int 返回类型
* @param @param url
*/
* @param @param proxy
public
static
int
findNewCommentCountByProxy
(
String
url
,
Proxy
proxy
)
* @param @return 设定文件
{
* @return int 返回类型
try
{
*/
//设置头信息
public
static
int
findNewCommentCountByProxy
(
String
url
,
Proxy
proxy
)
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
try
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"commentInfo"
))
//设置头信息
{
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
return
Integer
.
valueOf
(
htmlBody
.
split
(
"comments_count: "
)[
1
].
split
(
","
)[
0
]);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"commentInfo"
))
}
catch
(
Exception
e
)
{
{
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
try
{
return
-
1
;
return
Integer
.
valueOf
(
htmlBody
.
split
(
"comments_count: "
)[
1
].
split
(
","
)[
0
]);
}
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
}
catch
(
Exception
e
)
{
return
-
1
;
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
}
}
}
return
-
1
;
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
}
/**
return
-
1
;
* @Title: getCommentCount
}
* @Description: TODO(根据id查看评论数)
* @param @param url
/**
* @param @return 设定文件
* @Title: getCommentCount
* @return int 返回类型
* @Description: TODO(根据id查看评论数)
* @throws IOException
* @param @param url
*/
* @param @return 设定文件
public
static
int
getCommentCount
(
String
url
,
Proxy
proxy
)
* @return int 返回类型
{
* @throws IOException
String
group_id
=
getGroupId
(
url
,
proxy
);
*/
for
(
int
i
=
0
;
i
<
3
;
i
++){
public
static
int
getCommentCount
(
String
url
,
Proxy
proxy
)
throws
Exception
try
{
{
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
group_id
+
"&item_id=0&count=20&offset=0"
;
String
group_id
=
getGroupId
(
url
,
proxy
);
//设置头信息
for
(
int
i
=
0
;
i
<
3
;
i
++){
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
try
{
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
group_id
+
"&item_id=0&count=20&offset=0"
;
if
(
htmlBody
!=
null
)
//设置头信息
{
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
try
{
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
htmlBody
!=
null
)
JSONObject
data
=
json
.
getJSONObject
(
"data"
);
{
return
data
.
getIntValue
(
"total"
);
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
}
catch
(
Exception
e
)
{
JSONObject
data
=
json
.
getJSONObject
(
"data"
);
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
);
return
data
.
getIntValue
(
"total"
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
);
continue
;
}
}
}
}
}
catch
(
Exception
e
)
{
return
-
1
;
continue
;
}
}
}
return
-
1
;
}
/**
* @Title: getGroupId
* @Description: TODO(获取groupId用于更新评论列表)
/**
* @param @param url
* @Title: getGroupId
* @param @return 设定文件
* @Description: TODO(获取groupId用于更新评论列表)
* @return String 返回类型
* @param @param url
*/
* @param @return 设定文件
private
static
String
getGroupId
(
String
url
,
Proxy
proxy
)
* @return String 返回类型
{
*/
String
groupId
=
null
;
private
static
String
getGroupId
(
String
url
,
Proxy
proxy
)
throws
Exception
if
(
url
.
contains
(
"/a"
)||
url
.
contains
(
"/group/"
))
{
{
String
groupId
=
null
;
if
(
url
.
contains
(
"/a"
))
if
(
url
.
contains
(
"/a"
)||
url
.
contains
(
"/group/"
))
{
{
groupId
=
url
.
split
(
"/a"
)[
1
].
replace
(
"/"
,
""
);
if
(
url
.
contains
(
"/a"
))
}
else
{
{
groupId
=
url
.
split
(
"/a"
)[
1
].
replace
(
"/"
,
""
);
groupId
=
url
.
split
(
"/group/"
)[
1
].
replace
(
"/"
,
""
);
}
else
}
{
}
else
if
(
url
.
contains
(
"/i"
)||
url
.
contains
(
"/item/"
))
groupId
=
url
.
split
(
"/group/"
)[
1
].
replace
(
"/"
,
""
);
{
}
groupId
=
gettGroupIdByUrl
(
url
,
proxy
);
}
else
if
(
url
.
contains
(
"/i"
)||
url
.
contains
(
"/item/"
))
}
{
return
groupId
;
groupId
=
getGroupIdByUrl
(
url
,
proxy
);
}
}
return
groupId
;
/**
}
* @Title: gettGroupIdByUrl
* @Description: TODO(解析并获取groupId)
/**
* @param @param url
* @Title: gettGroupIdByUrl
* @param @return 设定文件
* @Description: TODO(解析并获取groupId)
* @return String 返回类型
* @param @param url
*/
* @param @return 设定文件
private
static
String
gettGroupIdByUrl
(
String
url
,
Proxy
proxy
)
* @return String 返回类型
{
*/
String
groupId
=
null
;
private
static
String
getGroupIdByUrl
(
String
url
,
Proxy
proxy
)
throws
Exception
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
{
try
{
String
groupId
=
null
;
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
if
(
htmlBody
!=
null
)
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
{
if
(
htmlBody
!=
null
)
if
(
htmlBody
.
contains
(
"groupId"
))
{
{
if
(
htmlBody
.
contains
(
"groupId"
))
groupId
=
htmlBody
.
split
(
"groupId: '"
)[
1
]
{
.
split
(
"',"
)[
0
].
trim
();
groupId
=
htmlBody
.
split
(
"groupId: '"
)[
1
]
}
.
split
(
"',"
)[
0
].
trim
();
}
else
}
{
}
else
logger
.
info
(
"获取groupId失败,链接地址为:{}"
,
url
);
{
}
logger
.
info
(
"获取groupId失败,链接地址为:{}"
,
url
);
}
catch
(
Exception
e
)
{
}
e
.
printStackTrace
();
return
groupId
;
logger
.
error
(
"获取groupId失败,链接地址为:{}"
,
url
,
e
);
}
}
return
groupId
;
/**
}
* 下载数据
* @param url
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
)
{
* @param proxy
// 下载数据页面
* @param headerMap
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
* @return
try
{
*/
Response
response
=
null
;
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
)
{
if
(
proxy
!=
null
)
{
// 下载数据页面
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
}
else
{
try
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
);
Response
response
=
null
;
}
if
(
proxy
!=
null
)
{
return
response
.
body
().
string
();
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
catch
(
Exception
e
)
{
}
else
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
);
if
(
i
==
3
){
}
break
;
return
response
.
body
().
string
();
}
else
{
}
catch
(
Exception
e
)
{
continue
;
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
}
if
(
i
==
3
){
}
break
;
}
}
else
{
return
null
;
continue
;
}
}
}
}
}
return
null
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment