Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
ae21017e
Commit
ae21017e
authored
Jul 24, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加解析今日头条单篇文章内容方法及代码规范部分修改
parent
a5f5a270
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
96 additions
and
91 deletions
+96
-91
pom.xml
+1
-1
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+95
-90
No files found.
pom.xml
View file @
ae21017e
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
toutiao
</artifactId>
<version>
0.3.
7
-SNAPSHOT
</version>
<version>
0.3.
8
-SNAPSHOT
</version>
<dependencies>
<dependency>
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
View file @
ae21017e
...
...
@@ -19,10 +19,16 @@ import java.util.Date;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
javax.script.ScriptEngine
;
import
javax.script.ScriptEngineManager
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
...
...
@@ -30,7 +36,6 @@ import com.zhiwei.crawler.core.HttpBoot;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.toutiao.bean.Signature
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.util.Tools
;
...
...
@@ -43,9 +48,8 @@ import okhttp3.Response;
* @date 2016年9月2日 上午11:17:44
*/
public
class
TouTiaoArticleParse
{
private
TouTiaoArticleParse
()
{
}
private
static
ScriptEngine
scriptEngine
=
new
ScriptEngineManager
().
getEngineByName
(
"javascript"
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoArticleParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
...
...
@@ -60,13 +64,13 @@ public class TouTiaoArticleParse {
* @return List<TouTiao> 返回类型
* @throws Exception
*/
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media
_id
,
String
max_behot_t
ime
,
Date
endData
,
Proxy
proxy
)
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media
Id
,
String
maxBehotT
ime
,
Date
endData
,
Proxy
proxy
)
throws
Exception
{
Signature
signature
=
new
Signature
();
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
media
_i
d
+
"&count=20&as="
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
media
I
d
+
"&count=20&as="
+
signature
.
getAs
()
+
"&cp="
+
signature
.
getCp
();
if
(
max
_behot_t
ime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
max
_behot_t
ime
;
if
(
max
BehotT
ime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
max
BehotT
ime
;
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
...
...
@@ -88,13 +92,13 @@ public class TouTiaoArticleParse {
}
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media
_id
,
String
max_behot_t
ime
,
Date
endData
,
ProxyHolder
proxy
)
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media
Id
,
String
maxBehotT
ime
,
Date
endData
,
ProxyHolder
proxy
)
throws
Exception
{
Signature
signature
=
new
Signature
();
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
media
_i
d
+
"&count=20&as="
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
media
I
d
+
"&count=20&as="
+
signature
.
getAs
()
+
"&cp="
+
signature
.
getCp
();
if
(
max
_behot_t
ime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
max
_behot_t
ime
;
if
(
max
BehotT
ime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
max
BehotT
ime
;
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
...
...
@@ -125,23 +129,23 @@ public class TouTiaoArticleParse {
* @return
* @throws Exception
*/
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
user
_id
,
String
max_behot_t
ime
,
Date
endData
,
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
user
Id
,
String
maxBehotT
ime
,
Date
endData
,
Proxy
proxy
)
throws
Exception
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
Signature
signature
=
new
Signature
(
user
_id
,
max_behot_t
ime
);
Signature
signature
=
new
Signature
(
user
Id
,
maxBehotT
ime
);
String
as
=
signature
.
getAs
();
String
cp
=
signature
.
getCp
();
String
_signature
=
signature
.
getSignature
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user
_i
d
+
"&max_behot_time="
+
max
_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
_signature
;
String
signatureStr
=
signature
.
getSignature
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user
I
d
+
"&max_behot_time="
+
max
BehotTime
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
signatureStr
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user
_i
d
+
"/"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user
I
d
+
"/"
);
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user
_i
d
,
htmlBody
,
endData
);
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user
I
d
,
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
}
...
...
@@ -157,25 +161,25 @@ public class TouTiaoArticleParse {
return
Collections
.
emptyMap
();
}
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
user
_id
,
String
max_behot_t
ime
,
Date
endData
,
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
user
Id
,
String
maxBehotT
ime
,
Date
endData
,
ProxyHolder
proxy
)
throws
Exception
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
Signature
signature
=
new
Signature
(
user
_id
,
max_behot_t
ime
);
Signature
signature
=
new
Signature
(
user
Id
,
maxBehotT
ime
);
String
as
=
signature
.
getAs
();
String
cp
=
signature
.
getCp
();
String
_signature
=
signature
.
getSignature
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user
_i
d
+
"&max_behot_time="
+
max
_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
_signature
;
String
signatureStr
=
signature
.
getSignature
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user
I
d
+
"&max_behot_time="
+
max
BehotTime
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
signatureStr
;
logger
.
info
(
"当前采集的历史文章链接:::{}"
,
url
);
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user
_i
d
+
"/"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user
I
d
+
"/"
);
String
htmlBody
=
null
;
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user
_i
d
,
htmlBody
,
endData
);
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user
I
d
,
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
}
else
{
...
...
@@ -203,13 +207,13 @@ public class TouTiaoArticleParse {
* @return List<String> 返回类型
*/
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Long
max
_behot_t
ime
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<
TouTiaoArticle
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Long
max
BehotT
ime
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
max
_behot_t
ime
=
Long
.
valueOf
(
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
));
max
BehotT
ime
=
Long
.
valueOf
(
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
));
String
title
=
null
;
String
content
=
null
;
String
time
=
null
;
...
...
@@ -219,7 +223,7 @@ public class TouTiaoArticleParse {
String
playNum
=
null
;
String
shareNum
=
null
;
String
source
=
null
;
String
user
_i
d
=
null
;
String
user
I
d
=
null
;
String
articleType
=
null
;
List
<
String
>
labelList
=
null
;
String
likeNum
=
null
;
...
...
@@ -238,9 +242,9 @@ public class TouTiaoArticleParse {
playNum
=
data
.
getString
(
"detail_play_effective_count"
);
shareNum
=
data
.
getString
(
"share_count"
);
source
=
data
.
getString
(
"source"
);
user
_i
d
=
data
.
getLong
(
"creator_uid"
)
+
""
;
user
I
d
=
data
.
getLong
(
"creator_uid"
)
+
""
;
articleType
=
data
.
getString
(
"chinese_tag"
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
user
_i
d
,
source
,
date
,
content
,
commentNum
,
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
user
I
d
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
shareNum
,
"今日头条"
,
articleType
,
likeNum
);
if
(
data
.
containsKey
(
"label"
))
{
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
...
...
@@ -259,26 +263,26 @@ public class TouTiaoArticleParse {
}
if
(
endDate
!=
null
)
{
if
(
max
_behot_time
!=
null
&&
!
"0"
.
equals
(
max_behot_t
ime
))
{
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
max
_behot_t
ime
+
"000"
));
if
(
max
BehotTime
!=
null
&&
!
"0"
.
equals
(
maxBehotT
ime
))
{
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
max
BehotT
ime
+
"000"
));
if
(
endDate
.
after
(
nextDate
))
{
max
_behot_t
ime
=
null
;
max
BehotT
ime
=
null
;
}
}
}
map
.
put
(
"max_behot_time"
,
max
_behot_t
ime
);
map
.
put
(
"max_behot_time"
,
max
BehotT
ime
);
map
.
put
(
"data"
,
dataList
);
return
map
;
}
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
user
_i
d
,
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Long
max
_behot_t
ime
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<
TouTiaoArticle
>();
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
user
I
d
,
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Long
max
BehotT
ime
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
max
_behot_t
ime
=
Long
.
valueOf
(
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
));
max
BehotT
ime
=
Long
.
valueOf
(
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
));
String
title
=
null
;
String
content
=
null
;
String
time
=
null
;
...
...
@@ -307,7 +311,7 @@ public class TouTiaoArticleParse {
shareNum
=
data
.
getString
(
"share_count"
);
source
=
data
.
getString
(
"source"
);
articleType
=
data
.
getString
(
"chinese_tag"
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
user
_i
d
,
source
,
date
,
content
,
commentNum
,
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
user
I
d
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
shareNum
,
"今日头条"
,
articleType
,
likeNum
);
if
(
data
.
containsKey
(
"label"
))
{
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
...
...
@@ -325,14 +329,14 @@ public class TouTiaoArticleParse {
return
null
;
}
if
(
endDate
!=
null
)
{
if
(
max
_behot_time
!=
null
&&
!
"0"
.
equals
(
max_behot_t
ime
))
{
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
max
_behot_t
ime
+
"000"
));
if
(
max
BehotTime
!=
null
&&
!
"0"
.
equals
(
maxBehotT
ime
))
{
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
max
BehotT
ime
+
"000"
));
if
(
endDate
.
after
(
nextDate
))
{
max
_behot_t
ime
=
null
;
max
BehotT
ime
=
null
;
}
}
}
map
.
put
(
"max_behot_time"
,
max
_behot_t
ime
);
map
.
put
(
"max_behot_time"
,
max
BehotT
ime
);
map
.
put
(
"data"
,
dataList
);
return
map
;
}
...
...
@@ -352,14 +356,14 @@ public class TouTiaoArticleParse {
* IOException 设定文件
* @return List<Map<String,Object>> 返回类型
*/
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
user
_i
d
,
Date
endDate
,
Proxy
proxy
,
String
max
_behot_t
ime
)
throws
IOException
{
String
url
=
"https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id="
+
user
_i
d
;
if
(
max
_behot_t
ime
!=
null
)
{
url
=
url
+
"?max_behot_time="
+
max
_behot_t
ime
;
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
user
I
d
,
Date
endDate
,
Proxy
proxy
,
String
max
BehotT
ime
)
throws
IOException
{
String
url
=
"https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id="
+
user
I
d
;
if
(
max
BehotT
ime
!=
null
)
{
url
=
url
+
"?max_behot_time="
+
max
BehotT
ime
;
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user
_i
d
+
"/?tab=weitoutiao"
);
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user
I
d
+
"/?tab=weitoutiao"
);
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
...
...
@@ -377,15 +381,15 @@ public class TouTiaoArticleParse {
return
null
;
}
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
user
_i
d
,
Date
endDate
,
ProxyHolder
proxy
,
Long
max
_behot_t
ime
)
throws
IOException
{
String
url
=
"https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id="
+
user
_i
d
;
if
(
max
_behot_t
ime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
max
_behot_t
ime
;
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
user
I
d
,
Date
endDate
,
ProxyHolder
proxy
,
Long
max
BehotT
ime
)
throws
IOException
{
String
url
=
"https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id="
+
user
I
d
;
if
(
max
BehotT
ime
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
max
BehotT
ime
;
}
logger
.
info
(
"微头条采集链接:::{}"
,
url
);
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user
_i
d
+
"/?tab=weitoutiao"
);
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user
I
d
+
"/?tab=weitoutiao"
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"create_time"
))
{
...
...
@@ -413,24 +417,21 @@ public class TouTiaoArticleParse {
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getClientMicroToutiaoCrawler
(
String
userId
,
ProxyHolder
proxy
,
Long
max
_behot_t
ime
)
{
Long
max
BehotT
ime
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
String
ma
=
""
;
while
(
true
)
{
String
url
=
"https://i.snssdk.com/api/feed/profile/v1/?visited_uid="
+
userId
+
"&offset="
+
max_behot_time
;
System
.
out
.
println
(
url
);
ma
=
String
.
valueOf
(
max_behot_time
);
String
url
=
"https://i.snssdk.com/api/feed/profile/v1/?visited_uid="
+
userId
+
"&offset="
+
maxBehotTime
;
ma
=
String
.
valueOf
(
maxBehotTime
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
max
_behot_t
ime
=
json
.
getLongValue
(
"offset"
);
max
BehotT
ime
=
json
.
getLongValue
(
"offset"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
System
.
out
.
println
(
json
.
toString
());
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
try
{
JSONObject
dataJSON
=
data
.
getJSONObject
(
"content"
).
getJSONObject
(
"raw_data"
);
System
.
out
.
println
(
dataJSON
.
toString
());
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
if
(
dataJSON
.
containsKey
(
"comment_base"
)
&&
dataJSON
.
getJSONObject
(
"comment_base"
)!=
null
)
{
JSONObject
commentBase
=
dataJSON
.
getJSONObject
(
"comment_base"
);
...
...
@@ -454,7 +455,6 @@ public class TouTiaoArticleParse {
map
.
put
(
"readNum"
,
readNum
);
map
.
put
(
"commentNum"
,
commentNum
);
map
.
put
(
"user_id"
,
user_id
);
// System.out.println(map.toString());
dataList
.
add
(
map
);
}
}
catch
(
Exception
e
)
{
...
...
@@ -463,8 +463,8 @@ public class TouTiaoArticleParse {
}
}
System
.
out
.
println
(
" 采集到 条 == "
+
dataList
.
size
()
+
" -- "
+
ma
+
" -- "
+
max
_behot_t
ime
);
if
(
ma
.
equals
(
String
.
valueOf
(
max
_behot_t
ime
)))
{
System
.
out
.
println
(
" 采集到 条 == "
+
dataList
.
size
()
+
" -- "
+
ma
+
" -- "
+
max
BehotT
ime
);
if
(
ma
.
equals
(
String
.
valueOf
(
max
BehotT
ime
)))
{
break
;
}
}
catch
(
Exception
e
)
{
...
...
@@ -487,16 +487,16 @@ public class TouTiaoArticleParse {
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
parseHtmlByMicroAccount
(
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Long
max
_behot_t
ime
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<
TouTiaoArticle
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Long
max
BehotT
ime
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
boolean
more
=
false
;
if
(
json
.
containsKey
(
"has_more"
))
{
more
=
json
.
getBoolean
(
"has_more"
);
}
max
_behot_t
ime
=
json
.
getJSONObject
(
"next"
).
getLongValue
(
"max_behot_time"
);
max
BehotT
ime
=
json
.
getJSONObject
(
"next"
).
getLongValue
(
"max_behot_time"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
Date
date
=
null
;
String
href
=
null
;
...
...
@@ -564,19 +564,19 @@ public class TouTiaoArticleParse {
/** 验证是否有下一页数据 **/
if
(
more
)
{
if
(
max
_behot_time
!=
null
&&
max_behot_t
ime
!=
0
)
{
if
(
max
BehotTime
!=
null
&&
maxBehotT
ime
!=
0
)
{
if
(
endDate
.
after
(
date
))
{
max
_behot_t
ime
=
null
;
max
BehotT
ime
=
null
;
}
}
}
else
{
max
_behot_t
ime
=
null
;
max
BehotT
ime
=
null
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
map
.
put
(
"max_behot_time"
,
max
_behot_t
ime
);
map
.
put
(
"max_behot_time"
,
max
BehotT
ime
);
map
.
put
(
"data"
,
dataList
);
return
map
;
...
...
@@ -591,10 +591,17 @@ public class TouTiaoArticleParse {
public
static
String
getContent
(
String
url
,
Proxy
proxy
)
{
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
null
);
if
(!
StringUtils
.
isBlank
(
htmlBody
))
{
if
(
htmlBody
.
contains
(
"content:"
))
{
String
content
=
htmlBody
.
split
(
" content: '"
)[
1
].
split
(
"',"
)[
0
];
return
ZhiWeiTools
.
delHTMLTag
(
content
);
String
regex
=
"<script>var BASE_DATA[\\s\\S]+?</script>"
;
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"articleInfo"
))
{
//通过正则截取需要的js代码
Matcher
matcher
=
Pattern
.
compile
(
regex
).
matcher
(
htmlBody
);
if
(
matcher
.
find
())
{
String
content
=
matcher
.
group
().
replace
(
"<script>var BASE_DATA = |;</script>"
,
""
);
//通过js引擎执行js代码
String
jsContent
=
"eval(("
+
content
+
")).articleInfo.content.toString();"
;
String
contentHtml
=
scriptEngine
.
eval
(
jsContent
).
toString
();
//解析最后的数据
return
Jsoup
.
parse
(
contentHtml
).
text
();
}
}
return
null
;
...
...
@@ -605,16 +612,14 @@ public class TouTiaoArticleParse {
}
/**
* 下载数据
* @param url
* @param proxy
* @param headMap
* @return
*/
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headMap
)
{
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
null
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment