Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
cb6a0b84
Commit
cb6a0b84
authored
Jan 07, 2019
by
[zhangzhiwei]
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加文章类型
parent
efe57d38
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
135 additions
and
106 deletions
+135
-106
src/main/java/com/zhiwei/toutiao/bean/TouTiaoArticle.java
+11
-1
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+121
-102
src/main/java/com/zhiwei/toutiao/parse/TouTiaoChannelParse.java
+1
-1
src/main/java/com/zhiwei/toutiao/parse/TouTiaoParse.java
+1
-1
src/main/java/com/zhiwei/toutiao/parse/TouTiaoSearchParse.java
+1
-1
No files found.
src/main/java/com/zhiwei/toutiao/bean/TouTiaoArticle.java
View file @
cb6a0b84
...
@@ -35,7 +35,14 @@ public class TouTiaoArticle implements Serializable{
...
@@ -35,7 +35,14 @@ public class TouTiaoArticle implements Serializable{
private
String
readNum
;
private
String
readNum
;
private
String
shareNum
;
private
String
shareNum
;
private
List
<
String
>
labelList
;
private
List
<
String
>
labelList
;
private
String
articleType
;
public
String
getArticleType
()
{
return
articleType
;
}
public
void
setArticleType
(
String
articleType
)
{
this
.
articleType
=
articleType
;
}
public
String
getCommentCount
()
{
public
String
getCommentCount
()
{
return
commentCount
;
return
commentCount
;
}
}
...
@@ -113,7 +120,8 @@ public class TouTiaoArticle implements Serializable{
...
@@ -113,7 +120,8 @@ public class TouTiaoArticle implements Serializable{
public
TouTiaoArticle
(){}
public
TouTiaoArticle
(){}
public
TouTiaoArticle
(
String
url
,
String
title
,
String
user_id
,
public
TouTiaoArticle
(
String
url
,
String
title
,
String
user_id
,
String
source
,
Date
time
,
String
content
,
String
commentCount
,
String
source
,
Date
time
,
String
content
,
String
commentCount
,
String
playCount
,
String
readNum
,
String
shareNum
,
String
type
)
String
playCount
,
String
readNum
,
String
shareNum
,
String
type
,
String
articleType
)
{
{
this
.
url
=
url
;
this
.
url
=
url
;
this
.
title
=
title
;
this
.
title
=
title
;
...
@@ -126,6 +134,7 @@ public class TouTiaoArticle implements Serializable{
...
@@ -126,6 +134,7 @@ public class TouTiaoArticle implements Serializable{
this
.
playCount
=
playCount
;
this
.
playCount
=
playCount
;
this
.
shareNum
=
shareNum
;
this
.
shareNum
=
shareNum
;
this
.
commentCount
=
commentCount
;
this
.
commentCount
=
commentCount
;
this
.
articleType
=
articleType
;
}
}
public
String
toString
()
public
String
toString
()
...
@@ -143,6 +152,7 @@ public class TouTiaoArticle implements Serializable{
...
@@ -143,6 +152,7 @@ public class TouTiaoArticle implements Serializable{
+
", readNum = "
+
readNum
+
", readNum = "
+
readNum
+
", shareNum = "
+
shareNum
+
", shareNum = "
+
shareNum
+
", labelList = "
+
labelList
+
", labelList = "
+
labelList
+
", articleType = "
+
articleType
+
"]"
;
+
"]"
;
}
}
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
View file @
cb6a0b84
...
@@ -39,7 +39,9 @@ import com.zhiwei.toutiao.util.Tools;
...
@@ -39,7 +39,9 @@ import com.zhiwei.toutiao.util.Tools;
* @date 2016年9月2日 上午11:17:44
* @date 2016年9月2日 上午11:17:44
*/
*/
public
class
TouTiaoArticleParse
{
public
class
TouTiaoArticleParse
{
private
TouTiaoArticleParse
()
{}
private
TouTiaoArticleParse
()
{
}
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoArticleParse
.
class
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoArticleParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
...
@@ -54,23 +56,25 @@ public class TouTiaoArticleParse {
...
@@ -54,23 +56,25 @@ public class TouTiaoArticleParse {
* @throws Exception
* @throws Exception
*/
*/
@Deprecated
@Deprecated
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media_id
,
String
max_behot_time
,
Date
endData
,
Proxy
proxy
)
throws
Exception
{
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media_id
,
String
max_behot_time
,
Date
endData
,
Proxy
proxy
)
throws
Exception
{
Signature
signature
=
new
Signature
();
Signature
signature
=
new
Signature
();
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
media_id
+
"&count=20&as="
+
signature
.
getAs
()+
"&cp="
+
signature
.
getCp
();
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
media_id
+
"&count=20&as="
if
(
max_behot_time
!=
null
){
+
signature
.
getAs
()
+
"&cp="
+
signature
.
getCp
();
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
if
(
max_behot_time
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
}
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
url
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
{
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()>
0
)
{
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
return
ttList
;
}
}
}
else
{
}
else
{
logger
.
info
(
"数据为null"
);
logger
.
info
(
"数据为null"
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -81,25 +85,27 @@ public class TouTiaoArticleParse {
...
@@ -81,25 +85,27 @@ public class TouTiaoArticleParse {
}
}
@Deprecated
@Deprecated
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media_id
,
Long
max_behot_time
,
Date
endData
,
ProxyHolder
proxy
)
throws
Exception
{
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media_id
,
Long
max_behot_time
,
Date
endData
,
ProxyHolder
proxy
)
throws
Exception
{
Signature
signature
=
new
Signature
();
Signature
signature
=
new
Signature
();
String
as
=
signature
.
getAs
();
String
as
=
signature
.
getAs
();
String
cp
=
signature
.
getCp
();
String
cp
=
signature
.
getCp
();
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
media_id
+
"&count=20&as="
+
as
+
"&cp="
+
cp
;
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
media_id
+
"&count=20&as="
+
as
+
"&cp="
if
(
max_behot_time
!=
null
){
+
cp
;
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
if
(
max_behot_time
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
}
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
url
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
{
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()>
0
)
{
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
return
ttList
;
}
}
}
else
{
}
else
{
logger
.
info
(
"数据为null"
);
logger
.
info
(
"数据为null"
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -111,6 +117,7 @@ public class TouTiaoArticleParse {
...
@@ -111,6 +117,7 @@ public class TouTiaoArticleParse {
/**
/**
* 获取今日头条历史文章接口新
* 获取今日头条历史文章接口新
*
* @param user_id
* @param user_id
* @param max_behot_time
* @param max_behot_time
* @param endData
* @param endData
...
@@ -118,26 +125,29 @@ public class TouTiaoArticleParse {
...
@@ -118,26 +125,29 @@ public class TouTiaoArticleParse {
* @return
* @return
* @throws Exception
* @throws Exception
*/
*/
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
user_id
,
String
max_behot_time
,
Date
endData
,
Proxy
proxy
)
throws
Exception
{
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
user_id
,
String
max_behot_time
,
Date
endData
,
for
(
int
i
=
0
;
i
<
3
;
i
++){
Proxy
proxy
)
throws
Exception
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
Signature
signature
=
new
Signature
(
user_id
,
max_behot_time
);
Signature
signature
=
new
Signature
(
user_id
,
max_behot_time
);
String
as
=
signature
.
getAs
();
String
as
=
signature
.
getAs
();
String
cp
=
signature
.
getCp
();
String
cp
=
signature
.
getCp
();
String
_signature
=
signature
.
getSignature
();
String
_signature
=
signature
.
getSignature
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user_id
+
"&max_behot_time="
+
max_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
_signature
;
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user_id
+
"&max_behot_time="
+
max_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
_signature
;
System
.
out
.
println
(
url
);
System
.
out
.
println
(
url
);
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"user-agent"
,
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
{
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user_id
,
htmlBody
,
endData
);
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user_id
,
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()>
0
)
{
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
return
ttList
;
}
}
}
else
{
}
else
{
logger
.
info
(
"数据为null"
);
logger
.
info
(
"数据为null"
);
continue
;
continue
;
}
}
...
@@ -149,28 +159,31 @@ public class TouTiaoArticleParse {
...
@@ -149,28 +159,31 @@ public class TouTiaoArticleParse {
return
Collections
.
emptyMap
();
return
Collections
.
emptyMap
();
}
}
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
user_id
,
String
max_behot_time
,
Date
endData
,
ProxyHolder
proxy
)
throws
Exception
{
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
user_id
,
String
max_behot_time
,
Date
endData
,
for
(
int
i
=
0
;
i
<
3
;
i
++){
ProxyHolder
proxy
)
throws
Exception
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
Signature
signature
=
new
Signature
(
user_id
,
max_behot_time
);
Signature
signature
=
new
Signature
(
user_id
,
max_behot_time
);
String
as
=
signature
.
getAs
();
String
as
=
signature
.
getAs
();
String
cp
=
signature
.
getCp
();
String
cp
=
signature
.
getCp
();
String
_signature
=
signature
.
getSignature
();
String
_signature
=
signature
.
getSignature
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user_id
+
"&max_behot_time="
+
max_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
_signature
;
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user_id
+
"&max_behot_time="
+
max_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
_signature
;
logger
.
info
(
"当前采集的历史文章链接:::{}"
,
url
);
logger
.
info
(
"当前采集的历史文章链接:::{}"
,
url
);
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"user-agent"
,
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
{
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user_id
,
htmlBody
,
endData
);
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user_id
,
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()>
0
)
{
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
return
ttList
;
}
else
{
}
else
{
break
;
break
;
}
}
}
else
{
}
else
{
logger
.
info
(
"数据为null,获取到的文本为:::{}"
,
htmlBody
);
logger
.
info
(
"数据为null,获取到的文本为:::{}"
,
htmlBody
);
continue
;
continue
;
}
}
...
@@ -182,7 +195,6 @@ public class TouTiaoArticleParse {
...
@@ -182,7 +195,6 @@ public class TouTiaoArticleParse {
return
Collections
.
emptyMap
();
return
Collections
.
emptyMap
();
}
}
/***
/***
* 根据帐号解析历史文章地址
* 根据帐号解析历史文章地址
*
*
...
@@ -210,25 +222,28 @@ public class TouTiaoArticleParse {
...
@@ -210,25 +222,28 @@ public class TouTiaoArticleParse {
String
shareNum
=
null
;
String
shareNum
=
null
;
String
source
=
null
;
String
source
=
null
;
String
user_id
=
null
;
String
user_id
=
null
;
String
articleType
=
null
;
List
<
String
>
labelList
=
null
;
List
<
String
>
labelList
=
null
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
String
href
=
"https://www.toutiao.com/"
;
String
href
=
"https://www.toutiao.com/"
;
if
(
data
.
containsKey
(
"group_id"
))
{
if
(
data
.
containsKey
(
"group_id"
))
{
href
=
href
+
"a"
+
data
.
getLongValue
(
"group_id"
);
href
=
href
+
"a"
+
data
.
getLongValue
(
"group_id"
);
title
=
data
.
getString
(
"title"
);
title
=
data
.
getString
(
"title"
);
content
=
data
.
getString
(
"abstract"
);
content
=
data
.
getString
(
"abstract"
);
time
=
data
.
getLongValue
(
"behot_time"
)
*
1000
+
""
;
time
=
data
.
getLongValue
(
"behot_time"
)
*
1000
+
""
;
date
=
TimeParse
.
stringFormartDate
(
time
);
date
=
TimeParse
.
stringFormartDate
(
time
);
readNum
=
data
.
getString
(
"go_detail_count"
);
readNum
=
data
.
getString
(
"go_detail_count"
);
commentNum
=
data
.
getString
(
"comments_count"
);
commentNum
=
data
.
getString
(
"comments_count"
);
playNum
=
data
.
getString
(
"detail_play_effective_count"
);
playNum
=
data
.
getString
(
"detail_play_effective_count"
);
shareNum
=
data
.
getString
(
"share_count"
);
shareNum
=
data
.
getString
(
"share_count"
);
source
=
data
.
getString
(
"source"
);
source
=
data
.
getString
(
"source"
);
user_id
=
data
.
getLong
(
"creator_uid"
)+
""
;
user_id
=
data
.
getLong
(
"creator_uid"
)
+
""
;
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
user_id
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
shareNum
,
"今日头条"
);
articleType
=
data
.
getString
(
"chinese_tag"
);
if
(
data
.
containsKey
(
"label"
)){
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
user_id
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
shareNum
,
"今日头条"
,
articleType
);
if
(
data
.
containsKey
(
"label"
))
{
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
tt
.
setLabelList
(
labelList
);
tt
.
setLabelList
(
labelList
);
}
}
...
@@ -244,10 +259,10 @@ public class TouTiaoArticleParse {
...
@@ -244,10 +259,10 @@ public class TouTiaoArticleParse {
return
null
;
return
null
;
}
}
if
(
endDate
!=
null
)
{
if
(
endDate
!=
null
)
{
if
(
max_behot_time
!=
null
&&
!
"0"
.
equals
(
max_behot_time
))
{
if
(
max_behot_time
!=
null
&&
!
"0"
.
equals
(
max_behot_time
))
{
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
max_behot_time
+
"000"
));
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
max_behot_time
+
"000"
));
if
(
endDate
.
after
(
nextDate
))
{
if
(
endDate
.
after
(
nextDate
))
{
max_behot_time
=
null
;
max_behot_time
=
null
;
}
}
}
}
...
@@ -257,8 +272,6 @@ public class TouTiaoArticleParse {
...
@@ -257,8 +272,6 @@ public class TouTiaoArticleParse {
return
map
;
return
map
;
}
}
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
user_id
,
String
htmlBody
,
Date
endDate
)
{
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
user_id
,
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Long
max_behot_time
=
null
;
Long
max_behot_time
=
null
;
...
@@ -276,28 +289,30 @@ public class TouTiaoArticleParse {
...
@@ -276,28 +289,30 @@ public class TouTiaoArticleParse {
String
playNum
=
null
;
String
playNum
=
null
;
String
shareNum
=
null
;
String
shareNum
=
null
;
String
source
=
null
;
String
source
=
null
;
String
articleType
=
null
;
List
<
String
>
labelList
=
null
;
List
<
String
>
labelList
=
null
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
String
href
=
"https://www.toutiao.com/"
;
String
href
=
"https://www.toutiao.com/"
;
if
(
data
.
containsKey
(
"group_id"
))
{
if
(
data
.
containsKey
(
"group_id"
))
{
href
=
href
+
"a"
+
data
.
getLongValue
(
"group_id"
);
href
=
href
+
"a"
+
data
.
getLongValue
(
"group_id"
);
title
=
data
.
getString
(
"title"
);
title
=
data
.
getString
(
"title"
);
content
=
data
.
getString
(
"abstract"
);
content
=
data
.
getString
(
"abstract"
);
time
=
data
.
getLongValue
(
"behot_time"
)
*
1000
+
""
;
time
=
data
.
getLongValue
(
"behot_time"
)
*
1000
+
""
;
date
=
TimeParse
.
stringFormartDate
(
time
);
date
=
TimeParse
.
stringFormartDate
(
time
);
readNum
=
data
.
getString
(
"go_detail_count"
);
readNum
=
data
.
getString
(
"go_detail_count"
);
commentNum
=
data
.
getString
(
"comments_count"
);
commentNum
=
data
.
getString
(
"comments_count"
);
playNum
=
data
.
getString
(
"detail_play_effective_count"
);
playNum
=
data
.
getString
(
"detail_play_effective_count"
);
shareNum
=
data
.
getString
(
"share_count"
);
shareNum
=
data
.
getString
(
"share_count"
);
source
=
data
.
getString
(
"source"
);
source
=
data
.
getString
(
"source"
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
user_id
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
shareNum
,
"今日头条"
);
articleType
=
data
.
getString
(
"chinese_tag"
);
if
(
data
.
containsKey
(
"label"
)){
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
user_id
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
shareNum
,
"今日头条"
,
articleType
);
if
(
data
.
containsKey
(
"label"
))
{
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
labelList
=
data
.
getJSONArray
(
"label"
).
toJavaList
(
String
.
class
);
tt
.
setLabelList
(
labelList
);
tt
.
setLabelList
(
labelList
);
}
}
System
.
out
.
println
(
tt
.
toString
());
dataList
.
add
(
tt
);
dataList
.
add
(
tt
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -309,10 +324,10 @@ public class TouTiaoArticleParse {
...
@@ -309,10 +324,10 @@ public class TouTiaoArticleParse {
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
return
null
;
return
null
;
}
}
if
(
endDate
!=
null
)
{
if
(
endDate
!=
null
)
{
if
(
max_behot_time
!=
null
&&
!
"0"
.
equals
(
max_behot_time
))
{
if
(
max_behot_time
!=
null
&&
!
"0"
.
equals
(
max_behot_time
))
{
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
max_behot_time
+
"000"
));
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
max_behot_time
+
"000"
));
if
(
endDate
.
after
(
nextDate
))
{
if
(
endDate
.
after
(
nextDate
))
{
max_behot_time
=
null
;
max_behot_time
=
null
;
}
}
}
}
...
@@ -322,35 +337,38 @@ public class TouTiaoArticleParse {
...
@@ -322,35 +337,38 @@ public class TouTiaoArticleParse {
return
map
;
return
map
;
}
}
/**
/**
* @Title: getMicroTouTiaoCrawler
* @Title: getMicroTouTiaoCrawler
* @author hero
* @author hero
* @Description: 根据用户user_id查询用户微头条数据
* @Description: 根据用户user_id查询用户微头条数据
* @param @param user_id
* @param @param
* @param @param endDate
* user_id
* @param @param proxy
* @param @param
* endDate
* @param @param
* proxy
* @param @return
* @param @return
* @param @throws IOException 设定文件
* @param @throws
* IOException 设定文件
* @return List<Map<String,Object>> 返回类型
* @return List<Map<String,Object>> 返回类型
*/
*/
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
user_id
,
Date
endDate
,
Proxy
proxy
,
String
max_behot_time
)
throws
IOException
{
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
user_id
,
Date
endDate
,
Proxy
proxy
,
String
url
=
"https://www.toutiao.com/c/ugc/content/list/"
+
user_id
+
"/"
;
String
max_behot_time
)
throws
IOException
{
if
(
max_behot_time
!=
null
){
String
url
=
"https://www.toutiao.com/c/ugc/content/list/"
+
user_id
+
"/"
;
if
(
max_behot_time
!=
null
)
{
url
=
url
+
"?max_time="
+
max_behot_time
;
url
=
url
+
"?max_time="
+
max_behot_time
;
}
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
System
.
out
.
println
(
url
);
System
.
out
.
println
(
url
);
try
{
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()>
0
)
{
if
(
dataMap
!=
null
&&
dataMap
.
size
()
>
0
)
{
return
dataMap
;
return
dataMap
;
}
}
}
else
{
}
else
{
logger
.
info
(
"数据为null"
);
logger
.
info
(
"数据为null"
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -360,24 +378,23 @@ public class TouTiaoArticleParse {
...
@@ -360,24 +378,23 @@ public class TouTiaoArticleParse {
return
null
;
return
null
;
}
}
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
user_id
,
Date
endDate
,
ProxyHolder
proxy
,
Long
max_behot_time
)
throws
IOException
{
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
user_id
,
Date
endDate
,
ProxyHolder
proxy
,
Long
max_behot_time
)
throws
IOException
{
String
url
=
"https://www.toutiao.com/c/ugc/content/list/"
+
user_id
+
"/"
;
String
url
=
"https://www.toutiao.com/c/ugc/content/list/"
+
user_id
+
"/"
;
if
(
max_behot_time
!=
null
)
{
if
(
max_behot_time
!=
null
){
url
=
url
+
"?max_time="
+
max_behot_time
;
url
=
url
+
"?max_time="
+
max_behot_time
;
}
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
System
.
out
.
println
(
url
);
System
.
out
.
println
(
url
);
try
{
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()>
0
)
{
if
(
dataMap
!=
null
&&
dataMap
.
size
()
>
0
)
{
return
dataMap
;
return
dataMap
;
}
}
}
else
{
}
else
{
logger
.
info
(
"数据为null"
);
logger
.
info
(
"数据为null"
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -391,9 +408,12 @@ public class TouTiaoArticleParse {
...
@@ -391,9 +408,12 @@ public class TouTiaoArticleParse {
* @Title: parseHtmlByMicroAccount
* @Title: parseHtmlByMicroAccount
* @author hero
* @author hero
* @Description: 解析微头条数据
* @Description: 解析微头条数据
* @param @param htmlBody
* @param @param
* @param @param endDate
* htmlBody
* @param @return 设定文件
* @param @param
* endDate
* @param @return
* 设定文件
* @return Map<String,Object> 返回类型
* @return Map<String,Object> 返回类型
*/
*/
private
static
Map
<
String
,
Object
>
parseHtmlByMicroAccount
(
String
htmlBody
,
Date
endDate
)
{
private
static
Map
<
String
,
Object
>
parseHtmlByMicroAccount
(
String
htmlBody
,
Date
endDate
)
{
...
@@ -413,38 +433,40 @@ public class TouTiaoArticleParse {
...
@@ -413,38 +433,40 @@ public class TouTiaoArticleParse {
String
commentNum
=
null
;
String
commentNum
=
null
;
String
playNum
=
null
;
String
playNum
=
null
;
String
user_id
=
null
;
String
user_id
=
null
;
String
articleType
=
null
;
int
count
=
16
;
int
count
=
16
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
max_behot_time
=
data
.
getLongValue
(
"create_time"
);
max_behot_time
=
data
.
getLongValue
(
"create_time"
);
date
=
new
Date
(
max_behot_time
*
1000
);
date
=
new
Date
(
max_behot_time
*
1000
);
href
=
"https://www.toutiao.com/a"
+
data
.
getString
(
"thread_id"
);
href
=
"https://www.toutiao.com/a"
+
data
.
getString
(
"thread_id"
);
source
=
data
.
getJSONObject
(
"ugc_user"
).
getString
(
"name"
);
source
=
data
.
getJSONObject
(
"ugc_user"
).
getString
(
"name"
);
content
=
data
.
getString
(
"content"
);
content
=
data
.
getString
(
"content"
);
readNum
=
data
.
getInteger
(
"read_count"
)
+
""
;
readNum
=
data
.
getInteger
(
"read_count"
)
+
""
;
commentNum
=
data
.
getInteger
(
"comment_count"
)
+
""
;
commentNum
=
data
.
getInteger
(
"comment_count"
)
+
""
;
user_id
=
data
.
getJSONObject
(
"ugc_user"
).
getString
(
"user_id"
);
user_id
=
data
.
getJSONObject
(
"ugc_user"
).
getString
(
"user_id"
);
if
(
content
!=
null
&&
!
""
.
equals
(
content
))
{
if
(
content
!=
null
&&
!
""
.
equals
(
content
))
{
if
(
content
.
length
()<
16
)
{
if
(
content
.
length
()
<
16
)
{
count
=
content
.
length
();
count
=
content
.
length
();
}
}
title
=
content
.
substring
(
0
,
count
);
title
=
content
.
substring
(
0
,
count
);
}
}
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
user_id
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
"0"
,
"微头条"
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
user_id
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
"0"
,
"微头条"
,
articleType
);
dataList
.
add
(
tt
);
dataList
.
add
(
tt
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
continue
;
continue
;
}
}
}
}
/**
验证是否有下一页数据
**/
/**
验证是否有下一页数据
**/
if
(
more
)
{
if
(
more
)
{
if
(
max_behot_time
!=
null
&&
max_behot_time
!=
0
)
{
if
(
max_behot_time
!=
null
&&
max_behot_time
!=
0
)
{
if
(
endDate
.
after
(
date
))
{
if
(
endDate
.
after
(
date
))
{
max_behot_time
=
null
;
max_behot_time
=
null
;
}
}
}
}
}
else
{
}
else
{
max_behot_time
=
null
;
max_behot_time
=
null
;
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -457,7 +479,4 @@ public class TouTiaoArticleParse {
...
@@ -457,7 +479,4 @@ public class TouTiaoArticleParse {
return
map
;
return
map
;
}
}
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoChannelParse.java
View file @
cb6a0b84
...
@@ -98,7 +98,7 @@ public class TouTiaoChannelParse {
...
@@ -98,7 +98,7 @@ public class TouTiaoChannelParse {
}
}
url
=
getUrl
(
url
);
url
=
getUrl
(
url
);
date
=
TimeParse
.
stringFormartDate
(
time
);
date
=
TimeParse
.
stringFormartDate
(
time
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
url
,
title
,
null
,
source
,
date
,
content
,
comment_count
,
"-1"
,
"-1"
,
"-1"
,
"今日头条"
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
url
,
title
,
null
,
source
,
date
,
content
,
comment_count
,
"-1"
,
"-1"
,
"-1"
,
"今日头条"
,
null
);
ttList
.
add
(
tt
);
ttList
.
add
(
tt
);
}
catch
(
JSONException
e
)
{
}
catch
(
JSONException
e
)
{
continue
;
continue
;
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoParse.java
View file @
cb6a0b84
...
@@ -152,7 +152,7 @@ public class TouTiaoParse {
...
@@ -152,7 +152,7 @@ public class TouTiaoParse {
String
shareNum
=
data
.
getString
(
"share_count"
);
String
shareNum
=
data
.
getString
(
"share_count"
);
if
(
endData
.
before
(
date
))
{
if
(
endData
.
before
(
date
))
{
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
null
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
shareNum
,
"今日头条"
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
null
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
shareNum
,
"今日头条"
,
null
);
dataList
.
add
(
tt
);
dataList
.
add
(
tt
);
}
else
}
else
{
{
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoSearchParse.java
View file @
cb6a0b84
...
@@ -90,7 +90,7 @@ public class TouTiaoSearchParse {
...
@@ -90,7 +90,7 @@ public class TouTiaoSearchParse {
String
user_id
=
jso
.
getString
(
"user_id"
);
String
user_id
=
jso
.
getString
(
"user_id"
);
Date
date
=
TimeParse
.
stringFormartDate
(
time
);
Date
date
=
TimeParse
.
stringFormartDate
(
time
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
url
,
title
,
user_id
,
source
,
date
,
content
,
comment_count
,
"-1"
,
"-1"
,
"-1"
,
"今日头条"
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
url
,
title
,
user_id
,
source
,
date
,
content
,
comment_count
,
"-1"
,
"-1"
,
"-1"
,
"今日头条"
,
null
);
ttList
.
add
(
tt
);
ttList
.
add
(
tt
);
}
catch
(
JSONException
e
)
{
}
catch
(
JSONException
e
)
{
logger
.
debug
(
"解析数据出现问题"
,
e
.
fillInStackTrace
());
logger
.
debug
(
"解析数据出现问题"
,
e
.
fillInStackTrace
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment