Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
408ac5cd
Commit
408ac5cd
authored
Dec 10, 2018
by
[zhangzhiwei]
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加用户历史文章及关注列表新接口采集
parent
a0aee201
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
76 additions
and
60 deletions
+76
-60
pom.xml
+1
-1
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
+17
-14
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+37
-32
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
+21
-13
No files found.
pom.xml
View file @
408ac5cd
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
<dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.
0.9
-SNAPSHOT
</version>
<version>
0.
1.0
-SNAPSHOT
</version>
</dependency>
</dependency>
<dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<groupId>
com.zhiwei.crawler
</groupId>
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
View file @
408ac5cd
...
@@ -193,25 +193,28 @@ public class TouTiaoAccountParse {
...
@@ -193,25 +193,28 @@ public class TouTiaoAccountParse {
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"referer"
,
"ihttps://www.toutiao.com/c/user/relation/"
+
userid
+
"/?tab=following"
);
headerMap
.
put
(
"referer"
,
"ihttps://www.toutiao.com/c/user/relation/"
+
userid
+
"/?tab=following"
);
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
try
{
for
(
int
i
=
0
;
i
<
3
;
i
++){
String
htmlBody
=
null
;
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
String
htmlBody
=
null
;
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"name"
)){
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"name"
)){
more
=
json
.
getBooleanValue
(
"has_more"
);
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
List
<
TouTiaoAccount
>
dataList
=
parseFans
(
json
);
more
=
json
.
getBooleanValue
(
"has_more"
);
if
(
dataList
!=
null
&&
!
dataList
.
isEmpty
()){
List
<
TouTiaoAccount
>
dataList
=
parseFans
(
json
);
ttaList
.
addAll
(
dataList
);
if
(
dataList
!=
null
&&
!
dataList
.
isEmpty
()){
ttaList
.
addAll
(
dataList
);
}
else
{
more
=
false
;
}
}
else
{
}
else
{
more
=
false
;
more
=
false
;
continue
;
}
}
}
else
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
more
=
false
;
more
=
false
;
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
more
=
false
;
return
null
;
}
}
}
}
return
ttaList
;
return
ttaList
;
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
View file @
408ac5cd
...
@@ -14,6 +14,7 @@ package com.zhiwei.toutiao.parse;
...
@@ -14,6 +14,7 @@ package com.zhiwei.toutiao.parse;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
...
@@ -59,7 +60,6 @@ public class TouTiaoArticleParse {
...
@@ -59,7 +60,6 @@ public class TouTiaoArticleParse {
if
(
max_behot_time
!=
null
){
if
(
max_behot_time
!=
null
){
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
}
}
System
.
out
.
println
(
"url=========="
+
url
);
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
url
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
...
@@ -77,7 +77,7 @@ public class TouTiaoArticleParse {
...
@@ -77,7 +77,7 @@ public class TouTiaoArticleParse {
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
throw
e
;
throw
e
;
}
}
return
null
;
return
Collections
.
emptyMap
()
;
}
}
@Deprecated
@Deprecated
...
@@ -89,7 +89,6 @@ public class TouTiaoArticleParse {
...
@@ -89,7 +89,6 @@ public class TouTiaoArticleParse {
if
(
max_behot_time
!=
null
){
if
(
max_behot_time
!=
null
){
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
}
}
System
.
out
.
println
(
"url=========="
+
url
);
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
url
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
...
@@ -107,7 +106,7 @@ public class TouTiaoArticleParse {
...
@@ -107,7 +106,7 @@ public class TouTiaoArticleParse {
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
);
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
);
throw
e
;
throw
e
;
}
}
return
null
;
return
Collections
.
emptyMap
()
;
}
}
/**
/**
...
@@ -123,60 +122,66 @@ public class TouTiaoArticleParse {
...
@@ -123,60 +122,66 @@ public class TouTiaoArticleParse {
Signature
signature
=
new
Signature
(
user_id
,
max_behot_time
);
Signature
signature
=
new
Signature
(
user_id
,
max_behot_time
);
String
as
=
signature
.
getAs
();
String
as
=
signature
.
getAs
();
String
cp
=
signature
.
getCp
();
String
cp
=
signature
.
getCp
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user_id
+
"&max_behot_time="
+
max_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
signature
;
String
_signature
=
signature
.
getSignature
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user_id
+
"&max_behot_time="
+
max_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
_signature
;
if
(
max_behot_time
!=
null
){
if
(
max_behot_time
!=
null
){
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
}
}
System
.
out
.
println
(
"url=========="
+
url
);
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
{
for
(
int
i
=
0
;
i
<
3
;
i
++){
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
try
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
)){
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user_id
,
htmlBody
,
endData
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
)){
if
(
ttList
!=
null
&&
ttList
.
size
()>
0
){
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user_id
,
htmlBody
,
endData
);
return
ttList
;
if
(
ttList
!=
null
&&
ttList
.
size
()>
0
){
return
ttList
;
}
}
else
{
logger
.
info
(
"数据为null"
);
continue
;
}
}
}
else
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"数据为null"
);
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
throw
e
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
throw
e
;
}
}
return
null
;
return
Collections
.
emptyMap
()
;
}
}
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
user_id
,
String
max_behot_time
,
Date
endData
,
ProxyHolder
proxy
)
throws
Exception
{
public
static
Map
<
String
,
Object
>
getTouTiaoHistory
(
String
user_id
,
String
max_behot_time
,
Date
endData
,
ProxyHolder
proxy
)
throws
Exception
{
Signature
signature
=
new
Signature
(
user_id
,
max_behot_time
);
Signature
signature
=
new
Signature
(
user_id
,
max_behot_time
);
String
as
=
signature
.
getAs
();
String
as
=
signature
.
getAs
();
String
cp
=
signature
.
getCp
();
String
cp
=
signature
.
getCp
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user_id
+
"&max_behot_time="
+
max_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
signature
;
String
_signature
=
signature
.
getSignature
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user_id
+
"&max_behot_time="
+
max_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
_signature
;
if
(
max_behot_time
!=
null
){
if
(
max_behot_time
!=
null
){
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
}
}
System
.
out
.
println
(
"url=========="
+
url
);
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
{
for
(
int
i
=
0
;
i
<
3
;
i
++){
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
try
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
)){
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user_id
,
htmlBody
,
endData
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
)){
if
(
ttList
!=
null
&&
ttList
.
size
()>
0
){
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user_id
,
htmlBody
,
endData
);
return
ttList
;
if
(
ttList
!=
null
&&
ttList
.
size
()>
0
){
return
ttList
;
}
}
else
{
logger
.
info
(
"数据为null"
);
continue
;
}
}
}
else
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"数据为null"
);
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
);
throw
e
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
);
throw
e
;
}
}
return
null
;
return
Collections
.
emptyMap
()
;
}
}
...
...
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
View file @
408ac5cd
...
@@ -16,6 +16,9 @@ import java.util.Date;
...
@@ -16,6 +16,9 @@ import java.util.Date;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.parse.TouTiaoArticleParse
;
import
com.zhiwei.toutiao.parse.TouTiaoArticleParse
;
...
@@ -26,25 +29,29 @@ import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
...
@@ -26,25 +29,29 @@ import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
* @date 2016年9月2日 上午11:48:51
* @date 2016年9月2日 上午11:48:51
*/
*/
public
class
TouTiaoExample
{
public
class
TouTiaoExample
{
private
static
final
String
registry
=
"zookeeper://192.168.0.36:2181"
;
private
static
final
String
group
=
"local"
;
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
void
main
(
String
[]
args
)
throws
Exception
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
long
a
=
System
.
currentTimeMillis
();
ProxyFactory
.
init
(
registry
,
group
,
GroupType
.
PROVIDER
);
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
urlList
.
add
(
"6075371636"
);
urlList
.
add
(
"6075371636"
);
Date
endTime
=
TimeParse
.
stringFormartDate
(
"2018-10-01"
);
System
.
out
.
println
(
urlList
.
size
());
Date
endTime
=
TimeParse
.
stringFormartDate
(
"2018-04-01"
);
for
(
String
url
:
urlList
)
{
for
(
String
url
:
urlList
)
{
long
a
=
System
.
currentTimeMillis
();
String
mid
=
url
;
String
mid
=
url
;
Long
max_behot_time
=
0L
;
Long
max_behot_time
=
0L
;
List
<
TouTiaoArticle
>
list
=
new
ArrayList
<>();
boolean
f
=
true
;
boolean
f
=
true
;
while
(
f
)
{
while
(
f
)
{
Map
<
String
,
Object
>
dataMap
=
null
;
Map
<
String
,
Object
>
dataMap
=
null
;
dataMap
=
TouTiaoArticleParse
.
get
MicroTouTiaoCrawler
(
mid
,
endTime
,
null
,
max_behot_time
+
""
);
dataMap
=
TouTiaoArticleParse
.
get
TouTiaoHistory
(
mid
,
max_behot_time
+
""
,
endTime
,
ProxyHolder
.
NAT_PROXY
);
if
(
dataMap
!=
null
)
{
if
(
dataMap
!=
null
&&
!
dataMap
.
isEmpty
()
)
{
List
<
TouTiaoArticle
>
ttlist
=
(
List
<
TouTiaoArticle
>)
dataMap
.
get
(
"data"
);
List
<
TouTiaoArticle
>
ttlist
=
(
List
<
TouTiaoArticle
>)
dataMap
.
get
(
"data"
);
max_behot_time
=
(
Long
)
dataMap
.
get
(
"max_behot_time"
);
max_behot_time
=
(
Long
)
dataMap
.
get
(
"max_behot_time"
);
System
.
out
.
println
(
max_behot_time
+
"======="
+
ttlist
.
size
());
System
.
out
.
println
(
max_behot_time
+
"======="
+
ttlist
.
size
());
...
@@ -52,16 +59,17 @@ public class TouTiaoExample {
...
@@ -52,16 +59,17 @@ public class TouTiaoExample {
f
=
false
;
f
=
false
;
}
else
{
}
else
{
if
(
ttlist
.
size
()
>
0
)
{
if
(
ttlist
.
size
()
>
0
)
{
for
(
TouTiaoArticle
tt
:
ttlist
)
{
list
.
addAll
(
ttlist
);
System
.
out
.
println
(
tt
);
}
}
}
}
}
}
else
{
f
=
false
;
}
}
}
}
long
b
=
System
.
currentTimeMillis
();
System
.
out
.
println
(
"一轮的采集时间为:"
+
(
b
-
a
)
/
1000
+
" 数据量为"
+
list
.
size
());
}
}
long
b
=
System
.
currentTimeMillis
();
System
.
out
.
println
(
"一轮的采集时间为:"
+
(
b
-
a
)
/
1000
);
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment