Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
c2e5c825
Commit
c2e5c825
authored
Nov 19, 2018
by
[zhangzhiwei]
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
微头条及头条文章采集
parent
26dc222c
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
57 additions
and
2 deletions
+57
-2
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+56
-1
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
+1
-1
No files found.
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
View file @
c2e5c825
...
@@ -26,6 +26,7 @@ import com.alibaba.fastjson.JSONArray;
...
@@ -26,6 +26,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.toutiao.util.Tools
;
...
@@ -78,6 +79,36 @@ public class TouTiaoArticleParse {
...
@@ -78,6 +79,36 @@ public class TouTiaoArticleParse {
return
null
;
return
null
;
}
}
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media_id
,
Long
max_behot_time
,
Date
endData
,
ProxyHolder
proxy
)
throws
Exception
{
String
as
=
Tools
.
getAS
().
split
(
"_"
)[
0
];
String
cp
=
Tools
.
getAS
().
split
(
"_"
)[
1
];
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
media_id
+
"&count=20&as="
+
as
+
"&cp="
+
cp
;
if
(
max_behot_time
!=
null
){
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
}
System
.
out
.
println
(
"url=========="
+
url
);
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
String
htmlBody
=
null
;
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
)){
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()>
0
){
return
ttList
;
}
}
else
{
logger
.
info
(
"数据为null"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
throw
e
;
}
return
null
;
}
/***
/***
* 根据帐号解析历史文章地址
* 根据帐号解析历史文章地址
*
*
...
@@ -178,7 +209,6 @@ public class TouTiaoArticleParse {
...
@@ -178,7 +209,6 @@ public class TouTiaoArticleParse {
System
.
out
.
println
(
url
);
System
.
out
.
println
(
url
);
try
{
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
System
.
out
.
println
(
htmlBody
);
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()>
0
){
if
(
dataMap
!=
null
&&
dataMap
.
size
()>
0
){
...
@@ -196,6 +226,31 @@ public class TouTiaoArticleParse {
...
@@ -196,6 +226,31 @@ public class TouTiaoArticleParse {
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
user_id
,
Date
endDate
,
ProxyHolder
proxy
,
Long
max_behot_time
)
throws
IOException
{
String
url
=
"https://www.toutiao.com/c/ugc/content/list/"
+
user_id
+
"/"
;
if
(
max_behot_time
!=
null
){
url
=
url
+
"?max_time="
+
max_behot_time
;
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
System
.
out
.
println
(
url
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()>
0
){
return
dataMap
;
}
}
else
{
logger
.
info
(
"数据为null"
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"获取数据出错::{},数据为null"
,
e
);
return
null
;
}
return
null
;
}
/**
/**
* @Title: parseHtmlByMicroAccount
* @Title: parseHtmlByMicroAccount
* @author hero
* @author hero
...
...
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
View file @
c2e5c825
...
@@ -31,7 +31,7 @@ public class TouTiaoExample {
...
@@ -31,7 +31,7 @@ public class TouTiaoExample {
public
static
void
main
(
String
[]
args
)
throws
Exception
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
long
a
=
System
.
currentTimeMillis
();
long
a
=
System
.
currentTimeMillis
();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
urlList
.
add
(
"
23782107381
"
);
urlList
.
add
(
"
6075371636
"
);
System
.
out
.
println
(
urlList
.
size
());
System
.
out
.
println
(
urlList
.
size
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment