Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
f6290b0f
Commit
f6290b0f
authored
Jul 24, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改微头条采集内容解析
parent
f97d6fe2
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
128 additions
and
126 deletions
+128
-126
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
+5
-7
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+2
-2
src/test/java/com/zhiwei/toutiao/test/TouTiaoChannelExample.java
+55
-50
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
+66
-67
No files found.
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
View file @
f6290b0f
...
@@ -80,8 +80,7 @@ public class TouTiaoAccountParse {
...
@@ -80,8 +80,7 @@ public class TouTiaoAccountParse {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
){
if
(
htmlBody
!=
null
){
tta
=
parseAccountByUserId
(
htmlBody
,
user_id
);
tta
=
parseAccountByUserId
(
htmlBody
,
user_id
,
proxy
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
e
.
fillInStackTrace
();
...
@@ -193,7 +192,6 @@ public class TouTiaoAccountParse {
...
@@ -193,7 +192,6 @@ public class TouTiaoAccountParse {
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
System
.
out
.
println
(
data
.
toString
());
if
(
data
.
getLong
(
"id"
)
==
null
)
{
if
(
data
.
getLong
(
"id"
)
==
null
)
{
continue
;
continue
;
}
}
...
@@ -214,7 +212,6 @@ public class TouTiaoAccountParse {
...
@@ -214,7 +212,6 @@ public class TouTiaoAccountParse {
follow_count
=
data
.
getInteger
(
"follow_count"
);
follow_count
=
data
.
getInteger
(
"follow_count"
);
}
}
String
img_url
=
"https:"
+
data
.
getString
(
"avatar_url"
);
String
img_url
=
"https:"
+
data
.
getString
(
"avatar_url"
);
System
.
out
.
println
(
data
.
getString
(
"create_time"
));
Date
create_time
=
null
;
Date
create_time
=
null
;
if
(
data
.
getString
(
"create_time"
)
!=
null
)
{
if
(
data
.
getString
(
"create_time"
)
!=
null
)
{
create_time
=
new
Date
(
Long
.
valueOf
(
data
.
getString
(
"create_time"
))*
1000
);
create_time
=
new
Date
(
Long
.
valueOf
(
data
.
getString
(
"create_time"
))*
1000
);
...
@@ -254,9 +251,9 @@ public class TouTiaoAccountParse {
...
@@ -254,9 +251,9 @@ public class TouTiaoAccountParse {
* @param @return 设定文件
* @param @return 设定文件
* @return TouTiaoAccount 返回类型
* @return TouTiaoAccount 返回类型
*/
*/
private
static
TouTiaoAccount
parseAccountByUserId
(
String
htmlBody
,
String
user_id
)
{
private
static
TouTiaoAccount
parseAccountByUserId
(
String
htmlBody
,
String
user_id
,
Proxy
proxy
)
{
try
{
try
{
TouTiaoAccount
touTiaoAccount
=
new
TouTiaoAccount
();
TouTiaoAccount
touTiaoAccount
=
new
TouTiaoAccount
();
;
if
(
htmlBody
.
contains
(
"var header={"
)){
if
(
htmlBody
.
contains
(
"var header={"
)){
String
name
=
htmlBody
.
split
(
"var header"
)[
1
].
split
(
"name:'"
)[
1
].
split
(
"',"
)[
0
];
String
name
=
htmlBody
.
split
(
"var header"
)[
1
].
split
(
"name:'"
)[
1
].
split
(
"',"
)[
0
];
String
img_url
=
"https:"
+
htmlBody
.
split
(
"avtar_img:'"
)[
1
].
split
(
"',"
)[
0
];
String
img_url
=
"https:"
+
htmlBody
.
split
(
"avtar_img:'"
)[
1
].
split
(
"',"
)[
0
];
...
@@ -268,8 +265,9 @@ public class TouTiaoAccountParse {
...
@@ -268,8 +265,9 @@ public class TouTiaoAccountParse {
touTiaoAccount
.
setFollow_count
(
fensi
);
touTiaoAccount
.
setFollow_count
(
fensi
);
}
}
touTiaoAccount
.
setId
(
user_id
);
touTiaoAccount
.
setId
(
user_id
);
touTiaoAccount
.
setUser_id
(
Long
.
valueOf
(
user_id
));
touTiaoAccount
.
setImg_url
(
img_url
);
touTiaoAccount
.
setImg_url
(
img_url
);
touTiaoAccount
.
setName
(
name
);
touTiaoAccount
.
setName
(
name
);
touTiaoAccount
.
setUser_type
(
type
);
touTiaoAccount
.
setUser_type
(
type
);
return
touTiaoAccount
;
return
touTiaoAccount
;
}
}
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
View file @
f6290b0f
...
@@ -151,7 +151,6 @@ public class TouTiaoArticleParse {
...
@@ -151,7 +151,6 @@ public class TouTiaoArticleParse {
}
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
...
@@ -175,6 +174,7 @@ public class TouTiaoArticleParse {
...
@@ -175,6 +174,7 @@ public class TouTiaoArticleParse {
* @param @return 设定文件
* @param @return 设定文件
* @return Map<String,Object> 返回类型
* @return Map<String,Object> 返回类型
*/
*/
@SuppressWarnings
(
"unlikely-arg-type"
)
private
static
Map
<
String
,
Object
>
parseHtmlByMicroAccount
(
String
htmlBody
,
Date
endDate
)
{
private
static
Map
<
String
,
Object
>
parseHtmlByMicroAccount
(
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
...
@@ -201,7 +201,7 @@ public class TouTiaoArticleParse {
...
@@ -201,7 +201,7 @@ public class TouTiaoArticleParse {
date
=
new
Date
(
max_behot_time
*
1000
);
date
=
new
Date
(
max_behot_time
*
1000
);
href
=
"https://www.toutiao.com/a"
+
data
.
getString
(
"thread_id"
);
href
=
"https://www.toutiao.com/a"
+
data
.
getString
(
"thread_id"
);
source
=
data
.
getJSONObject
(
"ugc_user"
).
getString
(
"name"
);
source
=
data
.
getJSONObject
(
"ugc_user"
).
getString
(
"name"
);
content
=
data
.
getString
(
"
rich_
content"
);
content
=
data
.
getString
(
"content"
);
readNum
=
data
.
getInteger
(
"read_count"
)+
""
;
readNum
=
data
.
getInteger
(
"read_count"
)+
""
;
commentNum
=
data
.
getInteger
(
"comment_count"
)+
""
;
commentNum
=
data
.
getInteger
(
"comment_count"
)+
""
;
user_id
=
data
.
getJSONObject
(
"ugc_user"
).
getString
(
"user_id"
);
user_id
=
data
.
getJSONObject
(
"ugc_user"
).
getString
(
"user_id"
);
...
...
src/test/java/com/zhiwei/toutiao/test/TouTiaoChannelExample.java
View file @
f6290b0f
//package com.zhiwei.toutiao.test;
package
com
.
zhiwei
.
toutiao
.
test
;
//
//import java.util.List;
import
java.util.List
;
//import java.util.Map;
import
java.util.Map
;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
//import com.zhiwei.toutiao.parse.TouTiaoChannelParse;
import
com.zhiwei.toutiao.parse.TouTiaoChannelParse
;
//import com.zhiwei.toutiao.util.Tools;
import
com.zhiwei.toutiao.util.Tools
;
//
///**
/**
// * @ClassName: TouTiaoChannelExample
* @ClassName: TouTiaoChannelExample
// * @Description: TODO(头条频道解析测试)
* @Description: TODO(头条频道解析测试)
// * @author hero
* @author hero
// * @date 2017年7月24日 下午5:10:52
* @date 2017年7月24日 下午5:10:52
// */
*/
//public class TouTiaoChannelExample {
public
class
TouTiaoChannelExample
{
//
// public static void main(String[] args) {
public
static
void
main
(
String
[]
args
)
{
//
// long max_behot_time = 0;
long
max_behot_time
=
0
;
// for(int i= 0;i<3; i++){
for
(
int
i
=
0
;
i
<
3
;
i
++){
// System.out.println("i=============="+i);
System
.
out
.
println
(
"i=============="
+
i
);
// if( i==0 ){
if
(
i
==
0
){
// max_behot_time = 0;
max_behot_time
=
0
;
// }
}
// String as = Tools.getAS().split("_")[0];
String
as
=
Tools
.
getAS
().
split
(
"_"
)[
0
];
// String cp = Tools.getAS().split("_")[1];
String
cp
=
Tools
.
getAS
().
split
(
"_"
)[
1
];
// String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
String
url
=
"http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
// + "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time
+
"&widen=1&max_behot_time="
+
max_behot_time
+
"&max_behot_time_tmp="
+
max_behot_time
// +"&tadrequire=true&as=" +as +"&cp=" + cp;
+
"&tadrequire=true&as="
+
as
+
"&cp="
+
cp
;
// System.out.println("url:" + url);
System
.
out
.
println
(
"url:"
+
url
);
//
// Map<String,Object> result = TouTiaoChannelParse.touTiaoChannel(url, null);
Map
<
String
,
Object
>
result
;
// if(result!=null){
try
{
// Long next = (Long)result.get("next");
result
=
TouTiaoChannelParse
.
touTiaoChannel
(
url
,
null
);
// List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data");
if
(
result
!=
null
){
// System.out.println("ttlist size is " + ttList.size());
Long
next
=
(
Long
)
result
.
get
(
"next"
);
// for(TouTiaoArticle tt : ttList){
List
<
TouTiaoArticle
>
ttList
=
(
List
<
TouTiaoArticle
>)
result
.
get
(
"data"
);
// System.out.println(tt);
System
.
out
.
println
(
"ttlist size is "
+
ttList
.
size
());
// }
for
(
TouTiaoArticle
tt
:
ttList
){
// if(next != null){
System
.
out
.
println
(
tt
);
// max_behot_time = next;
}
// }else{
if
(
next
!=
null
){
// break;
max_behot_time
=
next
;
// }
}
else
{
// }
break
;
// }
}
// }
}
//
}
catch
(
Exception
e
)
{
//}
e
.
printStackTrace
();
}
}
}
}
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
View file @
f6290b0f
///**
/**
// * @Title: TouTiaoExample.java
* @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test
* @Package com.zhiwei.toutiao.test
// * @Description:
* @Description:
// * @author hero
* @author hero
// * @date 2016年9月2日 上午11:48:51
* @date 2016年9月2日 上午11:48:51
// * @version V1.0
* @version V1.0
// */
*/
///**
/**
//*
*
//*/
*/
//package com.zhiwei.toutiao.test;
package
com
.
zhiwei
.
toutiao
.
test
;
//
//import java.util.ArrayList;
import
java.util.ArrayList
;
//import java.util.Date;
import
java.util.Date
;
//import java.util.List;
import
java.util.List
;
//import java.util.Map;
import
java.util.Map
;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
import
com.zhiwei.toutiao.parse.TouTiaoArticleParse
;
//import com.zhiwei.toutiao.util.Tools;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//
/**
///**
* @Description:
// * @Description:
* @author hero
// * @author hero
* @date 2016年9月2日 上午11:48:51
// * @date 2016年9月2日 上午11:48:51
*/
// */
public
class
TouTiaoExample
{
//public class TouTiaoExample {
//
@SuppressWarnings
(
"unchecked"
)
// @SuppressWarnings("unchecked")
public
static
void
main
(
String
[]
args
)
throws
Exception
{
// public static void main(String[] args) throws Exception {
long
a
=
System
.
currentTimeMillis
();
// long a = System.currentTimeMillis();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
// List<String> urlList = new ArrayList<String>();
urlList
.
add
(
"6859134443"
);
// urlList.add("6859134443");
//
System
.
out
.
println
(
urlList
.
size
());
// System.out.println(urlList.size());
//
Date
endTime
=
TimeParse
.
stringFormartDate
(
"2018-04-01"
);
// Date endTime = TimeParse.stringFormartDate("2018-04-01");
//
for
(
String
url
:
urlList
)
{
// for (String url : urlList) {
String
mid
=
url
;
// String mid = url;
String
max_behot_time
=
"0"
;
// String max_behot_time = "0";
while
(
true
)
{
// while (true) {
Map
<
String
,
Object
>
dataMap
=
null
;
// Map<String, Object> dataMap = null;
dataMap
=
TouTiaoArticleParse
.
getTouTiaoList
(
mid
,
max_behot_time
,
endTime
,
null
);
// dataMap = TouTiaoArticleParse.getTouTiaoList(mid, max_behot_time, endTime,null);
if
(
dataMap
!=
null
)
{
// if (dataMap != null) {
List
<
TouTiaoArticle
>
ttlist
=
(
List
<
TouTiaoArticle
>)
dataMap
.
get
(
"data"
);
// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
max_behot_time
=
(
String
)
dataMap
.
get
(
"max_behot_time"
);
// max_behot_time = (String) dataMap.get("max_behot_time");
System
.
out
.
println
(
max_behot_time
+
"======="
+
ttlist
.
size
());
// System.out.println(max_behot_time + "=======" + ttlist.size());
if
(
max_behot_time
==
null
||
ttlist
.
isEmpty
())
{
// if (max_behot_time == null || ttlist.isEmpty()) {
break
;
// break;
}
else
{
// } else {
if
(
ttlist
.
size
()
>
0
)
{
// if (ttlist.size() > 0) {
for
(
TouTiaoArticle
tt
:
ttlist
)
{
// for (TouTiaoArticle tt : ttlist) {
System
.
out
.
println
(
tt
);
// System.out.println(tt);
}
// }
}
// }
}
// }
}
// }
}
// }
}
// }
long
b
=
System
.
currentTimeMillis
();
// long b = System.currentTimeMillis();
System
.
out
.
println
(
"一轮的采集时间为:"
+
(
b
-
a
)
/
1000
);
// System.out.println("一轮的采集时间为:" + (b - a) / 1000);
}
// }
//
}
//}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment