Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
f6290b0f
Commit
f6290b0f
authored
Jul 24, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改微头条采集内容解析
parent
f97d6fe2
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
128 additions
and
126 deletions
+128
-126
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
+5
-7
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+2
-2
src/test/java/com/zhiwei/toutiao/test/TouTiaoChannelExample.java
+55
-50
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
+66
-67
No files found.
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
View file @
f6290b0f
...
...
@@ -80,8 +80,7 @@ public class TouTiaoAccountParse {
String
htmlBody
=
null
;
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
){
tta
=
parseAccountByUserId
(
htmlBody
,
user_id
);
tta
=
parseAccountByUserId
(
htmlBody
,
user_id
,
proxy
);
}
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
...
...
@@ -193,7 +192,6 @@ public class TouTiaoAccountParse {
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
System
.
out
.
println
(
data
.
toString
());
if
(
data
.
getLong
(
"id"
)
==
null
)
{
continue
;
}
...
...
@@ -214,7 +212,6 @@ public class TouTiaoAccountParse {
follow_count
=
data
.
getInteger
(
"follow_count"
);
}
String
img_url
=
"https:"
+
data
.
getString
(
"avatar_url"
);
System
.
out
.
println
(
data
.
getString
(
"create_time"
));
Date
create_time
=
null
;
if
(
data
.
getString
(
"create_time"
)
!=
null
)
{
create_time
=
new
Date
(
Long
.
valueOf
(
data
.
getString
(
"create_time"
))*
1000
);
...
...
@@ -254,9 +251,9 @@ public class TouTiaoAccountParse {
* @param @return 设定文件
* @return TouTiaoAccount 返回类型
*/
private
static
TouTiaoAccount
parseAccountByUserId
(
String
htmlBody
,
String
user_id
)
{
private
static
TouTiaoAccount
parseAccountByUserId
(
String
htmlBody
,
String
user_id
,
Proxy
proxy
)
{
try
{
TouTiaoAccount
touTiaoAccount
=
new
TouTiaoAccount
();
TouTiaoAccount
touTiaoAccount
=
new
TouTiaoAccount
();
;
if
(
htmlBody
.
contains
(
"var header={"
)){
String
name
=
htmlBody
.
split
(
"var header"
)[
1
].
split
(
"name:'"
)[
1
].
split
(
"',"
)[
0
];
String
img_url
=
"https:"
+
htmlBody
.
split
(
"avtar_img:'"
)[
1
].
split
(
"',"
)[
0
];
...
...
@@ -268,8 +265,9 @@ public class TouTiaoAccountParse {
touTiaoAccount
.
setFollow_count
(
fensi
);
}
touTiaoAccount
.
setId
(
user_id
);
touTiaoAccount
.
setUser_id
(
Long
.
valueOf
(
user_id
));
touTiaoAccount
.
setImg_url
(
img_url
);
touTiaoAccount
.
setName
(
name
);
touTiaoAccount
.
setName
(
name
);
touTiaoAccount
.
setUser_type
(
type
);
return
touTiaoAccount
;
}
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
View file @
f6290b0f
...
...
@@ -151,7 +151,6 @@ public class TouTiaoArticleParse {
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
...
...
@@ -175,6 +174,7 @@ public class TouTiaoArticleParse {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
@SuppressWarnings
(
"unlikely-arg-type"
)
private
static
Map
<
String
,
Object
>
parseHtmlByMicroAccount
(
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
...
...
@@ -201,7 +201,7 @@ public class TouTiaoArticleParse {
date
=
new
Date
(
max_behot_time
*
1000
);
href
=
"https://www.toutiao.com/a"
+
data
.
getString
(
"thread_id"
);
source
=
data
.
getJSONObject
(
"ugc_user"
).
getString
(
"name"
);
content
=
data
.
getString
(
"
rich_
content"
);
content
=
data
.
getString
(
"content"
);
readNum
=
data
.
getInteger
(
"read_count"
)+
""
;
commentNum
=
data
.
getInteger
(
"comment_count"
)+
""
;
user_id
=
data
.
getJSONObject
(
"ugc_user"
).
getString
(
"user_id"
);
...
...
src/test/java/com/zhiwei/toutiao/test/TouTiaoChannelExample.java
View file @
f6290b0f
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoChannelParse;
//import com.zhiwei.toutiao.util.Tools;
//
///**
// * @ClassName: TouTiaoChannelExample
// * @Description: TODO(头条频道解析测试)
// * @author hero
// * @date 2017年7月24日 下午5:10:52
// */
//public class TouTiaoChannelExample {
//
// public static void main(String[] args) {
//
// long max_behot_time = 0;
// for(int i= 0;i<3; i++){
// System.out.println("i=============="+i);
// if( i==0 ){
// max_behot_time = 0;
// }
// String as = Tools.getAS().split("_")[0];
// String cp = Tools.getAS().split("_")[1];
// String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
// + "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time
// +"&tadrequire=true&as=" +as +"&cp=" + cp;
// System.out.println("url:" + url);
//
// Map<String,Object> result = TouTiaoChannelParse.touTiaoChannel(url, null);
// if(result!=null){
// Long next = (Long)result.get("next");
// List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data");
// System.out.println("ttlist size is " + ttList.size());
// for(TouTiaoArticle tt : ttList){
// System.out.println(tt);
// }
// if(next != null){
// max_behot_time = next;
// }else{
// break;
// }
// }
// }
// }
//
//}
package
com
.
zhiwei
.
toutiao
.
test
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.parse.TouTiaoChannelParse
;
import
com.zhiwei.toutiao.util.Tools
;
/**
* @ClassName: TouTiaoChannelExample
* @Description: TODO(头条频道解析测试)
* @author hero
* @date 2017年7月24日 下午5:10:52
*/
public
class
TouTiaoChannelExample
{
public
static
void
main
(
String
[]
args
)
{
long
max_behot_time
=
0
;
for
(
int
i
=
0
;
i
<
3
;
i
++){
System
.
out
.
println
(
"i=============="
+
i
);
if
(
i
==
0
){
max_behot_time
=
0
;
}
String
as
=
Tools
.
getAS
().
split
(
"_"
)[
0
];
String
cp
=
Tools
.
getAS
().
split
(
"_"
)[
1
];
String
url
=
"http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
+
"&widen=1&max_behot_time="
+
max_behot_time
+
"&max_behot_time_tmp="
+
max_behot_time
+
"&tadrequire=true&as="
+
as
+
"&cp="
+
cp
;
System
.
out
.
println
(
"url:"
+
url
);
Map
<
String
,
Object
>
result
;
try
{
result
=
TouTiaoChannelParse
.
touTiaoChannel
(
url
,
null
);
if
(
result
!=
null
){
Long
next
=
(
Long
)
result
.
get
(
"next"
);
List
<
TouTiaoArticle
>
ttList
=
(
List
<
TouTiaoArticle
>)
result
.
get
(
"data"
);
System
.
out
.
println
(
"ttlist size is "
+
ttList
.
size
());
for
(
TouTiaoArticle
tt
:
ttList
){
System
.
out
.
println
(
tt
);
}
if
(
next
!=
null
){
max_behot_time
=
next
;
}
else
{
break
;
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
}
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
View file @
f6290b0f
///**
// * @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// * @version V1.0
// */
///**
//*
//*/
//package com.zhiwei.toutiao.test;
//
//import java.util.ArrayList;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
//import com.zhiwei.toutiao.util.Tools;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//
///**
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// */
//public class TouTiaoExample {
//
// @SuppressWarnings("unchecked")
// public static void main(String[] args) throws Exception {
// long a = System.currentTimeMillis();
// List<String> urlList = new ArrayList<String>();
// urlList.add("6859134443");
//
// System.out.println(urlList.size());
//
// Date endTime = TimeParse.stringFormartDate("2018-04-01");
//
// for (String url : urlList) {
// String mid = url;
// String max_behot_time = "0";
// while (true) {
// Map<String, Object> dataMap = null;
// dataMap = TouTiaoArticleParse.getTouTiaoList(mid, max_behot_time, endTime,null);
// if (dataMap != null) {
// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
// max_behot_time = (String) dataMap.get("max_behot_time");
// System.out.println(max_behot_time + "=======" + ttlist.size());
// if (max_behot_time == null || ttlist.isEmpty()) {
// break;
// } else {
// if (ttlist.size() > 0) {
// for (TouTiaoArticle tt : ttlist) {
// System.out.println(tt);
// }
// }
// }
// }
// }
// }
// long b = System.currentTimeMillis();
// System.out.println("一轮的采集时间为:" + (b - a) / 1000);
// }
//
//}
/**
* @Title: TouTiaoExample.java
* @Package com.zhiwei.toutiao.test
* @Description:
* @author hero
* @date 2016年9月2日 上午11:48:51
* @version V1.0
*/
/**
*
*/
package
com
.
zhiwei
.
toutiao
.
test
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.parse.TouTiaoArticleParse
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
/**
* @Description:
* @author hero
* @date 2016年9月2日 上午11:48:51
*/
public
class
TouTiaoExample
{
@SuppressWarnings
(
"unchecked"
)
public
static
void
main
(
String
[]
args
)
throws
Exception
{
long
a
=
System
.
currentTimeMillis
();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
urlList
.
add
(
"6859134443"
);
System
.
out
.
println
(
urlList
.
size
());
Date
endTime
=
TimeParse
.
stringFormartDate
(
"2018-04-01"
);
for
(
String
url
:
urlList
)
{
String
mid
=
url
;
String
max_behot_time
=
"0"
;
while
(
true
)
{
Map
<
String
,
Object
>
dataMap
=
null
;
dataMap
=
TouTiaoArticleParse
.
getTouTiaoList
(
mid
,
max_behot_time
,
endTime
,
null
);
if
(
dataMap
!=
null
)
{
List
<
TouTiaoArticle
>
ttlist
=
(
List
<
TouTiaoArticle
>)
dataMap
.
get
(
"data"
);
max_behot_time
=
(
String
)
dataMap
.
get
(
"max_behot_time"
);
System
.
out
.
println
(
max_behot_time
+
"======="
+
ttlist
.
size
());
if
(
max_behot_time
==
null
||
ttlist
.
isEmpty
())
{
break
;
}
else
{
if
(
ttlist
.
size
()
>
0
)
{
for
(
TouTiaoArticle
tt
:
ttlist
)
{
System
.
out
.
println
(
tt
);
}
}
}
}
}
}
long
b
=
System
.
currentTimeMillis
();
System
.
out
.
println
(
"一轮的采集时间为:"
+
(
b
-
a
)
/
1000
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment