Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
314e5609
Commit
314e5609
authored
Jun 20, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
今日头条采集程序初次提交
parents
Hide whitespace changes
Inline
Side-by-side
Showing
26 changed files
with
3351 additions
and
0 deletions
+3351
-0
pom.xml
+69
-0
src/main/java/com/zhiwei/toutiao/bean/TouTiaoAccount.java
+202
-0
src/main/java/com/zhiwei/toutiao/bean/TouTiaoArticle.java
+130
-0
src/main/java/com/zhiwei/toutiao/bean/TouTiaoComment.java
+108
-0
src/main/java/com/zhiwei/toutiao/bean/TouTiaoQuestion.java
+149
-0
src/main/java/com/zhiwei/toutiao/bean/TouTiaoQuestionAnswer.java
+132
-0
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
+386
-0
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+243
-0
src/main/java/com/zhiwei/toutiao/parse/TouTiaoChannelParse.java
+146
-0
src/main/java/com/zhiwei/toutiao/parse/TouTiaoCommentParse.java
+293
-0
src/main/java/com/zhiwei/toutiao/parse/TouTiaoParse.java
+177
-0
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionAnswerParse.java
+115
-0
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionParse.java
+107
-0
src/main/java/com/zhiwei/toutiao/parse/TouTiaoSearchParse.java
+145
-0
src/main/java/com/zhiwei/toutiao/util/Config.java
+30
-0
src/main/java/com/zhiwei/toutiao/util/Tools.java
+268
-0
src/main/java/com/zhiwei/wangyi/bean/WangYiNews.java
+103
-0
src/main/java/com/zhiwei/wangyi/parse/WangyiNewParse.java
+111
-0
src/test/java/com/zhiwei/toutiao/test/TouTiaoAccountExample.java
+33
-0
src/test/java/com/zhiwei/toutiao/test/TouTiaoChannelExample.java
+50
-0
src/test/java/com/zhiwei/toutiao/test/TouTiaoCommentExample.java
+52
-0
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
+67
-0
src/test/java/com/zhiwei/toutiao/test/TouTiaoMicroExample.java
+76
-0
src/test/java/com/zhiwei/toutiao/test/TouTiaoQuestionAnswerExample.java
+90
-0
src/test/java/com/zhiwei/toutiao/test/TouTiaoQuestionExample.java
+35
-0
src/test/java/com/zhiwei/toutiao/test/TouTiaoSearchExample.java
+34
-0
No files found.
pom.xml
0 → 100644
View file @
314e5609
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
toutiao
</artifactId>
<version>
0.2.2-SNAPSHOT
</version>
<dependencies>
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
zhiweiTools
</artifactId>
<version>
0.0.6-SNAPSHOT
</version>
</dependency>
</dependencies>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>
maven-source-plugin
</artifactId>
<version>
2.4
</version>
<configuration>
<attach>
true
</attach>
</configuration>
<executions>
<execution>
<phase>
compile
</phase>
<goals>
<goal>
jar
</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-javadoc-plugin
</artifactId>
<version>
2.10.4
</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-surefire-plugin
</artifactId>
<version>
2.7.2
</version>
<configuration>
<forkMode>
once
</forkMode>
<argLine>
-Dfile.encoding=UTF-8
</argLine>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>
nexus-releases
</id>
<name>
User Porject Snapshot
</name>
<url>
http://192.168.0.30:8081/nexus/content/repositories/snapshots/
</url >
<uniqueVersion>
true
</uniqueVersion>
</snapshotRepository>
<repository>
<id>
nexus-releases
</id>
<name>
User Porject Release
</name>
<url>
http://192.168.0.30:8081/nexus/content/repositories/releases/
</url
>
</repository>
</distributionManagement>
</project>
\ No newline at end of file
src/main/java/com/zhiwei/toutiao/bean/TouTiaoAccount.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
toutiao
.
bean
;
import
java.io.Serializable
;
import
java.util.Date
;
/**
* @ClassName: TouTiaoAccount
* @Description: TODO(头条帐号信息)
* @author hero
* @date 2017年10月17日 下午2:50:46
*/
public
class
TouTiaoAccount
implements
Serializable
{
private
static
final
long
serialVersionUID
=
-
7447778477165461146L
;
public
String
id
;
//主键 帐号id
public
Long
user_id
;
//帐号id
public
String
name
;
//帐号昵称
public
Long
media_id
;
//未知
public
String
description
;
//描述
public
Integer
user_verified
;
//是否认证 (0,不是;1 是)
public
String
verify_content
;
//认证原因
public
Integer
follow_count
;
//粉丝数
public
Integer
friend_count
;
//关注数
public
String
img_url
;
//头像地址
public
Date
create_time
;
//账号注册时间
public
String
gender
;
//性别
public
String
user_type
;
//用户类型
@Override
public
String
toString
(){
return
"new TouTiaoAccount["
+
"id = "
+
id
+
", user_id = "
+
user_id
+
", name = "
+
name
+
", media_id = "
+
media_id
+
", description = "
+
description
+
", user_verified = "
+
user_verified
+
", verify_content = "
+
verify_content
+
", follow_count = "
+
follow_count
+
", friend_count = "
+
friend_count
+
", img_url = "
+
img_url
+
", create_time = "
+
create_time
+
", gender = "
+
gender
+
", user_type = "
+
user_type
+
"]"
;
}
public
TouTiaoAccount
(){}
public
TouTiaoAccount
(
Long
user_id
,
String
name
,
Long
media_id
,
String
description
,
Integer
user_verified
,
String
verify_content
,
Integer
follow_count
,
String
img_url
,
Date
create_time
,
String
gender
,
String
user_type
){
this
.
id
=
user_id
+
""
;
this
.
user_id
=
user_id
;
this
.
name
=
name
;
this
.
media_id
=
media_id
;
this
.
description
=
description
;
this
.
user_verified
=
user_verified
;
this
.
verify_content
=
verify_content
;
this
.
follow_count
=
follow_count
;
this
.
img_url
=
img_url
;
this
.
create_time
=
create_time
;
this
.
gender
=
gender
;
this
.
user_type
=
user_type
;
}
public
String
getId
()
{
return
id
;
}
public
void
setId
(
String
id
)
{
this
.
id
=
id
;
}
public
Long
getUser_id
()
{
return
user_id
;
}
public
void
setUser_id
(
Long
user_id
)
{
this
.
user_id
=
user_id
;
}
public
String
getName
()
{
return
name
;
}
public
void
setName
(
String
name
)
{
this
.
name
=
name
;
}
public
Long
getMedia_id
()
{
return
media_id
;
}
public
void
setMedia_id
(
Long
media_id
)
{
this
.
media_id
=
media_id
;
}
public
String
getDescription
()
{
return
description
;
}
public
void
setDescription
(
String
description
)
{
this
.
description
=
description
;
}
public
Integer
getUser_verified
()
{
return
user_verified
;
}
public
void
setUser_verified
(
Integer
user_verified
)
{
this
.
user_verified
=
user_verified
;
}
public
Integer
getFollow_count
()
{
return
follow_count
;
}
public
void
setFollow_count
(
Integer
follow_count
)
{
this
.
follow_count
=
follow_count
;
}
public
String
getVerify_content
()
{
return
verify_content
;
}
public
void
setVerify_content
(
String
verify_content
)
{
this
.
verify_content
=
verify_content
;
}
public
String
getImg_url
()
{
return
img_url
;
}
public
void
setImg_url
(
String
img_url
)
{
this
.
img_url
=
img_url
;
}
public
Date
getCreate_time
()
{
return
create_time
;
}
public
void
setCreate_time
(
Date
create_time
)
{
this
.
create_time
=
create_time
;
}
public
String
getGender
()
{
return
gender
;
}
public
int
getFriend_count
()
{
return
friend_count
;
}
public
void
setFriend_count
(
int
friend_count
)
{
this
.
friend_count
=
friend_count
;
}
public
void
setGender
(
String
gender
)
{
this
.
gender
=
gender
;
}
public
String
getUser_type
()
{
return
user_type
;
}
public
void
setUser_type
(
String
user_type
)
{
this
.
user_type
=
user_type
;
}
}
src/main/java/com/zhiwei/toutiao/bean/TouTiaoArticle.java
0 → 100644
View file @
314e5609
/**
* @Title: TouTiao.java
* @Package com.zhiwei.toutiao.entity
* @Description:
* @author hero
* @date 2016年9月2日 上午8:47:13
* @version V1.0
*/
/**
*
*/
package
com
.
zhiwei
.
toutiao
.
bean
;
import
java.io.Serializable
;
import
java.util.Date
;
/**
* @Description:
* @author hero
* @date 2016年9月2日 上午8:47:13
*/
public
class
TouTiaoArticle
implements
Serializable
{
private
static
final
long
serialVersionUID
=
7745861002592578553L
;
private
String
url
;
private
String
title
;
private
String
type
;
private
String
source
;
private
String
user_id
;
private
Date
time
;
private
String
content
;
private
String
commentCount
;
private
String
playCount
;
private
String
readNum
;
public
String
getCommentCount
()
{
return
commentCount
;
}
public
void
setCommentCount
(
String
commentCount
)
{
this
.
commentCount
=
commentCount
;
}
public
String
getPlayCount
()
{
return
playCount
;
}
public
void
setPlayCount
(
String
playCount
)
{
this
.
playCount
=
playCount
;
}
public
String
getReadNum
()
{
return
readNum
;
}
public
void
setReadNum
(
String
readNum
)
{
this
.
readNum
=
readNum
;
}
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
String
getTitle
()
{
return
title
;
}
public
void
setTitle
(
String
title
)
{
this
.
title
=
title
;
}
public
String
getSource
()
{
return
source
;
}
public
void
setSource
(
String
source
)
{
this
.
source
=
source
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
public
String
getContent
()
{
return
content
;
}
public
void
setContent
(
String
content
)
{
this
.
content
=
content
;
}
public
TouTiaoArticle
(){}
public
String
getUser_id
()
{
return
user_id
;
}
public
void
setUser_id
(
String
user_id
)
{
this
.
user_id
=
user_id
;
}
public
String
getType
()
{
return
type
;
}
public
void
setType
(
String
type
)
{
this
.
type
=
type
;
}
public
TouTiaoArticle
(
String
url
,
String
title
,
String
user_id
,
String
source
,
Date
time
,
String
content
,
String
commentCount
,
String
playCount
,
String
readNum
,
String
type
)
{
this
.
url
=
url
;
this
.
title
=
title
;
this
.
type
=
type
;
this
.
source
=
source
;
this
.
user_id
=
user_id
;
this
.
time
=
time
;
this
.
content
=
content
;
this
.
readNum
=
readNum
;
this
.
playCount
=
playCount
;
this
.
commentCount
=
commentCount
;
}
public
String
toString
()
{
return
"new TouTiaoArticle["
+
"url = "
+
url
+
", title = "
+
title
+
", type = "
+
type
+
", source = "
+
source
+
", user_id = "
+
user_id
+
", time = "
+
time
+
", content = "
+
content
+
", commentCount = "
+
commentCount
+
", playCount = "
+
playCount
+
", readNum = "
+
readNum
+
"]"
;
}
}
src/main/java/com/zhiwei/toutiao/bean/TouTiaoComment.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
toutiao
.
bean
;
import
java.io.Serializable
;
import
java.util.Date
;
public
class
TouTiaoComment
implements
Serializable
{
private
static
final
long
serialVersionUID
=
-
1536817427402243392L
;
private
String
id
;
private
String
text
;
private
String
userName
;
private
Integer
reply_count
;
private
Integer
digg_count
;
private
Date
time
;
private
String
source_url
;
public
String
getId
()
{
return
id
;
}
public
void
setId
(
String
id
)
{
this
.
id
=
id
;
}
public
String
getText
()
{
return
text
;
}
public
void
setText
(
String
text
)
{
this
.
text
=
text
;
}
public
String
getUserName
()
{
return
userName
;
}
public
void
setUserName
(
String
userName
)
{
this
.
userName
=
userName
;
}
public
Integer
getReply_count
()
{
return
reply_count
;
}
public
void
setReply_count
(
Integer
reply_count
)
{
this
.
reply_count
=
reply_count
;
}
public
Integer
getDigg_count
()
{
return
digg_count
;
}
public
void
setDigg_count
(
Integer
digg_count
)
{
this
.
digg_count
=
digg_count
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
public
String
getSource_url
()
{
return
source_url
;
}
public
void
setSource_url
(
String
source_url
)
{
this
.
source_url
=
source_url
;
}
@Override
public
String
toString
()
{
return
"new TouTiaoComment["
+
"id = "
+
id
+
", text = "
+
text
+
", userName = "
+
userName
+
", reply_count = "
+
reply_count
+
", digg_count = "
+
digg_count
+
", time = "
+
time
+
", source_url = "
+
source_url
+
"]"
;
}
public
TouTiaoComment
(
String
id
,
String
text
,
String
userName
,
Integer
reply_count
,
Integer
digg_count
,
Date
time
,
String
source_url
)
{
this
.
id
=
id
;
this
.
text
=
text
;
this
.
userName
=
userName
;
this
.
reply_count
=
reply_count
;
this
.
digg_count
=
digg_count
;
this
.
time
=
time
;
this
.
source_url
=
source_url
;
}
}
src/main/java/com/zhiwei/toutiao/bean/TouTiaoQuestion.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
toutiao
.
bean
;
import
java.io.Serializable
;
import
java.util.Date
;
/**
* @ClassName: TouTiaoQuestion
* @Description: TODO(头条问答(又名悟空问答)采集)
* @author hero
* @date 2017年7月20日 上午11:23:24
*/
public
class
TouTiaoQuestion
implements
Serializable
{
private
static
final
long
serialVersionUID
=
7743044965507540483L
;
private
String
url
;
//问题链接
private
String
title
;
//标题
private
String
content
;
//问题
private
String
source
;
//发布者
private
Date
time
;
//发布时间
private
Integer
follow_count
;
//关注数
private
Integer
nice_ans_count
;
//精选回答数据
private
Integer
normal_ans_count
;
//正常回答数
private
Integer
ans_count
;
//总的回答数
public
TouTiaoQuestion
(
String
url
,
String
title
,
String
source
,
String
content
,
Date
time
,
int
follow_count
,
int
nice_ans_count
,
int
normal_ans_count
,
int
ans_count
){
this
.
url
=
url
;
this
.
title
=
title
;
this
.
content
=
content
;
this
.
source
=
source
;
this
.
time
=
time
;
this
.
follow_count
=
follow_count
;
this
.
nice_ans_count
=
nice_ans_count
;
this
.
normal_ans_count
=
normal_ans_count
;
this
.
ans_count
=
ans_count
;
}
@Override
public
String
toString
(){
return
"new TouTiaoQuestion["
+
"url = "
+
url
+
", title = "
+
title
+
", content = "
+
content
+
", source = "
+
source
+
", time = "
+
time
+
", follow_count = "
+
follow_count
+
", nice_ans_count = "
+
nice_ans_count
+
", normal_ans_count = "
+
normal_ans_count
+
", ans_count = "
+
ans_count
+
"]"
;
}
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
String
getTitle
()
{
return
title
;
}
public
void
setTitle
(
String
title
)
{
this
.
title
=
title
;
}
public
String
getContent
()
{
return
content
;
}
public
void
setContent
(
String
content
)
{
this
.
content
=
content
;
}
public
String
getSource
()
{
return
source
;
}
public
void
setSource
(
String
source
)
{
this
.
source
=
source
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
public
Integer
getFollow_count
()
{
return
follow_count
;
}
public
void
setFollow_count
(
Integer
follow_count
)
{
this
.
follow_count
=
follow_count
;
}
public
Integer
getNice_ans_count
()
{
return
nice_ans_count
;
}
public
void
setNice_ans_count
(
Integer
nice_ans_count
)
{
this
.
nice_ans_count
=
nice_ans_count
;
}
public
Integer
getNormal_ans_count
()
{
return
normal_ans_count
;
}
public
void
setNormal_ans_count
(
Integer
normal_ans_count
)
{
this
.
normal_ans_count
=
normal_ans_count
;
}
public
Integer
getAns_count
()
{
return
ans_count
;
}
public
void
setAns_count
(
Integer
ans_count
)
{
this
.
ans_count
=
ans_count
;
}
}
src/main/java/com/zhiwei/toutiao/bean/TouTiaoQuestionAnswer.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
toutiao
.
bean
;
import
java.io.Serializable
;
import
java.util.Date
;
/**
* @ClassName: TouTiaoQuestionAnswer
* @Description: TODO(头条问答的回答)
* @author hero
* @date 2017年7月28日 下午6:13:10
*/
public
class
TouTiaoQuestionAnswer
implements
Serializable
{
private
static
final
long
serialVersionUID
=
-
252511004299401886L
;
private
String
id
;
//回答id,主键
private
String
questionId
;
//问题id
private
String
content
;
//回答内容
private
String
user_id
;
//用户昵称
private
String
username
;
//用户昵称
private
Date
time
;
//回答时间
private
int
comment_count
;
//评论数
private
int
digg_count
;
//点赞数
public
TouTiaoQuestionAnswer
(){}
public
TouTiaoQuestionAnswer
(
String
id
,
String
questionId
,
String
content
,
String
user_id
,
String
username
,
Date
time
,
int
comment_count
,
int
digg_count
){
this
.
id
=
id
;
this
.
questionId
=
questionId
;
this
.
content
=
content
;
this
.
user_id
=
user_id
;
this
.
username
=
username
;
this
.
time
=
time
;
this
.
comment_count
=
comment_count
;
this
.
digg_count
=
digg_count
;
}
@Override
public
String
toString
(){
return
"new TouTiaoQuestionAnswer["
+
"id = "
+
id
+
", questionId = "
+
questionId
+
", content = "
+
content
+
", user_id = "
+
user_id
+
", username = "
+
username
+
", time = "
+
time
+
", comment_count = "
+
comment_count
+
", digg_count = "
+
digg_count
+
"]"
;
}
public
String
getId
()
{
return
id
;
}
public
void
setId
(
String
id
)
{
this
.
id
=
id
;
}
public
String
getQuestionId
()
{
return
questionId
;
}
public
void
setQuestionId
(
String
questionId
)
{
this
.
questionId
=
questionId
;
}
public
String
getContent
()
{
return
content
;
}
public
void
setContent
(
String
content
)
{
this
.
content
=
content
;
}
public
String
getUsername
()
{
return
username
;
}
public
void
setUsername
(
String
username
)
{
this
.
username
=
username
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
public
int
getComment_count
()
{
return
comment_count
;
}
public
void
setComment_count
(
int
comment_count
)
{
this
.
comment_count
=
comment_count
;
}
public
int
getDigg_count
()
{
return
digg_count
;
}
public
void
setDigg_count
(
int
digg_count
)
{
this
.
digg_count
=
digg_count
;
}
public
String
getUser_id
()
{
return
user_id
;
}
public
void
setUser_id
(
String
user_id
)
{
this
.
user_id
=
user_id
;
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.toutiao.bean.TouTiaoAccount
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.tools.URLCodeUtil
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
/**
* @ClassName: TouTiaoAccountParse
* @Description: TODO(今日头条帐号采集)
* @author hero
* @date 2017年10月17日 下午3:36:54
*/
public
class
TouTiaoAccountParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TouTiaoAccountParse
.
class
);
/**
* @Title: getTouTiaoAccountInfo
* @author hero
* @Description: TODO(根据帐号名获取帐号信息)
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return TouTiaoAccount 返回类型
*/
public
static
TouTiaoAccount
getTouTiaoAccountInfoByName
(
String
name
,
Proxy
proxy
){
String
url
=
"https://www.toutiao.com/search_content/?offset=0&format=json&keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)+
"&autoload=true&count=20&cur_tab=1"
;
headerMap
=
Tools
.
getTouTiaoHeader
();
TouTiaoAccount
tta
=
null
;
try
{
String
htmlBody
=
null
;
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"media_id"
)){
tta
=
parseHtmlByAccount
(
htmlBody
,
name
,
proxy
);
if
(
tta
==
null
){
url
=
"https://www.toutiao.com/search_content/?offset=0&format=json&keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)+
"&autoload=true&count=20&cur_tab=4&from=media"
;
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/search/?keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
));
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"media_id"
)){
tta
=
parseHtmlByAccount
(
htmlBody
,
name
,
proxy
);
}
}
}
else
{
url
=
"https://www.toutiao.com/search_content/?offset=0&format=json&keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)+
"&autoload=true&count=20&cur_tab=4&from=media"
;
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/search/?keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
));
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"media_id"
)){
tta
=
parseHtmlByAccount
(
htmlBody
,
name
,
proxy
);
}
}
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
return
null
;
}
return
tta
;
}
public
static
TouTiaoAccount
getTouTiaoAccountInfoByUserId
(
String
user_id
,
Proxy
proxy
){
String
url
=
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
;
headerMap
=
Tools
.
getTouTiaoHeader
();
TouTiaoAccount
tta
=
null
;
try
{
String
htmlBody
=
null
;
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"mediaId"
)){
tta
=
parseAccountByUserId
(
htmlBody
,
user_id
);
}
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
return
null
;
}
return
tta
;
}
/**
* @Title: getTouTiaoAccountInfoByWord
* @author hero
* @Description: TODO(根据关键词查询今日头条帐号信息)
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<TouTiaoAccount> 返回类型
*/
public
static
List
<
TouTiaoAccount
>
getTouTiaoAccountInfoByWord
(
String
word
,
Proxy
proxy
){
List
<
TouTiaoAccount
>
list
=
new
ArrayList
<
TouTiaoAccount
>();
boolean
f
=
true
;
int
page
=
0
;
while
(
f
){
String
url
=
"https://www.toutiao.com/search_content/?offset="
+
page
*
20
+
"&format=json&keyword="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&autoload=true&count=20&cur_tab=4&from=media"
;
headerMap
=
Tools
.
getTouTiaoHeader
();
try
{
String
htmlBody
=
null
;
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"media_id"
)){
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
list
.
addAll
(
parseHtmlByWord
(
json
,
proxy
));
if
(
json
.
getIntValue
(
"has_more"
)==
0
){
f
=
false
;
}
}
page
++;
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
f
=
false
;
}
}
return
list
;
}
/**
* @Title: getFriendsList
* @author hero
* @Description: 获取用户关注列表
* @param @param userid
* @param @param proxy
* @param @return 设定文件
* @return List<TouTiaoAccount> 返回类型
*/
public
static
List
<
TouTiaoAccount
>
getFriendsList
(
String
userid
,
Proxy
proxy
,
long
sleep
){
List
<
TouTiaoAccount
>
ttaList
=
new
ArrayList
<
TouTiaoAccount
>();
boolean
more
=
true
;
int
page
=
0
;
while
(
more
){
String
url
=
"http://is.snssdk.com/user/following/?offset="
+
page
*
50
+
"&device_id=35330393347&count=50&user_id="
+
userid
+
"&ts="
+
System
.
currentTimeMillis
()/
1000
;
page
++;
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Host"
,
"is.snssdk.com"
);
try
{
String
htmlBody
=
null
;
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"name"
)){
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
more
=
json
.
getJSONObject
(
"data"
).
getBooleanValue
(
"has_more"
);
List
<
TouTiaoAccount
>
dataList
=
parseHtmlByFans
(
json
);
if
(
dataList
!=
null
&&
dataList
.
size
()>
0
){
ttaList
.
addAll
(
dataList
);
}
else
{
more
=
false
;
}
}
else
{
more
=
false
;
}
ZhiWeiTools
.
sleep
(
sleep
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
more
=
false
;
return
null
;
}
}
return
ttaList
;
}
/**
* @Title: parseHtmlByAccount
* @author hero
* @Description: TODO(解析单个帐号信息)
* @param @param htmlBody
* @param @param word
* @param @return 设定文件
* @return TouTiaoAccount 返回类型
*/
private
static
TouTiaoAccount
parseHtmlByAccount
(
String
htmlBody
,
String
word
,
Proxy
proxy
)
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
if
(
data
.
containsKey
(
"media_id"
)){
long
user_id
=
data
.
getLong
(
"id"
);
String
name
=
data
.
getString
(
"name"
);
long
media_id
=
data
.
getLong
(
"media_id"
);
String
description
=
data
.
getString
(
"description"
);
int
user_verified
=
data
.
getInteger
(
"user_verified"
);
String
verify_content
=
data
.
getString
(
"verify_content"
);
int
follow_count
=
data
.
getInteger
(
"follow_count"
);
String
img_url
=
"https:"
+
data
.
getString
(
"avatar_url"
);
System
.
out
.
println
(
data
.
getString
(
"create_time"
));
Date
create_time
=
new
Date
(
Long
.
valueOf
(
data
.
getString
(
"create_time"
))*
1000
);
String
gender
=
data
.
getString
(
"gender"
);
String
user_type
=
data
.
getString
(
"user_type"
);
if
(
name
.
equals
(
word
)){
TouTiaoAccount
tta
=
new
TouTiaoAccount
(
user_id
,
name
,
media_id
,
description
,
user_verified
,
verify_content
,
follow_count
,
img_url
,
create_time
,
gender
,
user_type
);
ZhiWeiTools
.
sleep
(
1000
);
TouTiaoAccount
ttaUpdate
=
getTouTiaoAccountInfoByUserId
(
user_id
+
""
,
proxy
);
if
(
ttaUpdate
!=
null
){
tta
.
setFriend_count
(
ttaUpdate
.
getFriend_count
());
tta
.
setUser_type
(
ttaUpdate
.
getUser_type
());
}
return
tta
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
);
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
);
return
null
;
}
return
null
;
}
/**
* @Title: parseAccountByUserId
* @author hero
* @Description: 根据uid更新用户部分信息
* @param @param htmlBody
* @param @param user_id
* @param @return 设定文件
* @return TouTiaoAccount 返回类型
*/
private
static
TouTiaoAccount
parseAccountByUserId
(
String
htmlBody
,
String
user_id
)
{
try
{
TouTiaoAccount
touTiaoAccount
=
new
TouTiaoAccount
();
if
(
htmlBody
.
contains
(
"var header={"
)){
String
name
=
htmlBody
.
split
(
"var header"
)[
1
].
split
(
"name:'"
)[
1
].
split
(
"',"
)[
0
];
String
img_url
=
"https:"
+
htmlBody
.
split
(
"avtar_img:'"
)[
1
].
split
(
"',"
)[
0
];
String
type
=
htmlBody
.
split
(
"type: '"
)[
1
].
split
(
"'"
)[
0
];
if
(
htmlBody
.
contains
(
"guanzhu"
)){
int
guanzhu
=
Integer
.
valueOf
(
htmlBody
.
split
(
"guanzhu:'"
)[
1
].
split
(
"',"
)[
0
]);
int
fensi
=
Integer
.
valueOf
(
htmlBody
.
split
(
"fensi:'"
)[
1
].
split
(
"',"
)[
0
]);
touTiaoAccount
.
setFriend_count
(
guanzhu
);
touTiaoAccount
.
setFollow_count
(
fensi
);
}
touTiaoAccount
.
setId
(
user_id
);
touTiaoAccount
.
setImg_url
(
img_url
);
touTiaoAccount
.
setName
(
name
);
touTiaoAccount
.
setUser_type
(
type
);
return
touTiaoAccount
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
);
return
null
;
}
return
null
;
}
/**
*
* @Title: parseHtmlByAccount
* @author hero
* @Description: 解析根据关键词获取帐号列表
* @param @param htmlBody
* @param @param word
* @param @return 设定文件
* @return List<TouTiaoAccount> 返回类型
*/
private
static
List
<
TouTiaoAccount
>
parseHtmlByWord
(
JSONObject
json
,
Proxy
proxy
)
{
List
<
TouTiaoAccount
>
ttaList
=
new
ArrayList
<
TouTiaoAccount
>();
try
{
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
long
user_id
=
0
;
String
name
=
null
;
long
media_id
=
0
;
String
description
=
null
;
int
user_verified
=
0
;
String
verify_content
=
null
;
int
follow_count
=
0
;
String
img_url
=
null
;
Date
create_time
=
null
;
String
gender
=
null
;
String
user_type
=
null
;
TouTiaoAccount
tta
=
null
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
if
(
data
.
containsKey
(
"media_id"
)){
user_id
=
data
.
getLong
(
"id"
);
name
=
data
.
getString
(
"name"
);
media_id
=
data
.
getLong
(
"media_id"
);
description
=
data
.
getString
(
"description"
);
user_verified
=
data
.
getInteger
(
"user_verified"
);
verify_content
=
data
.
getString
(
"verify_content"
);
follow_count
=
data
.
getInteger
(
"follow_count"
);
img_url
=
"https:"
+
data
.
getString
(
"avatar_url"
);
create_time
=
new
Date
(
Integer
.
valueOf
(
data
.
getString
(
"create_time"
)+
"000"
));
gender
=
data
.
getString
(
"gender"
);
user_type
=
data
.
getString
(
"user_type"
);
tta
=
new
TouTiaoAccount
(
user_id
,
name
,
media_id
,
description
,
user_verified
,
verify_content
,
follow_count
,
img_url
,
create_time
,
gender
,
user_type
);
ZhiWeiTools
.
sleep
(
1000
);
TouTiaoAccount
ttaUpdate
=
getTouTiaoAccountInfoByUserId
(
user_id
+
""
,
proxy
);
if
(
ttaUpdate
!=
null
){
tta
.
setFriend_count
(
ttaUpdate
.
getFriend_count
());
tta
.
setUser_type
(
ttaUpdate
.
getUser_type
());
}
ttaList
.
add
(
tta
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
);
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
);
return
null
;
}
return
ttaList
;
}
/***
* @Title: parseHtmlByFans
* @author hero
* @Description: 获取头条账号粉丝列表
* @param @param htmlBody
* @param @return 设定文件
* @return List<TouTiaoAccount> 返回类型
*/
private
static
List
<
TouTiaoAccount
>
parseHtmlByFans
(
JSONObject
json
)
{
List
<
TouTiaoAccount
>
ttaList
=
null
;
try
{
ttaList
=
new
ArrayList
<
TouTiaoAccount
>();
JSONArray
jsonArray
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"users"
);
Long
user_id
=
null
;
String
name
=
null
;
String
description
=
null
;
Integer
user_verified
=
null
;
String
verify_content
=
null
;
int
follow_count
=
0
;
String
img_url
=
null
;
Date
create_time
=
null
;
String
gender
=
null
;
String
user_type
=
null
;
TouTiaoAccount
tta
=
null
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
user_id
=
data
.
getLong
(
"user_id"
);
name
=
data
.
getString
(
"name"
);
description
=
data
.
getString
(
"description"
);
verify_content
=
data
.
getString
(
"verified_content"
);
img_url
=
data
.
getString
(
"avatar_url"
);
user_verified
=
data
.
getBoolean
(
"user_verified"
)==
true
?
0
:
1
;
tta
=
new
TouTiaoAccount
(
user_id
,
name
,
null
,
description
,
user_verified
,
verify_content
,
follow_count
,
img_url
,
create_time
,
gender
,
user_type
);
ttaList
.
add
(
tta
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
);
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
);
return
null
;
}
return
ttaList
;
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
0 → 100644
View file @
314e5609
/**
* @Title: TouTiaoParse.java
* @Package com.zhiwei.toutiao.parse
* @Description:
* @author hero
* @date 2016年9月2日 上午11:17:44
* @version V1.0
*/
/**
*
*/
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
/**
* @Description:头条帐号采集
* @author hero
* @date 2016年9月2日 上午11:17:44
*/
public
class
TouTiaoArticleParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TouTiaoArticleParse
.
class
);
/***
* 获取头条数据
*
* @Description:
* @param @param
* url
* @param @return
* @return List<TouTiao> 返回类型
* @throws Exception
*/
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media_id
,
String
max_behot_time
,
Date
endData
,
Proxy
proxy
)
throws
Exception
{
String
as
=
Tools
.
getAS
().
split
(
"_"
)[
0
];
String
cp
=
Tools
.
getAS
().
split
(
"_"
)[
1
];
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
media_id
+
"&count=20&as="
+
as
+
"&cp="
+
cp
;
if
(
max_behot_time
!=
null
){
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
}
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
String
htmlBody
=
null
;
try
{
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
)){
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()>
0
){
return
ttList
;
}
}
else
{
logger
.
info
(
"数据为null"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
throw
e
;
}
return
null
;
}
/***
* 根据帐号解析历史文章地址
*
* @Description:根据帐号解析历史文章地址
* @param @param
* htmlBody
* @param @return
* @return List<String> 返回类型
*/
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
String
max_behot_time
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<
TouTiaoArticle
>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
max_behot_time
=
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
String
href
=
"https://www.toutiao.com/"
;
if
(
data
.
containsKey
(
"group_id"
)){
href
=
href
+
"a"
+
data
.
getLongValue
(
"group_id"
);
String
title
=
data
.
getString
(
"title"
);
String
content
=
data
.
getString
(
"abstract"
);
String
time
=
data
.
getLongValue
(
"behot_time"
)*
1000
+
""
;
Date
date
=
TimeParse
.
stringFormartDate
(
time
);
String
readNum
=
data
.
getString
(
"go_detail_count"
);
String
commentNum
=
data
.
getString
(
"comments_count"
);
String
playNum
=
data
.
getString
(
"play_effective_count"
);
String
source
=
data
.
getString
(
"source"
);
String
user_id
=
data
.
getLong
(
"creator_uid"
).
toString
();
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
user_id
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
"今日头条"
);
dataList
.
add
(
tt
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
.
getMessage
());
return
null
;
}
if
(
max_behot_time
!=
null
&&
!
"0"
.
equals
(
max_behot_time
)){
Date
nextDate
=
new
Date
(
Long
.
valueOf
(
max_behot_time
+
"000"
));
if
(
endDate
.
after
(
nextDate
)){
max_behot_time
=
null
;
}
}
map
.
put
(
"max_behot_time"
,
max_behot_time
);
map
.
put
(
"data"
,
dataList
);
return
map
;
}
/**
* @Title: getMicroTouTiaoCrawler
* @author hero
* @Description: 根据用户user_id查询用户微头条数据
* @param @param user_id
* @param @param endDate
* @param @param proxy
* @param @return
* @param @throws IOException 设定文件
* @return List<Map<String,Object>> 返回类型
*/
public
static
Map
<
String
,
Object
>
getMicroTouTiaoCrawler
(
String
user_id
,
Date
endDate
,
Proxy
proxy
,
String
max_behot_time
)
throws
IOException
{
String
url
=
"https://www.toutiao.com/c/ugc/content/list/"
+
user_id
+
"/"
;
if
(
max_behot_time
!=
null
){
url
=
url
+
"?max_time="
+
max_behot_time
;
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()>
0
){
return
dataMap
;
}
}
else
{
logger
.
info
(
"数据为null"
);
}
return
null
;
}
/**
* @Title: parseHtmlByMicroAccount
* @author hero
* @Description: 解析微头条数据
* @param @param htmlBody
* @param @param endDate
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
parseHtmlByMicroAccount
(
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Long
max_behot_time
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<
TouTiaoArticle
>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
boolean
more
=
json
.
getBoolean
(
"has_more"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"list"
);
Date
date
=
null
;
String
href
=
null
;
String
source
=
null
;
String
title
=
null
;
String
content
=
null
;
String
readNum
=
null
;
String
commentNum
=
null
;
String
playNum
=
null
;
String
user_id
=
null
;
int
count
=
16
;
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
max_behot_time
=
data
.
getLongValue
(
"create_time"
);
date
=
new
Date
(
max_behot_time
*
1000
);
href
=
"https://www.toutiao.com/a"
+
data
.
getString
(
"thread_id"
);
source
=
data
.
getJSONObject
(
"ugc_user"
).
getString
(
"name"
);
content
=
data
.
getString
(
"rich_content"
);
readNum
=
data
.
getInteger
(
"read_count"
)+
""
;
commentNum
=
data
.
getInteger
(
"comment_count"
)+
""
;
user_id
=
data
.
getJSONObject
(
"ugc_user"
).
getString
(
"user_id"
);
if
(
content
!=
null
&&
!
""
.
equals
(
content
)){
if
(
content
.
length
()<
16
){
count
=
content
.
length
();
}
title
=
content
.
substring
(
0
,
count
);
}
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
user_id
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
"微头条"
);
dataList
.
add
(
tt
);
}
catch
(
Exception
e
)
{
continue
;
}
}
/**验证是否有下一页数据**/
if
(
more
){
if
(
max_behot_time
!=
null
&&
!
"0"
.
equals
(
max_behot_time
)){
if
(
endDate
.
after
(
date
)){
max_behot_time
=
null
;
}
}
}
else
{
max_behot_time
=
null
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
map
.
put
(
"max_behot_time"
,
max_behot_time
);
map
.
put
(
"data"
,
dataList
);
return
map
;
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoChannelParse.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONException
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
/**
* @ClassName: TouTiaoChannel
* @Description: TODO(今日头条按照频道采集)
* @author hero
* @date 2017年7月24日 下午4:57:22
*/
public
class
TouTiaoChannelParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TouTiaoChannelParse
.
class
);
/**
* @Title: touTiaoChannel
* @author hero
* @Description: TODO(解析)
* @param @param url
* @param @return 设定文件
* @return Map<String,Object> 返回类型
* @throws Exception
*/
public
static
Map
<
String
,
Object
>
touTiaoChannel
(
String
url
,
Proxy
proxy
)
throws
Exception
{
headerMap
=
Tools
.
getTouTiaoChannelHeader
();
headerMap
.
put
(
"referer"
,
url
);
String
htmlBody
=
null
;
try
{
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据连接出现问题:"
,
e
.
fillInStackTrace
());
throw
e
;
}
if
(
htmlBody
!=
null
)
{
return
parseHtmlByChannel
(
htmlBody
);
}
return
null
;
}
/**
* @Title: parseHtmlByChannel
* @author hero
* @Description: TODO(解析)
* @param @param htmlBody
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
parseHtmlByChannel
(
String
htmlBody
){
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<
String
,
Object
>();
List
<
TouTiaoArticle
>
ttList
=
new
ArrayList
<
TouTiaoArticle
>();
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
dataList
=
jsonObject
.
getJSONArray
(
"data"
);
Long
next
=
null
;
try
{
next
=
jsonObject
.
getJSONObject
(
"next"
).
getLong
(
"max_behot_time"
);
}
catch
(
Exception
e
)
{
next
=
null
;
}
String
time
=
null
;
String
title
=
null
;
String
content
=
null
;
String
comment_count
=
null
;
Date
date
=
null
;
String
source
=
null
;
for
(
int
i
=
0
;
i
<
dataList
.
size
();
i
++)
{
JSONObject
jso
=
dataList
.
getJSONObject
(
i
);
try
{
time
=
String
.
valueOf
(
jso
.
getLongValue
(
"behot_time"
)*
1000
);
title
=
jso
.
getString
(
"title"
);
content
=
jso
.
getString
(
"abstract"
);
comment_count
=
jso
.
getIntValue
(
"comments_count"
)+
""
;
source
=
jso
.
getString
(
"source"
);
String
url
=
null
;
if
(
null
!=
jso
.
getString
(
"group_id"
)){
url
=
"http://www.toutiao.com/a"
+
jso
.
getString
(
"group_id"
)+
"/"
;
}
url
=
getUrl
(
url
);
date
=
TimeParse
.
stringFormartDate
(
time
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
url
,
title
,
null
,
source
,
date
,
content
,
comment_count
,
"-1"
,
"-1"
,
"今日头条"
);
ttList
.
add
(
tt
);
}
catch
(
JSONException
e
)
{
continue
;
}
}
dataMap
.
put
(
"data"
,
ttList
);
dataMap
.
put
(
"next"
,
next
);
return
dataMap
;
}
/**
* @Title: getUrl
* @author hero
* @Description: TODO(处理url)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
getUrl
(
String
url
){
if
(
url
.
contains
(
"group/"
))
{
url
=
url
.
replace
(
"group/"
,
"a"
);
}
if
(
url
.
contains
(
"item"
))
{
url
=
url
.
replace
(
"/item/"
,
"/i"
);
}
if
(
url
.
contains
(
"m."
))
{
url
=
url
.
replace
(
"m."
,
""
);
}
if
(!
url
.
contains
(
"www"
))
{
url
=
url
.
replace
(
"toutiao.com"
,
"www.toutiao.com"
);
}
String
urlIndex
=
url
.
substring
(
url
.
length
()-
1
,
url
.
length
());
if
(!
urlIndex
.
equals
(
"/"
))
{
url
=
url
+
"/"
;
}
return
url
;
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoCommentParse.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.net.SocketTimeoutException
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.toutiao.bean.TouTiaoComment
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
/**
* @ClassName: TouTiaoComment
* @Description: TODO(今日头条评论数据)
* @author hero
* @date 2016年12月9日 下午7:50:28
*/
public
class
TouTiaoCommentParse
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TouTiaoCommentParse
.
class
);
/**
*
* @Title: getTouTiaoComment
* @author hero
* @Description: 获取评论列表,可指定限制返回条数
* @param @param url
* @param @param count
* @param @param proxy
* @param @return
* @param @throws Exception 设定文件
* @return List<TouTiaoComment> 返回类型
*/
public
static
List
<
TouTiaoComment
>
getTouTiaoComment
(
String
url
,
int
returnCount
,
Proxy
proxy
)
throws
Exception
{
List
<
TouTiaoComment
>
ttList
=
new
ArrayList
<
TouTiaoComment
>();
String
group_id
=
getGroupId
(
url
,
proxy
);
//查询评论总页数
if
(
group_id
!=
null
){
int
page
=
getPage
(
group_id
);
if
(
returnCount
>
0
){
int
pageMax
=
(
int
)
Math
.
ceil
((
double
)
returnCount
/
20.0
);
if
(
page
>=
pageMax
){
page
=
pageMax
;
}
}
for
(
int
i
=
0
;
i
<
page
;
i
++)
{
String
urlNew
=
"http://is.snssdk.com/article/v2/tab_comments/?app_name=news_article&offset="
+
i
*
20
+
"&group_id="
+
group_id
+
"&aggr_type=1&count=20&fold=1&item_id="
+
group_id
+
"&ts="
+
System
.
currentTimeMillis
();
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"User-Agent"
,
"News 6.6.5 rv:6.6.5.03 (iPhone; iOS 11.3; zh_CN) Cronet"
);
headerMap
.
put
(
"Host"
,
"is.snssdk.com"
);
for
(
int
j
=
1
;
j
<=
3
;
j
++){
try
{
String
htmlBody
=
HttpClientTemplateOK
.
get
(
urlNew
,
null
,
headerMap
);
if
(
htmlBody
!=
null
)
{
List
<
TouTiaoComment
>
commentes
=
analySisComment
(
htmlBody
,
url
);
ttList
.
addAll
(
commentes
);
}
else
{
logger
.
info
(
"采集出现问题,地址为:{}"
,
url
);
}
ZhiWeiTools
.
sleep
(
4000
);
break
;
}
catch
(
SocketTimeoutException
e
)
{
continue
;
}
}
}
}
return
ttList
;
}
/**
* @Title: analySisComment
* @Description: TODO(解析评论列表)
* @param @param htmlBody
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
private
static
List
<
TouTiaoComment
>
analySisComment
(
String
htmlBody
,
String
url
)
{
List
<
TouTiaoComment
>
list
=
new
ArrayList
<
TouTiaoComment
>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
commentes
=
json
.
getJSONArray
(
"data"
);
for
(
int
a
=
0
;
a
<
commentes
.
size
();
a
++)
{
JSONObject
comment
=
commentes
.
getJSONObject
(
a
).
getJSONObject
(
"comment"
);
String
id
=
comment
.
getString
(
"id"
);
String
text
=
comment
.
getString
(
"text"
);
String
name
=
comment
.
getString
(
"user_name"
);
int
reply_count
=
comment
.
getIntValue
(
"reply_count"
);
int
digg_count
=
comment
.
getIntValue
(
"digg_count"
);
long
timeLong
=
comment
.
getLongValue
(
"create_time"
)*
1000
;
Date
date
=
new
Date
(
timeLong
);
TouTiaoComment
ttComment
=
new
TouTiaoComment
(
id
,
text
,
name
,
reply_count
,
digg_count
,
date
,
url
);
list
.
add
(
ttComment
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
debug
(
"解析今日头条评论列表出现为题,{}"
,
e
.
getMessage
());
return
null
;
}
return
list
;
}
/**
* @Title: getPage
* @Description: TODO(获取总页数)
* @param @param url
* @param @return 设定文件
* @return int 返回类型
* @throws Exception
*/
private
static
int
getPage
(
String
group_id
)
throws
Exception
{
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
group_id
+
"&item_id=0&count=20&offset=0"
;
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
HttpClientTemplateOK
.
get
(
urlNew
,
null
,
headerMap
);
if
(
htmlBody
!=
null
)
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
data
=
json
.
getJSONObject
(
"data"
);
int
count
=
data
.
getIntValue
(
"total"
);
return
(
int
)
Math
.
ceil
((
double
)
count
/
20.0
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
.
getMessage
());
return
0
;
}
}
return
0
;
}
/**
* @Title: findCommentCount
* @author hero
* @Description: 根据id获取头条评论数
* @param @param url
* @param @param proxy
* @param @return 设定文件
* @return int 返回类型
*/
public
static
int
findCommentCount
(
String
url
,
Proxy
proxy
)
{
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"commentInfo"
))
{
try
{
return
Integer
.
valueOf
(
htmlBody
.
split
(
"comments_count: "
)[
1
].
split
(
","
)[
0
]);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
return
0
;
}
}
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
5000
);
continue
;
}
}
return
0
;
}
/**
* @Title: getCommentCount
* @Description: TODO(根据id查看评论数)
* @param @param url
* @param @return 设定文件
* @return int 返回类型
* @throws IOException
*/
public
static
int
getCommentCount
(
String
url
,
Proxy
proxy
)
{
String
group_id
=
getGroupId
(
url
,
proxy
);
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
group_id
+
"&item_id=0&count=20&offset=0"
;
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
HttpClientTemplateOK
.
get
(
urlNew
,
null
,
headerMap
);
if
(
htmlBody
!=
null
)
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
data
=
json
.
getJSONObject
(
"data"
);
int
count
=
data
.
getIntValue
(
"total"
);
return
count
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
info
(
"获取评论总页数时出现问题:{}"
,
e
.
getMessage
());
}
}
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
5000
);
continue
;
}
}
return
0
;
}
/**
* @Title: getGroupId
* @Description: TODO(获取groupId用于更新评论列表)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
getGroupId
(
String
url
,
Proxy
proxy
)
{
String
groupId
=
null
;
if
(
url
.
contains
(
"/a"
)||
url
.
contains
(
"/group/"
))
{
if
(
url
.
contains
(
"/a"
))
{
groupId
=
url
.
split
(
"/a"
)[
1
].
replace
(
"/"
,
""
);
}
else
{
groupId
=
url
.
split
(
"/group/"
)[
1
].
replace
(
"/"
,
""
);
}
}
else
if
(
url
.
contains
(
"/i"
)||
url
.
contains
(
"/item/"
))
{
groupId
=
gettGroupIdByUrl
(
url
,
proxy
);
}
return
groupId
;
}
/**
* @Title: gettGroupIdByUrl
* @Description: TODO(解析并获取groupId)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
gettGroupIdByUrl
(
String
url
,
Proxy
proxy
)
{
String
groupId
=
null
;
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
try
{
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
.
contains
(
"groupId"
))
{
groupId
=
htmlBody
.
split
(
"groupId: '"
)[
1
]
.
split
(
"',"
)[
0
].
trim
();
}
}
else
{
logger
.
info
(
"获取groupId失败,链接地址为:{}"
,
url
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"获取groupId失败,链接地址为:{}"
,
url
,
e
);
}
return
groupId
;
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoParse.java
0 → 100644
View file @
314e5609
/**
* @Title: TouTiaoParse.java
* @Package com.zhiwei.toutiao.parse
* @Description:
* @author hero
* @date 2016年9月2日 上午11:17:44
* @version V1.0
*/
/**
*
*/
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.net.InetSocketAddress
;
import
java.net.Proxy
;
import
java.net.Proxy.Type
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
/**
* @Description:
* @author hero
* @date 2016年9月2日 上午11:17:44
*/
public
class
TouTiaoParse
{
private
Map
<
String
,
String
>
headerMap
;
private
Logger
logger
=
LoggerFactory
.
getLogger
(
TouTiaoCommentParse
.
class
);
/***
* 获取头条数据
*
* @Description:
* @param @param url
* @param @return
* @return List<TouTiao> 返回类型
* @throws Exception
*/
@Deprecated
public
Map
<
String
,
Object
>
getTouTiaoList
(
String
url
,
Date
endData
,
String
source
,
String
hostname
,
int
host
)
throws
Exception
{
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
null
;
if
(
hostname
!=
null
)
{
Proxy
proxy
=
new
Proxy
(
Type
.
HTTP
,
new
InetSocketAddress
(
hostname
,
host
));
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
}
else
{
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"abstract"
))
{
return
parseHtmlByAccount
(
htmlBody
,
endData
,
source
);
}
return
null
;
}
/***
* 根据文章地址解析文章详情
*
* @Description:根据文章地址解析文章详情
* @param @param
* url
* @param @param
* htmlBody
* @param @return
* @return TouTiao 返回类型
*/
@SuppressWarnings
(
"unused"
)
private
String
parseHtmlByArticle
(
String
url
,
String
htmlBody
)
{
try
{
Document
doc
=
Jsoup
.
parse
(
htmlBody
);
String
content
=
null
;
if
(
doc
.
select
(
"[class=article-content]"
)
!=
null
)
{
content
=
doc
.
select
(
"[class=article-content]"
).
text
();
}
else
{
content
=
doc
.
select
(
"[class=content]"
).
text
();
}
return
content
;
}
catch
(
Exception
e
)
{
return
null
;
}
}
/***
* 根据帐号解析历史文章地址
*
* @Description:根据帐号解析历史文章地址
* @param @param
* htmlBody
* @param @return
* @return List<String> 返回类型
*/
private
Map
<
String
,
Object
>
parseHtmlByAccount
(
String
htmlBody
,
Date
endData
,
String
source
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
String
max_behot_time
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<
TouTiaoArticle
>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
max_behot_time
=
json
.
getJSONObject
(
"next"
).
getString
(
"max_behot_time"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
String
href
=
data
.
getString
(
"source_url"
);
if
(
href
.
contains
(
"item"
))
{
href
=
href
.
replace
(
"/item/"
,
"/i"
);
}
if
(
href
.
contains
(
"group"
))
{
href
=
href
.
replace
(
"/group/"
,
"/a"
);
}
if
(
href
.
contains
(
"m."
))
{
href
=
href
.
replace
(
"m."
,
""
);
}
if
(!
href
.
contains
(
"www"
))
{
href
=
href
.
replace
(
"toutiao.com"
,
"www.toutiao.com"
);
}
String
urlIndex
=
href
.
substring
(
href
.
length
()-
1
,
href
.
length
());
if
(!
urlIndex
.
equals
(
"/"
))
{
href
=
href
+
"/"
;
}
String
title
=
data
.
getString
(
"title"
);
String
content
=
data
.
getString
(
"abstract"
);
String
time
=
data
.
getString
(
"datetime"
);
Date
date
=
TimeParse
.
stringFormartDate
(
time
);
String
readNum
=
data
.
getString
(
"go_detail_count"
);
String
commentNum
=
data
.
getString
(
"comments_count"
);
String
playNum
=
data
.
getString
(
"play_effective_count"
);
if
(
endData
.
before
(
date
))
{
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
href
,
title
,
null
,
source
,
date
,
content
,
commentNum
,
playNum
,
readNum
,
"今日头条"
);
dataList
.
add
(
tt
);
}
else
{
max_behot_time
=
null
;
logger
.
info
(
"数据不再时间段内,{}"
,
time
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
);
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出现问题,{}"
,
e
);
return
null
;
}
map
.
put
(
"max_behot_time"
,
max_behot_time
);
map
.
put
(
"data"
,
dataList
);
return
map
;
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionAnswerParse.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.toutiao.bean.TouTiaoQuestionAnswer
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
/**
* @ClassName: TouTiaoQuestionAnswer
* @Description: TODO(头条问答的回答)
* @author hero
* @date 2017年7月28日 下午6:12:31
*/
public
class
TouTiaoQuestionAnswerParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TouTiaoQuestionAnswerParse
.
class
);
public
static
Map
<
String
,
Object
>
getAnserList
(
String
questionId
,
int
page
,
int
req_type
,
Proxy
proxy
){
String
url
=
"https://www.wukong.com/wenda/web/question/loadmorev1/?qid="
+
questionId
+
"&count=20&req_type="
+
req_type
+
"&offset="
+
page
*
20
;
headerMap
=
Tools
.
getTouTiaoQuestionAnswerHeader
();
headerMap
.
put
(
"referer"
,
"https://www.wukong.com/question/"
+
questionId
+
"/"
);
Map
<
String
,
Object
>
result
=
new
HashMap
<
String
,
Object
>();
List
<
TouTiaoQuestionAnswer
>
anserList
=
new
ArrayList
<
TouTiaoQuestionAnswer
>();
try
{
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
){
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
jsonObject
.
getJSONObject
(
"data"
)
!=
null
){
JSONObject
data
=
jsonObject
.
getJSONObject
(
"data"
);
System
.
out
.
println
(
data
.
getIntValue
(
"has_more"
));
page
++;
JSONArray
ans_list
=
data
.
getJSONArray
(
"ans_list"
);
for
(
int
i
=
0
;
i
<
ans_list
.
size
();
i
++){
JSONObject
ans
=
ans_list
.
getJSONObject
(
i
);
String
ansid
=
ans
.
getString
(
"ansid"
);
String
content
=
ans
.
getString
(
"content"
);
String
username
=
ans
.
getJSONObject
(
"user"
).
getString
(
"uname"
);
String
user_id
=
ans
.
getJSONObject
(
"user"
).
getString
(
"user_id"
);
Date
time
=
TimeParse
.
stringFormartDate
(
ans
.
getLongValue
(
"create_time"
)*
1000
+
""
);
int
comment_count
=
ans
.
getIntValue
(
"comment_count"
);
int
digg_count
=
ans
.
getIntValue
(
"digg_count"
);
TouTiaoQuestionAnswer
answer
=
new
TouTiaoQuestionAnswer
(
ansid
,
questionId
,
content
,
user_id
,
username
,
time
,
comment_count
,
digg_count
);
anserList
.
add
(
answer
);
}
}
else
{
return
null
;
}
}
else
{
return
null
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"头条问答问题获取出现问题"
,
e
.
fillInStackTrace
());
return
null
;
}
result
.
put
(
"page"
,
page
);
result
.
put
(
"ansList"
,
anserList
);
return
result
;
}
/**
* @Title: getAnswerCount
* @author hero
* @Description: TODO(根据头条问答地址更新回答数)
* @param @param questionId
* @param @return 设定文件
* @return String 返回类型
*/
public
String
getAnswerCount
(
String
questionId
,
Proxy
proxy
){
String
result
=
null
;
String
url
=
"https://www.wukong.com/question/"
+
questionId
+
"/"
;
System
.
out
.
println
(
url
);
headerMap
=
Tools
.
getTouTiaoQuestionAnswerHeader
();
headerMap
.
put
(
"referer"
,
url
);
try
{
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
){
Document
document
=
Jsoup
.
parse
(
htmlBody
);
String
text
=
document
.
select
(
"[class=question question-single]"
).
text
();
if
(
text
.
contains
(
"该问题不存在"
)){
result
=
"已删除"
;
}
else
{
result
=
document
.
select
(
"div.question-item"
).
select
(
"h3.answer-count-h"
).
text
();
result
=
result
.
split
(
"个回答"
)[
0
];
}
}
}
catch
(
Exception
e
)
{
return
"-1"
;
}
return
result
;
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionParse.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.toutiao.bean.TouTiaoQuestion
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
/**
* @ClassName: TouTiaoQuestionParse
* @Description: TODO(头条问道(悟空问答)数据解析程序)
* @author hero
* @date 2017年7月20日 下午2:14:48
*/
public
class
TouTiaoQuestionParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TouTiaoQuestionParse
.
class
);
/**
* @Title: getSearchTouTiaoQuestion
* @author hero
* @Description: TODO(根据关键词查询头条问答全部)
* @param @param
* url
* @param @return
* 设定文件
* @return List<TouTiaoQuestion> 返回类型
* @throws Exception
*/
public
static
List
<
TouTiaoQuestion
>
getSearchTouTiaoQuestion
(
String
url
,
Proxy
proxy
)
throws
Exception
{
List
<
TouTiaoQuestion
>
questtionList
=
new
ArrayList
<
TouTiaoQuestion
>();
headerMap
=
Tools
.
getTouTiaoQuestionHeader
();
headerMap
.
put
(
"referer"
,
url
);
String
htmlBody
=
null
;
try
{
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
List
<
TouTiaoQuestion
>
ttList
=
parseHtmlByQuestion
(
htmlBody
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
}
}
}
catch
(
Exception
e
)
{
throw
e
;
}
return
questtionList
;
}
/**
* @Title: parseHtmlByQuestion
* @author hero
* @Description: TODO(解析头条问答数据)
* @param @param
* htmlBody
* @param @return
* 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
private
static
List
<
TouTiaoQuestion
>
parseHtmlByQuestion
(
String
htmlBody
)
{
List
<
TouTiaoQuestion
>
questtionList
=
new
ArrayList
<
TouTiaoQuestion
>();
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
String
err_tips
=
jsonObject
.
getString
(
"err_tips"
);
if
(
err_tips
.
equals
(
"success"
))
{
JSONObject
json
=
jsonObject
.
getJSONObject
(
"data"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"feed_question"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
question
=
jsonArray
.
getJSONObject
(
i
).
getJSONObject
(
"question"
);
String
content
=
question
.
getJSONObject
(
"content"
).
getString
(
"text"
);
String
title
=
question
.
getString
(
"title"
);
String
url
=
"http://www.toutiao.com/a"
+
question
.
getString
(
"qid"
)
+
"/"
;
Date
time
=
TimeParse
.
stringFormartDate
(
question
.
getLong
(
"create_time"
)
*
1000L
+
""
);
String
source
=
question
.
getJSONObject
(
"user"
).
getString
(
"uname"
);
int
follow_count
=
question
.
getIntValue
(
"follow_count"
);
int
nice_ans_count
=
question
.
getIntValue
(
"nice_ans_count"
);
int
normal_ans_count
=
question
.
getIntValue
(
"normal_ans_count"
);
int
ans_count
=
nice_ans_count
+
normal_ans_count
;
TouTiaoQuestion
touTiaoQuestion
=
new
TouTiaoQuestion
(
url
,
title
,
source
,
content
,
time
,
follow_count
,
nice_ans_count
,
normal_ans_count
,
ans_count
);
questtionList
.
add
(
touTiaoQuestion
);
}
catch
(
Exception
e
)
{
logger
.
info
(
"头条问答解析数据出现问题"
,
e
.
fillInStackTrace
());
continue
;
}
}
}
return
questtionList
;
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoSearchParse.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONException
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
/**
* @ClassName: TouTiaoSearch
* @Description: TODO(今日头条搜索采集解析程序)
* @author hero
* @date 2017年7月24日 下午3:58:27
*/
public
class
TouTiaoSearchParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TouTiaoSearchParse
.
class
);
/**
* @Title: touTiaoSearchByWord
* @author hero
* @Description: TODO(根据关键词采集今日头条数据)
* @param @param url
* @param @return 设定文件
* @return List<TouTiaoArticle> 返回类型
* @throws Exception
*/
public
static
Map
<
String
,
Object
>
touTiaoSearchByWord
(
String
url
,
Proxy
proxy
)
throws
Exception
{
headerMap
=
Tools
.
getTouTiaoSearchHeader
();
headerMap
.
put
(
"referer"
,
url
);
String
htmlBody
=
null
;
try
{
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
){
Map
<
String
,
Object
>
dataMap
=
parseHtmlBySearch
(
htmlBody
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()>
0
){
return
dataMap
;
}
}
}
catch
(
Exception
e
)
{
throw
e
;
}
return
null
;
}
/**
* @Title: parseHtmlBySearch
* @author hero
* @Description: TODO(解析文本)
* @param @param htmlBody
* @param @return 设定文件
* @return List<TouTiaoArticle> 返回类型
*/
private
static
Map
<
String
,
Object
>
parseHtmlBySearch
(
String
htmlBody
){
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
dataList
=
jsonObject
.
getJSONArray
(
"data"
);
int
has_more
=
jsonObject
.
getIntValue
(
"has_more"
);
if
(
null
!=
dataList
&&
dataList
.
size
()>
0
){
Map
<
String
,
Object
>
result
=
new
HashMap
<
String
,
Object
>();
List
<
TouTiaoArticle
>
ttList
=
new
ArrayList
<
TouTiaoArticle
>();
for
(
int
i
=
0
;
i
<
dataList
.
size
();
i
++)
{
JSONObject
jso
=
dataList
.
getJSONObject
(
i
);
try
{
String
time
=
String
.
valueOf
(
jso
.
getLongValue
(
"create_time"
)*
1000
);
String
title
=
jso
.
getString
(
"title"
);
String
content
=
jso
.
getString
(
"abstract"
);
String
comment_count
=
jso
.
getIntValue
(
"comment_count"
)+
""
;
String
url
=
null
;
if
(
null
!=
jso
.
getString
(
"group_id"
)){
url
=
"http://www.toutiao.com/a"
+
jso
.
getString
(
"group_id"
)+
"/"
;
}
String
source
=
jso
.
getString
(
"source"
);
String
user_id
=
jso
.
getString
(
"user_id"
);
Date
date
=
TimeParse
.
stringFormartDate
(
time
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
url
,
title
,
user_id
,
source
,
date
,
content
,
comment_count
,
"-1"
,
"-1"
,
"今日头条"
);
ttList
.
add
(
tt
);
}
catch
(
JSONException
e
)
{
logger
.
debug
(
"解析数据出现问题"
,
e
.
fillInStackTrace
());
continue
;
}
}
result
.
put
(
"data"
,
ttList
);
result
.
put
(
"has_more"
,
has_more
);
return
result
;
}
return
null
;
}
/**
* @Title: getUrl
* @author hero
* @Description: TODO(处理url)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
@SuppressWarnings
(
"unused"
)
private
static
String
getUrl
(
String
url
){
if
(
url
.
contains
(
"group/"
))
{
url
=
url
.
replace
(
"group/"
,
"a"
);
}
if
(
url
.
contains
(
"item"
))
{
url
=
url
.
replace
(
"/item/"
,
"/i"
);
}
if
(
url
.
contains
(
"m."
))
{
url
=
url
.
replace
(
"m."
,
""
);
}
if
(!
url
.
contains
(
"www"
))
{
url
=
url
.
replace
(
"toutiao.com"
,
"www.toutiao.com"
);
}
String
urlIndex
=
url
.
substring
(
url
.
length
()-
1
,
url
.
length
());
if
(!
urlIndex
.
equals
(
"/"
))
{
url
=
url
+
"/"
;
}
return
url
;
}
}
src/main/java/com/zhiwei/toutiao/util/Config.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
toutiao
.
util
;
import
java.io.InputStream
;
import
java.util.Properties
;
public
class
Config
{
static
{
Properties
conf
=
null
;
try
{
InputStream
is
=
Thread
.
currentThread
().
getContextClassLoader
()
.
getResourceAsStream
(
"proxyip.properties"
);
conf
=
new
Properties
();
conf
.
load
(
is
);
is
.
close
();
registry
=
conf
.
getProperty
(
"registry"
);
group
=
conf
.
getProperty
(
"group"
);
minCount
=
Integer
.
valueOf
(
conf
.
getProperty
(
"minCount"
));
maxCount
=
Integer
.
valueOf
(
conf
.
getProperty
(
"maxCount"
));
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
public
static
String
registry
;
public
static
String
group
;
public
static
int
minCount
;
public
static
int
maxCount
;
}
src/main/java/com/zhiwei/toutiao/util/Tools.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
toutiao
.
util
;
import
java.io.BufferedReader
;
import
java.io.IOException
;
import
java.io.InputStream
;
import
java.io.InputStreamReader
;
import
java.security.MessageDigest
;
import
java.security.NoSuchAlgorithmException
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.Map
;
public
class
Tools
{
public
static
String
getText
(
String
textFileName
)
{
try
{
StringBuffer
sb
=
new
StringBuffer
();
InputStream
is
=
Thread
.
currentThread
().
getContextClassLoader
()
.
getResourceAsStream
(
"tac_sign.txt"
);
BufferedReader
br
=
new
BufferedReader
(
new
InputStreamReader
(
is
));
String
line
=
""
;
while
((
line
=
br
.
readLine
())!=
null
)
{
sb
.
append
(
line
);
}
br
.
close
();
return
sb
.
toString
();
}
catch
(
IOException
e
)
{
return
null
;
}
}
/**
* @Title: getAS
* @Description: TODO(获取今日头条加密值)
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
getAS
()
{
long
i
=
(
long
)
Math
.
floor
(
new
Date
().
getTime
()/
1000L
);
String
t
=
Long
.
toHexString
(
i
).
toUpperCase
();
char
[]
ts
=
t
.
toCharArray
();
String
e
=
parseStrToMd5L32
(
i
+
""
).
toString
().
toUpperCase
();
// System.out.println(i+"========"+t);
char
[]
s
=
e
.
substring
(
0
,
5
).
toCharArray
();
char
[]
a
=
e
.
substring
(
e
.
length
()-
5
,
e
.
length
()).
toCharArray
();
String
c
=
""
;
String
o
=
""
;
for
(
int
n
=
0
;
5
>
n
;
n
++)
{
o
+=
""
+
s
[
n
]
+
ts
[
n
];
}
for
(
int
r
=
0
;
5
>
r
;
r
++)
{
c
+=
""
+
ts
[
r
+
3
]
+
a
[
r
];
}
String
as
=
"A1"
+
o
+
t
.
substring
(
t
.
length
()-
3
,
t
.
length
());
String
cp
=
t
.
substring
(
0
,
3
)
+
c
+
"E1"
;
return
as
+
"_"
+
cp
;
}
// public static void main(String[] args) {
// Tools.getAS();
// }
/**
* 计算字符串Md5
* @Title: md5
* @param str
* @return String
*/
public
static
String
md5
(
String
str
)
{
String
result
=
null
;
try
{
MessageDigest
md
=
MessageDigest
.
getInstance
(
"MD5"
);
byte
[]
bytes
=
str
.
getBytes
(
"utf-8"
);
md
.
update
(
bytes
);
bytes
=
md
.
digest
();
result
=
bytesToHexString
(
bytes
);
}
catch
(
Exception
e
)
{}
return
result
;
}
/**
* 将二进制转换成16进制字符串
* @Title bytesToHexString
* @param buf
* @return String
*/
private
static
String
bytesToHexString
(
byte
bytes
[])
{
String
result
=
null
;
if
(
bytes
!=
null
)
{
if
(
bytes
.
length
>
0
)
{
StringBuffer
sb
=
new
StringBuffer
();
for
(
int
i
=
0
;
i
<
bytes
.
length
;
i
++)
{
String
hex
=
Integer
.
toHexString
(
bytes
[
i
]
&
0xFF
);
if
(
hex
.
length
()
==
1
)
{
hex
=
'0'
+
hex
;
}
sb
.
append
(
hex
);
}
result
=
sb
.
toString
().
toLowerCase
();
}
}
return
result
;
}
/**
* @param str
* @return
* @Date: 2013-9-6
* @Author: lulei
* @Description: 32位小写MD5
*/
public
static
String
parseStrToMd5L32
(
String
str
){
String
reStr
=
null
;
try
{
MessageDigest
md5
=
MessageDigest
.
getInstance
(
"MD5"
);
byte
[]
bytes
=
md5
.
digest
(
str
.
getBytes
());
StringBuffer
stringBuffer
=
new
StringBuffer
();
for
(
byte
b
:
bytes
){
int
bt
=
b
&
0xff
;
if
(
bt
<
16
){
stringBuffer
.
append
(
0
);
}
stringBuffer
.
append
(
Integer
.
toHexString
(
bt
));
}
reStr
=
stringBuffer
.
toString
();
}
catch
(
NoSuchAlgorithmException
e
)
{
e
.
printStackTrace
();
}
return
reStr
;
}
/**
* @Title: getTouTiaoHeader
* @author hero
* @Description: TODO(头条帐号头部信息)
* @param @return 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
public
static
Map
<
String
,
String
>
getTouTiaoHeader
()
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.8"
);
headerMap
.
put
(
"AlexaToolbar-ALX_NS_PH"
,
"AlexaToolbar/alx-4.0"
);
headerMap
.
put
(
"Cache-Control"
,
"no-cache"
);
headerMap
.
put
(
"Host"
,
"www.toutiao.com"
);
headerMap
.
put
(
"Pragma"
,
"no-cache"
);
headerMap
.
put
(
"Proxy-Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Upgrade-Insecure-Requests"
,
"1"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
);
headerMap
.
put
(
"Accept-Encoding"
,
"deflate, br"
);
return
headerMap
;
}
/**
* @Title: getWangYiHeader
* @author hero
* @Description: TODO(网易帐号头部信息)
* @param @return 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
public
static
Map
<
String
,
String
>
getWangYiHeader
()
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.8"
);
headerMap
.
put
(
"AlexaToolbar-ALX_NS_PH"
,
"AlexaToolbar/alx-4.0"
);
headerMap
.
put
(
"Cache-Control"
,
"no-cache"
);
headerMap
.
put
(
"Host"
,
"c.m.163.com"
);
headerMap
.
put
(
"Pragma"
,
"no-cache"
);
headerMap
.
put
(
"Proxy-Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Upgrade-Insecure-Requests"
,
"1"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
);
return
headerMap
;
}
/**
* @Title: getTouTiaoQuestionHeader
* @author hero
* @Description: TODO(头条问答头部信息)
* @param @return 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
public
static
Map
<
String
,
String
>
getTouTiaoQuestionHeader
(){
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"Accept"
,
"application/json, text/javascript, */*; q=0.01"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.8"
);
headerMap
.
put
(
"wendacsrftoken"
,
"undefined"
);
headerMap
.
put
(
"x-requested-with"
,
"XMLHttpRequest"
);
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
);
return
headerMap
;
}
/**
* @Title: getTouTiaoSearchHeader
* @author hero
* @Description: TODO(头条搜索头部信息)
* @param @return 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
public
static
Map
<
String
,
String
>
getTouTiaoSearchHeader
(){
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"Accept"
,
"application/json, text/javascript, */*; q=0.01"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.8"
);
headerMap
.
put
(
"Host"
,
"www.toutiao.com"
);
headerMap
.
put
(
"x-requested-with"
,
"XMLHttpRequest"
);
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
);
return
headerMap
;
}
/**
* @Title: getTouTiaoChannelHeader
* @author hero
* @Description: TODO(头条频道头部信息)
* @param @return 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
public
static
Map
<
String
,
String
>
getTouTiaoChannelHeader
(){
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"Accept"
,
"application/json, text/javascript, */*; q=0.01"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.8"
);
headerMap
.
put
(
"Host"
,
"www.toutiao.com"
);
headerMap
.
put
(
"x-requested-with"
,
"XMLHttpRequest"
);
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
);
return
headerMap
;
}
/**
* @Title: getTouTiaoChannelHeader
* @author hero
* @Description: TODO(头条问答回答列表)
* @param @return 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
public
static
Map
<
String
,
String
>
getTouTiaoQuestionAnswerHeader
(){
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"Accept"
,
"application/json, text/javascript, */*; q=0.01"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.8"
);
headerMap
.
put
(
"x-requested-with"
,
"XMLHttpRequest"
);
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
);
return
headerMap
;
}
}
src/main/java/com/zhiwei/wangyi/bean/WangYiNews.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
wangyi
.
bean
;
import
java.io.Serializable
;
import
java.util.Date
;
/**
* @ClassName: WangYiNews
* @Description: TODO(网易新闻)
* @author hero
* @date 2017年1月3日 上午9:22:42
*/
public
class
WangYiNews
implements
Serializable
{
private
static
final
long
serialVersionUID
=
2222466947676512589L
;
private
String
id
;
//主键id,文章地址
private
String
title
;
//标题
private
String
source
;
//来源
private
Date
time
;
//发布时间
private
String
content
;
//简介
private
int
reply_count
;
//跟帖数
public
String
getId
()
{
return
id
;
}
public
void
setId
(
String
id
)
{
this
.
id
=
id
;
}
public
String
getTitle
()
{
return
title
;
}
public
void
setTitle
(
String
title
)
{
this
.
title
=
title
;
}
public
String
getSource
()
{
return
source
;
}
public
void
setSource
(
String
source
)
{
this
.
source
=
source
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
public
String
getContent
()
{
return
content
;
}
public
void
setContent
(
String
content
)
{
this
.
content
=
content
;
}
public
int
getReply_count
()
{
return
reply_count
;
}
public
void
setReply_count
(
int
reply_count
)
{
this
.
reply_count
=
reply_count
;
}
@Override
public
String
toString
()
{
return
"new WangYiNews["
+
"id = "
+
id
+
", title = "
+
title
+
", source = "
+
source
+
", time = "
+
time
+
", content = "
+
content
+
", reply_count = "
+
reply_count
+
"]"
;
}
public
WangYiNews
(){};
public
WangYiNews
(
String
id
,
String
title
,
String
source
,
Date
time
,
String
content
,
int
reply_count
)
{
this
.
id
=
id
;
this
.
title
=
title
;
this
.
source
=
source
;
this
.
time
=
time
;
this
.
content
=
content
;
this
.
reply_count
=
reply_count
;
}
}
src/main/java/com/zhiwei/wangyi/parse/WangyiNewParse.java
0 → 100644
View file @
314e5609
package
com
.
zhiwei
.
wangyi
.
parse
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.wangyi.bean.WangYiNews
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
WangyiNewParse
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WangyiNewParse
.
class
);
private
static
boolean
finish
=
true
;
/**
* @Title: getWYHistory
* @Description: TODO(根据文章地址解析网易号历史文章)
* @param @param url
* @param @return 设定文件
* @return List<WangYiNews> 返回类型
* @throws Exception
*/
public
static
List
<
WangYiNews
>
getWYHistory
(
String
tid
,
Date
endTime
)
throws
Exception
{
List
<
WangYiNews
>
list
=
new
ArrayList
<
WangYiNews
>();
Map
<
String
,
String
>
headerMap
=
Tools
.
getWangYiHeader
();
int
page
=
0
;
//解析翻页数据
while
(
finish
)
{
String
url
=
"http://c.m.163.com/nc/subscribe/list/"
+
tid
+
"/all/"
+
page
*
20
+
"-20.html"
;
System
.
out
.
println
(
url
);
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
if
(
htmlBody
!=
null
)
{
List
<
WangYiNews
>
wyList
=
analysis
(
htmlBody
,
endTime
);
if
(
wyList
==
null
)
{
finish
=
false
;
}
list
.
addAll
(
wyList
);
}
page
++;
ZhiWeiTools
.
sleep
(
10000
);
}
return
list
;
}
/**
* @Title: analysis
* @Description: TODO(解析头条号历史文章)
* @param @param htmlBody
* @param @param endTime
* @param @return 设定文件
* @return List<WangYiNews> 返回类型
*/
private
static
List
<
WangYiNews
>
analysis
(
String
htmlBody
,
Date
endTime
)
{
List
<
WangYiNews
>
dataList
=
new
ArrayList
<
WangYiNews
>();
try
{
JSONObject
dataJosn
=
JSONObject
.
parseObject
(
htmlBody
);
//解析来源
JSONObject
subscribe_info
=
dataJosn
.
getJSONObject
(
"subscribe_info"
);
String
source
=
subscribe_info
.
getString
(
"tname"
);
//解析列表
JSONArray
tab_list
=
dataJosn
.
getJSONArray
(
"tab_list"
);
if
(
tab_list
.
size
()>=
1
)
{
for
(
int
i
=
0
;
i
<
tab_list
.
size
();
i
++)
{
JSONObject
data
=
tab_list
.
getJSONObject
(
i
);
try
{
String
url
=
"https://c.m.163.com/news/a/"
+
data
.
getString
(
"docid"
)+
".html?spss=newsapp&spsw=1"
;
String
title
=
data
.
getString
(
"title"
);
String
content
=
data
.
getString
(
"aheadBody"
);
Date
time
=
TimeParse
.
stringFormartDate
(
data
.
getString
(
"ptime"
));
int
reply_count
=
data
.
getIntValue
(
"replyCount"
);
if
(
time
.
after
(
endTime
))
{
WangYiNews
wy
=
new
WangYiNews
(
url
,
title
,
source
,
time
,
content
,
reply_count
);
dataList
.
add
(
wy
);
}
else
{
finish
=
false
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"网易号历史文章解析,单个字段解析出现问题"
,
e
.
getMessage
());
continue
;
}
}
}
else
{
finish
=
false
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"网易号历史文章解析,需要解析的文本结构有问题"
,
e
.
getMessage
());
return
null
;
}
return
dataList
;
}
}
src/test/java/com/zhiwei/toutiao/test/TouTiaoAccountExample.java
0 → 100644
View file @
314e5609
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//
//import org.junit.Test;
//
//import com.zhiwei.toutiao.bean.TouTiaoAccount;
//import com.zhiwei.toutiao.parse.TouTiaoAccountParse;
//
///**
// * @ClassName: TouTiaoAccountExample
// * @Description: TODO(今日头条帐号采集)
// * @author hero
// * @date 2017年10月17日 下午4:03:44
// */
//public class TouTiaoAccountExample {
//
// public void touTiaoAccountTest(){
// String word = "华尔街瞭望";
// System.out.println("===================="+TouTiaoAccountParse.getTouTiaoAccountInfoByName(word, null));
// }
//
//
//
// @Test
// public void touTiaoAccountFriendTest(){
// String userid = "3350881978";
// List<TouTiaoAccount> userList = TouTiaoAccountParse.getFriendsList(userid, null,1000);
// for(TouTiaoAccount tta : userList){
// System.out.println(tta);
// }
// }
//}
src/test/java/com/zhiwei/toutiao/test/TouTiaoChannelExample.java
0 → 100644
View file @
314e5609
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoChannelParse;
//import com.zhiwei.toutiao.util.Tools;
//
///**
// * @ClassName: TouTiaoChannelExample
// * @Description: TODO(头条频道解析测试)
// * @author hero
// * @date 2017年7月24日 下午5:10:52
// */
//public class TouTiaoChannelExample {
//
// public static void main(String[] args) {
//
// long max_behot_time = 0;
// for(int i= 0;i<3; i++){
// System.out.println("i=============="+i);
// if( i==0 ){
// max_behot_time = 0;
// }
// String as = Tools.getAS().split("_")[0];
// String cp = Tools.getAS().split("_")[1];
// String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
// + "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time
// +"&tadrequire=true&as=" +as +"&cp=" + cp;
// System.out.println("url:" + url);
//
// Map<String,Object> result = TouTiaoChannelParse.touTiaoChannel(url, null);
// if(result!=null){
// Long next = (Long)result.get("next");
// List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data");
// System.out.println("ttlist size is " + ttList.size());
// for(TouTiaoArticle tt : ttList){
// System.out.println(tt);
// }
// if(next != null){
// max_behot_time = next;
// }else{
// break;
// }
// }
// }
// }
//
//}
src/test/java/com/zhiwei/toutiao/test/TouTiaoCommentExample.java
0 → 100644
View file @
314e5609
//package com.zhiwei.toutiao.test;
//
//import java.net.InetSocketAddress;
//import java.net.Proxy;
//import java.net.Proxy.Type;
//import java.util.ArrayList;
//import java.util.List;
//
//import com.zhiwei.toutiao.bean.TouTiaoComment;
//import com.zhiwei.toutiao.parse.TouTiaoCommentParse;
//
///**
// * @ClassName: TouTiaoCommentExample
// * @Description: TODO(今日头条评论测试)
// * @author hero
// * @date 2016年12月9日 下午8:08:02
// */
//public class TouTiaoCommentExample {
// private static String hostname = "192.168.9.37";
// private static int host = 31128;
// private static Proxy proxy = new Proxy(Type.HTTP, new InetSocketAddress(hostname, host));
//
// public static void main(String[] args) throws Exception {
//
// TouTiaoCommentParse touTiaoComment = new TouTiaoCommentParse();
//
// List<String> mids = new ArrayList<String>();
// mids.add("https://www.toutiao.com/a6549289895376978436/");
//
// for(String mid : mids)
// {
// List<TouTiaoComment> list = touTiaoComment.getTouTiaoComment(mid, null);
// System.out.println(mid+"============="+list.size());
// for(TouTiaoComment ttc : list)
// {
// System.out.println(ttc);
//// DBObject doc = new BasicDBObject();
//// doc.put("_id", ttc.getId());
//// doc.put("text", ttc.getText());
//// doc.put("time", ttc.getTime());
//// doc.put("username", ttc.getUserName());
//// doc.put("reply_count", ttc.getReply_count());
//// doc.put("digg_count", ttc.getDigg_count());
//// doc.put("source_url", ttc.getId());
//// touTiaoCommentDAO.addTouTiaoComment(doc);
// }
// }
//
// }
//
//
//}
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
0 → 100644
View file @
314e5609
///**
// * @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// * @version V1.0
// */
///**
//*
//*/
//package com.zhiwei.toutiao.test;
//
//import java.util.ArrayList;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
//import com.zhiwei.toutiao.util.Tools;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//
///**
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// */
//public class TouTiaoExample {
//
// @SuppressWarnings("unchecked")
// public static void main(String[] args) throws Exception {
// long a = System.currentTimeMillis();
// List<String> urlList = new ArrayList<String>();
// urlList.add("6859134443");
//
// System.out.println(urlList.size());
//
// Date endTime = TimeParse.stringFormartDate("2018-04-01");
//
// for (String url : urlList) {
// String mid = url;
// String max_behot_time = "0";
// while (true) {
// Map<String, Object> dataMap = null;
// dataMap = TouTiaoArticleParse.getTouTiaoList(mid, max_behot_time, endTime,null);
// if (dataMap != null) {
// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
// max_behot_time = (String) dataMap.get("max_behot_time");
// System.out.println(max_behot_time + "=======" + ttlist.size());
// if (max_behot_time == null || ttlist.isEmpty()) {
// break;
// } else {
// if (ttlist.size() > 0) {
// for (TouTiaoArticle tt : ttlist) {
// System.out.println(tt);
// }
// }
// }
// }
// }
// }
// long b = System.currentTimeMillis();
// System.out.println("一轮的采集时间为:" + (b - a) / 1000);
// }
//
//}
src/test/java/com/zhiwei/toutiao/test/TouTiaoMicroExample.java
0 → 100644
View file @
314e5609
///**
// * @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// * @version V1.0
// */
//package com.zhiwei.toutiao.test;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
//
///**
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// */
//public class TouTiaoMicroExample {
//
// public static void main(String[] args) throws Exception {
// long a = System.currentTimeMillis();
// String user_id = "55301399445";
// Date date = new Date((new Date().getTime()-24*60*60*1000));
// parseMicroTouTiao(user_id, date);
// long b = System.currentTimeMillis();
// System.out.println("一轮的采集时间为:" + (b - a) / 1000);
//
// }
//
//
// @SuppressWarnings("unchecked")
// public static void parseMicroTouTiao(String user_id, Date endDate){
// int count = 1;
// boolean f = true;
// String max_behot_time = null;
// while(f)
// {
// if(count==3){
// f = false;
// }
// for(int i=0; i<3; i++){
// try {
// Map<String, Object> dataMap = TouTiaoArticleParse.getMicroTouTiaoCrawler(user_id, endDate, null, max_behot_time);
// List<TouTiaoArticle> ttlist = null;
// if(dataMap!=null && !dataMap.isEmpty())
// {
// ttlist = (List<TouTiaoArticle>) dataMap.get("data");
// max_behot_time = dataMap.get("max_behot_time")!=null?dataMap.get("max_behot_time").toString():null;
// if (ttlist!=null && ttlist.size() > 0)
// {
// for(TouTiaoArticle touTiaoArticle : ttlist){
// System.out.println(TimeParse.dateFormartString(touTiaoArticle.getTime(), "yyyy-MM-dd HH:mm:ss"));
// }
// }
// count++;
// break;
// }else{
// continue;
// }
// } catch (Exception e) {
// e.printStackTrace();
// continue;
// }
// }
// ZhiWeiTools.sleep(7000);
// }
// }
//
//
//
//}
src/test/java/com/zhiwei/toutiao/test/TouTiaoQuestionAnswerExample.java
0 → 100644
View file @
314e5609
//package com.zhiwei.toutiao.test;
//
//import java.util.ArrayList;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.proxyip.util.Tools;
//import com.zhiwei.toutiao.bean.TouTiaoQuestionAnswer;
//import com.zhiwei.toutiao.parse.TouTiaoQuestionAnswerParse;
//import com.zhiwei.zhiweiTools.excel.PoiExcelUtil;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//
///**
// * @ClassName: TouTiaoQuestionAnswerExample
// * @Description: TODO(头条问答回答测试)
// * @author hero
// * @date 2017年7月28日 下午8:38:54
// */
//public class TouTiaoQuestionAnswerExample {
//
//
//
//
// public static void main(String[] args) {
//
// String path = "E://头条问答采集需求.xlsx";
// String write_path = "E://头条问答回答列表0801.xlsx";
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> dataMap = (List<Map<String, Object>>) map.get("body");
//
// List<String> headerList = new ArrayList<String>();
// headerList.add("问题链接");
// headerList.add("问题标题");
// headerList.add("回答用户uid");
// headerList.add("回答用户昵称");
// headerList.add("回答时间");
// headerList.add("回答内容");
// headerList.add("回答评论数");
// headerList.add("回答点赞数");
//
// List<Map<String,Object>> answerList = new ArrayList<Map<String,Object>>();
// for(Map<String,Object> data : dataMap){
// String title = data.get("标题").toString();
// String link = data.get("链接").toString();
// String[] questionIdes = link.split("/");
// System.out.println(questionIdes.length);
// String questionId = questionIdes[questionIdes.length-1];
// questionId = questionId.substring(1, questionId.length());
// System.out.println(link+"========"+questionId);
//
// int page = 0;
// int nextPage = 1;
// int req_type = 1;
// while(page != nextPage && req_type != 3){
// Map<String,Object> result = TouTiaoQuestionAnswerParse.getAnserList(questionId, page, req_type);
// System.out.println(result);
// page = (int) result.get("page");
// nextPage++;
// List<TouTiaoQuestionAnswer> ansList = (List<TouTiaoQuestionAnswer>) result.get("ansList");
// if(ansList.size()>0){
// for(TouTiaoQuestionAnswer answer : ansList){
// Map<String,Object> answerMap = new HashMap<String,Object>();
// answerMap.put("问题链接", link);
// answerMap.put("问题标题", title);
// answerMap.put("回答用户uid", answer.getUsername());
// answerMap.put("回答用户昵称", answer.getUser_id());
// answerMap.put("回答时间", TimeParse.dateFormartString(answer.getTime(), "yyyy-MM-dd HH:mm:ss"));
// answerMap.put("回答内容", answer.getContent());
// answerMap.put("回答评论数", answer.getComment_count());
// answerMap.put("回答点赞数", answer.getDigg_count());
// answerList.add(answerMap);
// }
// }else{
// req_type++;
// page = 0;
// nextPage = 1;
// }
// System.out.println(page+"=========="+nextPage+"============"+req_type);
// Tools.sleep(8000);
// }
// }
//
// poi.exportExcel(write_path, "0", headerList, answerList);
//
// }
//
//}
src/test/java/com/zhiwei/toutiao/test/TouTiaoQuestionExample.java
0 → 100644
View file @
314e5609
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//
//import org.junit.Test;
//
//import com.zhiwei.toutiao.bean.TouTiaoQuestion;
//import com.zhiwei.toutiao.parse.TouTiaoQuestionParse;
//import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
//
///**
// * @ClassName: TouTiaoQuestionExample
// * @Description: TODO(头条问答采集测试类)
// * @author hero
// * @date 2017年7月20日 下午3:06:51
// */
//public class TouTiaoQuestionExample {
//
//
//
// @Test
// public void touTiaoQuestionTest(){
// String word = "京东";
//
// String url = "https://www.wukong.com/wenda/web/search/question/brow/?search_text="+
// URLCodeUtil.getURLEncode(word, "UTF-8")+"&count=15";
//
// List<TouTiaoQuestion> list = TouTiaoQuestionParse.getSearchTouTiaoQuestion(url);
// System.out.println(list.size());
// for(TouTiaoQuestion question : list){
// System.out.println(question);
// }
// }
//
//}
src/test/java/com/zhiwei/toutiao/test/TouTiaoSearchExample.java
0 → 100644
View file @
314e5609
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoSearchParse;
//import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
//
///**
// * @ClassName: TouTiaoSearchExample
// * @Description: TODO(头条搜索测试)
// * @author hero
// * @date 2017年7月24日 下午5:11:15
// */
//public class TouTiaoSearchExample {
//
// public static void main(String[] args) {
// String word = "京东";
// for (int i = 0; i < 3; i++) {
// String url = "http://www.toutiao.com/search_content/?offset=" + i * 20 + "&format=json&keyword="
// + URLCodeUtil.getURLDecode(word, "utf--8") + "&autoload=true&count=20&cur_tab=1";
// System.out.println(url);
// Map<String, Object> ttList;
// try {
// ttList = TouTiaoSearchParse.touTiaoSearchByWord(url,null);
// System.out.println("ttsize is : " + ttList.size());
// } catch (Exception e) {
// e.printStackTrace();
// }
//
// }
// }
//}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment