Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
f4ed3aa0
Commit
f4ed3aa0
authored
Feb 20, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
升级核心包版本及默认代理使用晋豪得NAT
parent
e9bfd2df
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
128 additions
and
191 deletions
+128
-191
pom.xml
+1
-1
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
+31
-9
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+35
-19
src/main/java/com/zhiwei/toutiao/parse/TouTiaoChannelParse.java
+0
-146
src/main/java/com/zhiwei/toutiao/parse/TouTiaoCommentParse.java
+33
-11
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionAnswerParse.java
+0
-1
src/main/java/com/zhiwei/toutiao/parse/TouTiaoSearchParse.java
+28
-3
src/main/java/com/zhiwei/wangyi/parse/WangyiNewParse.java
+0
-1
No files found.
pom.xml
View file @
f4ed3aa0
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
toutiao
</artifactId>
<artifactId>
toutiao
</artifactId>
<version>
0.
2.9
-SNAPSHOT
</version>
<version>
0.
3.0
-SNAPSHOT
</version>
<dependencies>
<dependencies>
<dependency>
<dependency>
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
View file @
f4ed3aa0
...
@@ -21,6 +21,8 @@ import com.zhiwei.toutiao.bean.Signature;
...
@@ -21,6 +21,8 @@ import com.zhiwei.toutiao.bean.Signature;
import
com.zhiwei.toutiao.bean.TouTiaoAccount
;
import
com.zhiwei.toutiao.bean.TouTiaoAccount
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.toutiao.util.Tools
;
import
okhttp3.Response
;
/**
/**
* @ClassName: TouTiaoAccountParse
* @ClassName: TouTiaoAccountParse
* @Description: 今日头条帐号采集
* @Description: 今日头条帐号采集
...
@@ -47,13 +49,12 @@ public class TouTiaoAccountParse {
...
@@ -47,13 +49,12 @@ public class TouTiaoAccountParse {
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
=
Tools
.
getTouTiaoHeader
();
TouTiaoAccount
tta
=
null
;
TouTiaoAccount
tta
=
null
;
try
{
try
{
String
htmlBody
=
null
;
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
tta
=
parseHtmlByAccount
(
htmlBody
,
name
,
proxy
);
tta
=
parseHtmlByAccount
(
htmlBody
,
name
,
proxy
);
if
(
tta
==
null
){
if
(
tta
==
null
){
url
=
"https://www.toutiao.com/api/search/content/?aid=24&offset=0&format=json&keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)+
"&autoload=true&count=20&cur_tab=1&from=search_tab&pd=synthesis"
;
url
=
"https://www.toutiao.com/api/search/content/?aid=24&offset=0&format=json&keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)+
"&autoload=true&count=20&cur_tab=1&from=search_tab&pd=synthesis"
;
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/search/?keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
));
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/search/?keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
));
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
){
if
(
htmlBody
!=
null
){
tta
=
parseHtmlByAccount
(
htmlBody
,
name
,
proxy
);
tta
=
parseHtmlByAccount
(
htmlBody
,
name
,
proxy
);
}
}
...
@@ -73,8 +74,7 @@ public class TouTiaoAccountParse {
...
@@ -73,8 +74,7 @@ public class TouTiaoAccountParse {
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
=
Tools
.
getTouTiaoHeader
();
TouTiaoAccount
tta
=
null
;
TouTiaoAccount
tta
=
null
;
try
{
try
{
String
htmlBody
=
null
;
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
){
if
(
htmlBody
!=
null
){
tta
=
parseAccountByUserId
(
htmlBody
,
user_id
,
proxy
);
tta
=
parseAccountByUserId
(
htmlBody
,
user_id
,
proxy
);
}
}
...
@@ -106,8 +106,7 @@ public class TouTiaoAccountParse {
...
@@ -106,8 +106,7 @@ public class TouTiaoAccountParse {
String
url
=
"https://www.toutiao.com/search_content/?offset="
+
page
*
20
+
"&format=json&keyword="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&autoload=true&count=20&cur_tab=4&from=media"
;
String
url
=
"https://www.toutiao.com/search_content/?offset="
+
page
*
20
+
"&format=json&keyword="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&autoload=true&count=20&cur_tab=4&from=media"
;
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
=
Tools
.
getTouTiaoHeader
();
try
{
try
{
String
htmlBody
=
null
;
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
){
if
(
htmlBody
!=
null
){
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
list
.
addAll
(
parseHtmlByWord
(
json
,
proxy
));
list
.
addAll
(
parseHtmlByWord
(
json
,
proxy
));
...
@@ -149,8 +148,7 @@ public class TouTiaoAccountParse {
...
@@ -149,8 +148,7 @@ public class TouTiaoAccountParse {
headerMap
.
put
(
"User-Agent"
,
"Dalvik/2.1.0 (Linux; U; Android 8.1.0; MI 8 MIUI/V10.0.11.0.OEACNFH) NewsArticle/7.0.1 cronet/TTNetVersion:pre_blink_merge-277498-gd2bb364e 2018-08-24"
);
headerMap
.
put
(
"User-Agent"
,
"Dalvik/2.1.0 (Linux; U; Android 8.1.0; MI 8 MIUI/V10.0.11.0.OEACNFH) NewsArticle/7.0.1 cronet/TTNetVersion:pre_blink_merge-277498-gd2bb364e 2018-08-24"
);
headerMap
.
put
(
"Host"
,
"it-hl.snssdk.com"
);
headerMap
.
put
(
"Host"
,
"it-hl.snssdk.com"
);
try
{
try
{
String
htmlBody
=
null
;
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"name"
)){
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"name"
)){
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
more
=
json
.
getJSONObject
(
"data"
).
getBooleanValue
(
"has_more"
);
more
=
json
.
getJSONObject
(
"data"
).
getBooleanValue
(
"has_more"
);
...
@@ -494,4 +492,28 @@ public class TouTiaoAccountParse {
...
@@ -494,4 +492,28 @@ public class TouTiaoAccountParse {
return
ttaList
;
return
ttaList
;
}
}
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headMap
)
{
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
break
;
}
else
{
continue
;
}
}
}
return
null
;
}
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
View file @
f4ed3aa0
...
@@ -33,6 +33,8 @@ import com.zhiwei.toutiao.bean.Signature;
...
@@ -33,6 +33,8 @@ import com.zhiwei.toutiao.bean.Signature;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.toutiao.util.Tools
;
import
okhttp3.Response
;
/**
/**
* @Description:头条帐号采集
* @Description:头条帐号采集
* @author hero
* @author hero
...
@@ -55,7 +57,6 @@ public class TouTiaoArticleParse {
...
@@ -55,7 +57,6 @@ public class TouTiaoArticleParse {
* @return List<TouTiao> 返回类型
* @return List<TouTiao> 返回类型
* @throws Exception
* @throws Exception
*/
*/
@Deprecated
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media_id
,
String
max_behot_time
,
Date
endData
,
Proxy
proxy
)
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media_id
,
String
max_behot_time
,
Date
endData
,
Proxy
proxy
)
throws
Exception
{
throws
Exception
{
Signature
signature
=
new
Signature
();
Signature
signature
=
new
Signature
();
...
@@ -66,9 +67,8 @@ public class TouTiaoArticleParse {
...
@@ -66,9 +67,8 @@ public class TouTiaoArticleParse {
}
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
url
);
String
htmlBody
=
null
;
try
{
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
...
@@ -84,22 +84,19 @@ public class TouTiaoArticleParse {
...
@@ -84,22 +84,19 @@ public class TouTiaoArticleParse {
return
Collections
.
emptyMap
();
return
Collections
.
emptyMap
();
}
}
@Deprecated
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media_id
,
Long
max_behot_time
,
Date
endData
,
public
static
Map
<
String
,
Object
>
getTouTiaoList
(
String
media_id
,
String
max_behot_time
,
Date
endData
,
ProxyHolder
proxy
)
ProxyHolder
proxy
)
throws
Exception
{
throws
Exception
{
Signature
signature
=
new
Signature
();
Signature
signature
=
new
Signature
();
String
as
=
signature
.
getAs
();
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
media_id
+
"&count=20&as="
String
cp
=
signature
.
getCp
();
+
signature
.
getAs
()
+
"&cp="
+
signature
.
getCp
();
String
url
=
"https://www.toutiao.com/pgc/ma/?page_type=1&media_id="
+
media_id
+
"&count=20&as="
+
as
+
"&cp="
+
cp
;
if
(
max_behot_time
!=
null
)
{
if
(
max_behot_time
!=
null
)
{
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
url
=
url
+
"&max_behot_time="
+
max_behot_time
;
}
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
url
);
String
htmlBody
=
null
;
try
{
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
...
@@ -109,7 +106,7 @@ public class TouTiaoArticleParse {
...
@@ -109,7 +106,7 @@ public class TouTiaoArticleParse {
logger
.
info
(
"数据为null"
);
logger
.
info
(
"数据为null"
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
);
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
()
);
throw
e
;
throw
e
;
}
}
return
Collections
.
emptyMap
();
return
Collections
.
emptyMap
();
...
@@ -134,14 +131,12 @@ public class TouTiaoArticleParse {
...
@@ -134,14 +131,12 @@ public class TouTiaoArticleParse {
String
_signature
=
signature
.
getSignature
();
String
_signature
=
signature
.
getSignature
();
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user_id
+
"&max_behot_time="
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user_id
+
"&max_behot_time="
+
max_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
_signature
;
+
max_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
_signature
;
System
.
out
.
println
(
url
);
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"user-agent"
,
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
String
htmlBody
=
null
;
try
{
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
))
{
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user_id
,
htmlBody
,
endData
);
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
user_id
,
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
...
@@ -360,9 +355,8 @@ public class TouTiaoArticleParse {
...
@@ -360,9 +355,8 @@ public class TouTiaoArticleParse {
}
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
System
.
out
.
println
(
url
);
try
{
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()
>
0
)
{
if
(
dataMap
!=
null
&&
dataMap
.
size
()
>
0
)
{
...
@@ -386,7 +380,6 @@ public class TouTiaoArticleParse {
...
@@ -386,7 +380,6 @@ public class TouTiaoArticleParse {
}
}
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
System
.
out
.
println
(
url
);
try
{
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
!=
null
)
{
...
@@ -479,4 +472,27 @@ public class TouTiaoArticleParse {
...
@@ -479,4 +472,27 @@ public class TouTiaoArticleParse {
return
map
;
return
map
;
}
}
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headMap
)
{
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
break
;
}
else
{
continue
;
}
}
}
return
null
;
}
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoChannelParse.java
deleted
100644 → 0
View file @
e9bfd2df
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONException
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.util.Tools
;
/**
* @ClassName: TouTiaoChannel
* @Description: 今日头条按照频道采集
* @author hero
* @date 2017年7月24日 下午4:57:22
*/
public
class
TouTiaoChannelParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoChannelParse
.
class
);
/**
* @Title: touTiaoChannel
* @author hero
* @Description: 解析
* @param @param url
* @param @return 设定文件
* @return Map<String,Object> 返回类型
* @throws Exception
*/
public
static
Map
<
String
,
Object
>
touTiaoChannel
(
String
url
,
Proxy
proxy
)
throws
Exception
{
headerMap
=
Tools
.
getTouTiaoChannelHeader
();
headerMap
.
put
(
"referer"
,
url
);
String
htmlBody
=
null
;
try
{
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据连接出现问题:"
,
e
.
fillInStackTrace
());
throw
e
;
}
if
(
htmlBody
!=
null
)
{
return
parseHtmlByChannel
(
htmlBody
);
}
return
null
;
}
/**
* @Title: parseHtmlByChannel
* @author hero
* @Description: 解析
* @param @param htmlBody
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
parseHtmlByChannel
(
String
htmlBody
){
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<
String
,
Object
>();
List
<
TouTiaoArticle
>
ttList
=
new
ArrayList
<
TouTiaoArticle
>();
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
dataList
=
jsonObject
.
getJSONArray
(
"data"
);
Long
next
=
null
;
try
{
next
=
jsonObject
.
getJSONObject
(
"next"
).
getLong
(
"max_behot_time"
);
}
catch
(
Exception
e
)
{
next
=
null
;
}
String
time
=
null
;
String
title
=
null
;
String
content
=
null
;
String
comment_count
=
null
;
Date
date
=
null
;
String
source
=
null
;
for
(
int
i
=
0
;
i
<
dataList
.
size
();
i
++)
{
JSONObject
jso
=
dataList
.
getJSONObject
(
i
);
try
{
time
=
String
.
valueOf
(
jso
.
getLongValue
(
"behot_time"
)*
1000
);
title
=
jso
.
getString
(
"title"
);
content
=
jso
.
getString
(
"abstract"
);
comment_count
=
jso
.
getIntValue
(
"comments_count"
)+
""
;
source
=
jso
.
getString
(
"source"
);
String
url
=
null
;
if
(
null
!=
jso
.
getString
(
"group_id"
)){
url
=
"http://www.toutiao.com/a"
+
jso
.
getString
(
"group_id"
)+
"/"
;
}
url
=
getUrl
(
url
);
date
=
TimeParse
.
stringFormartDate
(
time
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
url
,
title
,
null
,
source
,
date
,
content
,
comment_count
,
"-1"
,
"-1"
,
"-1"
,
"今日头条"
,
null
);
ttList
.
add
(
tt
);
}
catch
(
JSONException
e
)
{
continue
;
}
}
dataMap
.
put
(
"data"
,
ttList
);
dataMap
.
put
(
"next"
,
next
);
return
dataMap
;
}
/**
* @Title: getUrl
* @author hero
* @Description: 处理url
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
getUrl
(
String
url
){
if
(
url
.
contains
(
"group/"
))
{
url
=
url
.
replace
(
"group/"
,
"a"
);
}
if
(
url
.
contains
(
"item"
))
{
url
=
url
.
replace
(
"/item/"
,
"/i"
);
}
if
(
url
.
contains
(
"m."
))
{
url
=
url
.
replace
(
"m."
,
""
);
}
if
(!
url
.
contains
(
"www"
))
{
url
=
url
.
replace
(
"toutiao.com"
,
"www.toutiao.com"
);
}
String
urlIndex
=
url
.
substring
(
url
.
length
()-
1
,
url
.
length
());
if
(!
urlIndex
.
equals
(
"/"
))
{
url
=
url
+
"/"
;
}
return
url
;
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoCommentParse.java
View file @
f4ed3aa0
...
@@ -15,11 +15,14 @@ import com.alibaba.fastjson.JSONArray;
...
@@ -15,11 +15,14 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.toutiao.bean.TouTiaoComment
;
import
com.zhiwei.toutiao.bean.TouTiaoComment
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.toutiao.util.Tools
;
import
okhttp3.Response
;
/**
/**
* @ClassName: TouTiaoComment
* @ClassName: TouTiaoComment
* @Description: 今日头条评论数据
* @Description: 今日头条评论数据
...
@@ -66,7 +69,7 @@ public class TouTiaoCommentParse {
...
@@ -66,7 +69,7 @@ public class TouTiaoCommentParse {
headerMap
.
put
(
"Host"
,
"is.snssdk.com"
);
headerMap
.
put
(
"Host"
,
"is.snssdk.com"
);
for
(
int
j
=
1
;
j
<=
3
;
j
++){
for
(
int
j
=
1
;
j
<=
3
;
j
++){
try
{
try
{
String
htmlBody
=
HttpClientTemplateOK
.
get
(
urlNew
,
proxy
,
headerMap
);
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
if
(
htmlBody
!=
null
)
{
{
List
<
TouTiaoComment
>
commentes
=
analySisComment
(
htmlBody
,
url
);
List
<
TouTiaoComment
>
commentes
=
analySisComment
(
htmlBody
,
url
);
...
@@ -77,7 +80,7 @@ public class TouTiaoCommentParse {
...
@@ -77,7 +80,7 @@ public class TouTiaoCommentParse {
}
}
ZhiWeiTools
.
sleep
(
4000
);
ZhiWeiTools
.
sleep
(
4000
);
break
;
break
;
}
catch
(
SocketTimeout
Exception
e
)
{
}
catch
(
Exception
e
)
{
continue
;
continue
;
}
}
}
}
...
@@ -137,8 +140,7 @@ public class TouTiaoCommentParse {
...
@@ -137,8 +140,7 @@ public class TouTiaoCommentParse {
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
group_id
+
"&item_id=0&count=20&offset=0"
;
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
group_id
+
"&item_id=0&count=20&offset=0"
;
//设置头信息
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
urlNew
,
headerMap
),
proxy
).
body
().
string
();
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
if
(
htmlBody
!=
null
)
{
{
try
{
try
{
...
@@ -174,7 +176,7 @@ public class TouTiaoCommentParse {
...
@@ -174,7 +176,7 @@ public class TouTiaoCommentParse {
try
{
try
{
//设置头信息
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"commentInfo"
))
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"commentInfo"
))
{
{
try
{
try
{
...
@@ -185,7 +187,6 @@ public class TouTiaoCommentParse {
...
@@ -185,7 +187,6 @@ public class TouTiaoCommentParse {
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
5000
);
continue
;
continue
;
}
}
}
}
...
@@ -206,7 +207,7 @@ public class TouTiaoCommentParse {
...
@@ -206,7 +207,7 @@ public class TouTiaoCommentParse {
try
{
try
{
//设置头信息
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"commentInfo"
))
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"commentInfo"
))
{
{
try
{
try
{
...
@@ -217,7 +218,7 @@ public class TouTiaoCommentParse {
...
@@ -217,7 +218,7 @@ public class TouTiaoCommentParse {
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析头条评论数错误:::{}"
,
e
.
fillInStackTrace
());
}
}
return
0
;
return
0
;
}
}
...
@@ -238,7 +239,7 @@ public class TouTiaoCommentParse {
...
@@ -238,7 +239,7 @@ public class TouTiaoCommentParse {
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
group_id
+
"&item_id=0&count=20&offset=0"
;
String
urlNew
=
"http://www.toutiao.com/api/comment/list/?group_id="
+
group_id
+
"&item_id=0&count=20&offset=0"
;
//设置头信息
//设置头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
urlNew
,
headerMap
),
proxy
).
body
().
string
(
);
String
htmlBody
=
downloadHtml
(
urlNew
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
if
(
htmlBody
!=
null
)
{
{
try
{
try
{
...
@@ -253,7 +254,6 @@ public class TouTiaoCommentParse {
...
@@ -253,7 +254,6 @@ public class TouTiaoCommentParse {
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
5000
);
continue
;
continue
;
}
}
}
}
...
@@ -301,7 +301,7 @@ public class TouTiaoCommentParse {
...
@@ -301,7 +301,7 @@ public class TouTiaoCommentParse {
String
groupId
=
null
;
String
groupId
=
null
;
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
try
{
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
headerMap
);
if
(
htmlBody
!=
null
)
if
(
htmlBody
!=
null
)
{
{
if
(
htmlBody
.
contains
(
"groupId"
))
if
(
htmlBody
.
contains
(
"groupId"
))
...
@@ -320,5 +320,27 @@ public class TouTiaoCommentParse {
...
@@ -320,5 +320,27 @@ public class TouTiaoCommentParse {
return
groupId
;
return
groupId
;
}
}
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
)
{
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
break
;
}
else
{
continue
;
}
}
}
return
null
;
}
}
}
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionAnswerParse.java
View file @
f4ed3aa0
...
@@ -45,7 +45,6 @@ public class TouTiaoQuestionAnswerParse {
...
@@ -45,7 +45,6 @@ public class TouTiaoQuestionAnswerParse {
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
jsonObject
.
getJSONObject
(
"data"
)
!=
null
){
if
(
jsonObject
.
getJSONObject
(
"data"
)
!=
null
){
JSONObject
data
=
jsonObject
.
getJSONObject
(
"data"
);
JSONObject
data
=
jsonObject
.
getJSONObject
(
"data"
);
System
.
out
.
println
(
data
.
getIntValue
(
"has_more"
));
page
++;
page
++;
JSONArray
ans_list
=
data
.
getJSONArray
(
"ans_list"
);
JSONArray
ans_list
=
data
.
getJSONArray
(
"ans_list"
);
for
(
int
i
=
0
;
i
<
ans_list
.
size
();
i
++){
for
(
int
i
=
0
;
i
<
ans_list
.
size
();
i
++){
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoSearchParse.java
View file @
f4ed3aa0
package
com
.
zhiwei
.
toutiao
.
parse
;
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.HashMap
;
...
@@ -15,9 +16,12 @@ import com.alibaba.fastjson.JSONObject;
...
@@ -15,9 +16,12 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
okhttp3.Response
;
/**
/**
* @ClassName: TouTiaoSearch
* @ClassName: TouTiaoSearch
* @Description: TODO(今日头条搜索采集解析程序)
* @Description: TODO(今日头条搜索采集解析程序)
...
@@ -39,10 +43,10 @@ public class TouTiaoSearchParse {
...
@@ -39,10 +43,10 @@ public class TouTiaoSearchParse {
* @return List<TouTiaoArticle> 返回类型
* @return List<TouTiaoArticle> 返回类型
* @throws Exception
* @throws Exception
*/
*/
public
static
Map
<
String
,
Object
>
touTiaoSearchByWord
(
String
url
,
Proxy
Holder
proxy
)
throws
Exception
{
public
static
Map
<
String
,
Object
>
touTiaoSearchByWord
(
String
url
,
Proxy
proxy
)
throws
Exception
{
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
{
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
).
body
().
string
(
);
htmlBody
=
downloadHtml
(
url
,
proxy
,
HeaderTool
.
getCommonHead
()
);
if
(
htmlBody
!=
null
){
if
(
htmlBody
!=
null
){
Map
<
String
,
Object
>
dataMap
=
parseHtmlBySearch
(
htmlBody
);
Map
<
String
,
Object
>
dataMap
=
parseHtmlBySearch
(
htmlBody
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()>
0
){
if
(
dataMap
!=
null
&&
dataMap
.
size
()>
0
){
...
@@ -135,9 +139,30 @@ public class TouTiaoSearchParse {
...
@@ -135,9 +139,30 @@ public class TouTiaoSearchParse {
{
{
url
=
url
+
"/"
;
url
=
url
+
"/"
;
}
}
return
url
;
return
url
;
}
}
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
)
{
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
break
;
}
else
{
continue
;
}
}
}
return
null
;
}
}
}
src/main/java/com/zhiwei/wangyi/parse/WangyiNewParse.java
View file @
f4ed3aa0
...
@@ -38,7 +38,6 @@ public class WangyiNewParse {
...
@@ -38,7 +38,6 @@ public class WangyiNewParse {
while
(
finish
)
while
(
finish
)
{
{
String
url
=
"http://c.m.163.com/nc/subscribe/list/"
+
tid
+
"/all/"
+
page
*
20
+
"-20.html"
;
String
url
=
"http://c.m.163.com/nc/subscribe/list/"
+
tid
+
"/all/"
+
page
*
20
+
"-20.html"
;
System
.
out
.
println
(
url
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
)
if
(
htmlBody
!=
null
)
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment