Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
ee3aa8bd
Commit
ee3aa8bd
authored
Sep 19, 2018
by
[zhangzhiwei]
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改并更新今日头条采集程序
parent
2191591c
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
44 additions
and
44 deletions
+44
-44
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
+11
-10
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+7
-8
src/main/java/com/zhiwei/toutiao/parse/TouTiaoChannelParse.java
+3
-3
src/main/java/com/zhiwei/toutiao/parse/TouTiaoCommentParse.java
+3
-3
src/main/java/com/zhiwei/toutiao/parse/TouTiaoParse.java
+3
-3
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionAnswerParse.java
+3
-3
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionParse.java
+3
-3
src/main/java/com/zhiwei/toutiao/parse/TouTiaoSearchParse.java
+3
-3
src/main/java/com/zhiwei/wangyi/parse/WangyiNewParse.java
+8
-8
No files found.
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
View file @
ee3aa8bd
...
...
@@ -6,12 +6,13 @@ import java.util.Date;
import
java.util.List
;
import
java.util.Map
;
import
org.
slf4j.Log
ger
;
import
org.
slf4j.LoggerFactory
;
import
org.
apache.logging.log4j.LogMana
ger
;
import
org.
apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.toutiao.bean.TouTiaoAccount
;
...
...
@@ -27,7 +28,7 @@ public class TouTiaoAccountParse {
private
TouTiaoAccountParse
()
{}
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
Log
gerFactory
.
getLogger
(
TouTiaoAccountParse
.
class
);
private
static
Logger
logger
=
Log
Manager
.
getLogger
(
TouTiaoAccountParse
.
class
);
/**
* @Title: getTouTiaoAccountInfo
...
...
@@ -44,13 +45,13 @@ public class TouTiaoAccountParse {
TouTiaoAccount
tta
=
null
;
try
{
String
htmlBody
=
null
;
htmlBody
=
Http
ClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
htmlBody
=
Http
Boot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
if
(
htmlBody
!=
null
){
tta
=
parseHtmlByAccount
(
htmlBody
,
name
,
proxy
);
if
(
tta
==
null
){
url
=
"https://www.toutiao.com/search_content/?offset=0&format=json&keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)+
"&autoload=true&count=20&cur_tab=4&from=media"
;
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/search/?keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
));
htmlBody
=
Http
ClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
htmlBody
=
Http
Boot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
if
(
htmlBody
!=
null
){
tta
=
parseHtmlByAccount
(
htmlBody
,
name
,
proxy
);
}
...
...
@@ -58,7 +59,7 @@ public class TouTiaoAccountParse {
}
else
{
url
=
"https://www.toutiao.com/search_content/?offset=0&format=json&keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)+
"&autoload=true&count=20&cur_tab=4&from=media"
;
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/search/?keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
));
htmlBody
=
Http
ClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
htmlBody
=
Http
Boot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
if
(
htmlBody
!=
null
){
tta
=
parseHtmlByAccount
(
htmlBody
,
name
,
proxy
);
}
...
...
@@ -79,7 +80,7 @@ public class TouTiaoAccountParse {
TouTiaoAccount
tta
=
null
;
try
{
String
htmlBody
=
null
;
htmlBody
=
Http
ClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
htmlBody
=
Http
Boot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
if
(
htmlBody
!=
null
){
tta
=
parseAccountByUserId
(
htmlBody
,
user_id
,
proxy
);
}
...
...
@@ -112,7 +113,7 @@ public class TouTiaoAccountParse {
headerMap
=
Tools
.
getTouTiaoHeader
();
try
{
String
htmlBody
=
null
;
htmlBody
=
Http
ClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
htmlBody
=
Http
Boot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
if
(
htmlBody
!=
null
){
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
list
.
addAll
(
parseHtmlByWord
(
json
,
proxy
));
...
...
@@ -152,7 +153,7 @@ public class TouTiaoAccountParse {
headerMap
.
put
(
"Host"
,
"is.snssdk.com"
);
try
{
String
htmlBody
=
null
;
htmlBody
=
Http
ClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
htmlBody
=
Http
Boot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"name"
)){
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
more
=
json
.
getJSONObject
(
"data"
).
getBooleanValue
(
"has_more"
);
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
View file @
ee3aa8bd
...
...
@@ -19,12 +19,13 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
org.
slf4j.Log
ger
;
import
org.
slf4j.LoggerFactory
;
import
org.
apache.logging.log4j.LogMana
ger
;
import
org.
apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.util.Tools
;
...
...
@@ -36,7 +37,7 @@ import com.zhiwei.toutiao.util.Tools;
*/
public
class
TouTiaoArticleParse
{
private
TouTiaoArticleParse
()
{}
private
static
Logger
logger
=
Log
gerFactory
.
getLogger
(
TouTiaoArticleParse
.
class
);
private
static
Logger
logger
=
Log
Manager
.
getLogger
(
TouTiaoArticleParse
.
class
);
/***
* 获取头条数据
...
...
@@ -59,7 +60,7 @@ public class TouTiaoArticleParse {
headerMap
.
put
(
"Referer"
,
url
);
String
htmlBody
=
null
;
try
{
htmlBody
=
Http
ClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
htmlBody
=
Http
Boot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"behot_time"
)){
Map
<
String
,
Object
>
ttList
=
parseHtmlByAccount
(
htmlBody
,
endData
);
if
(
ttList
!=
null
&&
ttList
.
size
()>
0
){
...
...
@@ -154,7 +155,7 @@ public class TouTiaoArticleParse {
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"Referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
try
{
String
htmlBody
=
Http
ClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
String
htmlBody
=
Http
Boot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
parseHtmlByMicroAccount
(
htmlBody
,
endDate
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()>
0
){
...
...
@@ -182,9 +183,7 @@ public class TouTiaoArticleParse {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
@SuppressWarnings
(
"unlikely-arg-type"
)
private
static
Map
<
String
,
Object
>
parseHtmlByMicroAccount
(
String
htmlBody
,
Date
endDate
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Long
max_behot_time
=
null
;
List
<
TouTiaoArticle
>
dataList
=
new
ArrayList
<
TouTiaoArticle
>();
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoChannelParse.java
View file @
ee3aa8bd
...
...
@@ -7,8 +7,8 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
org.
slf4j.Log
ger
;
import
org.
slf4j.LoggerFactory
;
import
org.
apache.logging.log4j.LogMana
ger
;
import
org.
apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONException
;
...
...
@@ -26,7 +26,7 @@ import com.zhiwei.toutiao.util.Tools;
*/
public
class
TouTiaoChannelParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
Log
gerFactory
.
getLogger
(
TouTiaoChannelParse
.
class
);
private
static
Logger
logger
=
Log
Manager
.
getLogger
(
TouTiaoChannelParse
.
class
);
/**
* @Title: touTiaoChannel
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoCommentParse.java
View file @
ee3aa8bd
...
...
@@ -8,8 +8,8 @@ import java.util.Date;
import
java.util.List
;
import
java.util.Map
;
import
org.
slf4j.Log
ger
;
import
org.
slf4j.LoggerFactory
;
import
org.
apache.logging.log4j.LogMana
ger
;
import
org.
apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
...
...
@@ -26,7 +26,7 @@ import com.zhiwei.toutiao.util.Tools;
*/
public
class
TouTiaoCommentParse
{
private
static
Logger
logger
=
Log
gerFactory
.
getLogger
(
TouTiaoCommentParse
.
class
);
private
static
Logger
logger
=
Log
Manager
.
getLogger
(
TouTiaoCommentParse
.
class
);
/**
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoParse.java
View file @
ee3aa8bd
...
...
@@ -20,10 +20,10 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
...
...
@@ -40,7 +40,7 @@ import com.zhiwei.toutiao.util.Tools;
public
class
TouTiaoParse
{
private
Map
<
String
,
String
>
headerMap
;
private
Logger
logger
=
LoggerFactory
.
getLogger
(
TouTiaoCommentParse
.
class
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoCommentParse
.
class
);
/***
* 获取头条数据
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionAnswerParse.java
View file @
ee3aa8bd
...
...
@@ -7,10 +7,10 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
...
...
@@ -28,7 +28,7 @@ import com.zhiwei.toutiao.util.Tools;
public
class
TouTiaoQuestionAnswerParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
Log
gerFactory
.
getLogger
(
TouTiaoQuestionAnswerParse
.
class
);
private
static
Logger
logger
=
Log
Manager
.
getLogger
(
TouTiaoQuestionAnswerParse
.
class
);
public
static
Map
<
String
,
Object
>
getAnserList
(
String
questionId
,
int
page
,
int
req_type
,
Proxy
proxy
){
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionParse.java
View file @
ee3aa8bd
...
...
@@ -6,8 +6,8 @@ import java.util.Date;
import
java.util.List
;
import
java.util.Map
;
import
org.
slf4j.Log
ger
;
import
org.
slf4j.LoggerFactory
;
import
org.
apache.logging.log4j.LogMana
ger
;
import
org.
apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
...
...
@@ -25,7 +25,7 @@ import com.zhiwei.toutiao.util.Tools;
public
class
TouTiaoQuestionParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
Log
gerFactory
.
getLogger
(
TouTiaoQuestionParse
.
class
);
private
static
Logger
logger
=
Log
Manager
.
getLogger
(
TouTiaoQuestionParse
.
class
);
/**
* @Title: getSearchTouTiaoQuestion
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoSearchParse.java
View file @
ee3aa8bd
...
...
@@ -7,8 +7,8 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
org.
slf4j.Log
ger
;
import
org.
slf4j.LoggerFactory
;
import
org.
apache.logging.log4j.LogMana
ger
;
import
org.
apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONException
;
...
...
@@ -27,7 +27,7 @@ import com.zhiwei.toutiao.util.Tools;
public
class
TouTiaoSearchParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
Log
gerFactory
.
getLogger
(
TouTiaoSearchParse
.
class
);
private
static
Logger
logger
=
Log
Manager
.
getLogger
(
TouTiaoSearchParse
.
class
);
/**
* @Title: touTiaoSearchByWord
...
...
src/main/java/com/zhiwei/wangyi/parse/WangyiNewParse.java
View file @
ee3aa8bd
package
com
.
zhiwei
.
wangyi
.
parse
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.wangyi.bean.WangYiNews
;
public
class
WangyiNewParse
{
private
static
Logger
logger
=
Log
gerFactory
.
getLogger
(
WangyiNewParse
.
class
);
private
static
Logger
logger
=
Log
Manager
.
getLogger
(
WangyiNewParse
.
class
);
private
static
boolean
finish
=
true
;
/**
* @Title: getWYHistory
...
...
@@ -27,7 +27,7 @@ public class WangyiNewParse {
* @return List<WangYiNews> 返回类型
* @throws Exception
*/
public
static
List
<
WangYiNews
>
getWYHistory
(
String
tid
,
Date
endTime
)
throws
Exception
public
static
List
<
WangYiNews
>
getWYHistory
(
String
tid
,
Date
endTime
,
Proxy
proxy
)
throws
Exception
{
List
<
WangYiNews
>
list
=
new
ArrayList
<
WangYiNews
>();
Map
<
String
,
String
>
headerMap
=
Tools
.
getWangYiHeader
();
...
...
@@ -38,7 +38,7 @@ public class WangyiNewParse {
{
String
url
=
"http://c.m.163.com/nc/subscribe/list/"
+
tid
+
"/all/"
+
page
*
20
+
"-20.html"
;
System
.
out
.
println
(
url
);
String
htmlBody
=
Http
ClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
String
htmlBody
=
Http
Boot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
(
);
if
(
htmlBody
!=
null
)
{
List
<
WangYiNews
>
wyList
=
analysis
(
htmlBody
,
endTime
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment