Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
ec916427
Commit
ec916427
authored
Feb 18, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
头条关键词 采集修改
parent
5ade0fda
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
71 additions
and
73 deletions
+71
-73
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+2
-2
src/main/java/com/zhiwei/toutiao/parse/TouTiaoSearchParse.java
+12
-14
src/test/java/com/zhiwei/toutiao/test/TouTiaoChannelExample.java
+57
-57
No files found.
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
View file @
ec916427
...
@@ -43,7 +43,7 @@ public class TouTiaoArticleParse {
...
@@ -43,7 +43,7 @@ public class TouTiaoArticleParse {
}
}
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoArticleParse
.
class
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoArticleParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
(
true
);
/***
/***
* 获取头条数据
* 获取头条数据
...
@@ -169,7 +169,7 @@ public class TouTiaoArticleParse {
...
@@ -169,7 +169,7 @@ public class TouTiaoArticleParse {
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user_id
+
"&max_behot_time="
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user_id
+
"&max_behot_time="
+
max_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
_signature
;
+
max_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
_signature
;
logger
.
info
(
"当前采集的历史文章链接:::{}"
,
url
);
logger
.
info
(
"当前采集的历史文章链接:::{}"
,
url
);
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"user-agent"
,
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoSearchParse.java
View file @
ec916427
package
com
.
zhiwei
.
toutiao
.
parse
;
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.HashMap
;
...
@@ -13,10 +12,11 @@ import org.apache.logging.log4j.Logger;
...
@@ -13,10 +12,11 @@ import org.apache.logging.log4j.Logger;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONException
;
import
com.alibaba.fastjson.JSONException
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.util.Tools
;
/**
/**
* @ClassName: TouTiaoSearch
* @ClassName: TouTiaoSearch
...
@@ -26,24 +26,23 @@ import com.zhiwei.toutiao.util.Tools;
...
@@ -26,24 +26,23 @@ import com.zhiwei.toutiao.util.Tools;
*/
*/
public
class
TouTiaoSearchParse
{
public
class
TouTiaoSearchParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoSearchParse
.
class
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoSearchParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
/**
* @Title: touTiaoSearchByWord
* @Title: touTiaoSearchByWord
* @author hero
* @author hero
* @Description:
TODO
(根据关键词采集今日头条数据)
* @Description: (根据关键词采集今日头条数据)
* @param @param url
* @param @param url
* @param @return 设定文件
* @param @return 设定文件
* @return List<TouTiaoArticle> 返回类型
* @return List<TouTiaoArticle> 返回类型
* @throws Exception
* @throws Exception
*/
*/
public
static
Map
<
String
,
Object
>
touTiaoSearchByWord
(
String
url
,
Proxy
proxy
)
throws
Exception
{
public
static
Map
<
String
,
Object
>
touTiaoSearchByWord
(
String
url
,
ProxyHolder
proxy
)
throws
Exception
{
headerMap
=
Tools
.
getTouTiaoSearchHeader
();
headerMap
.
put
(
"referer"
,
url
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
{
try
{
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
).
body
().
string
(
);
if
(
htmlBody
!=
null
){
if
(
htmlBody
!=
null
){
Map
<
String
,
Object
>
dataMap
=
parseHtmlBySearch
(
htmlBody
);
Map
<
String
,
Object
>
dataMap
=
parseHtmlBySearch
(
htmlBody
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()>
0
){
if
(
dataMap
!=
null
&&
dataMap
.
size
()>
0
){
...
@@ -60,7 +59,7 @@ public class TouTiaoSearchParse {
...
@@ -60,7 +59,7 @@ public class TouTiaoSearchParse {
/**
/**
* @Title: parseHtmlBySearch
* @Title: parseHtmlBySearch
* @author hero
* @author hero
* @Description:
TODO
(解析文本)
* @Description: (解析文本)
* @param @param htmlBody
* @param @param htmlBody
* @param @return 设定文件
* @param @return 设定文件
* @return List<TouTiaoArticle> 返回类型
* @return List<TouTiaoArticle> 返回类型
...
@@ -72,8 +71,8 @@ public class TouTiaoSearchParse {
...
@@ -72,8 +71,8 @@ public class TouTiaoSearchParse {
int
has_more
=
jsonObject
.
getIntValue
(
"has_more"
);
int
has_more
=
jsonObject
.
getIntValue
(
"has_more"
);
if
(
null
!=
dataList
&&
dataList
.
size
()>
0
){
if
(
null
!=
dataList
&&
dataList
.
size
()>
0
){
Map
<
String
,
Object
>
result
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
result
=
new
HashMap
<>();
List
<
TouTiaoArticle
>
ttList
=
new
ArrayList
<
TouTiaoArticle
>();
List
<
TouTiaoArticle
>
ttList
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
dataList
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
dataList
.
size
();
i
++)
{
JSONObject
jso
=
dataList
.
getJSONObject
(
i
);
JSONObject
jso
=
dataList
.
getJSONObject
(
i
);
try
{
try
{
...
@@ -93,8 +92,7 @@ public class TouTiaoSearchParse {
...
@@ -93,8 +92,7 @@ public class TouTiaoSearchParse {
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
url
,
title
,
user_id
,
source
,
date
,
content
,
comment_count
,
"-1"
,
"-1"
,
"-1"
,
"今日头条"
,
null
);
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
url
,
title
,
user_id
,
source
,
date
,
content
,
comment_count
,
"-1"
,
"-1"
,
"-1"
,
"今日头条"
,
null
);
ttList
.
add
(
tt
);
ttList
.
add
(
tt
);
}
catch
(
JSONException
e
)
{
}
catch
(
JSONException
e
)
{
logger
.
debug
(
"解析数据出现问题"
,
e
.
fillInStackTrace
());
logger
.
debug
(
"解析数据出现问题 {}"
,
e
);
continue
;
}
}
}
}
result
.
put
(
"data"
,
ttList
);
result
.
put
(
"data"
,
ttList
);
...
...
src/test/java/com/zhiwei/toutiao/test/TouTiaoChannelExample.java
View file @
ec916427
package
com
.
zhiwei
.
toutiao
.
test
;
//
package com.zhiwei.toutiao.test;
//
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
com.zhiwei.toutiao.bean.Signature
;
//
import com.zhiwei.toutiao.bean.Signature;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
//
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import
com.zhiwei.toutiao.parse.TouTiaoChannelParse
;
//
import com.zhiwei.toutiao.parse.TouTiaoChannelParse;
import
com.zhiwei.toutiao.util.Tools
;
//
import com.zhiwei.toutiao.util.Tools;
//
/**
/
//
**
* @ClassName: TouTiaoChannelExample
//
* @ClassName: TouTiaoChannelExample
* @Description: TODO(头条频道解析测试)
//
* @Description: TODO(头条频道解析测试)
* @author hero
//
* @author hero
* @date 2017年7月24日 下午5:10:52
//
* @date 2017年7月24日 下午5:10:52
*/
//
*/
public
class
TouTiaoChannelExample
{
//
public class TouTiaoChannelExample {
//
public
static
void
main
(
String
[]
args
)
{
//
public static void main(String[] args) {
//
long
max_behot_time
=
0
;
//
long max_behot_time = 0;
for
(
int
i
=
0
;
i
<
3
;
i
++){
//
for(int i= 0;i<3; i++){
System
.
out
.
println
(
"i=============="
+
i
);
//
System.out.println("i=============="+i);
if
(
i
==
0
){
//
if( i==0 ){
max_behot_time
=
0
;
//
max_behot_time = 0;
}
//
}
Signature
signature
=
new
Signature
();
//
Signature signature = new Signature();
String
as
=
signature
.
getAs
();
//
String as = signature.getAs();
String
cp
=
signature
.
getCp
();
//
String cp = signature.getCp();
String
url
=
"http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
//
String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
+
"&widen=1&max_behot_time="
+
max_behot_time
+
"&max_behot_time_tmp="
+
max_behot_time
//
+ "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time
+
"&tadrequire=true&as="
+
as
+
"&cp="
+
cp
;
//
+"&tadrequire=true&as=" +as +"&cp=" + cp;
System
.
out
.
println
(
"url:"
+
url
);
//
System.out.println("url:" + url);
//
Map
<
String
,
Object
>
result
;
//
Map<String, Object> result;
try
{
//
try {
result
=
TouTiaoChannelParse
.
touTiaoChannel
(
url
,
null
);
//
result = TouTiaoChannelParse.touTiaoChannel(url, null);
if
(
result
!=
null
){
//
if(result!=null){
Long
next
=
(
Long
)
result
.
get
(
"next"
);
//
Long next = (Long)result.get("next");
List
<
TouTiaoArticle
>
ttList
=
(
List
<
TouTiaoArticle
>)
result
.
get
(
"data"
);
//
List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data");
System
.
out
.
println
(
"ttlist size is "
+
ttList
.
size
());
//
System.out.println("ttlist size is " + ttList.size());
for
(
TouTiaoArticle
tt
:
ttList
){
//
for(TouTiaoArticle tt : ttList){
System
.
out
.
println
(
tt
);
//
System.out.println(tt);
}
//
}
if
(
next
!=
null
){
//
if(next != null){
max_behot_time
=
next
;
//
max_behot_time = next;
}
else
{
//
}else{
break
;
//
break;
}
//
}
}
//
}
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
e
.
printStackTrace
();
//
e.printStackTrace();
}
//
}
}
//
}
}
//
}
//
}
//
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment