Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
ec916427
Commit
ec916427
authored
Feb 18, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
头条关键词 采集修改
parent
5ade0fda
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
71 additions
and
73 deletions
+71
-73
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
+2
-2
src/main/java/com/zhiwei/toutiao/parse/TouTiaoSearchParse.java
+12
-14
src/test/java/com/zhiwei/toutiao/test/TouTiaoChannelExample.java
+57
-57
No files found.
src/main/java/com/zhiwei/toutiao/parse/TouTiaoArticleParse.java
View file @
ec916427
...
...
@@ -43,7 +43,7 @@ public class TouTiaoArticleParse {
}
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoArticleParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
(
true
);
/***
* 获取头条数据
...
...
@@ -169,7 +169,7 @@ public class TouTiaoArticleParse {
String
url
=
"https://www.toutiao.com/c/user/article/?page_type=1&user_id="
+
user_id
+
"&max_behot_time="
+
max_behot_time
+
"&count=20&as="
+
as
+
"&cp="
+
cp
+
"&_signature="
+
_signature
;
logger
.
info
(
"当前采集的历史文章链接:::{}"
,
url
);
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"referer"
,
"https://www.toutiao.com/c/user/"
+
user_id
+
"/"
);
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoSearchParse.java
View file @
ec916427
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
...
...
@@ -13,10 +12,11 @@ import org.apache.logging.log4j.Logger;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONException
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.util.Tools
;
/**
* @ClassName: TouTiaoSearch
...
...
@@ -26,24 +26,23 @@ import com.zhiwei.toutiao.util.Tools;
*/
public
class
TouTiaoSearchParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoSearchParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
* @Title: touTiaoSearchByWord
* @author hero
* @Description:
TODO
(根据关键词采集今日头条数据)
* @Description: (根据关键词采集今日头条数据)
* @param @param url
* @param @return 设定文件
* @return List<TouTiaoArticle> 返回类型
* @throws Exception
*/
public
static
Map
<
String
,
Object
>
touTiaoSearchByWord
(
String
url
,
Proxy
proxy
)
throws
Exception
{
headerMap
=
Tools
.
getTouTiaoSearchHeader
();
headerMap
.
put
(
"referer"
,
url
);
public
static
Map
<
String
,
Object
>
touTiaoSearchByWord
(
String
url
,
ProxyHolder
proxy
)
throws
Exception
{
String
htmlBody
=
null
;
try
{
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
).
body
().
string
(
);
if
(
htmlBody
!=
null
){
Map
<
String
,
Object
>
dataMap
=
parseHtmlBySearch
(
htmlBody
);
if
(
dataMap
!=
null
&&
dataMap
.
size
()>
0
){
...
...
@@ -60,7 +59,7 @@ public class TouTiaoSearchParse {
/**
* @Title: parseHtmlBySearch
* @author hero
* @Description:
TODO
(解析文本)
* @Description: (解析文本)
* @param @param htmlBody
* @param @return 设定文件
* @return List<TouTiaoArticle> 返回类型
...
...
@@ -72,8 +71,8 @@ public class TouTiaoSearchParse {
int
has_more
=
jsonObject
.
getIntValue
(
"has_more"
);
if
(
null
!=
dataList
&&
dataList
.
size
()>
0
){
Map
<
String
,
Object
>
result
=
new
HashMap
<
String
,
Object
>();
List
<
TouTiaoArticle
>
ttList
=
new
ArrayList
<
TouTiaoArticle
>();
Map
<
String
,
Object
>
result
=
new
HashMap
<>();
List
<
TouTiaoArticle
>
ttList
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
dataList
.
size
();
i
++)
{
JSONObject
jso
=
dataList
.
getJSONObject
(
i
);
try
{
...
...
@@ -93,8 +92,7 @@ public class TouTiaoSearchParse {
TouTiaoArticle
tt
=
new
TouTiaoArticle
(
url
,
title
,
user_id
,
source
,
date
,
content
,
comment_count
,
"-1"
,
"-1"
,
"-1"
,
"今日头条"
,
null
);
ttList
.
add
(
tt
);
}
catch
(
JSONException
e
)
{
logger
.
debug
(
"解析数据出现问题"
,
e
.
fillInStackTrace
());
continue
;
logger
.
debug
(
"解析数据出现问题 {}"
,
e
);
}
}
result
.
put
(
"data"
,
ttList
);
...
...
src/test/java/com/zhiwei/toutiao/test/TouTiaoChannelExample.java
View file @
ec916427
package
com
.
zhiwei
.
toutiao
.
test
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.toutiao.bean.Signature
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.parse.TouTiaoChannelParse
;
import
com.zhiwei.toutiao.util.Tools
;
/**
* @ClassName: TouTiaoChannelExample
* @Description: TODO(头条频道解析测试)
* @author hero
* @date 2017年7月24日 下午5:10:52
*/
public
class
TouTiaoChannelExample
{
public
static
void
main
(
String
[]
args
)
{
long
max_behot_time
=
0
;
for
(
int
i
=
0
;
i
<
3
;
i
++){
System
.
out
.
println
(
"i=============="
+
i
);
if
(
i
==
0
){
max_behot_time
=
0
;
}
Signature
signature
=
new
Signature
();
String
as
=
signature
.
getAs
();
String
cp
=
signature
.
getCp
();
String
url
=
"http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
+
"&widen=1&max_behot_time="
+
max_behot_time
+
"&max_behot_time_tmp="
+
max_behot_time
+
"&tadrequire=true&as="
+
as
+
"&cp="
+
cp
;
System
.
out
.
println
(
"url:"
+
url
);
Map
<
String
,
Object
>
result
;
try
{
result
=
TouTiaoChannelParse
.
touTiaoChannel
(
url
,
null
);
if
(
result
!=
null
){
Long
next
=
(
Long
)
result
.
get
(
"next"
);
List
<
TouTiaoArticle
>
ttList
=
(
List
<
TouTiaoArticle
>)
result
.
get
(
"data"
);
System
.
out
.
println
(
"ttlist size is "
+
ttList
.
size
());
for
(
TouTiaoArticle
tt
:
ttList
){
System
.
out
.
println
(
tt
);
}
if
(
next
!=
null
){
max_behot_time
=
next
;
}
else
{
break
;
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
}
//
package com.zhiwei.toutiao.test;
//
//
import java.util.List;
//
import java.util.Map;
//
//
import com.zhiwei.toutiao.bean.Signature;
//
import com.zhiwei.toutiao.bean.TouTiaoArticle;
//
import com.zhiwei.toutiao.parse.TouTiaoChannelParse;
//
import com.zhiwei.toutiao.util.Tools;
//
/
//
**
//
* @ClassName: TouTiaoChannelExample
//
* @Description: TODO(头条频道解析测试)
//
* @author hero
//
* @date 2017年7月24日 下午5:10:52
//
*/
//
public class TouTiaoChannelExample {
//
//
public static void main(String[] args) {
//
//
long max_behot_time = 0;
//
for(int i= 0;i<3; i++){
//
System.out.println("i=============="+i);
//
if( i==0 ){
//
max_behot_time = 0;
//
}
//
Signature signature = new Signature();
//
String as = signature.getAs();
//
String cp = signature.getCp();
//
String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
//
+ "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time
//
+"&tadrequire=true&as=" +as +"&cp=" + cp;
//
System.out.println("url:" + url);
//
//
Map<String, Object> result;
//
try {
//
result = TouTiaoChannelParse.touTiaoChannel(url, null);
//
if(result!=null){
//
Long next = (Long)result.get("next");
//
List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data");
//
System.out.println("ttlist size is " + ttList.size());
//
for(TouTiaoArticle tt : ttList){
//
System.out.println(tt);
//
}
//
if(next != null){
//
max_behot_time = next;
//
}else{
//
break;
//
}
//
}
//
} catch (Exception e) {
//
e.printStackTrace();
//
}
//
}
//
}
//
//
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment