Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
643adf87
Commit
643adf87
authored
Aug 21, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修复头条获取关注列表不翻页的情况
parent
e43ea617
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
328 additions
and
249 deletions
+328
-249
pom.xml
+1
-1
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
+7
-1
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionParse.java
+33
-23
src/test/java/com/zhiwei/toutiao/test/TouTiaoAccountExample.java
+44
-33
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
+76
-76
src/test/java/com/zhiwei/toutiao/test/TouTiaoMicroExample.java
+102
-76
src/test/java/com/zhiwei/toutiao/test/TouTiaoQuestionAnswerExample.java
+4
-4
src/test/java/com/zhiwei/toutiao/test/TouTiaoQuestionExample.java
+61
-35
No files found.
pom.xml
View file @
643adf87
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
toutiao
</artifactId>
<artifactId>
toutiao
</artifactId>
<version>
0.
3.9
-SNAPSHOT
</version>
<version>
0.
4.0
-SNAPSHOT
</version>
<dependencies>
<dependencies>
<dependency>
<dependency>
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
View file @
643adf87
...
@@ -178,11 +178,13 @@ public class TouTiaoAccountParse {
...
@@ -178,11 +178,13 @@ public class TouTiaoAccountParse {
Signature
signature
=
new
Signature
(
userId
,
"0"
);
Signature
signature
=
new
Signature
(
userId
,
"0"
);
String
signatureStr
=
signature
.
getSignature
();
String
signatureStr
=
signature
.
getSignature
();
boolean
more
=
true
;
boolean
more
=
true
;
long
cursor
=
0
;
while
(
more
){
while
(
more
){
String
url
=
"https://www.toutiao.com/c/user/following/?user_id="
+
userId
+
"&cursor=
0
&count=100&_signature="
+
signatureStr
;
String
url
=
"https://www.toutiao.com/c/user/following/?user_id="
+
userId
+
"&cursor=
"
+
cursor
+
"
&count=100&_signature="
+
signatureStr
;
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"referer"
,
"ihttps://www.toutiao.com/c/user/relation/"
+
userId
+
"/?tab=following"
);
headerMap
.
put
(
"referer"
,
"ihttps://www.toutiao.com/c/user/relation/"
+
userId
+
"/?tab=following"
);
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
logger
.
info
(
"链接地址为:{}"
,
url
);
for
(
int
i
=
0
;
i
<
3
;
i
++){
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
try
{
String
htmlBody
=
null
;
String
htmlBody
=
null
;
...
@@ -190,14 +192,18 @@ public class TouTiaoAccountParse {
...
@@ -190,14 +192,18 @@ public class TouTiaoAccountParse {
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"name"
)){
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"name"
)){
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
more
=
json
.
getBooleanValue
(
"has_more"
);
more
=
json
.
getBooleanValue
(
"has_more"
);
cursor
=
json
.
getLongValue
(
"cursor"
);
List
<
TouTiaoAccount
>
dataList
=
parseFans
(
json
);
List
<
TouTiaoAccount
>
dataList
=
parseFans
(
json
);
logger
.
info
(
cursor
+
"=========="
+
dataList
.
size
());
if
(
dataList
!=
null
&&
!
dataList
.
isEmpty
()){
if
(
dataList
!=
null
&&
!
dataList
.
isEmpty
()){
ttaList
.
addAll
(
dataList
);
ttaList
.
addAll
(
dataList
);
break
;
}
else
{
}
else
{
more
=
false
;
more
=
false
;
}
}
}
else
{
}
else
{
more
=
false
;
more
=
false
;
logger
.
info
(
"数据结构错误,请检查链接:{},页面信息为:{}"
,
url
,
htmlBody
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionParse.java
View file @
643adf87
package
com
.
zhiwei
.
toutiao
.
parse
;
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
...
@@ -14,6 +16,7 @@ import com.zhiwei.crawler.core.HttpBoot;
...
@@ -14,6 +16,7 @@ import com.zhiwei.crawler.core.HttpBoot;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.toutiao.bean.TouTiaoQuestion
;
import
com.zhiwei.toutiao.bean.TouTiaoQuestion
;
import
com.zhiwei.toutiao.util.Tools
;
import
com.zhiwei.toutiao.util.Tools
;
...
@@ -25,7 +28,6 @@ import com.zhiwei.toutiao.util.Tools;
...
@@ -25,7 +28,6 @@ import com.zhiwei.toutiao.util.Tools;
*/
*/
public
class
TouTiaoQuestionParse
{
public
class
TouTiaoQuestionParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoQuestionParse
.
class
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoQuestionParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
...
@@ -41,21 +43,32 @@ public class TouTiaoQuestionParse {
...
@@ -41,21 +43,32 @@ public class TouTiaoQuestionParse {
* @return List<TouTiaoQuestion> 返回类型
* @return List<TouTiaoQuestion> 返回类型
* @throws Exception
* @throws Exception
*/
*/
public
static
List
<
TouTiaoQuestion
>
getSearchTouTiaoQuestion
(
String
url
)
throws
Exception
{
public
static
List
<
TouTiaoQuestion
>
getSearchTouTiaoQuestion
(
String
word
){
List
<
TouTiaoQuestion
>
questtionList
=
new
ArrayList
<
TouTiaoQuestion
>();
List
<
TouTiaoQuestion
>
questtionList
=
new
ArrayList
<>();
headerMap
=
Tools
.
getTouTiaoQuestionHeader
();
boolean
more
=
true
;
int
page
=
0
;
while
(
more
)
{
String
url
=
"https://www.wukong.com/wenda/web/search/question/brow/?search_text="
+
URLCodeUtil
.
getURLEncode
(
word
,
"UTF-8"
)+
"&count=10&offset="
+
page
*
10
;
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoQuestionHeader
();
headerMap
.
put
(
"referer"
,
url
);
headerMap
.
put
(
"referer"
,
url
);
String
htmlBody
=
null
;
try
{
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
)
{
if
(
Objects
.
nonNull
(
htmlBody
)
&&
htmlBody
.
contains
(
"question"
)
)
{
List
<
TouTiaoQuestion
>
ttList
=
parseHtmlByQuestion
(
htmlBody
);
List
<
TouTiaoQuestion
>
ttList
=
parseHtmlByQuestion
(
htmlBody
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
if
(
Objects
.
nonNull
(
ttList
)
&&
!
ttList
.
isEmpty
()
)
{
return
ttList
;
questtionList
.
addAll
(
ttList
)
;
}
}
JSONObject
dataJSON
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
more
=
dataJSON
.
getBooleanValue
(
"has_more"
);
page
++;
}
else
{
more
=
false
;
}
}
catch
(
IOException
e
)
{
logger
.
info
(
"头条问答解析数据出现问题"
,
e
);
}
}
}
catch
(
Exception
e
)
{
throw
e
;
}
}
return
questtionList
;
return
questtionList
;
}
}
...
@@ -71,15 +84,13 @@ public class TouTiaoQuestionParse {
...
@@ -71,15 +84,13 @@ public class TouTiaoQuestionParse {
* @return List<TouTiaoQuestion> 返回类型
* @return List<TouTiaoQuestion> 返回类型
*/
*/
private
static
List
<
TouTiaoQuestion
>
parseHtmlByQuestion
(
String
htmlBody
)
{
private
static
List
<
TouTiaoQuestion
>
parseHtmlByQuestion
(
String
htmlBody
)
{
List
<
TouTiaoQuestion
>
questtionList
=
new
ArrayList
<
TouTiaoQuestion
>();
List
<
TouTiaoQuestion
>
questtionList
=
new
ArrayList
<>();
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
String
err
_t
ips
=
jsonObject
.
getString
(
"err_tips"
);
String
err
T
ips
=
jsonObject
.
getString
(
"err_tips"
);
if
(
err
_t
ips
.
equals
(
"success"
))
{
if
(
err
T
ips
.
equals
(
"success"
))
{
JSONObject
json
=
jsonObject
.
getJSONObject
(
"data"
);
JSONObject
json
=
jsonObject
.
getJSONObject
(
"data"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"feed_question"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"feed_question"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
try
{
JSONObject
question
=
jsonArray
.
getJSONObject
(
i
).
getJSONObject
(
"question"
);
JSONObject
question
=
jsonArray
.
getJSONObject
(
i
).
getJSONObject
(
"question"
);
...
@@ -88,18 +99,17 @@ public class TouTiaoQuestionParse {
...
@@ -88,18 +99,17 @@ public class TouTiaoQuestionParse {
String
url
=
"http://www.toutiao.com/a"
+
question
.
getString
(
"qid"
)
+
"/"
;
String
url
=
"http://www.toutiao.com/a"
+
question
.
getString
(
"qid"
)
+
"/"
;
Date
time
=
TimeParse
.
stringFormartDate
(
question
.
getLong
(
"create_time"
)
*
1000L
+
""
);
Date
time
=
TimeParse
.
stringFormartDate
(
question
.
getLong
(
"create_time"
)
*
1000L
+
""
);
String
source
=
question
.
getJSONObject
(
"user"
).
getString
(
"uname"
);
String
source
=
question
.
getJSONObject
(
"user"
).
getString
(
"uname"
);
int
follow
_c
ount
=
question
.
getIntValue
(
"follow_count"
);
int
follow
C
ount
=
question
.
getIntValue
(
"follow_count"
);
int
nice
_ans_c
ount
=
question
.
getIntValue
(
"nice_ans_count"
);
int
nice
AnsC
ount
=
question
.
getIntValue
(
"nice_ans_count"
);
int
normal
_ans_c
ount
=
question
.
getIntValue
(
"normal_ans_count"
);
int
normal
AnsC
ount
=
question
.
getIntValue
(
"normal_ans_count"
);
int
ans
_count
=
nice_ans_count
+
normal_ans_c
ount
;
int
ans
Count
=
niceAnsCount
+
normalAnsC
ount
;
TouTiaoQuestion
touTiaoQuestion
=
new
TouTiaoQuestion
(
url
,
title
,
source
,
content
,
time
,
TouTiaoQuestion
touTiaoQuestion
=
new
TouTiaoQuestion
(
url
,
title
,
source
,
content
,
time
,
follow
_count
,
nice_ans_count
,
normal_ans_count
,
ans_c
ount
);
follow
Count
,
niceAnsCount
,
normalAnsCount
,
ansC
ount
);
questtionList
.
add
(
touTiaoQuestion
);
questtionList
.
add
(
touTiaoQuestion
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"头条问答解析数据出现问题"
,
e
.
fillInStackTrace
()
);
logger
.
info
(
"头条问答解析数据出现问题"
,
e
);
continue
;
continue
;
}
}
}
}
...
...
src/test/java/com/zhiwei/toutiao/test/TouTiaoAccountExample.java
View file @
643adf87
//package com.zhiwei.toutiao.test;
package
com
.
zhiwei
.
toutiao
.
test
;
//
//import java.util.List;
import
java.util.List
;
//
//import org.junit.Test;
import
com.zhiwei.common.config.GroupType
;
//
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//import com.zhiwei.toutiao.bean.TouTiaoAccount;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
//import com.zhiwei.toutiao.parse.TouTiaoAccountParse;
import
com.zhiwei.toutiao.bean.TouTiaoAccount
;
//
import
com.zhiwei.toutiao.parse.TouTiaoAccountParse
;
///**
// * @ClassName: TouTiaoAccountExample
/**
// * @Description: TODO(今日头条帐号采集)
* @ClassName: TouTiaoAccountExample
// * @author hero
* @Description: TODO(今日头条帐号采集)
// * @date 2017年10月17日 下午4:03:44
* @author hero
// */
* @date 2017年10月17日 下午4:03:44
//public class TouTiaoAccountExample {
*/
//
public
class
TouTiaoAccountExample
{
// public void touTiaoAccountTest(){
// String word = "华尔街瞭望";
private
static
final
String
registry
=
"zookeeper://192.168.0.36:2181"
;
// System.out.println("===================="+TouTiaoAccountParse.getTouTiaoAccountInfoByName(word, null));
private
static
final
String
group
=
"local"
;
// }
//
//
public
static
void
main
(
String
[]
args
)
{
//
ProxyFactory
.
init
(
registry
,
group
,
GroupType
.
PROVIDER
);
// @Test
touTiaoAccountFriendTest
();
// public void touTiaoAccountFriendTest(){
// String userid = "3350881978";
}
// List<TouTiaoAccount> userList = TouTiaoAccountParse.getFriendsList(userid, null,1000);
// for(TouTiaoAccount tta : userList){
public
void
touTiaoAccountTest
(){
// System.out.println(tta);
String
word
=
"华尔街瞭望"
;
// }
System
.
out
.
println
(
"===================="
+
TouTiaoAccountParse
.
getTouTiaoAccountInfoByName
(
word
,
null
));
// }
}
//}
public
static
void
touTiaoAccountFriendTest
(){
String
userid
=
"3478445819704347"
;
List
<
TouTiaoAccount
>
userList
=
TouTiaoAccountParse
.
getFriendsList
(
userid
,
ProxyHolder
.
NAT_HEAVY_PROXY
);
for
(
TouTiaoAccount
tta
:
userList
){
System
.
out
.
println
(
tta
);
}
}
}
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
View file @
643adf87
/**
///**
* @Title: TouTiaoExample.java
// * @Title: TouTiaoExample.java
* @Package com.zhiwei.toutiao.test
// * @Package com.zhiwei.toutiao.test
* @Description:
// * @Description:
* @author hero
// * @author hero
* @date 2016年9月2日 上午11:48:51
// * @date 2016年9月2日 上午11:48:51
* @version V1.0
// * @version V1.0
*/
// */
/**
///**
*
//*
*/
//*/
package
com
.
zhiwei
.
toutiao
.
test
;
//package com.zhiwei.toutiao.test;
//
import
java.util.ArrayList
;
//import java.util.ArrayList;
import
java.util.Date
;
//import java.util.Date;
import
java.util.List
;
//import java.util.List;
import
java.util.Map
;
//import java.util.Map;
//
import
com.zhiwei.common.config.GroupType
;
//import com.zhiwei.common.config.GroupType;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
//import com.zhiwei.crawler.proxy.ProxyHolder;
import
com.zhiwei.tools.timeparse.TimeParse
;
//import com.zhiwei.tools.timeparse.TimeParse;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
import
com.zhiwei.toutiao.parse.TouTiaoArticleParse
;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
//
/**
///**
* @Description:
// * @Description:
* @author hero
// * @author hero
* @date 2016年9月2日 上午11:48:51
// * @date 2016年9月2日 上午11:48:51
*/
// */
public
class
TouTiaoExample
{
//public class TouTiaoExample {
//
private
static
final
String
registry
=
"zookeeper://192.168.0.36:2181"
;
// private static final String registry = "zookeeper://192.168.0.36:2181";
private
static
final
String
group
=
"local"
;
// private static final String group = "local";
//
@SuppressWarnings
(
"unchecked"
)
// @SuppressWarnings("unchecked")
public
static
void
main
(
String
[]
args
)
throws
Exception
{
// public static void main(String[] args) throws Exception {
//
ProxyFactory
.
init
(
registry
,
group
,
GroupType
.
PROVIDER
);
// ProxyFactory.init(registry, group, GroupType.PROVIDER);
String
url
=
"https://www.toutiao.com/a6659244827009352196/"
;
// String url = "https://www.toutiao.com/a6659244827009352196/";
String
content
=
TouTiaoArticleParse
.
getContent
(
url
,
null
);
// String content = TouTiaoArticleParse.getContent(url, null);
System
.
out
.
println
(
content
);
// System.out.println(content);
//
// List<String> urlList = new ArrayList<String>();
//// List<String> urlList = new ArrayList<String>();
// urlList.add("1920576965");
//// urlList.add("1920576965");
// Date endTime = TimeParse.stringFormartDate("2018-10-01");
//// Date endTime = TimeParse.stringFormartDate("2018-10-01");
////
//// for (String url : urlList) {
//// long a = System.currentTimeMillis();
//// String mid = url;
//// Long max_behot_time = 0L;
//// List<TouTiaoArticle> list = new ArrayList<>();
//// boolean f = true;
//// while (f) {
//// Map<String, Object> dataMap = null;
//// dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY);
//// if (dataMap != null && !dataMap.isEmpty()) {
//// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
//// max_behot_time = (Long)dataMap.get("max_behot_time");
//// System.out.println(max_behot_time + "=======" + ttlist.size());
//// if (null == max_behot_time || ttlist.isEmpty()) {
//// f = false;
//// } else {
//// if (ttlist.size() > 0) {
//// list.addAll(ttlist);
//// }
//// }
//// }else{
//// f = false;
//// }
//// }
//// long b = System.currentTimeMillis();
//// System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size());
//// }
//
//
// for (String url : urlList) {
// long a = System.currentTimeMillis();
// String mid = url;
// Long max_behot_time = 0L;
// List<TouTiaoArticle> list = new ArrayList<>();
// boolean f = true;
// while (f) {
// Map<String, Object> dataMap = null;
// dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY);
// if (dataMap != null && !dataMap.isEmpty()) {
// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
// max_behot_time = (Long)dataMap.get("max_behot_time");
// System.out.println(max_behot_time + "=======" + ttlist.size());
// if (null == max_behot_time || ttlist.isEmpty()) {
// f = false;
// } else {
// if (ttlist.size() > 0) {
// list.addAll(ttlist);
// }
// }
// }else{
// f = false;
// }
// }
// long b = System.currentTimeMillis();
// System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size());
// }
// }
//
}
//}
}
src/test/java/com/zhiwei/toutiao/test/TouTiaoMicroExample.java
View file @
643adf87
///**
/**
// * @Title: TouTiaoExample.java
* @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test
* @Package com.zhiwei.toutiao.test
// * @Description:
* @Description:
// * @author hero
* @author hero
// * @date 2016年9月2日 上午11:48:51
* @date 2016年9月2日 上午11:48:51
// * @version V1.0
* @version V1.0
// */
*//*
//package com.zhiwei.toutiao.test;
package com.zhiwei.toutiao.test;
//import java.util.Date;
import java.util.Date;
//import java.util.List;
import java.util.List;
//import java.util.Map;
import java.util.Map;
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
import com.alibaba.fastjson.JSON;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
import com.mongodb.BasicDBObject;
//
import com.mongodb.DB;
///**
import com.mongodb.DBCollection;
// * @Description:
import com.mongodb.DBObject;
// * @author hero
import com.mongodb.Mongo;
// * @date 2016年9月2日 上午11:48:51
import com.zhiwei.common.config.GroupType;
// */
import com.zhiwei.crawler.proxy.ProxyFactory;
//public class TouTiaoMicroExample {
import com.zhiwei.tools.timeparse.TimeParse;
//
import com.zhiwei.toutiao.bean.TouTiaoArticle;
// public static void main(String[] args) throws Exception {
import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
// long a = System.currentTimeMillis();
// String user_id = "55301399445";
*//**
// Date date = new Date((new Date().getTime()-24*60*60*1000));
* @Description:
// parseMicroTouTiao(user_id, date);
* @author hero
// long b = System.currentTimeMillis();
* @date 2016年9月2日 上午11:48:51
// System.out.println("一轮的采集时间为:" + (b - a) / 1000);
*//*
//
public class TouTiaoMicroExample {
// }
//
private static final String registry = "zookeeper://192.168.0.36:2181";
//
private static final String group = "local";
// @SuppressWarnings("unchecked")
// public static void parseMicroTouTiao(String user_id, Date endDate){
// int count = 1;
public static void main(String[] args) throws Exception {
// boolean f = true;
// String max_behot_time = null;
// while(f)
Mongo mongo = new Mongo("192.168.0.81", 27017);
// {
DB db = mongo.getDB("toutiao");
// if(count==3){
DBCollection coll = db.getCollection("aaaa");
// f = false;
// }
ProxyFactory.init(registry, group, GroupType.PROVIDER);
// for(int i=0; i<3; i++){
// try {
long a = System.currentTimeMillis();
// Map<String, Object> dataMap = TouTiaoArticleParse.getMicroTouTiaoCrawler(user_id, endDate, null, max_behot_time);
String user_id = "3527019566";
// List<TouTiaoArticle> ttlist = null;
Date date = TimeParse.stringFormartDate("2019-01-01 00:00:00");
// if(dataMap!=null && !dataMap.isEmpty())
parseMicroTouTiao(user_id, date, coll);
// {
long b = System.currentTimeMillis();
// ttlist = (List<TouTiaoArticle>) dataMap.get("data");
System.out.println("一轮的采集时间为:" + (b - a) / 1000);
// max_behot_time = dataMap.get("max_behot_time")!=null?dataMap.get("max_behot_time").toString():null;
// if (ttlist!=null && ttlist.size() > 0)
mongo.close();
// {
}
// for(TouTiaoArticle touTiaoArticle : ttlist){
// System.out.println(TimeParse.dateFormartString(touTiaoArticle.getTime(), "yyyy-MM-dd HH:mm:ss"));
// }
@SuppressWarnings("unchecked")
// }
public static void parseMicroTouTiao(String user_id, Date endDate,DBCollection coll){
// count++;
// break;
int count = 1;
// }else{
boolean f = true;
// continue;
String max_behot_time = null;
// }
while(f)
// } catch (Exception e) {
{
// e.printStackTrace();
if(count==3){
// continue;
f = false;
// }
}
// }
for(int i=0; i<3; i++){
// ZhiWeiTools.sleep(7000);
try {
// }
Map<String, Object> dataMap = TouTiaoArticleParse.getMicroTouTiaoCrawler(user_id, endDate, null, max_behot_time);
// }
List<TouTiaoArticle> ttlist = null;
//
if(dataMap!=null && !dataMap.isEmpty())
//
{
//
ttlist = (List<TouTiaoArticle>) dataMap.get("data");
//}
max_behot_time = dataMap.get("max_behot_time")!=null?dataMap.get("max_behot_time").toString():null;
if (ttlist!=null && ttlist.size() > 0)
{
System.out.println(max_behot_time+"===="+ttlist.size());
for(TouTiaoArticle touTiaoArticle : ttlist){
Map map = JSONObject.toJavaObject((JSON)JSONObject.toJSON(touTiaoArticle), Map.class);
DBObject document = new BasicDBObject(map);
coll.save(document);
}
}
break;
}else{
continue;
}
} catch (Exception e) {
e.printStackTrace();
continue;
}
}
}
}
}
*/
\ No newline at end of file
src/test/java/com/zhiwei/toutiao/test/TouTiaoQuestionAnswerExample.java
View file @
643adf87
...
@@ -5,11 +5,12 @@
...
@@ -5,11 +5,12 @@
//import java.util.List;
//import java.util.List;
//import java.util.Map;
//import java.util.Map;
//
//
//import com.zhiwei.proxyip.util.Tools;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.tools.timeparse.TimeParse;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//import com.zhiwei.toutiao.bean.TouTiaoQuestionAnswer;
//import com.zhiwei.toutiao.bean.TouTiaoQuestionAnswer;
//import com.zhiwei.toutiao.parse.TouTiaoQuestionAnswerParse;
//import com.zhiwei.toutiao.parse.TouTiaoQuestionAnswerParse;
//import com.zhiwei.zhiweiTools.excel.PoiExcelUtil;
//import com.zhiwei.toutiao.util.Tools;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//
//
///**
///**
// * @ClassName: TouTiaoQuestionAnswerExample
// * @ClassName: TouTiaoQuestionAnswerExample
...
@@ -79,7 +80,6 @@
...
@@ -79,7 +80,6 @@
// nextPage = 1;
// nextPage = 1;
// }
// }
// System.out.println(page+"=========="+nextPage+"============"+req_type);
// System.out.println(page+"=========="+nextPage+"============"+req_type);
// Tools.sleep(8000);
// }
// }
// }
// }
//
//
...
...
src/test/java/com/zhiwei/toutiao/test/TouTiaoQuestionExample.java
View file @
643adf87
//package com.zhiwei.toutiao.test;
package
com
.
zhiwei
.
toutiao
.
test
;
//
//import java.util.List;
import
java.util.List
;
//
import
java.util.Map
;
//import org.junit.Test;
//
import
org.junit.jupiter.api.Test
;
//import com.zhiwei.toutiao.bean.TouTiaoQuestion;
//import com.zhiwei.toutiao.parse.TouTiaoQuestionParse;
import
com.alibaba.fastjson.JSONObject
;
//import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import
com.mongodb.BasicDBObject
;
//
import
com.mongodb.DB
;
///**
import
com.mongodb.DBCollection
;
// * @ClassName: TouTiaoQuestionExample
import
com.mongodb.DBObject
;
// * @Description: TODO(头条问答采集测试类)
import
com.mongodb.Mongo
;
// * @author hero
import
com.zhiwei.common.config.GroupType
;
// * @date 2017年7月20日 下午3:06:51
import
com.zhiwei.crawler.proxy.ProxyFactory
;
// */
import
com.zhiwei.excelpoi.bean.ExcelResult
;
//public class TouTiaoQuestionExample {
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import
com.zhiwei.toutiao.bean.TouTiaoQuestion
;
//
import
com.zhiwei.toutiao.parse.TouTiaoQuestionParse
;
//
// @Test
/**
// public void touTiaoQuestionTest(){
* @ClassName: TouTiaoQuestionExample
// String word = "京东";
* @Description: TODO(头条问答采集测试类)
//
* @author hero
// String url = "https://www.wukong.com/wenda/web/search/question/brow/?search_text="+
* @date 2017年7月20日 下午3:06:51
// URLCodeUtil.getURLEncode(word, "UTF-8")+"&count=15";
*/
//
public
class
TouTiaoQuestionExample
{
// List<TouTiaoQuestion> list = TouTiaoQuestionParse.getSearchTouTiaoQuestion(url);
// System.out.println(list.size());
private
static
final
String
registry
=
"zookeeper://192.168.0.36:2181"
;
// for(TouTiaoQuestion question : list){
private
static
final
String
group
=
"local"
;
// System.out.println(question);
// }
public
static
void
main
(
String
[]
args
)
{
// }
ProxyFactory
.
init
(
registry
,
group
,
GroupType
.
PROVIDER
);
//
//}
Mongo
mongo
=
new
Mongo
(
"192.168.0.81"
,
27017
);
DB
db
=
mongo
.
getDB
(
"wukong"
);
DBCollection
coll
=
db
.
getCollection
(
"wukong"
);
touTiaoQuestionTest
(
coll
);
}
public
static
void
touTiaoQuestionTest
(
DBCollection
coll
)
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
ExcelResult
excelResult
=
poi
.
importExcelResult
(
"C:\\Users\\qq859\\Desktop\\悟空问答关键词.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
dataList
=
excelResult
.
getBodyList
();
for
(
Map
<
String
,
Object
>
data
:
dataList
)
{
String
word
=
data
.
get
(
"关键词"
).
toString
();
System
.
out
.
println
(
"word================"
+
word
);
List
<
TouTiaoQuestion
>
list
=
TouTiaoQuestionParse
.
getSearchTouTiaoQuestion
(
word
);
System
.
out
.
println
(
list
.
size
());
for
(
TouTiaoQuestion
question
:
list
){
String
jsonStr
=
JSONObject
.
toJSONString
(
question
);
Map
dataMap
=
JSONObject
.
toJavaObject
(
JSONObject
.
parseObject
(
jsonStr
),
Map
.
class
);
dataMap
.
put
(
"word"
,
word
);
coll
.
save
(
new
BasicDBObject
(
dataMap
));
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment