Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
toutiao
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
toutiao
Commits
643adf87
Commit
643adf87
authored
Aug 21, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修复头条获取关注列表不翻页的情况
parent
e43ea617
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
334 additions
and
255 deletions
+334
-255
pom.xml
+1
-1
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
+7
-1
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionParse.java
+38
-28
src/test/java/com/zhiwei/toutiao/test/TouTiaoAccountExample.java
+44
-33
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
+77
-77
src/test/java/com/zhiwei/toutiao/test/TouTiaoMicroExample.java
+102
-76
src/test/java/com/zhiwei/toutiao/test/TouTiaoQuestionAnswerExample.java
+4
-4
src/test/java/com/zhiwei/toutiao/test/TouTiaoQuestionExample.java
+61
-35
No files found.
pom.xml
View file @
643adf87
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
toutiao
</artifactId>
<version>
0.
3.9
-SNAPSHOT
</version>
<version>
0.
4.0
-SNAPSHOT
</version>
<dependencies>
<dependency>
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoAccountParse.java
View file @
643adf87
...
...
@@ -178,11 +178,13 @@ public class TouTiaoAccountParse {
Signature
signature
=
new
Signature
(
userId
,
"0"
);
String
signatureStr
=
signature
.
getSignature
();
boolean
more
=
true
;
long
cursor
=
0
;
while
(
more
){
String
url
=
"https://www.toutiao.com/c/user/following/?user_id="
+
userId
+
"&cursor=
0
&count=100&_signature="
+
signatureStr
;
String
url
=
"https://www.toutiao.com/c/user/following/?user_id="
+
userId
+
"&cursor=
"
+
cursor
+
"
&count=100&_signature="
+
signatureStr
;
headerMap
=
Tools
.
getTouTiaoHeader
();
headerMap
.
put
(
"referer"
,
"ihttps://www.toutiao.com/c/user/relation/"
+
userId
+
"/?tab=following"
);
headerMap
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
);
logger
.
info
(
"链接地址为:{}"
,
url
);
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
String
htmlBody
=
null
;
...
...
@@ -190,14 +192,18 @@ public class TouTiaoAccountParse {
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"name"
)){
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
more
=
json
.
getBooleanValue
(
"has_more"
);
cursor
=
json
.
getLongValue
(
"cursor"
);
List
<
TouTiaoAccount
>
dataList
=
parseFans
(
json
);
logger
.
info
(
cursor
+
"=========="
+
dataList
.
size
());
if
(
dataList
!=
null
&&
!
dataList
.
isEmpty
()){
ttaList
.
addAll
(
dataList
);
break
;
}
else
{
more
=
false
;
}
}
else
{
more
=
false
;
logger
.
info
(
"数据结构错误,请检查链接:{},页面信息为:{}"
,
url
,
htmlBody
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取今日头条帐号数据连接超时"
,
e
.
fillInStackTrace
());
...
...
src/main/java/com/zhiwei/toutiao/parse/TouTiaoQuestionParse.java
View file @
643adf87
package
com
.
zhiwei
.
toutiao
.
parse
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
...
...
@@ -14,6 +16,7 @@ import com.zhiwei.crawler.core.HttpBoot;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.toutiao.bean.TouTiaoQuestion
;
import
com.zhiwei.toutiao.util.Tools
;
...
...
@@ -25,7 +28,6 @@ import com.zhiwei.toutiao.util.Tools;
*/
public
class
TouTiaoQuestionParse
{
private
static
Map
<
String
,
String
>
headerMap
;
private
static
Logger
logger
=
LogManager
.
getLogger
(
TouTiaoQuestionParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
...
...
@@ -41,21 +43,32 @@ public class TouTiaoQuestionParse {
* @return List<TouTiaoQuestion> 返回类型
* @throws Exception
*/
public
static
List
<
TouTiaoQuestion
>
getSearchTouTiaoQuestion
(
String
url
)
throws
Exception
{
List
<
TouTiaoQuestion
>
questtionList
=
new
ArrayList
<
TouTiaoQuestion
>();
headerMap
=
Tools
.
getTouTiaoQuestionHeader
();
headerMap
.
put
(
"referer"
,
url
);
String
htmlBody
=
null
;
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
htmlBody
!=
null
)
{
List
<
TouTiaoQuestion
>
ttList
=
parseHtmlByQuestion
(
htmlBody
);
if
(
ttList
!=
null
&&
ttList
.
size
()
>
0
)
{
return
ttList
;
}
}
}
catch
(
Exception
e
)
{
throw
e
;
public
static
List
<
TouTiaoQuestion
>
getSearchTouTiaoQuestion
(
String
word
){
List
<
TouTiaoQuestion
>
questtionList
=
new
ArrayList
<>();
boolean
more
=
true
;
int
page
=
0
;
while
(
more
)
{
String
url
=
"https://www.wukong.com/wenda/web/search/question/brow/?search_text="
+
URLCodeUtil
.
getURLEncode
(
word
,
"UTF-8"
)+
"&count=10&offset="
+
page
*
10
;
Map
<
String
,
String
>
headerMap
=
Tools
.
getTouTiaoQuestionHeader
();
headerMap
.
put
(
"referer"
,
url
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
Objects
.
nonNull
(
htmlBody
)
&&
htmlBody
.
contains
(
"question"
))
{
List
<
TouTiaoQuestion
>
ttList
=
parseHtmlByQuestion
(
htmlBody
);
if
(
Objects
.
nonNull
(
ttList
)
&&
!
ttList
.
isEmpty
())
{
questtionList
.
addAll
(
ttList
);
}
JSONObject
dataJSON
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
more
=
dataJSON
.
getBooleanValue
(
"has_more"
);
page
++;
}
else
{
more
=
false
;
}
}
catch
(
IOException
e
)
{
logger
.
info
(
"头条问答解析数据出现问题"
,
e
);
}
}
return
questtionList
;
}
...
...
@@ -71,15 +84,13 @@ public class TouTiaoQuestionParse {
* @return List<TouTiaoQuestion> 返回类型
*/
private
static
List
<
TouTiaoQuestion
>
parseHtmlByQuestion
(
String
htmlBody
)
{
List
<
TouTiaoQuestion
>
questtionList
=
new
ArrayList
<
TouTiaoQuestion
>();
List
<
TouTiaoQuestion
>
questtionList
=
new
ArrayList
<>();
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
String
err
_t
ips
=
jsonObject
.
getString
(
"err_tips"
);
if
(
err
_t
ips
.
equals
(
"success"
))
{
String
err
T
ips
=
jsonObject
.
getString
(
"err_tips"
);
if
(
err
T
ips
.
equals
(
"success"
))
{
JSONObject
json
=
jsonObject
.
getJSONObject
(
"data"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"feed_question"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
try
{
JSONObject
question
=
jsonArray
.
getJSONObject
(
i
).
getJSONObject
(
"question"
);
...
...
@@ -88,18 +99,17 @@ public class TouTiaoQuestionParse {
String
url
=
"http://www.toutiao.com/a"
+
question
.
getString
(
"qid"
)
+
"/"
;
Date
time
=
TimeParse
.
stringFormartDate
(
question
.
getLong
(
"create_time"
)
*
1000L
+
""
);
String
source
=
question
.
getJSONObject
(
"user"
).
getString
(
"uname"
);
int
follow
_c
ount
=
question
.
getIntValue
(
"follow_count"
);
int
nice
_ans_c
ount
=
question
.
getIntValue
(
"nice_ans_count"
);
int
normal
_ans_c
ount
=
question
.
getIntValue
(
"normal_ans_count"
);
int
ans
_count
=
nice_ans_count
+
normal_ans_c
ount
;
int
follow
C
ount
=
question
.
getIntValue
(
"follow_count"
);
int
nice
AnsC
ount
=
question
.
getIntValue
(
"nice_ans_count"
);
int
normal
AnsC
ount
=
question
.
getIntValue
(
"normal_ans_count"
);
int
ans
Count
=
niceAnsCount
+
normalAnsC
ount
;
TouTiaoQuestion
touTiaoQuestion
=
new
TouTiaoQuestion
(
url
,
title
,
source
,
content
,
time
,
follow
_count
,
nice_ans_count
,
normal_ans_count
,
ans_c
ount
);
follow
Count
,
niceAnsCount
,
normalAnsCount
,
ansC
ount
);
questtionList
.
add
(
touTiaoQuestion
);
}
catch
(
Exception
e
)
{
logger
.
info
(
"头条问答解析数据出现问题"
,
e
.
fillInStackTrace
()
);
logger
.
info
(
"头条问答解析数据出现问题"
,
e
);
continue
;
}
}
...
...
src/test/java/com/zhiwei/toutiao/test/TouTiaoAccountExample.java
View file @
643adf87
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//
//import org.junit.Test;
//
//import com.zhiwei.toutiao.bean.TouTiaoAccount;
//import com.zhiwei.toutiao.parse.TouTiaoAccountParse;
//
///**
// * @ClassName: TouTiaoAccountExample
// * @Description: TODO(今日头条帐号采集)
// * @author hero
// * @date 2017年10月17日 下午4:03:44
// */
//public class TouTiaoAccountExample {
//
// public void touTiaoAccountTest(){
// String word = "华尔街瞭望";
// System.out.println("===================="+TouTiaoAccountParse.getTouTiaoAccountInfoByName(word, null));
// }
//
//
//
// @Test
// public void touTiaoAccountFriendTest(){
// String userid = "3350881978";
// List<TouTiaoAccount> userList = TouTiaoAccountParse.getFriendsList(userid, null,1000);
// for(TouTiaoAccount tta : userList){
// System.out.println(tta);
// }
// }
//}
package
com
.
zhiwei
.
toutiao
.
test
;
import
java.util.List
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.toutiao.bean.TouTiaoAccount
;
import
com.zhiwei.toutiao.parse.TouTiaoAccountParse
;
/**
* @ClassName: TouTiaoAccountExample
* @Description: TODO(今日头条帐号采集)
* @author hero
* @date 2017年10月17日 下午4:03:44
*/
public
class
TouTiaoAccountExample
{
private
static
final
String
registry
=
"zookeeper://192.168.0.36:2181"
;
private
static
final
String
group
=
"local"
;
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
registry
,
group
,
GroupType
.
PROVIDER
);
touTiaoAccountFriendTest
();
}
public
void
touTiaoAccountTest
(){
String
word
=
"华尔街瞭望"
;
System
.
out
.
println
(
"===================="
+
TouTiaoAccountParse
.
getTouTiaoAccountInfoByName
(
word
,
null
));
}
public
static
void
touTiaoAccountFriendTest
(){
String
userid
=
"3478445819704347"
;
List
<
TouTiaoAccount
>
userList
=
TouTiaoAccountParse
.
getFriendsList
(
userid
,
ProxyHolder
.
NAT_HEAVY_PROXY
);
for
(
TouTiaoAccount
tta
:
userList
){
System
.
out
.
println
(
tta
);
}
}
}
src/test/java/com/zhiwei/toutiao/test/TouTiaoExample.java
View file @
643adf87
/**
* @Title: TouTiaoExample.java
* @Package com.zhiwei.toutiao.test
* @Description:
* @author hero
* @date 2016年9月2日 上午11:48:51
* @version V1.0
*/
/**
*
*/
package
com
.
zhiwei
.
toutiao
.
test
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.toutiao.bean.TouTiaoArticle
;
import
com.zhiwei.toutiao.parse.TouTiaoArticleParse
;
/**
* @Description:
* @author hero
* @date 2016年9月2日 上午11:48:51
*/
public
class
TouTiaoExample
{
private
static
final
String
registry
=
"zookeeper://192.168.0.36:2181"
;
private
static
final
String
group
=
"local"
;
@SuppressWarnings
(
"unchecked"
)
public
static
void
main
(
String
[]
args
)
throws
Exception
{
ProxyFactory
.
init
(
registry
,
group
,
GroupType
.
PROVIDER
);
String
url
=
"https://www.toutiao.com/a6659244827009352196/"
;
String
content
=
TouTiaoArticleParse
.
getContent
(
url
,
null
);
System
.
out
.
println
(
content
);
// List<String> urlList = new ArrayList<String>();
// urlList.add("1920576965");
// Date endTime = TimeParse.stringFormartDate("2018-10-01");
///**
// * @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// * @version V1.0
// */
///**
//*
//*/
//package com.zhiwei.toutiao.test;
//
// for (String url : urlList) {
// long a = System.currentTimeMillis();
// String mid = url;
// Long max_behot_time = 0L;
// List<TouTiaoArticle> list = new ArrayList<>();
// boolean f = true;
// while (f) {
// Map<String, Object> dataMap = null;
// dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY);
// if (dataMap != null && !dataMap.isEmpty()) {
// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
// max_behot_time = (Long)dataMap.get("max_behot_time");
// System.out.println(max_behot_time + "=======" + ttlist.size());
// if (null == max_behot_time || ttlist.isEmpty()) {
// f = false;
// } else {
// if (ttlist.size() > 0) {
// list.addAll(ttlist);
// }
// }
// }else{
// f = false;
// }
// }
// long b = System.currentTimeMillis();
// System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size());
// }
}
}
//import java.util.ArrayList;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.tools.timeparse.TimeParse;
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
//
///**
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// */
//public class TouTiaoExample {
//
// private static final String registry = "zookeeper://192.168.0.36:2181";
// private static final String group = "local";
//
// @SuppressWarnings("unchecked")
// public static void main(String[] args) throws Exception {
//
// ProxyFactory.init(registry, group, GroupType.PROVIDER);
// String url = "https://www.toutiao.com/a6659244827009352196/";
// String content = TouTiaoArticleParse.getContent(url, null);
// System.out.println(content);
//
//// List<String> urlList = new ArrayList<String>();
//// urlList.add("1920576965");
//// Date endTime = TimeParse.stringFormartDate("2018-10-01");
////
//// for (String url : urlList) {
//// long a = System.currentTimeMillis();
//// String mid = url;
//// Long max_behot_time = 0L;
//// List<TouTiaoArticle> list = new ArrayList<>();
//// boolean f = true;
//// while (f) {
//// Map<String, Object> dataMap = null;
//// dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY);
//// if (dataMap != null && !dataMap.isEmpty()) {
//// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
//// max_behot_time = (Long)dataMap.get("max_behot_time");
//// System.out.println(max_behot_time + "=======" + ttlist.size());
//// if (null == max_behot_time || ttlist.isEmpty()) {
//// f = false;
//// } else {
//// if (ttlist.size() > 0) {
//// list.addAll(ttlist);
//// }
//// }
//// }else{
//// f = false;
//// }
//// }
//// long b = System.currentTimeMillis();
//// System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size());
//// }
//
// }
//
//}
src/test/java/com/zhiwei/toutiao/test/TouTiaoMicroExample.java
View file @
643adf87
///**
// * @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// * @version V1.0
// */
//package com.zhiwei.toutiao.test;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
//
///**
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// */
//public class TouTiaoMicroExample {
//
// public static void main(String[] args) throws Exception {
// long a = System.currentTimeMillis();
// String user_id = "55301399445";
// Date date = new Date((new Date().getTime()-24*60*60*1000));
// parseMicroTouTiao(user_id, date);
// long b = System.currentTimeMillis();
// System.out.println("一轮的采集时间为:" + (b - a) / 1000);
//
// }
//
//
// @SuppressWarnings("unchecked")
// public static void parseMicroTouTiao(String user_id, Date endDate){
// int count = 1;
// boolean f = true;
// String max_behot_time = null;
// while(f)
// {
// if(count==3){
// f = false;
// }
// for(int i=0; i<3; i++){
// try {
// Map<String, Object> dataMap = TouTiaoArticleParse.getMicroTouTiaoCrawler(user_id, endDate, null, max_behot_time);
// List<TouTiaoArticle> ttlist = null;
// if(dataMap!=null && !dataMap.isEmpty())
// {
// ttlist = (List<TouTiaoArticle>) dataMap.get("data");
// max_behot_time = dataMap.get("max_behot_time")!=null?dataMap.get("max_behot_time").toString():null;
// if (ttlist!=null && ttlist.size() > 0)
// {
// for(TouTiaoArticle touTiaoArticle : ttlist){
// System.out.println(TimeParse.dateFormartString(touTiaoArticle.getTime(), "yyyy-MM-dd HH:mm:ss"));
// }
// }
// count++;
// break;
// }else{
// continue;
// }
// } catch (Exception e) {
// e.printStackTrace();
// continue;
// }
// }
// ZhiWeiTools.sleep(7000);
// }
// }
//
//
//
//}
/**
* @Title: TouTiaoExample.java
* @Package com.zhiwei.toutiao.test
* @Description:
* @author hero
* @date 2016年9月2日 上午11:48:51
* @version V1.0
*//*
package com.zhiwei.toutiao.test;
import java.util.Date;
import java.util.List;
import java.util.Map;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
*//**
* @Description:
* @author hero
* @date 2016年9月2日 上午11:48:51
*//*
public class TouTiaoMicroExample {
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
public static void main(String[] args) throws Exception {
Mongo mongo = new Mongo("192.168.0.81", 27017);
DB db = mongo.getDB("toutiao");
DBCollection coll = db.getCollection("aaaa");
ProxyFactory.init(registry, group, GroupType.PROVIDER);
long a = System.currentTimeMillis();
String user_id = "3527019566";
Date date = TimeParse.stringFormartDate("2019-01-01 00:00:00");
parseMicroTouTiao(user_id, date, coll);
long b = System.currentTimeMillis();
System.out.println("一轮的采集时间为:" + (b - a) / 1000);
mongo.close();
}
@SuppressWarnings("unchecked")
public static void parseMicroTouTiao(String user_id, Date endDate,DBCollection coll){
int count = 1;
boolean f = true;
String max_behot_time = null;
while(f)
{
if(count==3){
f = false;
}
for(int i=0; i<3; i++){
try {
Map<String, Object> dataMap = TouTiaoArticleParse.getMicroTouTiaoCrawler(user_id, endDate, null, max_behot_time);
List<TouTiaoArticle> ttlist = null;
if(dataMap!=null && !dataMap.isEmpty())
{
ttlist = (List<TouTiaoArticle>) dataMap.get("data");
max_behot_time = dataMap.get("max_behot_time")!=null?dataMap.get("max_behot_time").toString():null;
if (ttlist!=null && ttlist.size() > 0)
{
System.out.println(max_behot_time+"===="+ttlist.size());
for(TouTiaoArticle touTiaoArticle : ttlist){
Map map = JSONObject.toJavaObject((JSON)JSONObject.toJSON(touTiaoArticle), Map.class);
DBObject document = new BasicDBObject(map);
coll.save(document);
}
}
break;
}else{
continue;
}
} catch (Exception e) {
e.printStackTrace();
continue;
}
}
}
}
}
*/
\ No newline at end of file
src/test/java/com/zhiwei/toutiao/test/TouTiaoQuestionAnswerExample.java
View file @
643adf87
...
...
@@ -5,11 +5,12 @@
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.proxyip.util.Tools;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.tools.timeparse.TimeParse;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//import com.zhiwei.toutiao.bean.TouTiaoQuestionAnswer;
//import com.zhiwei.toutiao.parse.TouTiaoQuestionAnswerParse;
//import com.zhiwei.zhiweiTools.excel.PoiExcelUtil;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//import com.zhiwei.toutiao.util.Tools;
//
///**
// * @ClassName: TouTiaoQuestionAnswerExample
...
...
@@ -79,7 +80,6 @@
// nextPage = 1;
// }
// System.out.println(page+"=========="+nextPage+"============"+req_type);
// Tools.sleep(8000);
// }
// }
//
...
...
src/test/java/com/zhiwei/toutiao/test/TouTiaoQuestionExample.java
View file @
643adf87
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//
//import org.junit.Test;
//
//import com.zhiwei.toutiao.bean.TouTiaoQuestion;
//import com.zhiwei.toutiao.parse.TouTiaoQuestionParse;
//import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
//
///**
// * @ClassName: TouTiaoQuestionExample
// * @Description: TODO(头条问答采集测试类)
// * @author hero
// * @date 2017年7月20日 下午3:06:51
// */
//public class TouTiaoQuestionExample {
//
//
//
// @Test
// public void touTiaoQuestionTest(){
// String word = "京东";
//
// String url = "https://www.wukong.com/wenda/web/search/question/brow/?search_text="+
// URLCodeUtil.getURLEncode(word, "UTF-8")+"&count=15";
//
// List<TouTiaoQuestion> list = TouTiaoQuestionParse.getSearchTouTiaoQuestion(url);
// System.out.println(list.size());
// for(TouTiaoQuestion question : list){
// System.out.println(question);
// }
// }
//
//}
package
com
.
zhiwei
.
toutiao
.
test
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.jupiter.api.Test
;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DB
;
import
com.mongodb.DBCollection
;
import
com.mongodb.DBObject
;
import
com.mongodb.Mongo
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.bean.ExcelResult
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.toutiao.bean.TouTiaoQuestion
;
import
com.zhiwei.toutiao.parse.TouTiaoQuestionParse
;
/**
* @ClassName: TouTiaoQuestionExample
* @Description: TODO(头条问答采集测试类)
* @author hero
* @date 2017年7月20日 下午3:06:51
*/
public
class
TouTiaoQuestionExample
{
private
static
final
String
registry
=
"zookeeper://192.168.0.36:2181"
;
private
static
final
String
group
=
"local"
;
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
registry
,
group
,
GroupType
.
PROVIDER
);
Mongo
mongo
=
new
Mongo
(
"192.168.0.81"
,
27017
);
DB
db
=
mongo
.
getDB
(
"wukong"
);
DBCollection
coll
=
db
.
getCollection
(
"wukong"
);
touTiaoQuestionTest
(
coll
);
}
public
static
void
touTiaoQuestionTest
(
DBCollection
coll
)
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
ExcelResult
excelResult
=
poi
.
importExcelResult
(
"C:\\Users\\qq859\\Desktop\\悟空问答关键词.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
dataList
=
excelResult
.
getBodyList
();
for
(
Map
<
String
,
Object
>
data
:
dataList
)
{
String
word
=
data
.
get
(
"关键词"
).
toString
();
System
.
out
.
println
(
"word================"
+
word
);
List
<
TouTiaoQuestion
>
list
=
TouTiaoQuestionParse
.
getSearchTouTiaoQuestion
(
word
);
System
.
out
.
println
(
list
.
size
());
for
(
TouTiaoQuestion
question
:
list
){
String
jsonStr
=
JSONObject
.
toJSONString
(
question
);
Map
dataMap
=
JSONObject
.
toJavaObject
(
JSONObject
.
parseObject
(
jsonStr
),
Map
.
class
);
dataMap
.
put
(
"word"
,
word
);
coll
.
save
(
new
BasicDBObject
(
dataMap
));
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment