Commit 643adf87 by zhiwei

修复头条获取关注列表不翻页的情况

parent e43ea617
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>toutiao</artifactId> <artifactId>toutiao</artifactId>
<version>0.3.9-SNAPSHOT</version> <version>0.4.0-SNAPSHOT</version>
<dependencies> <dependencies>
<dependency> <dependency>
......
...@@ -178,11 +178,13 @@ public class TouTiaoAccountParse { ...@@ -178,11 +178,13 @@ public class TouTiaoAccountParse {
Signature signature = new Signature(userId, "0"); Signature signature = new Signature(userId, "0");
String signatureStr = signature.getSignature(); String signatureStr = signature.getSignature();
boolean more = true; boolean more = true;
long cursor = 0;
while(more){ while(more){
String url = "https://www.toutiao.com/c/user/following/?user_id="+ userId +"&cursor=0&count=100&_signature="+ signatureStr; String url = "https://www.toutiao.com/c/user/following/?user_id="+ userId +"&cursor="+ cursor +"&count=100&_signature="+ signatureStr;
headerMap = Tools.getTouTiaoHeader(); headerMap = Tools.getTouTiaoHeader();
headerMap.put("referer", "ihttps://www.toutiao.com/c/user/relation/"+ userId +"/?tab=following"); headerMap.put("referer", "ihttps://www.toutiao.com/c/user/relation/"+ userId +"/?tab=following");
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
logger.info("链接地址为:{}", url);
for(int i=0;i<3;i++){ for(int i=0;i<3;i++){
try { try {
String htmlBody = null; String htmlBody = null;
...@@ -190,14 +192,18 @@ public class TouTiaoAccountParse { ...@@ -190,14 +192,18 @@ public class TouTiaoAccountParse {
if(htmlBody != null && htmlBody.contains("name")){ if(htmlBody != null && htmlBody.contains("name")){
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject json = JSONObject.parseObject(htmlBody);
more = json.getBooleanValue("has_more"); more = json.getBooleanValue("has_more");
cursor = json.getLongValue("cursor");
List<TouTiaoAccount> dataList = parseFans(json); List<TouTiaoAccount> dataList = parseFans(json);
logger.info(cursor+"=========="+dataList.size());
if(dataList!=null && !dataList.isEmpty()){ if(dataList!=null && !dataList.isEmpty()){
ttaList.addAll(dataList); ttaList.addAll(dataList);
break;
}else{ }else{
more = false; more = false;
} }
}else{ }else{
more = false; more = false;
logger.info("数据结构错误,请检查链接:{},页面信息为:{}", url, htmlBody);
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace()); logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
......
package com.zhiwei.toutiao.parse; package com.zhiwei.toutiao.parse;
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
...@@ -14,6 +16,7 @@ import com.zhiwei.crawler.core.HttpBoot; ...@@ -14,6 +16,7 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.toutiao.bean.TouTiaoQuestion; import com.zhiwei.toutiao.bean.TouTiaoQuestion;
import com.zhiwei.toutiao.util.Tools; import com.zhiwei.toutiao.util.Tools;
...@@ -25,7 +28,6 @@ import com.zhiwei.toutiao.util.Tools; ...@@ -25,7 +28,6 @@ import com.zhiwei.toutiao.util.Tools;
*/ */
public class TouTiaoQuestionParse { public class TouTiaoQuestionParse {
private static Map<String, String> headerMap;
private static Logger logger = LogManager.getLogger(TouTiaoQuestionParse.class); private static Logger logger = LogManager.getLogger(TouTiaoQuestionParse.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
...@@ -41,21 +43,32 @@ public class TouTiaoQuestionParse { ...@@ -41,21 +43,32 @@ public class TouTiaoQuestionParse {
* @return List<TouTiaoQuestion> 返回类型 * @return List<TouTiaoQuestion> 返回类型
* @throws Exception * @throws Exception
*/ */
public static List<TouTiaoQuestion> getSearchTouTiaoQuestion(String url) throws Exception { public static List<TouTiaoQuestion> getSearchTouTiaoQuestion(String word){
List<TouTiaoQuestion> questtionList = new ArrayList<TouTiaoQuestion>(); List<TouTiaoQuestion> questtionList = new ArrayList<>();
headerMap = Tools.getTouTiaoQuestionHeader();
headerMap.put("referer", url); boolean more = true;
String htmlBody = null; int page = 0;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),ProxyHolder.NAT_HEAVY_PROXY).body().string(); while(more) {
if (htmlBody != null) { String url = "https://www.wukong.com/wenda/web/search/question/brow/?search_text=" + URLCodeUtil.getURLEncode(word, "UTF-8")+"&count=10&offset=" +page*10;
List<TouTiaoQuestion> ttList = parseHtmlByQuestion(htmlBody); Map<String, String> headerMap = Tools.getTouTiaoQuestionHeader();
if (ttList != null && ttList.size() > 0) { headerMap.put("referer", url);
return ttList; try {
} String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),ProxyHolder.NAT_HEAVY_PROXY).body().string();
} if (Objects.nonNull(htmlBody) && htmlBody.contains("question")) {
} catch (Exception e) { List<TouTiaoQuestion> ttList = parseHtmlByQuestion(htmlBody);
throw e; if (Objects.nonNull(ttList) && !ttList.isEmpty()) {
questtionList.addAll(ttList);
}
JSONObject dataJSON = JSONObject.parseObject(htmlBody).getJSONObject("data");
more = dataJSON.getBooleanValue("has_more");
page++;
}else {
more = false;
}
} catch (IOException e) {
logger.info("头条问答解析数据出现问题", e);
}
} }
return questtionList; return questtionList;
} }
...@@ -71,15 +84,13 @@ public class TouTiaoQuestionParse { ...@@ -71,15 +84,13 @@ public class TouTiaoQuestionParse {
* @return List<TouTiaoQuestion> 返回类型 * @return List<TouTiaoQuestion> 返回类型
*/ */
private static List<TouTiaoQuestion> parseHtmlByQuestion(String htmlBody) { private static List<TouTiaoQuestion> parseHtmlByQuestion(String htmlBody) {
List<TouTiaoQuestion> questtionList = new ArrayList<TouTiaoQuestion>(); List<TouTiaoQuestion> questtionList = new ArrayList<>();
JSONObject jsonObject = JSONObject.parseObject(htmlBody); JSONObject jsonObject = JSONObject.parseObject(htmlBody);
String err_tips = jsonObject.getString("err_tips"); String errTips = jsonObject.getString("err_tips");
if (err_tips.equals("success")) { if (errTips.equals("success")) {
JSONObject json = jsonObject.getJSONObject("data"); JSONObject json = jsonObject.getJSONObject("data");
JSONArray jsonArray = json.getJSONArray("feed_question"); JSONArray jsonArray = json.getJSONArray("feed_question");
for (int i = 0; i < jsonArray.size(); i++) { for (int i = 0; i < jsonArray.size(); i++) {
try { try {
JSONObject question = jsonArray.getJSONObject(i).getJSONObject("question"); JSONObject question = jsonArray.getJSONObject(i).getJSONObject("question");
...@@ -88,18 +99,17 @@ public class TouTiaoQuestionParse { ...@@ -88,18 +99,17 @@ public class TouTiaoQuestionParse {
String url = "http://www.toutiao.com/a" + question.getString("qid") + "/"; String url = "http://www.toutiao.com/a" + question.getString("qid") + "/";
Date time = TimeParse.stringFormartDate(question.getLong("create_time") * 1000L + ""); Date time = TimeParse.stringFormartDate(question.getLong("create_time") * 1000L + "");
String source = question.getJSONObject("user").getString("uname"); String source = question.getJSONObject("user").getString("uname");
int follow_count = question.getIntValue("follow_count"); int followCount = question.getIntValue("follow_count");
int nice_ans_count = question.getIntValue("nice_ans_count"); int niceAnsCount = question.getIntValue("nice_ans_count");
int normal_ans_count = question.getIntValue("normal_ans_count"); int normalAnsCount = question.getIntValue("normal_ans_count");
int ans_count = nice_ans_count + normal_ans_count; int ansCount = niceAnsCount + normalAnsCount;
TouTiaoQuestion touTiaoQuestion = new TouTiaoQuestion(url, title, source, content, time, TouTiaoQuestion touTiaoQuestion = new TouTiaoQuestion(url, title, source, content, time,
follow_count, nice_ans_count, normal_ans_count, ans_count); followCount, niceAnsCount, normalAnsCount, ansCount);
questtionList.add(touTiaoQuestion); questtionList.add(touTiaoQuestion);
} catch (Exception e) { } catch (Exception e) {
logger.info("头条问答解析数据出现问题", e.fillInStackTrace()); logger.info("头条问答解析数据出现问题", e);
continue; continue;
} }
} }
......
//package com.zhiwei.toutiao.test; package com.zhiwei.toutiao.test;
//
//import java.util.List; import java.util.List;
//
//import org.junit.Test; import com.zhiwei.common.config.GroupType;
// import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.toutiao.bean.TouTiaoAccount; import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.toutiao.parse.TouTiaoAccountParse; import com.zhiwei.toutiao.bean.TouTiaoAccount;
// import com.zhiwei.toutiao.parse.TouTiaoAccountParse;
///**
// * @ClassName: TouTiaoAccountExample /**
// * @Description: TODO(今日头条帐号采集) * @ClassName: TouTiaoAccountExample
// * @author hero * @Description: TODO(今日头条帐号采集)
// * @date 2017年10月17日 下午4:03:44 * @author hero
// */ * @date 2017年10月17日 下午4:03:44
//public class TouTiaoAccountExample { */
// public class TouTiaoAccountExample {
// public void touTiaoAccountTest(){
// String word = "华尔街瞭望"; private static final String registry = "zookeeper://192.168.0.36:2181";
// System.out.println("===================="+TouTiaoAccountParse.getTouTiaoAccountInfoByName(word, null)); private static final String group = "local";
// }
//
// public static void main(String[] args) {
// ProxyFactory.init(registry, group, GroupType.PROVIDER);
// @Test touTiaoAccountFriendTest();
// public void touTiaoAccountFriendTest(){
// String userid = "3350881978"; }
// List<TouTiaoAccount> userList = TouTiaoAccountParse.getFriendsList(userid, null,1000);
// for(TouTiaoAccount tta : userList){ public void touTiaoAccountTest(){
// System.out.println(tta); String word = "华尔街瞭望";
// } System.out.println("===================="+TouTiaoAccountParse.getTouTiaoAccountInfoByName(word, null));
// } }
//}
public static void touTiaoAccountFriendTest(){
String userid = "3478445819704347";
List<TouTiaoAccount> userList = TouTiaoAccountParse.getFriendsList(userid, ProxyHolder.NAT_HEAVY_PROXY);
for(TouTiaoAccount tta : userList){
System.out.println(tta);
}
}
}
/** ///**
* @Title: TouTiaoExample.java // * @Title: TouTiaoExample.java
* @Package com.zhiwei.toutiao.test // * @Package com.zhiwei.toutiao.test
* @Description: // * @Description:
* @author hero // * @author hero
* @date 2016年9月2日 上午11:48:51 // * @date 2016年9月2日 上午11:48:51
* @version V1.0 // * @version V1.0
*/ // */
/** ///**
* //*
*/ //*/
package com.zhiwei.toutiao.test; //package com.zhiwei.toutiao.test;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
/**
* @Description:
* @author hero
* @date 2016年9月2日 上午11:48:51
*/
public class TouTiaoExample {
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
@SuppressWarnings("unchecked")
public static void main(String[] args) throws Exception {
ProxyFactory.init(registry, group, GroupType.PROVIDER);
String url = "https://www.toutiao.com/a6659244827009352196/";
String content = TouTiaoArticleParse.getContent(url, null);
System.out.println(content);
// List<String> urlList = new ArrayList<String>();
// urlList.add("1920576965");
// Date endTime = TimeParse.stringFormartDate("2018-10-01");
// //
// for (String url : urlList) { //import java.util.ArrayList;
// long a = System.currentTimeMillis(); //import java.util.Date;
// String mid = url; //import java.util.List;
// Long max_behot_time = 0L; //import java.util.Map;
// List<TouTiaoArticle> list = new ArrayList<>(); //
// boolean f = true; //import com.zhiwei.common.config.GroupType;
// while (f) { //import com.zhiwei.crawler.proxy.ProxyFactory;
// Map<String, Object> dataMap = null; //import com.zhiwei.crawler.proxy.ProxyHolder;
// dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY); //import com.zhiwei.tools.timeparse.TimeParse;
// if (dataMap != null && !dataMap.isEmpty()) { //import com.zhiwei.toutiao.bean.TouTiaoArticle;
// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data"); //import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
// max_behot_time = (Long)dataMap.get("max_behot_time"); //
// System.out.println(max_behot_time + "=======" + ttlist.size()); ///**
// if (null == max_behot_time || ttlist.isEmpty()) { // * @Description:
// f = false; // * @author hero
// } else { // * @date 2016年9月2日 上午11:48:51
// if (ttlist.size() > 0) { // */
// list.addAll(ttlist); //public class TouTiaoExample {
// } //
// } // private static final String registry = "zookeeper://192.168.0.36:2181";
// }else{ // private static final String group = "local";
// f = false; //
// } // @SuppressWarnings("unchecked")
// } // public static void main(String[] args) throws Exception {
// long b = System.currentTimeMillis(); //
// System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size()); // ProxyFactory.init(registry, group, GroupType.PROVIDER);
// } // String url = "https://www.toutiao.com/a6659244827009352196/";
// String content = TouTiaoArticleParse.getContent(url, null);
} // System.out.println(content);
//
} //// List<String> urlList = new ArrayList<String>();
//// urlList.add("1920576965");
//// Date endTime = TimeParse.stringFormartDate("2018-10-01");
////
//// for (String url : urlList) {
//// long a = System.currentTimeMillis();
//// String mid = url;
//// Long max_behot_time = 0L;
//// List<TouTiaoArticle> list = new ArrayList<>();
//// boolean f = true;
//// while (f) {
//// Map<String, Object> dataMap = null;
//// dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY);
//// if (dataMap != null && !dataMap.isEmpty()) {
//// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
//// max_behot_time = (Long)dataMap.get("max_behot_time");
//// System.out.println(max_behot_time + "=======" + ttlist.size());
//// if (null == max_behot_time || ttlist.isEmpty()) {
//// f = false;
//// } else {
//// if (ttlist.size() > 0) {
//// list.addAll(ttlist);
//// }
//// }
//// }else{
//// f = false;
//// }
//// }
//// long b = System.currentTimeMillis();
//// System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size());
//// }
//
// }
//
//}
///** /**
// * @Title: TouTiaoExample.java * @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test * @Package com.zhiwei.toutiao.test
// * @Description: * @Description:
// * @author hero * @author hero
// * @date 2016年9月2日 上午11:48:51 * @date 2016年9月2日 上午11:48:51
// * @version V1.0 * @version V1.0
// */ *//*
//package com.zhiwei.toutiao.test; package com.zhiwei.toutiao.test;
//import java.util.Date; import java.util.Date;
//import java.util.List; import java.util.List;
//import java.util.Map; import java.util.Map;
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse; import com.alibaba.fastjson.JSON;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse; import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; import com.mongodb.BasicDBObject;
// import com.mongodb.DB;
///** import com.mongodb.DBCollection;
// * @Description: import com.mongodb.DBObject;
// * @author hero import com.mongodb.Mongo;
// * @date 2016年9月2日 上午11:48:51 import com.zhiwei.common.config.GroupType;
// */ import com.zhiwei.crawler.proxy.ProxyFactory;
//public class TouTiaoMicroExample { import com.zhiwei.tools.timeparse.TimeParse;
// import com.zhiwei.toutiao.bean.TouTiaoArticle;
// public static void main(String[] args) throws Exception { import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
// long a = System.currentTimeMillis();
// String user_id = "55301399445"; *//**
// Date date = new Date((new Date().getTime()-24*60*60*1000)); * @Description:
// parseMicroTouTiao(user_id, date); * @author hero
// long b = System.currentTimeMillis(); * @date 2016年9月2日 上午11:48:51
// System.out.println("一轮的采集时间为:" + (b - a) / 1000); *//*
// public class TouTiaoMicroExample {
// }
// private static final String registry = "zookeeper://192.168.0.36:2181";
// private static final String group = "local";
// @SuppressWarnings("unchecked")
// public static void parseMicroTouTiao(String user_id, Date endDate){
// int count = 1; public static void main(String[] args) throws Exception {
// boolean f = true;
// String max_behot_time = null;
// while(f) Mongo mongo = new Mongo("192.168.0.81", 27017);
// { DB db = mongo.getDB("toutiao");
// if(count==3){ DBCollection coll = db.getCollection("aaaa");
// f = false;
// } ProxyFactory.init(registry, group, GroupType.PROVIDER);
// for(int i=0; i<3; i++){
// try { long a = System.currentTimeMillis();
// Map<String, Object> dataMap = TouTiaoArticleParse.getMicroTouTiaoCrawler(user_id, endDate, null, max_behot_time); String user_id = "3527019566";
// List<TouTiaoArticle> ttlist = null; Date date = TimeParse.stringFormartDate("2019-01-01 00:00:00");
// if(dataMap!=null && !dataMap.isEmpty()) parseMicroTouTiao(user_id, date, coll);
// { long b = System.currentTimeMillis();
// ttlist = (List<TouTiaoArticle>) dataMap.get("data"); System.out.println("一轮的采集时间为:" + (b - a) / 1000);
// max_behot_time = dataMap.get("max_behot_time")!=null?dataMap.get("max_behot_time").toString():null;
// if (ttlist!=null && ttlist.size() > 0) mongo.close();
// { }
// for(TouTiaoArticle touTiaoArticle : ttlist){
// System.out.println(TimeParse.dateFormartString(touTiaoArticle.getTime(), "yyyy-MM-dd HH:mm:ss"));
// } @SuppressWarnings("unchecked")
// } public static void parseMicroTouTiao(String user_id, Date endDate,DBCollection coll){
// count++;
// break; int count = 1;
// }else{ boolean f = true;
// continue; String max_behot_time = null;
// } while(f)
// } catch (Exception e) { {
// e.printStackTrace(); if(count==3){
// continue; f = false;
// } }
// } for(int i=0; i<3; i++){
// ZhiWeiTools.sleep(7000); try {
// } Map<String, Object> dataMap = TouTiaoArticleParse.getMicroTouTiaoCrawler(user_id, endDate, null, max_behot_time);
// } List<TouTiaoArticle> ttlist = null;
// if(dataMap!=null && !dataMap.isEmpty())
// {
// ttlist = (List<TouTiaoArticle>) dataMap.get("data");
//} max_behot_time = dataMap.get("max_behot_time")!=null?dataMap.get("max_behot_time").toString():null;
if (ttlist!=null && ttlist.size() > 0)
{
System.out.println(max_behot_time+"===="+ttlist.size());
for(TouTiaoArticle touTiaoArticle : ttlist){
Map map = JSONObject.toJavaObject((JSON)JSONObject.toJSON(touTiaoArticle), Map.class);
DBObject document = new BasicDBObject(map);
coll.save(document);
}
}
break;
}else{
continue;
}
} catch (Exception e) {
e.printStackTrace();
continue;
}
}
}
}
}
*/
\ No newline at end of file
...@@ -5,11 +5,12 @@ ...@@ -5,11 +5,12 @@
//import java.util.List; //import java.util.List;
//import java.util.Map; //import java.util.Map;
// //
//import com.zhiwei.proxyip.util.Tools; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.tools.timeparse.TimeParse;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//import com.zhiwei.toutiao.bean.TouTiaoQuestionAnswer; //import com.zhiwei.toutiao.bean.TouTiaoQuestionAnswer;
//import com.zhiwei.toutiao.parse.TouTiaoQuestionAnswerParse; //import com.zhiwei.toutiao.parse.TouTiaoQuestionAnswerParse;
//import com.zhiwei.zhiweiTools.excel.PoiExcelUtil; //import com.zhiwei.toutiao.util.Tools;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
// //
///** ///**
// * @ClassName: TouTiaoQuestionAnswerExample // * @ClassName: TouTiaoQuestionAnswerExample
...@@ -79,7 +80,6 @@ ...@@ -79,7 +80,6 @@
// nextPage = 1; // nextPage = 1;
// } // }
// System.out.println(page+"=========="+nextPage+"============"+req_type); // System.out.println(page+"=========="+nextPage+"============"+req_type);
// Tools.sleep(8000);
// } // }
// } // }
// //
......
//package com.zhiwei.toutiao.test; package com.zhiwei.toutiao.test;
//
//import java.util.List; import java.util.List;
// import java.util.Map;
//import org.junit.Test;
// import org.junit.jupiter.api.Test;
//import com.zhiwei.toutiao.bean.TouTiaoQuestion;
//import com.zhiwei.toutiao.parse.TouTiaoQuestionParse; import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.zhiweiTools.tools.URLCodeUtil; import com.mongodb.BasicDBObject;
// import com.mongodb.DB;
///** import com.mongodb.DBCollection;
// * @ClassName: TouTiaoQuestionExample import com.mongodb.DBObject;
// * @Description: TODO(头条问答采集测试类) import com.mongodb.Mongo;
// * @author hero import com.zhiwei.common.config.GroupType;
// * @date 2017年7月20日 下午3:06:51 import com.zhiwei.crawler.proxy.ProxyFactory;
// */ import com.zhiwei.excelpoi.bean.ExcelResult;
//public class TouTiaoQuestionExample { import com.zhiwei.excelpoi.excel.PoiExcelUtil;
// import com.zhiwei.toutiao.bean.TouTiaoQuestion;
// import com.zhiwei.toutiao.parse.TouTiaoQuestionParse;
//
// @Test /**
// public void touTiaoQuestionTest(){ * @ClassName: TouTiaoQuestionExample
// String word = "京东"; * @Description: TODO(头条问答采集测试类)
// * @author hero
// String url = "https://www.wukong.com/wenda/web/search/question/brow/?search_text="+ * @date 2017年7月20日 下午3:06:51
// URLCodeUtil.getURLEncode(word, "UTF-8")+"&count=15"; */
// public class TouTiaoQuestionExample {
// List<TouTiaoQuestion> list = TouTiaoQuestionParse.getSearchTouTiaoQuestion(url);
// System.out.println(list.size()); private static final String registry = "zookeeper://192.168.0.36:2181";
// for(TouTiaoQuestion question : list){ private static final String group = "local";
// System.out.println(question);
// } public static void main(String[] args) {
// } ProxyFactory.init(registry, group, GroupType.PROVIDER);
//
//} Mongo mongo = new Mongo("192.168.0.81", 27017);
DB db = mongo.getDB("wukong");
DBCollection coll = db.getCollection("wukong");
touTiaoQuestionTest(coll);
}
public static void touTiaoQuestionTest(DBCollection coll) {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
ExcelResult excelResult = poi.importExcelResult("C:\\Users\\qq859\\Desktop\\悟空问答关键词.xlsx", 0);
List<Map<String,Object>> dataList = excelResult.getBodyList();
for(Map<String,Object> data : dataList) {
String word = data.get("关键词").toString();
System.out.println("word================"+word);
List<TouTiaoQuestion> list = TouTiaoQuestionParse.getSearchTouTiaoQuestion(word);
System.out.println(list.size());
for(TouTiaoQuestion question : list){
String jsonStr = JSONObject.toJSONString(question);
Map dataMap = JSONObject.toJavaObject(JSONObject.parseObject(jsonStr), Map.class);
dataMap.put("word", word);
coll.save(new BasicDBObject(dataMap));
}
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment