Commit 643adf87 by zhiwei

修复头条获取关注列表不翻页的情况

parent e43ea617
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>toutiao</artifactId>
<version>0.3.9-SNAPSHOT</version>
<version>0.4.0-SNAPSHOT</version>
<dependencies>
<dependency>
......
......@@ -178,11 +178,13 @@ public class TouTiaoAccountParse {
Signature signature = new Signature(userId, "0");
String signatureStr = signature.getSignature();
boolean more = true;
long cursor = 0;
while(more){
String url = "https://www.toutiao.com/c/user/following/?user_id="+ userId +"&cursor=0&count=100&_signature="+ signatureStr;
String url = "https://www.toutiao.com/c/user/following/?user_id="+ userId +"&cursor="+ cursor +"&count=100&_signature="+ signatureStr;
headerMap = Tools.getTouTiaoHeader();
headerMap.put("referer", "ihttps://www.toutiao.com/c/user/relation/"+ userId +"/?tab=following");
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
logger.info("链接地址为:{}", url);
for(int i=0;i<3;i++){
try {
String htmlBody = null;
......@@ -190,14 +192,18 @@ public class TouTiaoAccountParse {
if(htmlBody != null && htmlBody.contains("name")){
JSONObject json = JSONObject.parseObject(htmlBody);
more = json.getBooleanValue("has_more");
cursor = json.getLongValue("cursor");
List<TouTiaoAccount> dataList = parseFans(json);
logger.info(cursor+"=========="+dataList.size());
if(dataList!=null && !dataList.isEmpty()){
ttaList.addAll(dataList);
break;
}else{
more = false;
}
}else{
more = false;
logger.info("数据结构错误,请检查链接:{},页面信息为:{}", url, htmlBody);
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
......
package com.zhiwei.toutiao.parse;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......@@ -14,6 +16,7 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.toutiao.bean.TouTiaoQuestion;
import com.zhiwei.toutiao.util.Tools;
......@@ -25,7 +28,6 @@ import com.zhiwei.toutiao.util.Tools;
*/
public class TouTiaoQuestionParse {
private static Map<String, String> headerMap;
private static Logger logger = LogManager.getLogger(TouTiaoQuestionParse.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
......@@ -41,21 +43,32 @@ public class TouTiaoQuestionParse {
* @return List<TouTiaoQuestion> 返回类型
* @throws Exception
*/
public static List<TouTiaoQuestion> getSearchTouTiaoQuestion(String url) throws Exception {
List<TouTiaoQuestion> questtionList = new ArrayList<TouTiaoQuestion>();
headerMap = Tools.getTouTiaoQuestionHeader();
headerMap.put("referer", url);
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null) {
List<TouTiaoQuestion> ttList = parseHtmlByQuestion(htmlBody);
if (ttList != null && ttList.size() > 0) {
return ttList;
}
}
} catch (Exception e) {
throw e;
public static List<TouTiaoQuestion> getSearchTouTiaoQuestion(String word){
List<TouTiaoQuestion> questtionList = new ArrayList<>();
boolean more = true;
int page = 0;
while(more) {
String url = "https://www.wukong.com/wenda/web/search/question/brow/?search_text=" + URLCodeUtil.getURLEncode(word, "UTF-8")+"&count=10&offset=" +page*10;
Map<String, String> headerMap = Tools.getTouTiaoQuestionHeader();
headerMap.put("referer", url);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (Objects.nonNull(htmlBody) && htmlBody.contains("question")) {
List<TouTiaoQuestion> ttList = parseHtmlByQuestion(htmlBody);
if (Objects.nonNull(ttList) && !ttList.isEmpty()) {
questtionList.addAll(ttList);
}
JSONObject dataJSON = JSONObject.parseObject(htmlBody).getJSONObject("data");
more = dataJSON.getBooleanValue("has_more");
page++;
}else {
more = false;
}
} catch (IOException e) {
logger.info("头条问答解析数据出现问题", e);
}
}
return questtionList;
}
......@@ -71,15 +84,13 @@ public class TouTiaoQuestionParse {
* @return List<TouTiaoQuestion> 返回类型
*/
private static List<TouTiaoQuestion> parseHtmlByQuestion(String htmlBody) {
List<TouTiaoQuestion> questtionList = new ArrayList<TouTiaoQuestion>();
List<TouTiaoQuestion> questtionList = new ArrayList<>();
JSONObject jsonObject = JSONObject.parseObject(htmlBody);
String err_tips = jsonObject.getString("err_tips");
if (err_tips.equals("success")) {
String errTips = jsonObject.getString("err_tips");
if (errTips.equals("success")) {
JSONObject json = jsonObject.getJSONObject("data");
JSONArray jsonArray = json.getJSONArray("feed_question");
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject question = jsonArray.getJSONObject(i).getJSONObject("question");
......@@ -88,18 +99,17 @@ public class TouTiaoQuestionParse {
String url = "http://www.toutiao.com/a" + question.getString("qid") + "/";
Date time = TimeParse.stringFormartDate(question.getLong("create_time") * 1000L + "");
String source = question.getJSONObject("user").getString("uname");
int follow_count = question.getIntValue("follow_count");
int nice_ans_count = question.getIntValue("nice_ans_count");
int normal_ans_count = question.getIntValue("normal_ans_count");
int ans_count = nice_ans_count + normal_ans_count;
int followCount = question.getIntValue("follow_count");
int niceAnsCount = question.getIntValue("nice_ans_count");
int normalAnsCount = question.getIntValue("normal_ans_count");
int ansCount = niceAnsCount + normalAnsCount;
TouTiaoQuestion touTiaoQuestion = new TouTiaoQuestion(url, title, source, content, time,
follow_count, nice_ans_count, normal_ans_count, ans_count);
followCount, niceAnsCount, normalAnsCount, ansCount);
questtionList.add(touTiaoQuestion);
} catch (Exception e) {
logger.info("头条问答解析数据出现问题", e.fillInStackTrace());
logger.info("头条问答解析数据出现问题", e);
continue;
}
}
......
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//
//import org.junit.Test;
//
//import com.zhiwei.toutiao.bean.TouTiaoAccount;
//import com.zhiwei.toutiao.parse.TouTiaoAccountParse;
//
///**
// * @ClassName: TouTiaoAccountExample
// * @Description: TODO(今日头条帐号采集)
// * @author hero
// * @date 2017年10月17日 下午4:03:44
// */
//public class TouTiaoAccountExample {
//
// public void touTiaoAccountTest(){
// String word = "华尔街瞭望";
// System.out.println("===================="+TouTiaoAccountParse.getTouTiaoAccountInfoByName(word, null));
// }
//
//
//
// @Test
// public void touTiaoAccountFriendTest(){
// String userid = "3350881978";
// List<TouTiaoAccount> userList = TouTiaoAccountParse.getFriendsList(userid, null,1000);
// for(TouTiaoAccount tta : userList){
// System.out.println(tta);
// }
// }
//}
package com.zhiwei.toutiao.test;
import java.util.List;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.toutiao.bean.TouTiaoAccount;
import com.zhiwei.toutiao.parse.TouTiaoAccountParse;
/**
* @ClassName: TouTiaoAccountExample
* @Description: TODO(今日头条帐号采集)
* @author hero
* @date 2017年10月17日 下午4:03:44
*/
public class TouTiaoAccountExample {
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
public static void main(String[] args) {
ProxyFactory.init(registry, group, GroupType.PROVIDER);
touTiaoAccountFriendTest();
}
public void touTiaoAccountTest(){
String word = "华尔街瞭望";
System.out.println("===================="+TouTiaoAccountParse.getTouTiaoAccountInfoByName(word, null));
}
public static void touTiaoAccountFriendTest(){
String userid = "3478445819704347";
List<TouTiaoAccount> userList = TouTiaoAccountParse.getFriendsList(userid, ProxyHolder.NAT_HEAVY_PROXY);
for(TouTiaoAccount tta : userList){
System.out.println(tta);
}
}
}
/**
* @Title: TouTiaoExample.java
* @Package com.zhiwei.toutiao.test
* @Description:
* @author hero
* @date 2016年9月2日 上午11:48:51
* @version V1.0
*/
/**
*
*/
package com.zhiwei.toutiao.test;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
/**
* @Description:
* @author hero
* @date 2016年9月2日 上午11:48:51
*/
public class TouTiaoExample {
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
@SuppressWarnings("unchecked")
public static void main(String[] args) throws Exception {
ProxyFactory.init(registry, group, GroupType.PROVIDER);
String url = "https://www.toutiao.com/a6659244827009352196/";
String content = TouTiaoArticleParse.getContent(url, null);
System.out.println(content);
// List<String> urlList = new ArrayList<String>();
// urlList.add("1920576965");
// Date endTime = TimeParse.stringFormartDate("2018-10-01");
///**
// * @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// * @version V1.0
// */
///**
//*
//*/
//package com.zhiwei.toutiao.test;
//
// for (String url : urlList) {
// long a = System.currentTimeMillis();
// String mid = url;
// Long max_behot_time = 0L;
// List<TouTiaoArticle> list = new ArrayList<>();
// boolean f = true;
// while (f) {
// Map<String, Object> dataMap = null;
// dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY);
// if (dataMap != null && !dataMap.isEmpty()) {
// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
// max_behot_time = (Long)dataMap.get("max_behot_time");
// System.out.println(max_behot_time + "=======" + ttlist.size());
// if (null == max_behot_time || ttlist.isEmpty()) {
// f = false;
// } else {
// if (ttlist.size() > 0) {
// list.addAll(ttlist);
// }
// }
// }else{
// f = false;
// }
// }
// long b = System.currentTimeMillis();
// System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size());
// }
}
}
//import java.util.ArrayList;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.tools.timeparse.TimeParse;
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
//
///**
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// */
//public class TouTiaoExample {
//
// private static final String registry = "zookeeper://192.168.0.36:2181";
// private static final String group = "local";
//
// @SuppressWarnings("unchecked")
// public static void main(String[] args) throws Exception {
//
// ProxyFactory.init(registry, group, GroupType.PROVIDER);
// String url = "https://www.toutiao.com/a6659244827009352196/";
// String content = TouTiaoArticleParse.getContent(url, null);
// System.out.println(content);
//
//// List<String> urlList = new ArrayList<String>();
//// urlList.add("1920576965");
//// Date endTime = TimeParse.stringFormartDate("2018-10-01");
////
//// for (String url : urlList) {
//// long a = System.currentTimeMillis();
//// String mid = url;
//// Long max_behot_time = 0L;
//// List<TouTiaoArticle> list = new ArrayList<>();
//// boolean f = true;
//// while (f) {
//// Map<String, Object> dataMap = null;
//// dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY);
//// if (dataMap != null && !dataMap.isEmpty()) {
//// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
//// max_behot_time = (Long)dataMap.get("max_behot_time");
//// System.out.println(max_behot_time + "=======" + ttlist.size());
//// if (null == max_behot_time || ttlist.isEmpty()) {
//// f = false;
//// } else {
//// if (ttlist.size() > 0) {
//// list.addAll(ttlist);
//// }
//// }
//// }else{
//// f = false;
//// }
//// }
//// long b = System.currentTimeMillis();
//// System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size());
//// }
//
// }
//
//}
///**
// * @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// * @version V1.0
// */
//package com.zhiwei.toutiao.test;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
//
///**
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// */
//public class TouTiaoMicroExample {
//
// public static void main(String[] args) throws Exception {
// long a = System.currentTimeMillis();
// String user_id = "55301399445";
// Date date = new Date((new Date().getTime()-24*60*60*1000));
// parseMicroTouTiao(user_id, date);
// long b = System.currentTimeMillis();
// System.out.println("一轮的采集时间为:" + (b - a) / 1000);
//
// }
//
//
// @SuppressWarnings("unchecked")
// public static void parseMicroTouTiao(String user_id, Date endDate){
// int count = 1;
// boolean f = true;
// String max_behot_time = null;
// while(f)
// {
// if(count==3){
// f = false;
// }
// for(int i=0; i<3; i++){
// try {
// Map<String, Object> dataMap = TouTiaoArticleParse.getMicroTouTiaoCrawler(user_id, endDate, null, max_behot_time);
// List<TouTiaoArticle> ttlist = null;
// if(dataMap!=null && !dataMap.isEmpty())
// {
// ttlist = (List<TouTiaoArticle>) dataMap.get("data");
// max_behot_time = dataMap.get("max_behot_time")!=null?dataMap.get("max_behot_time").toString():null;
// if (ttlist!=null && ttlist.size() > 0)
// {
// for(TouTiaoArticle touTiaoArticle : ttlist){
// System.out.println(TimeParse.dateFormartString(touTiaoArticle.getTime(), "yyyy-MM-dd HH:mm:ss"));
// }
// }
// count++;
// break;
// }else{
// continue;
// }
// } catch (Exception e) {
// e.printStackTrace();
// continue;
// }
// }
// ZhiWeiTools.sleep(7000);
// }
// }
//
//
//
//}
/**
* @Title: TouTiaoExample.java
* @Package com.zhiwei.toutiao.test
* @Description:
* @author hero
* @date 2016年9月2日 上午11:48:51
* @version V1.0
*//*
package com.zhiwei.toutiao.test;
import java.util.Date;
import java.util.List;
import java.util.Map;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
*//**
* @Description:
* @author hero
* @date 2016年9月2日 上午11:48:51
*//*
public class TouTiaoMicroExample {
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
public static void main(String[] args) throws Exception {
Mongo mongo = new Mongo("192.168.0.81", 27017);
DB db = mongo.getDB("toutiao");
DBCollection coll = db.getCollection("aaaa");
ProxyFactory.init(registry, group, GroupType.PROVIDER);
long a = System.currentTimeMillis();
String user_id = "3527019566";
Date date = TimeParse.stringFormartDate("2019-01-01 00:00:00");
parseMicroTouTiao(user_id, date, coll);
long b = System.currentTimeMillis();
System.out.println("一轮的采集时间为:" + (b - a) / 1000);
mongo.close();
}
@SuppressWarnings("unchecked")
public static void parseMicroTouTiao(String user_id, Date endDate,DBCollection coll){
int count = 1;
boolean f = true;
String max_behot_time = null;
while(f)
{
if(count==3){
f = false;
}
for(int i=0; i<3; i++){
try {
Map<String, Object> dataMap = TouTiaoArticleParse.getMicroTouTiaoCrawler(user_id, endDate, null, max_behot_time);
List<TouTiaoArticle> ttlist = null;
if(dataMap!=null && !dataMap.isEmpty())
{
ttlist = (List<TouTiaoArticle>) dataMap.get("data");
max_behot_time = dataMap.get("max_behot_time")!=null?dataMap.get("max_behot_time").toString():null;
if (ttlist!=null && ttlist.size() > 0)
{
System.out.println(max_behot_time+"===="+ttlist.size());
for(TouTiaoArticle touTiaoArticle : ttlist){
Map map = JSONObject.toJavaObject((JSON)JSONObject.toJSON(touTiaoArticle), Map.class);
DBObject document = new BasicDBObject(map);
coll.save(document);
}
}
break;
}else{
continue;
}
} catch (Exception e) {
e.printStackTrace();
continue;
}
}
}
}
}
*/
\ No newline at end of file
......@@ -5,11 +5,12 @@
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.proxyip.util.Tools;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.tools.timeparse.TimeParse;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//import com.zhiwei.toutiao.bean.TouTiaoQuestionAnswer;
//import com.zhiwei.toutiao.parse.TouTiaoQuestionAnswerParse;
//import com.zhiwei.zhiweiTools.excel.PoiExcelUtil;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//import com.zhiwei.toutiao.util.Tools;
//
///**
// * @ClassName: TouTiaoQuestionAnswerExample
......@@ -79,7 +80,6 @@
// nextPage = 1;
// }
// System.out.println(page+"=========="+nextPage+"============"+req_type);
// Tools.sleep(8000);
// }
// }
//
......
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//
//import org.junit.Test;
//
//import com.zhiwei.toutiao.bean.TouTiaoQuestion;
//import com.zhiwei.toutiao.parse.TouTiaoQuestionParse;
//import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
//
///**
// * @ClassName: TouTiaoQuestionExample
// * @Description: TODO(头条问答采集测试类)
// * @author hero
// * @date 2017年7月20日 下午3:06:51
// */
//public class TouTiaoQuestionExample {
//
//
//
// @Test
// public void touTiaoQuestionTest(){
// String word = "京东";
//
// String url = "https://www.wukong.com/wenda/web/search/question/brow/?search_text="+
// URLCodeUtil.getURLEncode(word, "UTF-8")+"&count=15";
//
// List<TouTiaoQuestion> list = TouTiaoQuestionParse.getSearchTouTiaoQuestion(url);
// System.out.println(list.size());
// for(TouTiaoQuestion question : list){
// System.out.println(question);
// }
// }
//
//}
package com.zhiwei.toutiao.test;
import java.util.List;
import java.util.Map;
import org.junit.jupiter.api.Test;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.bean.ExcelResult;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.toutiao.bean.TouTiaoQuestion;
import com.zhiwei.toutiao.parse.TouTiaoQuestionParse;
/**
* @ClassName: TouTiaoQuestionExample
* @Description: TODO(头条问答采集测试类)
* @author hero
* @date 2017年7月20日 下午3:06:51
*/
public class TouTiaoQuestionExample {
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
public static void main(String[] args) {
ProxyFactory.init(registry, group, GroupType.PROVIDER);
Mongo mongo = new Mongo("192.168.0.81", 27017);
DB db = mongo.getDB("wukong");
DBCollection coll = db.getCollection("wukong");
touTiaoQuestionTest(coll);
}
public static void touTiaoQuestionTest(DBCollection coll) {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
ExcelResult excelResult = poi.importExcelResult("C:\\Users\\qq859\\Desktop\\悟空问答关键词.xlsx", 0);
List<Map<String,Object>> dataList = excelResult.getBodyList();
for(Map<String,Object> data : dataList) {
String word = data.get("关键词").toString();
System.out.println("word================"+word);
List<TouTiaoQuestion> list = TouTiaoQuestionParse.getSearchTouTiaoQuestion(word);
System.out.println(list.size());
for(TouTiaoQuestion question : list){
String jsonStr = JSONObject.toJSONString(question);
Map dataMap = JSONObject.toJavaObject(JSONObject.parseObject(jsonStr), Map.class);
dataMap.put("word", word);
coll.save(new BasicDBObject(dataMap));
}
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment