Commit 408ac5cd by [zhangzhiwei]

添加用户历史文章及关注列表新接口采集

parent a0aee201
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.0.9-SNAPSHOT</version> <version>0.1.0-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
......
...@@ -193,25 +193,28 @@ public class TouTiaoAccountParse { ...@@ -193,25 +193,28 @@ public class TouTiaoAccountParse {
headerMap = Tools.getTouTiaoHeader(); headerMap = Tools.getTouTiaoHeader();
headerMap.put("referer", "ihttps://www.toutiao.com/c/user/relation/"+ userid +"/?tab=following"); headerMap.put("referer", "ihttps://www.toutiao.com/c/user/relation/"+ userid +"/?tab=following");
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
try { for(int i=0;i<3;i++){
String htmlBody = null; try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string(); String htmlBody = null;
if(htmlBody != null && htmlBody.contains("name")){ htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
JSONObject json = JSONObject.parseObject(htmlBody); if(htmlBody != null && htmlBody.contains("name")){
more = json.getBooleanValue("has_more"); JSONObject json = JSONObject.parseObject(htmlBody);
List<TouTiaoAccount> dataList = parseFans(json); more = json.getBooleanValue("has_more");
if(dataList!=null && !dataList.isEmpty()){ List<TouTiaoAccount> dataList = parseFans(json);
ttaList.addAll(dataList); if(dataList!=null && !dataList.isEmpty()){
ttaList.addAll(dataList);
}else{
more = false;
}
}else{ }else{
more = false; more = false;
continue;
} }
}else{ } catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
more = false; more = false;
continue;
} }
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
more = false;
return null;
} }
} }
return ttaList; return ttaList;
......
...@@ -14,6 +14,7 @@ package com.zhiwei.toutiao.parse; ...@@ -14,6 +14,7 @@ package com.zhiwei.toutiao.parse;
import java.io.IOException; import java.io.IOException;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
...@@ -59,7 +60,6 @@ public class TouTiaoArticleParse { ...@@ -59,7 +60,6 @@ public class TouTiaoArticleParse {
if(max_behot_time!=null){ if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time; url = url + "&max_behot_time="+max_behot_time;
} }
System.out.println("url=========="+url);
Map<String,String> headerMap = Tools.getTouTiaoHeader(); Map<String,String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url); headerMap.put("Referer", url);
String htmlBody = null; String htmlBody = null;
...@@ -77,7 +77,7 @@ public class TouTiaoArticleParse { ...@@ -77,7 +77,7 @@ public class TouTiaoArticleParse {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace()); logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e; throw e;
} }
return null; return Collections.emptyMap();
} }
@Deprecated @Deprecated
...@@ -89,7 +89,6 @@ public class TouTiaoArticleParse { ...@@ -89,7 +89,6 @@ public class TouTiaoArticleParse {
if(max_behot_time!=null){ if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time; url = url + "&max_behot_time="+max_behot_time;
} }
System.out.println("url=========="+url);
Map<String,String> headerMap = Tools.getTouTiaoHeader(); Map<String,String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url); headerMap.put("Referer", url);
String htmlBody = null; String htmlBody = null;
...@@ -107,7 +106,7 @@ public class TouTiaoArticleParse { ...@@ -107,7 +106,7 @@ public class TouTiaoArticleParse {
logger.error("获取今日头条帐号数据连接超时", e); logger.error("获取今日头条帐号数据连接超时", e);
throw e; throw e;
} }
return null; return Collections.emptyMap();
} }
/** /**
...@@ -123,60 +122,66 @@ public class TouTiaoArticleParse { ...@@ -123,60 +122,66 @@ public class TouTiaoArticleParse {
Signature signature = new Signature(user_id, max_behot_time); Signature signature = new Signature(user_id, max_behot_time);
String as=signature.getAs(); String as=signature.getAs();
String cp=signature.getCp(); String cp=signature.getCp();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+signature; String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+_signature;
if(max_behot_time!=null){ if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time; url = url + "&max_behot_time="+max_behot_time;
} }
System.out.println("url=========="+url);
Map<String,String> headerMap = new HashMap<String,String>(); Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/"); headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/");
String htmlBody = null; String htmlBody = null;
try { for(int i=0;i<3;i++){
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); try {
if(htmlBody != null && htmlBody.contains("behot_time")){ htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
Map<String, Object> ttList = parseHtmlByAccount(user_id, htmlBody, endData); if(htmlBody != null && htmlBody.contains("behot_time")){
if(ttList!=null && ttList.size()>0){ Map<String, Object> ttList = parseHtmlByAccount(user_id, htmlBody, endData);
return ttList; if(ttList!=null && ttList.size()>0){
return ttList;
}
}else{
logger.info("数据为null");
continue;
} }
}else{ } catch (Exception e) {
logger.info("数据为null"); logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
} }
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
} }
return null; return Collections.emptyMap();
} }
public static Map<String, Object> getTouTiaoHistory(String user_id, String max_behot_time,Date endData, ProxyHolder proxy ) throws Exception{ public static Map<String, Object> getTouTiaoHistory(String user_id, String max_behot_time,Date endData, ProxyHolder proxy ) throws Exception{
Signature signature = new Signature(user_id, max_behot_time); Signature signature = new Signature(user_id, max_behot_time);
String as=signature.getAs(); String as=signature.getAs();
String cp=signature.getCp(); String cp=signature.getCp();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+signature; String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+_signature;
if(max_behot_time!=null){ if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time; url = url + "&max_behot_time="+max_behot_time;
} }
System.out.println("url=========="+url);
Map<String,String> headerMap = new HashMap<String,String>(); Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/"); headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/");
String htmlBody = null; String htmlBody = null;
try { for(int i=0;i<3;i++){
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); try {
if(htmlBody != null && htmlBody.contains("behot_time")){ htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
Map<String, Object> ttList = parseHtmlByAccount(user_id, htmlBody, endData); if(htmlBody != null && htmlBody.contains("behot_time")){
if(ttList!=null && ttList.size()>0){ Map<String, Object> ttList = parseHtmlByAccount(user_id, htmlBody, endData);
return ttList; if(ttList!=null && ttList.size()>0){
return ttList;
}
}else{
logger.info("数据为null");
continue;
} }
}else{ } catch (Exception e) {
logger.info("数据为null"); logger.error("获取今日头条帐号数据连接超时", e);
throw e;
} }
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e);
throw e;
} }
return null; return Collections.emptyMap();
} }
......
...@@ -16,6 +16,9 @@ import java.util.Date; ...@@ -16,6 +16,9 @@ import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle; import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.parse.TouTiaoArticleParse; import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
...@@ -26,25 +29,29 @@ import com.zhiwei.toutiao.parse.TouTiaoArticleParse; ...@@ -26,25 +29,29 @@ import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
* @date 2016年9月2日 上午11:48:51 * @date 2016年9月2日 上午11:48:51
*/ */
public class TouTiaoExample { public class TouTiaoExample {
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
long a = System.currentTimeMillis();
ProxyFactory.init(registry, group, GroupType.PROVIDER);
List<String> urlList = new ArrayList<String>(); List<String> urlList = new ArrayList<String>();
urlList.add("6075371636"); urlList.add("6075371636");
Date endTime = TimeParse.stringFormartDate("2018-10-01");
System.out.println(urlList.size());
Date endTime = TimeParse.stringFormartDate("2018-04-01");
for (String url : urlList) { for (String url : urlList) {
long a = System.currentTimeMillis();
String mid = url; String mid = url;
Long max_behot_time = 0L; Long max_behot_time = 0L;
List<TouTiaoArticle> list = new ArrayList<>();
boolean f = true; boolean f = true;
while (f) { while (f) {
Map<String, Object> dataMap = null; Map<String, Object> dataMap = null;
dataMap = TouTiaoArticleParse.getMicroTouTiaoCrawler(mid, endTime, null, max_behot_time+""); dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY);
if (dataMap != null) { if (dataMap != null && !dataMap.isEmpty()) {
List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data"); List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
max_behot_time = (Long)dataMap.get("max_behot_time"); max_behot_time = (Long)dataMap.get("max_behot_time");
System.out.println(max_behot_time + "=======" + ttlist.size()); System.out.println(max_behot_time + "=======" + ttlist.size());
...@@ -52,16 +59,17 @@ public class TouTiaoExample { ...@@ -52,16 +59,17 @@ public class TouTiaoExample {
f = false; f = false;
} else { } else {
if (ttlist.size() > 0) { if (ttlist.size() > 0) {
for (TouTiaoArticle tt : ttlist) { list.addAll(ttlist);
System.out.println(tt);
}
} }
} }
}else{
f = false;
} }
} }
long b = System.currentTimeMillis();
System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size());
} }
long b = System.currentTimeMillis();
System.out.println("一轮的采集时间为:" + (b - a) / 1000);
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment