Commit 408ac5cd by [zhangzhiwei]

添加用户历史文章及关注列表新接口采集

parent a0aee201
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.0.9-SNAPSHOT</version> <version>0.1.0-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
......
...@@ -193,6 +193,7 @@ public class TouTiaoAccountParse { ...@@ -193,6 +193,7 @@ public class TouTiaoAccountParse {
headerMap = Tools.getTouTiaoHeader(); headerMap = Tools.getTouTiaoHeader();
headerMap.put("referer", "ihttps://www.toutiao.com/c/user/relation/"+ userid +"/?tab=following"); headerMap.put("referer", "ihttps://www.toutiao.com/c/user/relation/"+ userid +"/?tab=following");
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
for(int i=0;i<3;i++){
try { try {
String htmlBody = null; String htmlBody = null;
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string(); htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
...@@ -207,11 +208,13 @@ public class TouTiaoAccountParse { ...@@ -207,11 +208,13 @@ public class TouTiaoAccountParse {
} }
}else{ }else{
more = false; more = false;
continue;
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace()); logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
more = false; more = false;
return null; continue;
}
} }
} }
return ttaList; return ttaList;
......
...@@ -14,6 +14,7 @@ package com.zhiwei.toutiao.parse; ...@@ -14,6 +14,7 @@ package com.zhiwei.toutiao.parse;
import java.io.IOException; import java.io.IOException;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
...@@ -59,7 +60,6 @@ public class TouTiaoArticleParse { ...@@ -59,7 +60,6 @@ public class TouTiaoArticleParse {
if(max_behot_time!=null){ if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time; url = url + "&max_behot_time="+max_behot_time;
} }
System.out.println("url=========="+url);
Map<String,String> headerMap = Tools.getTouTiaoHeader(); Map<String,String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url); headerMap.put("Referer", url);
String htmlBody = null; String htmlBody = null;
...@@ -77,7 +77,7 @@ public class TouTiaoArticleParse { ...@@ -77,7 +77,7 @@ public class TouTiaoArticleParse {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace()); logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e; throw e;
} }
return null; return Collections.emptyMap();
} }
@Deprecated @Deprecated
...@@ -89,7 +89,6 @@ public class TouTiaoArticleParse { ...@@ -89,7 +89,6 @@ public class TouTiaoArticleParse {
if(max_behot_time!=null){ if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time; url = url + "&max_behot_time="+max_behot_time;
} }
System.out.println("url=========="+url);
Map<String,String> headerMap = Tools.getTouTiaoHeader(); Map<String,String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url); headerMap.put("Referer", url);
String htmlBody = null; String htmlBody = null;
...@@ -107,7 +106,7 @@ public class TouTiaoArticleParse { ...@@ -107,7 +106,7 @@ public class TouTiaoArticleParse {
logger.error("获取今日头条帐号数据连接超时", e); logger.error("获取今日头条帐号数据连接超时", e);
throw e; throw e;
} }
return null; return Collections.emptyMap();
} }
/** /**
...@@ -123,15 +122,16 @@ public class TouTiaoArticleParse { ...@@ -123,15 +122,16 @@ public class TouTiaoArticleParse {
Signature signature = new Signature(user_id, max_behot_time); Signature signature = new Signature(user_id, max_behot_time);
String as=signature.getAs(); String as=signature.getAs();
String cp=signature.getCp(); String cp=signature.getCp();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+signature; String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+_signature;
if(max_behot_time!=null){ if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time; url = url + "&max_behot_time="+max_behot_time;
} }
System.out.println("url=========="+url);
Map<String,String> headerMap = new HashMap<String,String>(); Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/"); headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/");
String htmlBody = null; String htmlBody = null;
for(int i=0;i<3;i++){
try { try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if(htmlBody != null && htmlBody.contains("behot_time")){ if(htmlBody != null && htmlBody.contains("behot_time")){
...@@ -141,27 +141,30 @@ public class TouTiaoArticleParse { ...@@ -141,27 +141,30 @@ public class TouTiaoArticleParse {
} }
}else{ }else{
logger.info("数据为null"); logger.info("数据为null");
continue;
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace()); logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e; throw e;
} }
return null; }
return Collections.emptyMap();
} }
public static Map<String, Object> getTouTiaoHistory(String user_id, String max_behot_time,Date endData, ProxyHolder proxy ) throws Exception{ public static Map<String, Object> getTouTiaoHistory(String user_id, String max_behot_time,Date endData, ProxyHolder proxy ) throws Exception{
Signature signature = new Signature(user_id, max_behot_time); Signature signature = new Signature(user_id, max_behot_time);
String as=signature.getAs(); String as=signature.getAs();
String cp=signature.getCp(); String cp=signature.getCp();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+signature; String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+_signature;
if(max_behot_time!=null){ if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time; url = url + "&max_behot_time="+max_behot_time;
} }
System.out.println("url=========="+url);
Map<String,String> headerMap = new HashMap<String,String>(); Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/"); headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/");
String htmlBody = null; String htmlBody = null;
for(int i=0;i<3;i++){
try { try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if(htmlBody != null && htmlBody.contains("behot_time")){ if(htmlBody != null && htmlBody.contains("behot_time")){
...@@ -171,12 +174,14 @@ public class TouTiaoArticleParse { ...@@ -171,12 +174,14 @@ public class TouTiaoArticleParse {
} }
}else{ }else{
logger.info("数据为null"); logger.info("数据为null");
continue;
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e); logger.error("获取今日头条帐号数据连接超时", e);
throw e; throw e;
} }
return null; }
return Collections.emptyMap();
} }
......
...@@ -16,6 +16,9 @@ import java.util.Date; ...@@ -16,6 +16,9 @@ import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle; import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.parse.TouTiaoArticleParse; import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
...@@ -27,24 +30,28 @@ import com.zhiwei.toutiao.parse.TouTiaoArticleParse; ...@@ -27,24 +30,28 @@ import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
*/ */
public class TouTiaoExample { public class TouTiaoExample {
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
long a = System.currentTimeMillis();
List<String> urlList = new ArrayList<String>();
urlList.add("6075371636");
System.out.println(urlList.size()); ProxyFactory.init(registry, group, GroupType.PROVIDER);
Date endTime = TimeParse.stringFormartDate("2018-04-01"); List<String> urlList = new ArrayList<String>();
urlList.add("6075371636");
Date endTime = TimeParse.stringFormartDate("2018-10-01");
for (String url : urlList) { for (String url : urlList) {
long a = System.currentTimeMillis();
String mid = url; String mid = url;
Long max_behot_time = 0L; Long max_behot_time = 0L;
List<TouTiaoArticle> list = new ArrayList<>();
boolean f = true; boolean f = true;
while (f) { while (f) {
Map<String, Object> dataMap = null; Map<String, Object> dataMap = null;
dataMap = TouTiaoArticleParse.getMicroTouTiaoCrawler(mid, endTime, null, max_behot_time+""); dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY);
if (dataMap != null) { if (dataMap != null && !dataMap.isEmpty()) {
List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data"); List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
max_behot_time = (Long)dataMap.get("max_behot_time"); max_behot_time = (Long)dataMap.get("max_behot_time");
System.out.println(max_behot_time + "=======" + ttlist.size()); System.out.println(max_behot_time + "=======" + ttlist.size());
...@@ -52,16 +59,17 @@ public class TouTiaoExample { ...@@ -52,16 +59,17 @@ public class TouTiaoExample {
f = false; f = false;
} else { } else {
if (ttlist.size() > 0) { if (ttlist.size() > 0) {
for (TouTiaoArticle tt : ttlist) { list.addAll(ttlist);
System.out.println(tt);
}
}
} }
} }
}else{
f = false;
} }
} }
long b = System.currentTimeMillis(); long b = System.currentTimeMillis();
System.out.println("一轮的采集时间为:" + (b - a) / 1000); System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size());
}
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment