Commit 408ac5cd by [zhangzhiwei]

添加用户历史文章及关注列表新接口采集

parent a0aee201
......@@ -9,7 +9,7 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.9-SNAPSHOT</version>
<version>0.1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
......
......@@ -193,6 +193,7 @@ public class TouTiaoAccountParse {
headerMap = Tools.getTouTiaoHeader();
headerMap.put("referer", "ihttps://www.toutiao.com/c/user/relation/"+ userid +"/?tab=following");
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
for(int i=0;i<3;i++){
try {
String htmlBody = null;
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
......@@ -207,11 +208,13 @@ public class TouTiaoAccountParse {
}
}else{
more = false;
continue;
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
more = false;
return null;
continue;
}
}
}
return ttaList;
......
......@@ -14,6 +14,7 @@ package com.zhiwei.toutiao.parse;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
......@@ -59,7 +60,6 @@ public class TouTiaoArticleParse {
if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time;
}
System.out.println("url=========="+url);
Map<String,String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url);
String htmlBody = null;
......@@ -77,7 +77,7 @@ public class TouTiaoArticleParse {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
}
return null;
return Collections.emptyMap();
}
@Deprecated
......@@ -89,7 +89,6 @@ public class TouTiaoArticleParse {
if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time;
}
System.out.println("url=========="+url);
Map<String,String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url);
String htmlBody = null;
......@@ -107,7 +106,7 @@ public class TouTiaoArticleParse {
logger.error("获取今日头条帐号数据连接超时", e);
throw e;
}
return null;
return Collections.emptyMap();
}
/**
......@@ -123,15 +122,16 @@ public class TouTiaoArticleParse {
Signature signature = new Signature(user_id, max_behot_time);
String as=signature.getAs();
String cp=signature.getCp();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+signature;
String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+_signature;
if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time;
}
System.out.println("url=========="+url);
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/");
String htmlBody = null;
for(int i=0;i<3;i++){
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if(htmlBody != null && htmlBody.contains("behot_time")){
......@@ -141,27 +141,30 @@ public class TouTiaoArticleParse {
}
}else{
logger.info("数据为null");
continue;
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
}
return null;
}
return Collections.emptyMap();
}
public static Map<String, Object> getTouTiaoHistory(String user_id, String max_behot_time,Date endData, ProxyHolder proxy ) throws Exception{
Signature signature = new Signature(user_id, max_behot_time);
String as=signature.getAs();
String cp=signature.getCp();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+signature;
String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+_signature;
if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time;
}
System.out.println("url=========="+url);
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/");
String htmlBody = null;
for(int i=0;i<3;i++){
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if(htmlBody != null && htmlBody.contains("behot_time")){
......@@ -171,12 +174,14 @@ public class TouTiaoArticleParse {
}
}else{
logger.info("数据为null");
continue;
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e);
throw e;
}
return null;
}
return Collections.emptyMap();
}
......
......@@ -16,6 +16,9 @@ import java.util.Date;
import java.util.List;
import java.util.Map;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
......@@ -27,24 +30,28 @@ import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
*/
public class TouTiaoExample {
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
@SuppressWarnings("unchecked")
public static void main(String[] args) throws Exception {
long a = System.currentTimeMillis();
List<String> urlList = new ArrayList<String>();
urlList.add("6075371636");
System.out.println(urlList.size());
ProxyFactory.init(registry, group, GroupType.PROVIDER);
Date endTime = TimeParse.stringFormartDate("2018-04-01");
List<String> urlList = new ArrayList<String>();
urlList.add("6075371636");
Date endTime = TimeParse.stringFormartDate("2018-10-01");
for (String url : urlList) {
long a = System.currentTimeMillis();
String mid = url;
Long max_behot_time = 0L;
List<TouTiaoArticle> list = new ArrayList<>();
boolean f = true;
while (f) {
Map<String, Object> dataMap = null;
dataMap = TouTiaoArticleParse.getMicroTouTiaoCrawler(mid, endTime, null, max_behot_time+"");
if (dataMap != null) {
dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY);
if (dataMap != null && !dataMap.isEmpty()) {
List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
max_behot_time = (Long)dataMap.get("max_behot_time");
System.out.println(max_behot_time + "=======" + ttlist.size());
......@@ -52,16 +59,17 @@ public class TouTiaoExample {
f = false;
} else {
if (ttlist.size() > 0) {
for (TouTiaoArticle tt : ttlist) {
System.out.println(tt);
}
}
list.addAll(ttlist);
}
}
}else{
f = false;
}
}
long b = System.currentTimeMillis();
System.out.println("一轮的采集时间为:" + (b - a) / 1000);
System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size());
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment