Commit 47079954 by [zhangzhiwei]

采集核心包升级

parent 3e39658a
......@@ -9,12 +9,12 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.0-SNAPSHOT</version>
<version>0.1.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.1.0-RELEASE</version>
<version>0.1.1-RELEASE</version>
</dependency>
</dependencies>
......
......@@ -124,9 +124,6 @@ public class TouTiaoArticleParse {
String cp=signature.getCp();
String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+_signature;
if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time;
}
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/");
......@@ -152,28 +149,28 @@ public class TouTiaoArticleParse {
}
public static Map<String, Object> getTouTiaoHistory(String user_id, String max_behot_time,Date endData, ProxyHolder proxy ) throws Exception{
Signature signature = new Signature(user_id, max_behot_time);
String as=signature.getAs();
String cp=signature.getCp();
String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+_signature;
if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time;
}
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/");
String htmlBody = null;
for(int i=0;i<3;i++){
Signature signature = new Signature(user_id, max_behot_time);
String as=signature.getAs();
String cp=signature.getCp();
String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+_signature;
logger.info("当前采集的历史文章链接:::{}", url);
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/");
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if(htmlBody != null && htmlBody.contains("behot_time")){
Map<String, Object> ttList = parseHtmlByAccount(user_id, htmlBody, endData);
if(ttList!=null && ttList.size()>0){
return ttList;
}else{
break;
}
}else{
logger.info("数据为null");
logger.info("数据为null,获取到的文本为:::{}", htmlBody);
continue;
}
} catch (Exception e) {
......
......@@ -39,7 +39,7 @@ public class TouTiaoExample {
ProxyFactory.init(registry, group, GroupType.PROVIDER);
List<String> urlList = new ArrayList<String>();
urlList.add("6075371636");
urlList.add("1920576965");
Date endTime = TimeParse.stringFormartDate("2018-10-01");
for (String url : urlList) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment