Commit 47079954 by [zhangzhiwei]

采集核心包升级

parent 3e39658a
...@@ -9,12 +9,12 @@ ...@@ -9,12 +9,12 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.0-SNAPSHOT</version> <version>0.1.1-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.1.0-RELEASE</version> <version>0.1.1-RELEASE</version>
</dependency> </dependency>
</dependencies> </dependencies>
......
...@@ -124,9 +124,6 @@ public class TouTiaoArticleParse { ...@@ -124,9 +124,6 @@ public class TouTiaoArticleParse {
String cp=signature.getCp(); String cp=signature.getCp();
String _signature = signature.getSignature(); String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+_signature; String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+_signature;
if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time;
}
Map<String,String> headerMap = new HashMap<String,String>(); Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/"); headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/");
...@@ -152,28 +149,28 @@ public class TouTiaoArticleParse { ...@@ -152,28 +149,28 @@ public class TouTiaoArticleParse {
} }
public static Map<String, Object> getTouTiaoHistory(String user_id, String max_behot_time,Date endData, ProxyHolder proxy ) throws Exception{ public static Map<String, Object> getTouTiaoHistory(String user_id, String max_behot_time,Date endData, ProxyHolder proxy ) throws Exception{
Signature signature = new Signature(user_id, max_behot_time);
String as=signature.getAs();
String cp=signature.getCp();
String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+_signature;
if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time;
}
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/");
String htmlBody = null;
for(int i=0;i<3;i++){ for(int i=0;i<3;i++){
Signature signature = new Signature(user_id, max_behot_time);
String as=signature.getAs();
String cp=signature.getCp();
String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id="+user_id+"&max_behot_time="+max_behot_time+"&count=20&as="+as+"&cp="+cp+"&_signature="+_signature;
logger.info("当前采集的历史文章链接:::{}", url);
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer","https://www.toutiao.com/c/user/"+user_id+"/");
String htmlBody = null;
try { try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if(htmlBody != null && htmlBody.contains("behot_time")){ if(htmlBody != null && htmlBody.contains("behot_time")){
Map<String, Object> ttList = parseHtmlByAccount(user_id, htmlBody, endData); Map<String, Object> ttList = parseHtmlByAccount(user_id, htmlBody, endData);
if(ttList!=null && ttList.size()>0){ if(ttList!=null && ttList.size()>0){
return ttList; return ttList;
}else{
break;
} }
}else{ }else{
logger.info("数据为null"); logger.info("数据为null,获取到的文本为:::{}", htmlBody);
continue; continue;
} }
} catch (Exception e) { } catch (Exception e) {
......
...@@ -39,7 +39,7 @@ public class TouTiaoExample { ...@@ -39,7 +39,7 @@ public class TouTiaoExample {
ProxyFactory.init(registry, group, GroupType.PROVIDER); ProxyFactory.init(registry, group, GroupType.PROVIDER);
List<String> urlList = new ArrayList<String>(); List<String> urlList = new ArrayList<String>();
urlList.add("6075371636"); urlList.add("1920576965");
Date endTime = TimeParse.stringFormartDate("2018-10-01"); Date endTime = TimeParse.stringFormartDate("2018-10-01");
for (String url : urlList) { for (String url : urlList) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment