Commit f4ed3aa0 by zhiwei

升级核心包版本及默认代理使用晋豪得NAT

parent e9bfd2df
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>toutiao</artifactId> <artifactId>toutiao</artifactId>
<version>0.2.9-SNAPSHOT</version> <version>0.3.0-SNAPSHOT</version>
<dependencies> <dependencies>
<dependency> <dependency>
......
...@@ -21,6 +21,8 @@ import com.zhiwei.toutiao.bean.Signature; ...@@ -21,6 +21,8 @@ import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.toutiao.bean.TouTiaoAccount; import com.zhiwei.toutiao.bean.TouTiaoAccount;
import com.zhiwei.toutiao.util.Tools; import com.zhiwei.toutiao.util.Tools;
import okhttp3.Response;
/** /**
* @ClassName: TouTiaoAccountParse * @ClassName: TouTiaoAccountParse
* @Description: 今日头条帐号采集 * @Description: 今日头条帐号采集
...@@ -47,13 +49,12 @@ public class TouTiaoAccountParse { ...@@ -47,13 +49,12 @@ public class TouTiaoAccountParse {
headerMap = Tools.getTouTiaoHeader(); headerMap = Tools.getTouTiaoHeader();
TouTiaoAccount tta = null; TouTiaoAccount tta = null;
try { try {
String htmlBody = null; String htmlBody = downloadHtml(url, proxy, headerMap);
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
tta = parseHtmlByAccount(htmlBody, name, proxy); tta = parseHtmlByAccount(htmlBody, name, proxy);
if(tta == null){ if(tta == null){
url = "https://www.toutiao.com/api/search/content/?aid=24&offset=0&format=json&keyword="+URLCodeUtil.getURLEncode(name, "utf-8")+"&autoload=true&count=20&cur_tab=1&from=search_tab&pd=synthesis"; url = "https://www.toutiao.com/api/search/content/?aid=24&offset=0&format=json&keyword="+URLCodeUtil.getURLEncode(name, "utf-8")+"&autoload=true&count=20&cur_tab=1&from=search_tab&pd=synthesis";
headerMap.put("Referer","https://www.toutiao.com/search/?keyword="+URLCodeUtil.getURLEncode(name, "utf-8")); headerMap.put("Referer","https://www.toutiao.com/search/?keyword="+URLCodeUtil.getURLEncode(name, "utf-8"));
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string(); htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody != null){ if(htmlBody != null){
tta = parseHtmlByAccount(htmlBody, name, proxy); tta = parseHtmlByAccount(htmlBody, name, proxy);
} }
...@@ -73,8 +74,7 @@ public class TouTiaoAccountParse { ...@@ -73,8 +74,7 @@ public class TouTiaoAccountParse {
headerMap = Tools.getTouTiaoHeader(); headerMap = Tools.getTouTiaoHeader();
TouTiaoAccount tta = null; TouTiaoAccount tta = null;
try { try {
String htmlBody = null; String htmlBody = downloadHtml(url, proxy, headerMap);
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if(htmlBody != null){ if(htmlBody != null){
tta = parseAccountByUserId(htmlBody, user_id, proxy); tta = parseAccountByUserId(htmlBody, user_id, proxy);
} }
...@@ -106,8 +106,7 @@ public class TouTiaoAccountParse { ...@@ -106,8 +106,7 @@ public class TouTiaoAccountParse {
String url = "https://www.toutiao.com/search_content/?offset="+page*20+"&format=json&keyword="+URLCodeUtil.getURLEncode(word, "utf-8")+"&autoload=true&count=20&cur_tab=4&from=media"; String url = "https://www.toutiao.com/search_content/?offset="+page*20+"&format=json&keyword="+URLCodeUtil.getURLEncode(word, "utf-8")+"&autoload=true&count=20&cur_tab=4&from=media";
headerMap = Tools.getTouTiaoHeader(); headerMap = Tools.getTouTiaoHeader();
try { try {
String htmlBody = null; String htmlBody = downloadHtml(url, proxy, headerMap);
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if(htmlBody != null){ if(htmlBody != null){
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject json = JSONObject.parseObject(htmlBody);
list.addAll(parseHtmlByWord(json, proxy)); list.addAll(parseHtmlByWord(json, proxy));
...@@ -149,8 +148,7 @@ public class TouTiaoAccountParse { ...@@ -149,8 +148,7 @@ public class TouTiaoAccountParse {
headerMap.put("User-Agent", "Dalvik/2.1.0 (Linux; U; Android 8.1.0; MI 8 MIUI/V10.0.11.0.OEACNFH) NewsArticle/7.0.1 cronet/TTNetVersion:pre_blink_merge-277498-gd2bb364e 2018-08-24"); headerMap.put("User-Agent", "Dalvik/2.1.0 (Linux; U; Android 8.1.0; MI 8 MIUI/V10.0.11.0.OEACNFH) NewsArticle/7.0.1 cronet/TTNetVersion:pre_blink_merge-277498-gd2bb364e 2018-08-24");
headerMap.put("Host", "it-hl.snssdk.com"); headerMap.put("Host", "it-hl.snssdk.com");
try { try {
String htmlBody = null; String htmlBody = downloadHtml(url, proxy, headerMap);
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if(htmlBody != null && htmlBody.contains("name")){ if(htmlBody != null && htmlBody.contains("name")){
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject json = JSONObject.parseObject(htmlBody);
more = json.getJSONObject("data").getBooleanValue("has_more"); more = json.getJSONObject("data").getBooleanValue("has_more");
...@@ -493,5 +491,29 @@ public class TouTiaoAccountParse { ...@@ -493,5 +491,29 @@ public class TouTiaoAccountParse {
} }
return ttaList; return ttaList;
} }
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
break;
}else{
continue;
}
}
}
return null;
}
} }
...@@ -33,6 +33,8 @@ import com.zhiwei.toutiao.bean.Signature; ...@@ -33,6 +33,8 @@ import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.toutiao.bean.TouTiaoArticle; import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools; import com.zhiwei.toutiao.util.Tools;
import okhttp3.Response;
/** /**
* @Description:头条帐号采集 * @Description:头条帐号采集
* @author hero * @author hero
...@@ -55,7 +57,6 @@ public class TouTiaoArticleParse { ...@@ -55,7 +57,6 @@ public class TouTiaoArticleParse {
* @return List<TouTiao> 返回类型 * @return List<TouTiao> 返回类型
* @throws Exception * @throws Exception
*/ */
@Deprecated
public static Map<String, Object> getTouTiaoList(String media_id, String max_behot_time, Date endData, Proxy proxy) public static Map<String, Object> getTouTiaoList(String media_id, String max_behot_time, Date endData, Proxy proxy)
throws Exception { throws Exception {
Signature signature = new Signature(); Signature signature = new Signature();
...@@ -66,9 +67,8 @@ public class TouTiaoArticleParse { ...@@ -66,9 +67,8 @@ public class TouTiaoArticleParse {
} }
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url); headerMap.put("Referer", url);
String htmlBody = null;
try { try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null && htmlBody.contains("behot_time")) { if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData); Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData);
if (ttList != null && ttList.size() > 0) { if (ttList != null && ttList.size() > 0) {
...@@ -84,22 +84,19 @@ public class TouTiaoArticleParse { ...@@ -84,22 +84,19 @@ public class TouTiaoArticleParse {
return Collections.emptyMap(); return Collections.emptyMap();
} }
@Deprecated
public static Map<String, Object> getTouTiaoList(String media_id, Long max_behot_time, Date endData, public static Map<String, Object> getTouTiaoList(String media_id, String max_behot_time, Date endData, ProxyHolder proxy)
ProxyHolder proxy) throws Exception { throws Exception {
Signature signature = new Signature(); Signature signature = new Signature();
String as = signature.getAs(); String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + media_id + "&count=20&as="
String cp = signature.getCp(); + signature.getAs() + "&cp=" + signature.getCp();
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + media_id + "&count=20&as=" + as + "&cp="
+ cp;
if (max_behot_time != null) { if (max_behot_time != null) {
url = url + "&max_behot_time=" + max_behot_time; url = url + "&max_behot_time=" + max_behot_time;
} }
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url); headerMap.put("Referer", url);
String htmlBody = null;
try { try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null && htmlBody.contains("behot_time")) { if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData); Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData);
if (ttList != null && ttList.size() > 0) { if (ttList != null && ttList.size() > 0) {
...@@ -109,7 +106,7 @@ public class TouTiaoArticleParse { ...@@ -109,7 +106,7 @@ public class TouTiaoArticleParse {
logger.info("数据为null"); logger.info("数据为null");
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e); logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e; throw e;
} }
return Collections.emptyMap(); return Collections.emptyMap();
...@@ -134,14 +131,12 @@ public class TouTiaoArticleParse { ...@@ -134,14 +131,12 @@ public class TouTiaoArticleParse {
String _signature = signature.getSignature(); String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + user_id + "&max_behot_time=" String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + user_id + "&max_behot_time="
+ max_behot_time + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + _signature; + max_behot_time + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + _signature;
System.out.println(url);
Map<String, String> headerMap = new HashMap<String, String>(); Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("user-agent", headerMap.put("user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer", "https://www.toutiao.com/c/user/" + user_id + "/"); headerMap.put("referer", "https://www.toutiao.com/c/user/" + user_id + "/");
String htmlBody = null;
try { try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null && htmlBody.contains("behot_time")) { if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(user_id, htmlBody, endData); Map<String, Object> ttList = parseHtmlByAccount(user_id, htmlBody, endData);
if (ttList != null && ttList.size() > 0) { if (ttList != null && ttList.size() > 0) {
...@@ -360,9 +355,8 @@ public class TouTiaoArticleParse { ...@@ -360,9 +355,8 @@ public class TouTiaoArticleParse {
} }
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/"); headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/");
System.out.println(url);
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null) { if (htmlBody != null) {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate); Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
if (dataMap != null && dataMap.size() > 0) { if (dataMap != null && dataMap.size() > 0) {
...@@ -386,7 +380,6 @@ public class TouTiaoArticleParse { ...@@ -386,7 +380,6 @@ public class TouTiaoArticleParse {
} }
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/"); headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/");
System.out.println(url);
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null) { if (htmlBody != null) {
...@@ -479,4 +472,27 @@ public class TouTiaoArticleParse { ...@@ -479,4 +472,27 @@ public class TouTiaoArticleParse {
return map; return map;
} }
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
break;
}else{
continue;
}
}
}
return null;
}
} }
package com.zhiwei.toutiao.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
/**
* @ClassName: TouTiaoChannel
* @Description: 今日头条按照频道采集
* @author hero
* @date 2017年7月24日 下午4:57:22
*/
public class TouTiaoChannelParse {
private static Map<String, String> headerMap;
private static Logger logger = LogManager.getLogger(TouTiaoChannelParse.class);
/**
* @Title: touTiaoChannel
* @author hero
* @Description: 解析
* @param @param url
* @param @return 设定文件
* @return Map<String,Object> 返回类型
* @throws Exception
*/
public static Map<String,Object> touTiaoChannel(String url,Proxy proxy) throws Exception{
headerMap = Tools.getTouTiaoChannelHeader();
headerMap.put("referer", url);
String htmlBody = null;
try {
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
}catch (Exception e) {
logger.error("获取数据连接出现问题:",e.fillInStackTrace());
throw e;
}
if (htmlBody != null) {
return parseHtmlByChannel(htmlBody);
}
return null;
}
/**
* @Title: parseHtmlByChannel
* @author hero
* @Description: 解析
* @param @param htmlBody
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String,Object> parseHtmlByChannel(String htmlBody){
Map<String,Object> dataMap = new HashMap<String,Object>();
List<TouTiaoArticle> ttList = new ArrayList<TouTiaoArticle>();
JSONObject jsonObject = JSONObject.parseObject(htmlBody);
JSONArray dataList = jsonObject.getJSONArray("data");
Long next = null;
try {
next = jsonObject.getJSONObject("next").getLong("max_behot_time");
} catch (Exception e) {
next = null;
}
String time = null;
String title = null;
String content = null;
String comment_count = null;
Date date = null;
String source = null;
for (int i = 0; i < dataList.size(); i++) {
JSONObject jso = dataList.getJSONObject(i);
try {
time = String.valueOf(jso.getLongValue("behot_time")*1000);
title = jso.getString("title");
content = jso.getString("abstract");
comment_count = jso.getIntValue("comments_count")+"";
source = jso.getString("source");
String url = null;
if(null != jso.getString("group_id")){
url = "http://www.toutiao.com/a" + jso.getString("group_id")+"/";
}
url = getUrl(url);
date = TimeParse.stringFormartDate(time);
TouTiaoArticle tt = new TouTiaoArticle(url, title, null,source, date, content, comment_count, "-1", "-1", "-1","今日头条",null);
ttList.add(tt);
} catch (JSONException e) {
continue;
}
}
dataMap.put("data", ttList);
dataMap.put("next", next);
return dataMap;
}
/**
* @Title: getUrl
* @author hero
* @Description: 处理url
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private static String getUrl(String url){
if(url.contains("group/"))
{
url = url.replace("group/", "a");
}
if (url.contains("item")) {
url = url.replace("/item/", "/i");
}
if (url.contains("m.")) {
url = url.replace("m.", "");
}
if(!url.contains("www"))
{
url = url.replace("toutiao.com", "www.toutiao.com");
}
String urlIndex = url.substring(url.length()-1, url.length());
if(!urlIndex.equals("/"))
{
url = url+"/";
}
return url;
}
}
...@@ -15,11 +15,14 @@ import com.alibaba.fastjson.JSONArray; ...@@ -15,11 +15,14 @@ import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK; import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.toutiao.bean.TouTiaoComment; import com.zhiwei.toutiao.bean.TouTiaoComment;
import com.zhiwei.toutiao.util.Tools; import com.zhiwei.toutiao.util.Tools;
import okhttp3.Response;
/** /**
* @ClassName: TouTiaoComment * @ClassName: TouTiaoComment
* @Description: 今日头条评论数据 * @Description: 今日头条评论数据
...@@ -66,7 +69,7 @@ public class TouTiaoCommentParse { ...@@ -66,7 +69,7 @@ public class TouTiaoCommentParse {
headerMap.put("Host", "is.snssdk.com"); headerMap.put("Host", "is.snssdk.com");
for(int j=1; j<=3; j++){ for(int j=1; j<=3; j++){
try { try {
String htmlBody = HttpClientTemplateOK.get(urlNew, proxy,headerMap); String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody!=null) if(htmlBody!=null)
{ {
List<TouTiaoComment> commentes = analySisComment(htmlBody, url); List<TouTiaoComment> commentes = analySisComment(htmlBody, url);
...@@ -77,7 +80,7 @@ public class TouTiaoCommentParse { ...@@ -77,7 +80,7 @@ public class TouTiaoCommentParse {
} }
ZhiWeiTools.sleep(4000); ZhiWeiTools.sleep(4000);
break; break;
} catch (SocketTimeoutException e) { } catch (Exception e) {
continue; continue;
} }
} }
...@@ -137,8 +140,7 @@ public class TouTiaoCommentParse { ...@@ -137,8 +140,7 @@ public class TouTiaoCommentParse {
String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+group_id+"&item_id=0&count=20&offset=0"; String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+group_id+"&item_id=0&count=20&offset=0";
//设置头信息 //设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader(); Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(urlNew, headerMap),proxy).body().string(); String htmlBody = downloadHtml(urlNew, proxy, headerMap);
if(htmlBody!=null) if(htmlBody!=null)
{ {
try { try {
...@@ -174,7 +176,7 @@ public class TouTiaoCommentParse { ...@@ -174,7 +176,7 @@ public class TouTiaoCommentParse {
try { try {
//设置头信息 //设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader(); Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap),proxy).body().string(); String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody!=null && htmlBody.contains("commentInfo")) if(htmlBody!=null && htmlBody.contains("commentInfo"))
{ {
try { try {
...@@ -185,7 +187,6 @@ public class TouTiaoCommentParse { ...@@ -185,7 +187,6 @@ public class TouTiaoCommentParse {
} }
} }
} catch (Exception e) { } catch (Exception e) {
ZhiWeiTools.sleep(5000);
continue; continue;
} }
} }
...@@ -206,7 +207,7 @@ public class TouTiaoCommentParse { ...@@ -206,7 +207,7 @@ public class TouTiaoCommentParse {
try { try {
//设置头信息 //设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader(); Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap),proxy).body().string(); String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody!=null && htmlBody.contains("commentInfo")) if(htmlBody!=null && htmlBody.contains("commentInfo"))
{ {
try { try {
...@@ -217,7 +218,7 @@ public class TouTiaoCommentParse { ...@@ -217,7 +218,7 @@ public class TouTiaoCommentParse {
} }
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("解析头条评论数错误:::{}", e.fillInStackTrace());
} }
return 0; return 0;
} }
...@@ -238,7 +239,7 @@ public class TouTiaoCommentParse { ...@@ -238,7 +239,7 @@ public class TouTiaoCommentParse {
String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+group_id+"&item_id=0&count=20&offset=0"; String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+group_id+"&item_id=0&count=20&offset=0";
//设置头信息 //设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader(); Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(urlNew, headerMap),proxy).body().string(); String htmlBody = downloadHtml(urlNew, proxy, headerMap);
if(htmlBody!=null) if(htmlBody!=null)
{ {
try { try {
...@@ -253,7 +254,6 @@ public class TouTiaoCommentParse { ...@@ -253,7 +254,6 @@ public class TouTiaoCommentParse {
} }
} }
} catch (Exception e) { } catch (Exception e) {
ZhiWeiTools.sleep(5000);
continue; continue;
} }
} }
...@@ -301,7 +301,7 @@ public class TouTiaoCommentParse { ...@@ -301,7 +301,7 @@ public class TouTiaoCommentParse {
String groupId = null; String groupId = null;
Map<String,String> headerMap = Tools.getTouTiaoHeader(); Map<String,String> headerMap = Tools.getTouTiaoHeader();
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap),proxy).body().string(); String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody != null) if(htmlBody != null)
{ {
if(htmlBody.contains("groupId")) if(htmlBody.contains("groupId"))
...@@ -320,5 +320,27 @@ public class TouTiaoCommentParse { ...@@ -320,5 +320,27 @@ public class TouTiaoCommentParse {
return groupId; return groupId;
} }
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headerMap) {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
break;
}else{
continue;
}
}
}
return null;
}
} }
...@@ -45,7 +45,6 @@ public class TouTiaoQuestionAnswerParse { ...@@ -45,7 +45,6 @@ public class TouTiaoQuestionAnswerParse {
JSONObject jsonObject = JSONObject.parseObject(htmlBody); JSONObject jsonObject = JSONObject.parseObject(htmlBody);
if(jsonObject.getJSONObject("data") != null){ if(jsonObject.getJSONObject("data") != null){
JSONObject data = jsonObject.getJSONObject("data"); JSONObject data = jsonObject.getJSONObject("data");
System.out.println(data.getIntValue("has_more"));
page++; page++;
JSONArray ans_list = data.getJSONArray("ans_list"); JSONArray ans_list = data.getJSONArray("ans_list");
for(int i= 0; i<ans_list.size(); i++){ for(int i= 0; i<ans_list.size(); i++){
......
package com.zhiwei.toutiao.parse; package com.zhiwei.toutiao.parse;
import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
...@@ -15,9 +16,12 @@ import com.alibaba.fastjson.JSONObject; ...@@ -15,9 +16,12 @@ import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle; import com.zhiwei.toutiao.bean.TouTiaoArticle;
import okhttp3.Response;
/** /**
* @ClassName: TouTiaoSearch * @ClassName: TouTiaoSearch
* @Description: TODO(今日头条搜索采集解析程序) * @Description: TODO(今日头条搜索采集解析程序)
...@@ -39,10 +43,10 @@ public class TouTiaoSearchParse { ...@@ -39,10 +43,10 @@ public class TouTiaoSearchParse {
* @return List<TouTiaoArticle> 返回类型 * @return List<TouTiaoArticle> 返回类型
* @throws Exception * @throws Exception
*/ */
public static Map<String,Object> touTiaoSearchByWord(String url,ProxyHolder proxy) throws Exception{ public static Map<String,Object> touTiaoSearchByWord(String url,Proxy proxy) throws Exception{
String htmlBody = null; String htmlBody = null;
try { try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy).body().string(); htmlBody = downloadHtml(url, proxy, HeaderTool.getCommonHead());
if(htmlBody != null){ if(htmlBody != null){
Map<String,Object> dataMap = parseHtmlBySearch(htmlBody); Map<String,Object> dataMap = parseHtmlBySearch(htmlBody);
if(dataMap!=null && dataMap.size()>0){ if(dataMap!=null && dataMap.size()>0){
...@@ -135,9 +139,30 @@ public class TouTiaoSearchParse { ...@@ -135,9 +139,30 @@ public class TouTiaoSearchParse {
{ {
url = url+"/"; url = url+"/";
} }
return url; return url;
} }
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headerMap) {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
break;
}else{
continue;
}
}
}
return null;
}
} }
...@@ -38,7 +38,6 @@ public class WangyiNewParse { ...@@ -38,7 +38,6 @@ public class WangyiNewParse {
while(finish) while(finish)
{ {
String url = "http://c.m.163.com/nc/subscribe/list/"+tid+"/all/"+page*20+"-20.html"; String url = "http://c.m.163.com/nc/subscribe/list/"+tid+"/all/"+page*20+"-20.html";
System.out.println(url);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if(htmlBody!=null) if(htmlBody!=null)
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment