Commit f4ed3aa0 by zhiwei

升级核心包版本及默认代理使用晋豪得NAT

parent e9bfd2df
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>toutiao</artifactId>
<version>0.2.9-SNAPSHOT</version>
<version>0.3.0-SNAPSHOT</version>
<dependencies>
<dependency>
......
......@@ -21,6 +21,8 @@ import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.toutiao.bean.TouTiaoAccount;
import com.zhiwei.toutiao.util.Tools;
import okhttp3.Response;
/**
* @ClassName: TouTiaoAccountParse
* @Description: 今日头条帐号采集
......@@ -47,13 +49,12 @@ public class TouTiaoAccountParse {
headerMap = Tools.getTouTiaoHeader();
TouTiaoAccount tta = null;
try {
String htmlBody = null;
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
String htmlBody = downloadHtml(url, proxy, headerMap);
tta = parseHtmlByAccount(htmlBody, name, proxy);
if(tta == null){
url = "https://www.toutiao.com/api/search/content/?aid=24&offset=0&format=json&keyword="+URLCodeUtil.getURLEncode(name, "utf-8")+"&autoload=true&count=20&cur_tab=1&from=search_tab&pd=synthesis";
headerMap.put("Referer","https://www.toutiao.com/search/?keyword="+URLCodeUtil.getURLEncode(name, "utf-8"));
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody != null){
tta = parseHtmlByAccount(htmlBody, name, proxy);
}
......@@ -73,8 +74,7 @@ public class TouTiaoAccountParse {
headerMap = Tools.getTouTiaoHeader();
TouTiaoAccount tta = null;
try {
String htmlBody = null;
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody != null){
tta = parseAccountByUserId(htmlBody, user_id, proxy);
}
......@@ -106,8 +106,7 @@ public class TouTiaoAccountParse {
String url = "https://www.toutiao.com/search_content/?offset="+page*20+"&format=json&keyword="+URLCodeUtil.getURLEncode(word, "utf-8")+"&autoload=true&count=20&cur_tab=4&from=media";
headerMap = Tools.getTouTiaoHeader();
try {
String htmlBody = null;
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody != null){
JSONObject json = JSONObject.parseObject(htmlBody);
list.addAll(parseHtmlByWord(json, proxy));
......@@ -149,8 +148,7 @@ public class TouTiaoAccountParse {
headerMap.put("User-Agent", "Dalvik/2.1.0 (Linux; U; Android 8.1.0; MI 8 MIUI/V10.0.11.0.OEACNFH) NewsArticle/7.0.1 cronet/TTNetVersion:pre_blink_merge-277498-gd2bb364e 2018-08-24");
headerMap.put("Host", "it-hl.snssdk.com");
try {
String htmlBody = null;
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody != null && htmlBody.contains("name")){
JSONObject json = JSONObject.parseObject(htmlBody);
more = json.getJSONObject("data").getBooleanValue("has_more");
......@@ -493,5 +491,29 @@ public class TouTiaoAccountParse {
}
return ttaList;
}
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
break;
}else{
continue;
}
}
}
return null;
}
}
......@@ -33,6 +33,8 @@ import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
import okhttp3.Response;
/**
* @Description:头条帐号采集
* @author hero
......@@ -55,7 +57,6 @@ public class TouTiaoArticleParse {
* @return List<TouTiao> 返回类型
* @throws Exception
*/
@Deprecated
public static Map<String, Object> getTouTiaoList(String media_id, String max_behot_time, Date endData, Proxy proxy)
throws Exception {
Signature signature = new Signature();
......@@ -66,9 +67,8 @@ public class TouTiaoArticleParse {
}
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url);
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData);
if (ttList != null && ttList.size() > 0) {
......@@ -84,22 +84,19 @@ public class TouTiaoArticleParse {
return Collections.emptyMap();
}
@Deprecated
public static Map<String, Object> getTouTiaoList(String media_id, Long max_behot_time, Date endData,
ProxyHolder proxy) throws Exception {
public static Map<String, Object> getTouTiaoList(String media_id, String max_behot_time, Date endData, ProxyHolder proxy)
throws Exception {
Signature signature = new Signature();
String as = signature.getAs();
String cp = signature.getCp();
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + media_id + "&count=20&as=" + as + "&cp="
+ cp;
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + media_id + "&count=20&as="
+ signature.getAs() + "&cp=" + signature.getCp();
if (max_behot_time != null) {
url = url + "&max_behot_time=" + max_behot_time;
}
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url);
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData);
if (ttList != null && ttList.size() > 0) {
......@@ -109,7 +106,7 @@ public class TouTiaoArticleParse {
logger.info("数据为null");
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e);
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
}
return Collections.emptyMap();
......@@ -134,14 +131,12 @@ public class TouTiaoArticleParse {
String _signature = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + user_id + "&max_behot_time="
+ max_behot_time + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + _signature;
System.out.println(url);
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer", "https://www.toutiao.com/c/user/" + user_id + "/");
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(user_id, htmlBody, endData);
if (ttList != null && ttList.size() > 0) {
......@@ -360,9 +355,8 @@ public class TouTiaoArticleParse {
}
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/");
System.out.println(url);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null) {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
if (dataMap != null && dataMap.size() > 0) {
......@@ -386,7 +380,6 @@ public class TouTiaoArticleParse {
}
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/");
System.out.println(url);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null) {
......@@ -479,4 +472,27 @@ public class TouTiaoArticleParse {
return map;
}
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
break;
}else{
continue;
}
}
}
return null;
}
}
package com.zhiwei.toutiao.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
/**
* @ClassName: TouTiaoChannel
* @Description: 今日头条按照频道采集
* @author hero
* @date 2017年7月24日 下午4:57:22
*/
public class TouTiaoChannelParse {
private static Map<String, String> headerMap;
private static Logger logger = LogManager.getLogger(TouTiaoChannelParse.class);
/**
* @Title: touTiaoChannel
* @author hero
* @Description: 解析
* @param @param url
* @param @return 设定文件
* @return Map<String,Object> 返回类型
* @throws Exception
*/
public static Map<String,Object> touTiaoChannel(String url,Proxy proxy) throws Exception{
headerMap = Tools.getTouTiaoChannelHeader();
headerMap.put("referer", url);
String htmlBody = null;
try {
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
}catch (Exception e) {
logger.error("获取数据连接出现问题:",e.fillInStackTrace());
throw e;
}
if (htmlBody != null) {
return parseHtmlByChannel(htmlBody);
}
return null;
}
/**
* @Title: parseHtmlByChannel
* @author hero
* @Description: 解析
* @param @param htmlBody
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String,Object> parseHtmlByChannel(String htmlBody){
Map<String,Object> dataMap = new HashMap<String,Object>();
List<TouTiaoArticle> ttList = new ArrayList<TouTiaoArticle>();
JSONObject jsonObject = JSONObject.parseObject(htmlBody);
JSONArray dataList = jsonObject.getJSONArray("data");
Long next = null;
try {
next = jsonObject.getJSONObject("next").getLong("max_behot_time");
} catch (Exception e) {
next = null;
}
String time = null;
String title = null;
String content = null;
String comment_count = null;
Date date = null;
String source = null;
for (int i = 0; i < dataList.size(); i++) {
JSONObject jso = dataList.getJSONObject(i);
try {
time = String.valueOf(jso.getLongValue("behot_time")*1000);
title = jso.getString("title");
content = jso.getString("abstract");
comment_count = jso.getIntValue("comments_count")+"";
source = jso.getString("source");
String url = null;
if(null != jso.getString("group_id")){
url = "http://www.toutiao.com/a" + jso.getString("group_id")+"/";
}
url = getUrl(url);
date = TimeParse.stringFormartDate(time);
TouTiaoArticle tt = new TouTiaoArticle(url, title, null,source, date, content, comment_count, "-1", "-1", "-1","今日头条",null);
ttList.add(tt);
} catch (JSONException e) {
continue;
}
}
dataMap.put("data", ttList);
dataMap.put("next", next);
return dataMap;
}
/**
* @Title: getUrl
* @author hero
* @Description: 处理url
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private static String getUrl(String url){
if(url.contains("group/"))
{
url = url.replace("group/", "a");
}
if (url.contains("item")) {
url = url.replace("/item/", "/i");
}
if (url.contains("m.")) {
url = url.replace("m.", "");
}
if(!url.contains("www"))
{
url = url.replace("toutiao.com", "www.toutiao.com");
}
String urlIndex = url.substring(url.length()-1, url.length());
if(!urlIndex.equals("/"))
{
url = url+"/";
}
return url;
}
}
......@@ -15,11 +15,14 @@ import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.toutiao.bean.TouTiaoComment;
import com.zhiwei.toutiao.util.Tools;
import okhttp3.Response;
/**
* @ClassName: TouTiaoComment
* @Description: 今日头条评论数据
......@@ -66,7 +69,7 @@ public class TouTiaoCommentParse {
headerMap.put("Host", "is.snssdk.com");
for(int j=1; j<=3; j++){
try {
String htmlBody = HttpClientTemplateOK.get(urlNew, proxy,headerMap);
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody!=null)
{
List<TouTiaoComment> commentes = analySisComment(htmlBody, url);
......@@ -77,7 +80,7 @@ public class TouTiaoCommentParse {
}
ZhiWeiTools.sleep(4000);
break;
} catch (SocketTimeoutException e) {
} catch (Exception e) {
continue;
}
}
......@@ -137,8 +140,7 @@ public class TouTiaoCommentParse {
String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+group_id+"&item_id=0&count=20&offset=0";
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(urlNew, headerMap),proxy).body().string();
String htmlBody = downloadHtml(urlNew, proxy, headerMap);
if(htmlBody!=null)
{
try {
......@@ -174,7 +176,7 @@ public class TouTiaoCommentParse {
try {
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap),proxy).body().string();
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody!=null && htmlBody.contains("commentInfo"))
{
try {
......@@ -185,7 +187,6 @@ public class TouTiaoCommentParse {
}
}
} catch (Exception e) {
ZhiWeiTools.sleep(5000);
continue;
}
}
......@@ -206,7 +207,7 @@ public class TouTiaoCommentParse {
try {
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap),proxy).body().string();
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody!=null && htmlBody.contains("commentInfo"))
{
try {
......@@ -217,7 +218,7 @@ public class TouTiaoCommentParse {
}
}
} catch (Exception e) {
logger.error("解析头条评论数错误:::{}", e.fillInStackTrace());
}
return 0;
}
......@@ -238,7 +239,7 @@ public class TouTiaoCommentParse {
String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+group_id+"&item_id=0&count=20&offset=0";
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(urlNew, headerMap),proxy).body().string();
String htmlBody = downloadHtml(urlNew, proxy, headerMap);
if(htmlBody!=null)
{
try {
......@@ -253,7 +254,6 @@ public class TouTiaoCommentParse {
}
}
} catch (Exception e) {
ZhiWeiTools.sleep(5000);
continue;
}
}
......@@ -301,7 +301,7 @@ public class TouTiaoCommentParse {
String groupId = null;
Map<String,String> headerMap = Tools.getTouTiaoHeader();
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap),proxy).body().string();
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody != null)
{
if(htmlBody.contains("groupId"))
......@@ -320,5 +320,27 @@ public class TouTiaoCommentParse {
return groupId;
}
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headerMap) {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
break;
}else{
continue;
}
}
}
return null;
}
}
......@@ -45,7 +45,6 @@ public class TouTiaoQuestionAnswerParse {
JSONObject jsonObject = JSONObject.parseObject(htmlBody);
if(jsonObject.getJSONObject("data") != null){
JSONObject data = jsonObject.getJSONObject("data");
System.out.println(data.getIntValue("has_more"));
page++;
JSONArray ans_list = data.getJSONArray("ans_list");
for(int i= 0; i<ans_list.size(); i++){
......
package com.zhiwei.toutiao.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
......@@ -15,9 +16,12 @@ import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import okhttp3.Response;
/**
* @ClassName: TouTiaoSearch
* @Description: TODO(今日头条搜索采集解析程序)
......@@ -39,10 +43,10 @@ public class TouTiaoSearchParse {
* @return List<TouTiaoArticle> 返回类型
* @throws Exception
*/
public static Map<String,Object> touTiaoSearchByWord(String url,ProxyHolder proxy) throws Exception{
public static Map<String,Object> touTiaoSearchByWord(String url,Proxy proxy) throws Exception{
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy).body().string();
htmlBody = downloadHtml(url, proxy, HeaderTool.getCommonHead());
if(htmlBody != null){
Map<String,Object> dataMap = parseHtmlBySearch(htmlBody);
if(dataMap!=null && dataMap.size()>0){
......@@ -135,9 +139,30 @@ public class TouTiaoSearchParse {
{
url = url+"/";
}
return url;
}
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headerMap) {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
break;
}else{
continue;
}
}
}
return null;
}
}
......@@ -38,7 +38,6 @@ public class WangyiNewParse {
while(finish)
{
String url = "http://c.m.163.com/nc/subscribe/list/"+tid+"/all/"+page*20+"-20.html";
System.out.println(url);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if(htmlBody!=null)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment