Commit 3a3a77d3 by zhiwei

升级采集核心包

parent b2cab5c7
......@@ -3,19 +3,19 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>toutiao</artifactId>
<version>0.3.3-SNAPSHOT</version>
<version>0.3.5-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.2-SNAPSHOT</version>
<version>0.1.3-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.3.0-RELEASE</version>
<version>0.3.6-RELEASE</version>
<scope>provided</scope>
</dependency>
</dependencies>
......
......@@ -34,7 +34,7 @@ public class TouTiaoAccountParse {
private TouTiaoAccountParse() {}
private static Map<String, String> headerMap;
private static Logger logger = LogManager.getLogger(TouTiaoAccountParse.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: getTouTiaoAccountInfo
* @author hero
......@@ -506,7 +506,7 @@ public class TouTiaoAccountParse {
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_HEAVY_PROXY);
}
return response.body().string();
} catch (Exception e) {
......
......@@ -47,7 +47,8 @@ public class TouTiaoArticleParse {
}
private static Logger logger = LogManager.getLogger(TouTiaoArticleParse.class);
private static HttpBoot httpBoot = new HttpBoot(true);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/***
* 获取头条数据
......@@ -615,7 +616,7 @@ public class TouTiaoArticleParse {
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap), ProxyHolder.NAT_PROXY);
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap), ProxyHolder.NAT_HEAVY_PROXY);
}
return response.body().string();
} catch (Exception e) {
......
......@@ -31,7 +31,8 @@ import okhttp3.Response;
public class TouTiaoCommentParse {
private static Logger logger = LogManager.getLogger(TouTiaoCommentParse.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
*
......@@ -326,7 +327,7 @@ public class TouTiaoCommentParse {
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_HEAVY_PROXY);
}
return response.body().string();
} catch (Exception e) {
......
......@@ -11,9 +11,6 @@
*/
package com.zhiwei.toutiao.parse;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.Proxy.Type;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
......@@ -27,10 +24,9 @@ import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
/**
* @Description:
......@@ -39,38 +35,9 @@ import com.zhiwei.toutiao.util.Tools;
*/
public class TouTiaoParse {
private Map<String, String> headerMap ;
private static Logger logger = LogManager.getLogger(TouTiaoCommentParse.class);
/***
* 获取头条数据
*
* @Description:
* @param @param url
* @param @return
* @return List<TouTiao> 返回类型
* @throws Exception
*/
@Deprecated
public Map<String, Object> getTouTiaoList(String url,
Date endData, String source,String hostname,int host) throws Exception {
headerMap = Tools.getTouTiaoHeader();
String htmlBody = null;
if(hostname != null)
{
Proxy proxy = new Proxy(Type.HTTP, new InetSocketAddress(hostname, host));
htmlBody = HttpClientTemplateOK.get(url, proxy,headerMap);
}else
{
htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
}
if (htmlBody != null && htmlBody.contains("abstract")) {
return parseHtmlByAccount(htmlBody, endData, source);
}
return null;
}
/***
* 根据文章地址解析文章详情
*
* @Description:根据文章地址解析文章详情
......
package com.zhiwei.toutiao.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
......@@ -14,7 +13,9 @@ import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoQuestionAnswer;
import com.zhiwei.toutiao.util.Tools;
......@@ -29,8 +30,9 @@ public class TouTiaoQuestionAnswerParse {
private static Map<String, String> headerMap ;
private static Logger logger = LogManager.getLogger(TouTiaoQuestionAnswerParse.class);
public static Map<String,Object> getAnserList(String questionId,int page,int req_type,Proxy proxy){
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static Map<String,Object> getAnserList(String questionId,int page,int req_type){
String url = "https://www.wukong.com/wenda/web/question/loadmorev1/?qid="+questionId+"&count=20&req_type="+req_type+"&offset=" + page*20;
headerMap = Tools.getTouTiaoQuestionAnswerHeader();
......@@ -40,7 +42,7 @@ public class TouTiaoQuestionAnswerParse {
List<TouTiaoQuestionAnswer> anserList = new ArrayList<TouTiaoQuestionAnswer>();
try {
String htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null){
JSONObject jsonObject = JSONObject.parseObject(htmlBody);
if(jsonObject.getJSONObject("data") != null){
......@@ -85,7 +87,7 @@ public class TouTiaoQuestionAnswerParse {
* @param @return 设定文件
* @return String 返回类型
*/
public String getAnswerCount(String questionId,Proxy proxy){
public String getAnswerCount(String questionId){
String result = null;
String url = "https://www.wukong.com/question/"+questionId+"/";
System.out.println(url);
......@@ -93,7 +95,7 @@ public class TouTiaoQuestionAnswerParse {
headerMap.put("referer", url);
try {
String htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null){
Document document = Jsoup.parse(htmlBody);
String text = document.select("[class=question question-single]").text();
......
package com.zhiwei.toutiao.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
......@@ -11,7 +10,9 @@ import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoQuestion;
import com.zhiwei.toutiao.util.Tools;
......@@ -26,7 +27,9 @@ public class TouTiaoQuestionParse {
private static Map<String, String> headerMap;
private static Logger logger = LogManager.getLogger(TouTiaoQuestionParse.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: getSearchTouTiaoQuestion
* @author hero
......@@ -38,13 +41,13 @@ public class TouTiaoQuestionParse {
* @return List<TouTiaoQuestion> 返回类型
* @throws Exception
*/
public static List<TouTiaoQuestion> getSearchTouTiaoQuestion(String url,Proxy proxy) throws Exception {
public static List<TouTiaoQuestion> getSearchTouTiaoQuestion(String url) throws Exception {
List<TouTiaoQuestion> questtionList = new ArrayList<TouTiaoQuestion>();
headerMap = Tools.getTouTiaoQuestionHeader();
headerMap.put("referer", url);
String htmlBody = null;
try {
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null) {
List<TouTiaoQuestion> ttList = parseHtmlByQuestion(htmlBody);
if (ttList != null && ttList.size() > 0) {
......
......@@ -31,7 +31,7 @@ import okhttp3.Response;
public class TouTiaoSearchParse {
private static Logger logger = LogManager.getLogger(TouTiaoSearchParse.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
......@@ -151,7 +151,7 @@ public class TouTiaoSearchParse {
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_HEAVY_PROXY);
}
return response.body().string();
} catch (Exception e) {
......
......@@ -21,7 +21,8 @@ import com.zhiwei.wangyi.bean.WangYiNews;
public class WangyiNewParse {
private static Logger logger = LogManager.getLogger(WangyiNewParse.class);
private static boolean finish = true;
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: getWYHistory
* @Description: TODO(根据文章地址解析网易号历史文章)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment