Commit 9453f8d6 by zhiwei

修改http-core核心包,增加程序稳定性及处理乱码

parent cc347740
......@@ -61,12 +61,11 @@
</repository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>zhiweiTools</artifactId>
<version>0.0.7-SNAPSHOT</version>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.4-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
......@@ -9,24 +24,10 @@ import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
public class BaiduNewsCrawlerParse {
private static Logger logger = LoggerFactory.getLogger(BaiduNewsCrawlerParse.class);
private static Logger logger = LogManager.getLogger(BaiduNewsCrawlerParse.class);
private static final String pt = "百度新闻";
/**
......@@ -194,14 +195,14 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, startTime, endTime, tn, page);
System.out.println(url);
headerMap.put("Host", "news.baidu.com");
headerMap.put("cookie",cookie);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap));
return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
......@@ -245,8 +246,9 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
......@@ -268,8 +270,9 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
......
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.TiebaData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
......@@ -9,23 +23,9 @@ import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.TiebaData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
public class BaiduTiebaCrawlerParse {
private static Logger logger = LoggerFactory.getLogger(BaiduTiebaCrawlerParse.class);
private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class);
/**
* @Title: getBaiduTiebaData
* @author hero
......@@ -107,7 +107,6 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
Map<String, Object> dataMap = analysisDataAnswer(htmlBody,aid);
List<TiebaData> dataList = (List<TiebaData>) dataMap.get("data");
list.addAll(dataList);
System.out.println(list.size());
more = (Boolean) dataMap.get("more");
} else {
more = false;
......@@ -218,8 +217,9 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
......@@ -255,8 +255,9 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
......@@ -341,14 +342,13 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
String url = null;
if (word != null) {
if(tiebaName!=null){
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(tiebaName, "GBK")+"&qw="+
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+ URLCodeUtil.getURLEncode(tiebaName, "GBK")+"&qw="+
URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
}else{
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="+
URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
}
}
System.out.println(url);
return url;
}
}
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.DouBanData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.DouBanData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class DoubanCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(DoubanCrawlerParse.class);
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class DoubanCrawlerParse {
private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class);
/**
*
* @Title: getDoubanData
......@@ -93,8 +93,9 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
......@@ -114,8 +115,9 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
......@@ -294,12 +296,11 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK {
String url = null;
if (word != null) {
if(type.equals("topic")){
url = "https://www.douban.com/group/search?q="+URLCodeUtil.getURLEncode(word, "utf-8")+"&start="+page*50+"&cat=1013&sort=time";
url = "https://www.douban.com/group/search?q="+ URLCodeUtil.getURLEncode(word, "utf-8")+"&start="+page*50+"&cat=1013&sort=time";
}else if(type.equals("note")){
url = "https://www.douban.com/j/search?q="+URLCodeUtil.getURLEncode(word, "utf-8")+"&start="+page*20+"&cat=1015";
}
}
System.out.println(url);
return url;
}
......
package com.zhiwei.media_data_crawler.crawler;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class SoCrawlerParse extends HttpClientTemplateOK {
public class SoCrawlerParse {
private static Logger logger = LoggerFactory.getLogger(SoCrawlerParse.class);
private static Logger logger = LogManager.getLogger(SoCrawlerParse.class);
private static final String pt = "360网页";
/**
......@@ -99,7 +103,8 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
......@@ -155,7 +160,6 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
if(!element.attr("class").equals("res-list hasimg hasmediav")){
link = element.select("h3.res-title").select("a").attr("href");
title = element.select("h3.res-title").select("a").text();
System.out.println(title+"============"+link);
NewsData newsData = null;
String realUrl = link;
if(link.contains("www.so.com/link")) {
......@@ -257,7 +261,7 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
private static String getUrl(String word, String site, String time ,int page) {
String url = null;
if (word != null) {
url = "https://www.so.com/s?q="+URLCodeUtil.getURLEncode(word, "utf-8");
url = "https://www.so.com/s?q="+ URLCodeUtil.getURLEncode(word, "utf-8");
if(site!=null) {
url = url + "+site%3A" + site;
}
......@@ -279,7 +283,8 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
String url = null;
if(link != null) {
try {
String htmlBody = HttpClientTemplateOK.get(link, proxy, null);
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url),proxy,false);
String htmlBody = response.body().toString();
if(htmlBody!=null) {
url = htmlBody.split("window.location.replace\\(\"")[1].split("\"\\)")[0];
url = url.replaceAll("http", "https");
......
package com.zhiwei.media_data_crawler.crawler;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class SoNewsCrawlerParse extends HttpClientTemplateOK {
public class SoNewsCrawlerParse {
private static Logger logger = LoggerFactory.getLogger(SoNewsCrawlerParse.class);
private static Logger logger = LogManager.getLogger(SoNewsCrawlerParse.class);
private static final String pt = "360新闻";
/**
......@@ -133,7 +135,8 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
......
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
public class SougouNewsCrawlerParse {
private static Logger logger = LoggerFactory.getLogger(SougouNewsCrawlerParse.class);
private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class);
private static final String pt = "搜狗新闻";
......@@ -127,8 +128,9 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//下载数据页面
for(int i = 1; i<=3; i++){
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
......@@ -150,8 +152,9 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//下载数据页面
for(int i = 1; i<=3; i++){
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
......
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
import java.net.Proxy;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(SougouZhihuCrawlerParse.class);
public class SougouZhihuCrawlerParse{
private static Logger logger = LogManager.getLogger(SougouZhihuCrawlerParse.class);
private static final String pt = "搜狗知乎";
......@@ -99,8 +96,9 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
//下载数据页面
for(int i = 1; i<=3; i++){
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
......@@ -125,8 +123,9 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
//下载数据页面
for(int i = 1; i<=3; i++){
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
......
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.LunTanData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.LunTanData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
import java.net.Proxy;
import java.util.*;
public class TianYaCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(TianYaCrawlerParse.class);
public class TianYaCrawlerParse {
private static Logger logger = LogManager.getLogger(TianYaCrawlerParse.class);
private static final String pt = "天涯论坛";
/**
* @Title: getBaiduTiebaData
......@@ -89,8 +86,9 @@ public class TianYaCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
......@@ -172,7 +170,7 @@ public class TianYaCrawlerParse extends HttpClientTemplateOK {
private static String getUrl(String word, int page) {
String url = null;
if (word != null) {
url = "http://search.tianya.cn/bbs?q="+URLCodeUtil.getURLEncode(word, "utf-8")
url = "http://search.tianya.cn/bbs?q="+ URLCodeUtil.getURLEncode(word, "utf-8")
+"&s=4&f=0&pn="+page;
}
System.out.println(url);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment