Commit 9453f8d6 by zhiwei

修改http-core核心包,增加程序稳定性及处理乱码

parent cc347740
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#####更新提示2018-03-06 #####更新提示2018-03-06
本次更新内容为添加搜狗知乎采集 本次更新内容为添加搜狗知乎采集
添加自助翻页功能,如使用请添加休眠时间,以下是使用例子,百度为例 添加自助翻页功能,如使用请添加休眠时间,以下是使用例子,百度为例
public static List<NewsData> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy) throws Exception { public static List<NewsData> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy) throws Exception {
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 0; int page = 0;
boolean more = true; boolean more = true;
......
...@@ -60,13 +60,12 @@ ...@@ -60,13 +60,12 @@
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url> <url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository> </repository>
</distributionManagement> </distributionManagement>
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiweiTools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.0.7-SNAPSHOT</version> <version>0.0.4-SNAPSHOT</version>
</dependency> </dependency>
</dependencies> </dependencies>
</project> </project>
\ No newline at end of file
package com.zhiwei.media_data_crawler.crawler; package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
...@@ -9,24 +24,10 @@ import java.util.Map; ...@@ -9,24 +24,10 @@ import java.util.Map;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { public class BaiduNewsCrawlerParse {
private static Logger logger = LoggerFactory.getLogger(BaiduNewsCrawlerParse.class); private static Logger logger = LogManager.getLogger(BaiduNewsCrawlerParse.class);
private static final String pt = "百度新闻"; private static final String pt = "百度新闻";
/** /**
...@@ -194,14 +195,14 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -194,14 +195,14 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址 // 获取链接地址
String url = getUrl(word, startTime, endTime, tn, page); String url = getUrl(word, startTime, endTime, tn, page);
System.out.println(url);
headerMap.put("Host", "news.baidu.com"); headerMap.put("Host", "news.baidu.com");
headerMap.put("cookie",cookie); headerMap.put("cookie",cookie);
// 下载数据页面 // 下载数据页面
for (int i = 1; i <= 3; i++) { for (int i = 1; i <= 3; i++) {
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap));
} catch (IOException e) { return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
throw e; throw e;
...@@ -245,8 +246,9 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -245,8 +246,9 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面 // 下载数据页面
for (int i = 1; i <= 3; i++) { for (int i = 1; i <= 3; i++) {
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
} catch (IOException e) { return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
throw e; throw e;
...@@ -268,8 +270,9 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -268,8 +270,9 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面 // 下载数据页面
for (int i = 1; i <= 3; i++) { for (int i = 1; i <= 3; i++) {
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
} catch (IOException e) { return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
throw e; throw e;
......
package com.zhiwei.media_data_crawler.crawler; package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.TiebaData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
...@@ -9,23 +23,9 @@ import java.util.Map; ...@@ -9,23 +23,9 @@ import java.util.Map;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.jsoup.Jsoup; public class BaiduTiebaCrawlerParse {
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.TiebaData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(BaiduTiebaCrawlerParse.class); private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class);
/** /**
* @Title: getBaiduTiebaData * @Title: getBaiduTiebaData
* @author hero * @author hero
...@@ -107,7 +107,6 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK { ...@@ -107,7 +107,6 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
Map<String, Object> dataMap = analysisDataAnswer(htmlBody,aid); Map<String, Object> dataMap = analysisDataAnswer(htmlBody,aid);
List<TiebaData> dataList = (List<TiebaData>) dataMap.get("data"); List<TiebaData> dataList = (List<TiebaData>) dataMap.get("data");
list.addAll(dataList); list.addAll(dataList);
System.out.println(list.size());
more = (Boolean) dataMap.get("more"); more = (Boolean) dataMap.get("more");
} else { } else {
more = false; more = false;
...@@ -218,8 +217,9 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK { ...@@ -218,8 +217,9 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面 // 下载数据页面
for (int i = 1; i <= 3; i++) { for (int i = 1; i <= 3; i++) {
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
} catch (IOException e) { return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
throw e; throw e;
...@@ -255,8 +255,9 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK { ...@@ -255,8 +255,9 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面 // 下载数据页面
for (int i = 1; i <= 3; i++) { for (int i = 1; i <= 3; i++) {
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
} catch (IOException e) { return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
throw e; throw e;
...@@ -341,14 +342,13 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK { ...@@ -341,14 +342,13 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
String url = null; String url = null;
if (word != null) { if (word != null) {
if(tiebaName!=null){ if(tiebaName!=null){
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(tiebaName, "GBK")+"&qw="+ url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+ URLCodeUtil.getURLEncode(tiebaName, "GBK")+"&qw="+
URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page; URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
}else{ }else{
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="+ url = "http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="+
URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page; URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
} }
} }
System.out.println(url);
return url; return url;
} }
} }
package com.zhiwei.media_data_crawler.crawler; package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.DouBanData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.jsoup.Jsoup; public class DoubanCrawlerParse {
import org.jsoup.nodes.Document; private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class);
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.DouBanData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class DoubanCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(DoubanCrawlerParse.class);
/** /**
* *
* @Title: getDoubanData * @Title: getDoubanData
...@@ -93,8 +93,9 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK { ...@@ -93,8 +93,9 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面 // 下载数据页面
for (int i = 1; i <= 3; i++) { for (int i = 1; i <= 3; i++) {
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
} catch (IOException e) { return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
throw e; throw e;
...@@ -114,8 +115,9 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK { ...@@ -114,8 +115,9 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面 // 下载数据页面
for (int i = 1; i <= 3; i++) { for (int i = 1; i <= 3; i++) {
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
} catch (IOException e) { return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
throw e; throw e;
...@@ -294,12 +296,11 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK { ...@@ -294,12 +296,11 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK {
String url = null; String url = null;
if (word != null) { if (word != null) {
if(type.equals("topic")){ if(type.equals("topic")){
url = "https://www.douban.com/group/search?q="+URLCodeUtil.getURLEncode(word, "utf-8")+"&start="+page*50+"&cat=1013&sort=time"; url = "https://www.douban.com/group/search?q="+ URLCodeUtil.getURLEncode(word, "utf-8")+"&start="+page*50+"&cat=1013&sort=time";
}else if(type.equals("note")){ }else if(type.equals("note")){
url = "https://www.douban.com/j/search?q="+URLCodeUtil.getURLEncode(word, "utf-8")+"&start="+page*20+"&cat=1015"; url = "https://www.douban.com/j/search?q="+URLCodeUtil.getURLEncode(word, "utf-8")+"&start="+page*20+"&cat=1015";
} }
} }
System.out.println(url);
return url; return url;
} }
......
package com.zhiwei.media_data_crawler.crawler; package com.zhiwei.media_data_crawler.crawler;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class SoCrawlerParse extends HttpClientTemplateOK { public class SoCrawlerParse {
private static Logger logger = LoggerFactory.getLogger(SoCrawlerParse.class); private static Logger logger = LogManager.getLogger(SoCrawlerParse.class);
private static final String pt = "360网页"; private static final String pt = "360网页";
/** /**
...@@ -99,7 +103,8 @@ public class SoCrawlerParse extends HttpClientTemplateOK { ...@@ -99,7 +103,8 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面 // 下载数据页面
for (int i = 1; i <= 3; i++) { for (int i = 1; i <= 3; i++) {
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) { } catch (Exception e) {
logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
...@@ -155,7 +160,6 @@ public class SoCrawlerParse extends HttpClientTemplateOK { ...@@ -155,7 +160,6 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
if(!element.attr("class").equals("res-list hasimg hasmediav")){ if(!element.attr("class").equals("res-list hasimg hasmediav")){
link = element.select("h3.res-title").select("a").attr("href"); link = element.select("h3.res-title").select("a").attr("href");
title = element.select("h3.res-title").select("a").text(); title = element.select("h3.res-title").select("a").text();
System.out.println(title+"============"+link);
NewsData newsData = null; NewsData newsData = null;
String realUrl = link; String realUrl = link;
if(link.contains("www.so.com/link")) { if(link.contains("www.so.com/link")) {
...@@ -257,7 +261,7 @@ public class SoCrawlerParse extends HttpClientTemplateOK { ...@@ -257,7 +261,7 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
private static String getUrl(String word, String site, String time ,int page) { private static String getUrl(String word, String site, String time ,int page) {
String url = null; String url = null;
if (word != null) { if (word != null) {
url = "https://www.so.com/s?q="+URLCodeUtil.getURLEncode(word, "utf-8"); url = "https://www.so.com/s?q="+ URLCodeUtil.getURLEncode(word, "utf-8");
if(site!=null) { if(site!=null) {
url = url + "+site%3A" + site; url = url + "+site%3A" + site;
} }
...@@ -279,7 +283,8 @@ public class SoCrawlerParse extends HttpClientTemplateOK { ...@@ -279,7 +283,8 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
String url = null; String url = null;
if(link != null) { if(link != null) {
try { try {
String htmlBody = HttpClientTemplateOK.get(link, proxy, null); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url),proxy,false);
String htmlBody = response.body().toString();
if(htmlBody!=null) { if(htmlBody!=null) {
url = htmlBody.split("window.location.replace\\(\"")[1].split("\"\\)")[0]; url = htmlBody.split("window.location.replace\\(\"")[1].split("\"\\)")[0];
url = url.replaceAll("http", "https"); url = url.replaceAll("http", "https");
......
package com.zhiwei.media_data_crawler.crawler; package com.zhiwei.media_data_crawler.crawler;
import java.net.Proxy; import com.zhiwei.crawler.core.HttpBoot;
import java.util.ArrayList; import com.zhiwei.crawler.core.RequestUtils;
import java.util.HashMap; import com.zhiwei.media_data_crawler.data.DataCrawler;
import java.util.List; import com.zhiwei.media_data_crawler.entity.NewsData;
import java.util.Map; import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.data.DataCrawler; import java.net.Proxy;
import com.zhiwei.media_data_crawler.entity.NewsData; import java.util.ArrayList;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool; import java.util.HashMap;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK; import java.util.List;
import com.zhiwei.zhiweiTools.timeParse.TimeParse; import java.util.Map;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class SoNewsCrawlerParse extends HttpClientTemplateOK { public class SoNewsCrawlerParse {
private static Logger logger = LoggerFactory.getLogger(SoNewsCrawlerParse.class); private static Logger logger = LogManager.getLogger(SoNewsCrawlerParse.class);
private static final String pt = "360新闻"; private static final String pt = "360新闻";
/** /**
...@@ -133,7 +135,8 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -133,7 +135,8 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面 // 下载数据页面
for (int i = 1; i <= 3; i++) { for (int i = 1; i <= 3; i++) {
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) { } catch (Exception e) {
logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
......
package com.zhiwei.media_data_crawler.crawler; package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.data.DataCrawler; public class SougouNewsCrawlerParse {
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class SougouNewsCrawlerParse extends HttpClientTemplateOK { private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class);
private static Logger logger = LoggerFactory.getLogger(SougouNewsCrawlerParse.class);
private static final String pt = "搜狗新闻"; private static final String pt = "搜狗新闻";
...@@ -127,8 +128,9 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -127,8 +128,9 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//下载数据页面 //下载数据页面
for(int i = 1; i<=3; i++){ for(int i = 1; i<=3; i++){
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
} catch (IOException e) { return response.body().toString();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
throw e; throw e;
...@@ -150,8 +152,9 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -150,8 +152,9 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//下载数据页面 //下载数据页面
for(int i = 1; i<=3; i++){ for(int i = 1; i<=3; i++){
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
} catch (IOException e) { return response.body().toString();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
throw e; throw e;
......
package com.zhiwei.media_data_crawler.crawler; package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException; import com.zhiwei.crawler.core.HttpBoot;
import java.net.Proxy; import com.zhiwei.crawler.core.RequestUtils;
import java.util.ArrayList; import com.zhiwei.media_data_crawler.data.DataCrawler;
import java.util.Date; import com.zhiwei.media_data_crawler.entity.ZhiHuData;
import java.util.HashMap; import com.zhiwei.tools.httpclient.HeaderTool;
import java.util.List; import com.zhiwei.tools.timeparse.TimeParse;
import java.util.Map; import com.zhiwei.tools.tools.URLCodeUtil;
import java.util.regex.Matcher; import com.zhiwei.tools.tools.ZhiWeiTools;
import java.util.regex.Pattern; import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.data.DataCrawler; import java.net.Proxy;
import com.zhiwei.media_data_crawler.entity.ZhiHuData; import java.util.*;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool; import java.util.regex.Matcher;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK; import java.util.regex.Pattern;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
public class SougouZhihuCrawlerParse{
private static Logger logger = LoggerFactory.getLogger(SougouZhihuCrawlerParse.class);
private static Logger logger = LogManager.getLogger(SougouZhihuCrawlerParse.class);
private static final String pt = "搜狗知乎"; private static final String pt = "搜狗知乎";
...@@ -99,8 +96,9 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -99,8 +96,9 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
//下载数据页面 //下载数据页面
for(int i = 1; i<=3; i++){ for(int i = 1; i<=3; i++){
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
} catch (IOException e) { return response.body().toString();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
throw e; throw e;
...@@ -125,8 +123,9 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -125,8 +123,9 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
//下载数据页面 //下载数据页面
for(int i = 1; i<=3; i++){ for(int i = 1; i<=3; i++){
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
} catch (IOException e) { return response.body().toString();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
throw e; throw e;
......
package com.zhiwei.media_data_crawler.crawler; package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException; import com.zhiwei.crawler.core.HttpBoot;
import java.net.Proxy; import com.zhiwei.crawler.core.RequestUtils;
import java.util.ArrayList; import com.zhiwei.media_data_crawler.data.DataCrawler;
import java.util.Date; import com.zhiwei.media_data_crawler.entity.LunTanData;
import java.util.HashMap; import com.zhiwei.tools.httpclient.HeaderTool;
import java.util.List; import com.zhiwei.tools.timeparse.TimeParse;
import java.util.Map; import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.data.DataCrawler; import java.net.Proxy;
import com.zhiwei.media_data_crawler.entity.LunTanData; import java.util.*;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class TianYaCrawlerParse extends HttpClientTemplateOK { public class TianYaCrawlerParse {
private static Logger logger = LoggerFactory.getLogger(TianYaCrawlerParse.class); private static Logger logger = LogManager.getLogger(TianYaCrawlerParse.class);
private static final String pt = "天涯论坛"; private static final String pt = "天涯论坛";
/** /**
* @Title: getBaiduTiebaData * @Title: getBaiduTiebaData
...@@ -89,8 +86,9 @@ public class TianYaCrawlerParse extends HttpClientTemplateOK { ...@@ -89,8 +86,9 @@ public class TianYaCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面 // 下载数据页面
for (int i = 1; i <= 3; i++) { for (int i = 1; i <= 3; i++) {
try { try {
return get(url, proxy, headerMap); Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
} catch (IOException e) { return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){ if(i==3){
throw e; throw e;
...@@ -172,7 +170,7 @@ public class TianYaCrawlerParse extends HttpClientTemplateOK { ...@@ -172,7 +170,7 @@ public class TianYaCrawlerParse extends HttpClientTemplateOK {
private static String getUrl(String word, int page) { private static String getUrl(String word, int page) {
String url = null; String url = null;
if (word != null) { if (word != null) {
url = "http://search.tianya.cn/bbs?q="+URLCodeUtil.getURLEncode(word, "utf-8") url = "http://search.tianya.cn/bbs?q="+ URLCodeUtil.getURLEncode(word, "utf-8")
+"&s=4&f=0&pn="+page; +"&s=4&f=0&pn="+page;
} }
System.out.println(url); System.out.println(url);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment