Commit f4b4b293 by zhiwei

升级采集核心包版本并默认代理使用晋豪提供得NAT代理

parent f0484148
......@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.tools.httpclient.HeaderTool;
......@@ -68,8 +69,8 @@ public class BaiduNewsCrawlerParse {
more = false;
}
page++;
if(DataCrawler.sleepTime==null){
ZhiWeiTools.sleep(3000);
if(DataCrawler.sleepTime != null ){
ZhiWeiTools.sleep(DataCrawler.sleepTime);
}
}
return list;
......@@ -201,8 +202,13 @@ public class BaiduNewsCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap));
return response.body().string();
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
......@@ -247,7 +253,12 @@ public class BaiduNewsCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
......@@ -271,7 +282,12 @@ public class BaiduNewsCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
......
......@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.TiebaData;
import com.zhiwei.tools.httpclient.HeaderTool;
......@@ -57,8 +58,8 @@ public class BaiduTiebaCrawlerParse {
more = false;
}
page++;
if(DataCrawler.sleepTime==null){
ZhiWeiTools.sleep(3000);
if(DataCrawler.sleepTime!=null){
ZhiWeiTools.sleep(DataCrawler.sleepTime);
}
}
return list;
......@@ -217,8 +218,13 @@ public class BaiduTiebaCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
......@@ -255,8 +261,12 @@ public class BaiduTiebaCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
......
......@@ -3,6 +3,7 @@ package com.zhiwei.media_data_crawler.crawler;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.DouBanData;
import com.zhiwei.tools.httpclient.HeaderTool;
......@@ -94,8 +95,12 @@ public class DoubanCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
......@@ -116,8 +121,12 @@ public class DoubanCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
......@@ -193,8 +202,8 @@ public class DoubanCrawlerParse {
String content = document.select("div.topic-doc").select("div#link-report").select("div.topic-content").text();
douban.setContent(content);
}
if(DataCrawler.sleepTime==null){
ZhiWeiTools.sleep(5000);
if(DataCrawler.sleepTime!=null){
ZhiWeiTools.sleep(DataCrawler.sleepTime);
}
return douban;
} catch (Exception e) {
......@@ -273,8 +282,8 @@ public class DoubanCrawlerParse {
douban.setContent(content);
}
}
if(DataCrawler.sleepTime==null){
ZhiWeiTools.sleep(5000);
if(DataCrawler.sleepTime!=null){
ZhiWeiTools.sleep(DataCrawler.sleepTime);
}
return douban;
} catch (Exception e) {
......
......@@ -3,6 +3,7 @@ package com.zhiwei.media_data_crawler.crawler;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.tools.httpclient.HeaderTool;
......@@ -62,8 +63,8 @@ public class SoCrawlerParse {
more = false;
}
page++;
if(DataCrawler.sleepTime==null){
ZhiWeiTools.sleep(5000);
if(DataCrawler.sleepTime!=null){
ZhiWeiTools.sleep(DataCrawler.sleepTime);
}
}
return list;
......@@ -104,7 +105,12 @@ public class SoCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
......
......@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.tools.httpclient.HeaderTool;
......@@ -59,8 +60,8 @@ public class SoNewsCrawlerParse {
more = false;
}
page++;
if(DataCrawler.sleepTime==null){
ZhiWeiTools.sleep(5000);
if(DataCrawler.sleepTime!=null){
ZhiWeiTools.sleep(DataCrawler.sleepTime);
}
}
return list;
......@@ -108,8 +109,8 @@ public class SoNewsCrawlerParse {
more = false;
}
page++;
if(DataCrawler.sleepTime==null){
ZhiWeiTools.sleep(5000);
if(DataCrawler.sleepTime!=null){
ZhiWeiTools.sleep(DataCrawler.sleepTime);
}
}
return list;
......@@ -136,7 +137,12 @@ public class SoNewsCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
......
......@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.tools.httpclient.HeaderTool;
......@@ -9,6 +10,8 @@ import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
......@@ -63,8 +66,8 @@ public class SougouNewsCrawlerParse {
}
page++;
logger.info("采集到 {} 页 采集的数据量为 {}", page, list.size());
if(DataCrawler.sleepTime==null){
ZhiWeiTools.sleep(5000);
if(DataCrawler.sleepTime!=null){
ZhiWeiTools.sleep(DataCrawler.sleepTime);
}
}
return list;
......@@ -74,7 +77,7 @@ public class SougouNewsCrawlerParse {
public static Map<String,Object> getSougouNewsData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, 1, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){
if(StringUtils.isBlank(htmlBody)){
return analysisData(htmlBody, proxy, word, "normal");
}
return null;
......@@ -101,8 +104,8 @@ public class SougouNewsCrawlerParse {
more = false;
}
page++;
if(DataCrawler.sleepTime==null){
ZhiWeiTools.sleep(5000);
if(DataCrawler.sleepTime!=null){
ZhiWeiTools.sleep(DataCrawler.sleepTime);
}
}
return list;
......@@ -130,7 +133,12 @@ public class SougouNewsCrawlerParse {
//下载数据页面
for(int i = 1; i<=3; i++){
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
......@@ -154,7 +162,12 @@ public class SougouNewsCrawlerParse {
//下载数据页面
for(int i = 1; i<=3; i++){
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
......
......@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
import com.zhiwei.tools.httpclient.HeaderTool;
......@@ -9,6 +10,8 @@ import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
......@@ -59,8 +62,8 @@ public class SougouZhihuCrawlerParse{
}else{
more = false;
}
if(DataCrawler.sleepTime==null){
ZhiWeiTools.sleep(5000);
if(DataCrawler.sleepTime!=null){
ZhiWeiTools.sleep(DataCrawler.sleepTime);
}
page++;
}
......@@ -70,7 +73,7 @@ public class SougouZhihuCrawlerParse{
public static Map<String,Object> getSougouZhihuData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){
if(StringUtils.isBlank(htmlBody)){
return analysisData(htmlBody, proxy, word);
}
return null;
......@@ -97,7 +100,12 @@ public class SougouZhihuCrawlerParse{
//下载数据页面
for(int i = 1; i<=3; i++){
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
......@@ -124,7 +132,12 @@ public class SougouZhihuCrawlerParse{
//下载数据页面
for(int i = 1; i<=3; i++){
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
......
......@@ -2,6 +2,7 @@ package com.zhiwei.media_data_crawler.crawler;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.LunTanData;
import com.zhiwei.tools.httpclient.HeaderTool;
......@@ -55,8 +56,8 @@ public class TianYaCrawlerParse {
more = false;
}
page++;
if(DataCrawler.sleepTime==null){
ZhiWeiTools.sleep(3000);
if(DataCrawler.sleepTime!=null){
ZhiWeiTools.sleep(DataCrawler.sleepTime);
}
}
......@@ -87,7 +88,12 @@ public class TianYaCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment