Commit 86d07d91 by zhiwei

添加采集时间间隔,避免频率过快造成IP封锁

parent 41547bad
......@@ -20,6 +20,7 @@ import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
......@@ -58,10 +59,22 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
more = false;
}
page++;
ZhiWeiTools.sleep(3000);
}
return list;
}
/**
* @Title: getBaiduNewsDataByTitle
* @author hero
* @Description: 采集百度新闻数据,根据标题匹配
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getBaiduNewsDataByTitle(String word, String startTime, String endTime, Proxy proxy){
List<NewsData> list = new ArrayList<NewsData>();
......@@ -82,6 +95,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
more = false;
}
page++;
ZhiWeiTools.sleep(3000);
}
return list;
}
......@@ -215,6 +229,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
String otherLink = "http://news.baidu.com"+element.select("div.c-row").select("a.c-more_link").attr("href");
List<NewsData> otherDataList = getOherBaiduNewsData(otherLink, word, proxy);
list.addAll(otherDataList);
ZhiWeiTools.sleep(100);
}
} catch (Exception e) {
e.printStackTrace();
......
......@@ -17,6 +17,7 @@ import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class SoNewsCrawlerParse extends HttpClientTemplateOK {
......@@ -46,12 +47,14 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
logger.info("当前采集页数:{},当前采集关键词:{},当页数据量{}", page, word, dataList.size());
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
ZhiWeiTools.sleep(5000);
}
return list;
}
......@@ -85,6 +88,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
more = false;
}
page++;
ZhiWeiTools.sleep(5000);
}
return list;
}
......
......@@ -17,6 +17,7 @@ import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
......@@ -54,6 +55,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
}else{
more = false;
}
ZhiWeiTools.sleep(5000);
page++;
}
return list;
......@@ -80,6 +82,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
more = false;
}
page++;
ZhiWeiTools.sleep(5000);
}
return list;
}
......
package com.zhiwei.media_data_crawler.test;
import java.util.List;
import org.junit.Test;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
public class DataCrawlerTest {
@Test
public void getSoNewsTest(){
String word = "京东";
List<NewsData> list = DataCrawler.getSoNewsData(word, null);
for(NewsData newsData : list){
System.out.println(newsData);
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment