Commit 06f917df by zhiwei

添加单页数据返回说明,并更新使用文档

parent cb7cfb36
...@@ -3,7 +3,33 @@ ...@@ -3,7 +3,33 @@
#####更新提示2018-03-06 #####更新提示2018-03-06
本次更新内容为添加搜狗知乎采集 本次更新内容为添加搜狗知乎采集
添加自助翻页功能,如使用请添加休眠时间 添加自助翻页功能,如使用请添加休眠时间,以下是使用例子,百度为例
public static List<NewsData> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy) throws Exception {
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 20) {
more = false;
}
String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newsdy", page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
ZhiWeiTools.sleep(3000);
}
return list;
}
其它类的可看相应的源码,里面有休眠时间等设置
##### 摘要 ##### 摘要
> 这是一个基于OKHttp+Jsoup实现的网页抓取及解析功能的搜索引擎采集爬虫,目前包含:百度新闻、搜狗新闻、360新闻、搜狗知乎采集四种根据关键词采集功能 > 这是一个基于OKHttp+Jsoup实现的网页抓取及解析功能的搜索引擎采集爬虫,目前包含:百度新闻、搜狗新闻、360新闻、搜狗知乎采集四种根据关键词采集功能
...@@ -38,3 +64,6 @@ ...@@ -38,3 +64,6 @@
...@@ -8,7 +8,6 @@ import java.util.List; ...@@ -8,7 +8,6 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
...@@ -167,9 +166,13 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -167,9 +166,13 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return get(url, proxy, headerMap); return get(url, proxy, headerMap);
} catch (IOException e) { } catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue; continue;
} }
} }
}
return null; return null;
} }
...@@ -186,9 +189,13 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -186,9 +189,13 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return get(url, proxy, headerMap); return get(url, proxy, headerMap);
} catch (IOException e) { } catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue; continue;
} }
} }
}
return null; return null;
} }
......
...@@ -118,7 +118,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -118,7 +118,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
private static String downloadHtml(String word, String tn, Proxy proxy, int page)throws Exception { private static String downloadHtml(String word, String tn, Proxy proxy, int page)throws IOException {
// 获取通用请求头 // 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址 // 获取链接地址
...@@ -131,9 +131,13 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -131,9 +131,13 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
return get(url, proxy, headerMap); return get(url, proxy, headerMap);
} catch (IOException e) { } catch (IOException e) {
logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue; continue;
} }
} }
}
return null; return null;
} }
......
...@@ -123,9 +123,13 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -123,9 +123,13 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
return get(url, proxy, headerMap); return get(url, proxy, headerMap);
} catch (IOException e) { } catch (IOException e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue; continue;
} }
} }
}
return null; return null;
} }
...@@ -142,9 +146,13 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -142,9 +146,13 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
return get(url, proxy, headerMap); return get(url, proxy, headerMap);
} catch (IOException e) { } catch (IOException e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue; continue;
} }
} }
}
return null; return null;
} }
......
...@@ -99,9 +99,13 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -99,9 +99,13 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
return get(url, proxy, headerMap); return get(url, proxy, headerMap);
} catch (IOException e) { } catch (IOException e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue; continue;
} }
} }
}
return null; return null;
} }
...@@ -121,9 +125,13 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -121,9 +125,13 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
return get(url, proxy, headerMap); return get(url, proxy, headerMap);
} catch (IOException e) { } catch (IOException e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue; continue;
} }
} }
}
return null; return null;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment