Commit 38bcf00d by zhiwei

添加自助翻页功能,如使用请添加休眠时间

parent 0930c2aa
......@@ -32,9 +32,10 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsData(String word, Proxy proxy) {
public static List<NewsData> getSoNewsData(String word, Proxy proxy) throws Exception {
List<NewsData> list = new ArrayList<NewsData>();
int page = 1;
boolean more = true;
......@@ -59,6 +60,19 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
return list;
}
public static Map<String,Object> getSoNewsData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, "news", proxy, page);
if (htmlBody != null) {
return analysisData(htmlBody, proxy, word);
}
return null;
}
/**
* @Title: getSoNewsDataByTitle
* @author hero
......@@ -69,7 +83,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @return List<NewsData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy) {
public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>();
int page = 1;
boolean more = true;
......@@ -104,7 +118,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, String tn, Proxy proxy, int page) {
private static String downloadHtml(String word, String tn, Proxy proxy, int page)throws Exception {
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
......@@ -138,7 +152,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) {
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
......@@ -199,7 +213,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) {
private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
......
......@@ -35,9 +35,10 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getSougouNewsData(String word, Proxy proxy){
public static List<NewsData> getSougouNewsData(String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>();
int page = 1;
boolean more = true;
......@@ -62,8 +63,18 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
}
public static Map<String,Object> getSougouNewsData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, 1, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){
return analysisData(htmlBody, proxy, word);
}
return null;
}
@SuppressWarnings("unchecked")
public static List<NewsData> getSougouNewsDataByTitle(String word, Proxy proxy){
public static List<NewsData> getSougouNewsDataByTitle(String word, Proxy proxy)throws Exception{
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
......@@ -99,7 +110,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, int mode, Proxy proxy, int page) {
private static String downloadHtml(String word, int mode, Proxy proxy, int page) throws Exception{
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
......@@ -118,7 +129,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
return null;
}
private static String downloadHtml(String url, Proxy proxy, int page) {
private static String downloadHtml(String url, Proxy proxy, int page) throws Exception{
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
......@@ -149,7 +160,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) {
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String,Object> resultMap = new HashMap<String,Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
......@@ -241,7 +252,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @return List<NewsData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getOherSougouNewsData(String url, String word, Proxy proxy){
public static List<NewsData> getOherSougouNewsData(String url, String word, Proxy proxy)throws Exception{
List<NewsData> list = new ArrayList<NewsData>();
int page = 1;
boolean more = true;
......
......@@ -38,9 +38,10 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy
* @param @return 设定文件
* @return List<ZhiHuData> 返回类型
* @throws Exception
*/
@SuppressWarnings("unchecked")
public static List<ZhiHuData> getSougouZhihuData(String word, Proxy proxy){
public static List<ZhiHuData> getSougouZhihuData(String word, Proxy proxy) throws Exception{
List<ZhiHuData> list = new ArrayList<ZhiHuData>();
int page = 1;
boolean more = true;
......@@ -66,6 +67,13 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
}
public static Map<String,Object> getSougouZhihuData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){
return analysisData(htmlBody, proxy, word);
}
return null;
}
/**
*
......@@ -79,7 +87,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, Proxy proxy, int page) {
private static String downloadHtml(String word, Proxy proxy, int page) throws Exception{
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
......@@ -97,7 +105,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
return null;
}
private static String downloadHtml(String url, Proxy proxy, String type) {
private static String downloadHtml(String url, Proxy proxy, String type) throws Exception{
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
......@@ -131,7 +139,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) {
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word)throws Exception {
Map<String,Object> resultMap = new HashMap<String,Object>();
List<ZhiHuData> list = new ArrayList<ZhiHuData>();
boolean more = true;
......@@ -203,7 +211,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return ZhiHuData 返回类型
*/
private static ZhiHuData analysisZhihuAnswer(String url, Proxy proxy,ZhiHuData zhihu){
private static ZhiHuData analysisZhihuAnswer(String url, Proxy proxy,ZhiHuData zhihu)throws Exception{
try {
String htmlBody = downloadHtml(url, proxy, "问答");
if(htmlBody != null){
......@@ -239,7 +247,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @return ZhiHuData 返回类型
*/
@SuppressWarnings("deprecation")
private static ZhiHuData analysisZhihuArticle(String url, Proxy proxy, ZhiHuData zhihu){
private static ZhiHuData analysisZhihuArticle(String url, Proxy proxy, ZhiHuData zhihu)throws Exception{
try {
String htmlBody = downloadHtml(url, proxy, "文章");
Document document = Jsoup.parse(htmlBody);
......
package com.zhiwei.media_data_crawler.test;
import java.net.Proxy;
import java.util.List;
import org.junit.Test;
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
public class DataCrawlerTest {
@Test
public void getSoNewsTest(){
String word = "马云"; //关键词
String startTime = "2017-03-01 00:00:00"; //开始时间
String endTime = "2017-03-01 23:59:59"; //结束时间
Proxy proxy = null; //代理IP,不用可不填写
//百度新闻采集demo
// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//搜狗知乎采集
List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
System.out.println(zhihuList.size());
}
}
//package com.zhiwei.media_data_crawler.test;
//
//import java.net.Proxy;
//import java.util.List;
//
//import org.junit.Test;
//
//import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
//import com.zhiwei.media_data_crawler.data.DataCrawler;
//import com.zhiwei.media_data_crawler.entity.NewsData;
//import com.zhiwei.media_data_crawler.entity.ZhiHuData;
//
//public class DataCrawlerTest {
//
//
//
//
//
// @Test
// public void getSoNewsTest(){
// String word = "马云"; //关键词
// String startTime = "2017-03-01 00:00:00"; //开始时间
// String endTime = "2017-03-01 23:59:59"; //结束时间
// Proxy proxy = null; //代理IP,不用可不填写
// //百度新闻采集demo
//// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
//// //搜狗新闻关键词采集demo
//// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
//// //360新闻采集demo
//// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//// //搜狗知乎采集
//// List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
//// System.out.println(zhihuList.size());
//
// }
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment