Commit 38bcf00d by zhiwei

添加自助翻页功能,如使用请添加休眠时间

parent 0930c2aa
...@@ -32,9 +32,10 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -32,9 +32,10 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy * @param @param proxy
* @param @return 设定文件 * @param @return 设定文件
* @return List<NewsData> 返回类型 * @return List<NewsData> 返回类型
* @throws Exception
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsData(String word, Proxy proxy) { public static List<NewsData> getSoNewsData(String word, Proxy proxy) throws Exception {
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 1; int page = 1;
boolean more = true; boolean more = true;
...@@ -59,6 +60,19 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -59,6 +60,19 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
return list; return list;
} }
public static Map<String,Object> getSoNewsData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, "news", proxy, page);
if (htmlBody != null) {
return analysisData(htmlBody, proxy, word);
}
return null;
}
/** /**
* @Title: getSoNewsDataByTitle * @Title: getSoNewsDataByTitle
* @author hero * @author hero
...@@ -69,7 +83,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -69,7 +83,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @return List<NewsData> 返回类型 * @return List<NewsData> 返回类型
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy) { public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 1; int page = 1;
boolean more = true; boolean more = true;
...@@ -104,7 +118,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -104,7 +118,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
private static String downloadHtml(String word, String tn, Proxy proxy, int page) { private static String downloadHtml(String word, String tn, Proxy proxy, int page)throws Exception {
// 获取通用请求头 // 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址 // 获取链接地址
...@@ -138,7 +152,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -138,7 +152,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* 设定文件 * 设定文件
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) { private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>(); Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
boolean more = true; boolean more = true;
...@@ -199,7 +213,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -199,7 +213,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) { private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>(); Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
boolean more = true; boolean more = true;
......
...@@ -35,9 +35,10 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -35,9 +35,10 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy * @param @param proxy
* @param @return 设定文件 * @param @return 设定文件
* @return List<NewsData> 返回类型 * @return List<NewsData> 返回类型
* @throws Exception
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getSougouNewsData(String word, Proxy proxy){ public static List<NewsData> getSougouNewsData(String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 1; int page = 1;
boolean more = true; boolean more = true;
...@@ -62,8 +63,18 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -62,8 +63,18 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
} }
public static Map<String,Object> getSougouNewsData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, 1, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){
return analysisData(htmlBody, proxy, word);
}
return null;
}
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getSougouNewsDataByTitle(String word, Proxy proxy){ public static List<NewsData> getSougouNewsDataByTitle(String word, Proxy proxy)throws Exception{
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 0; int page = 0;
boolean more = true; boolean more = true;
...@@ -99,7 +110,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -99,7 +110,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
private static String downloadHtml(String word, int mode, Proxy proxy, int page) { private static String downloadHtml(String word, int mode, Proxy proxy, int page) throws Exception{
//获取通用请求头 //获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址 //获取链接地址
...@@ -118,7 +129,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -118,7 +129,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
return null; return null;
} }
private static String downloadHtml(String url, Proxy proxy, int page) { private static String downloadHtml(String url, Proxy proxy, int page) throws Exception{
//获取通用请求头 //获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址 //获取链接地址
...@@ -149,7 +160,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -149,7 +160,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) { private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String,Object> resultMap = new HashMap<String,Object>(); Map<String,Object> resultMap = new HashMap<String,Object>();
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
boolean more = true; boolean more = true;
...@@ -241,7 +252,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -241,7 +252,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @return List<NewsData> 返回类型 * @return List<NewsData> 返回类型
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getOherSougouNewsData(String url, String word, Proxy proxy){ public static List<NewsData> getOherSougouNewsData(String url, String word, Proxy proxy)throws Exception{
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 1; int page = 1;
boolean more = true; boolean more = true;
......
...@@ -38,9 +38,10 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -38,9 +38,10 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy * @param @param proxy
* @param @return 设定文件 * @param @return 设定文件
* @return List<ZhiHuData> 返回类型 * @return List<ZhiHuData> 返回类型
* @throws Exception
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<ZhiHuData> getSougouZhihuData(String word, Proxy proxy){ public static List<ZhiHuData> getSougouZhihuData(String word, Proxy proxy) throws Exception{
List<ZhiHuData> list = new ArrayList<ZhiHuData>(); List<ZhiHuData> list = new ArrayList<ZhiHuData>();
int page = 1; int page = 1;
boolean more = true; boolean more = true;
...@@ -66,6 +67,13 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -66,6 +67,13 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
} }
public static Map<String,Object> getSougouZhihuData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){
return analysisData(htmlBody, proxy, word);
}
return null;
}
/** /**
* *
...@@ -79,7 +87,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -79,7 +87,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
private static String downloadHtml(String word, Proxy proxy, int page) { private static String downloadHtml(String word, Proxy proxy, int page) throws Exception{
//获取通用请求头 //获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址 //获取链接地址
...@@ -97,7 +105,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -97,7 +105,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
return null; return null;
} }
private static String downloadHtml(String url, Proxy proxy, String type) { private static String downloadHtml(String url, Proxy proxy, String type) throws Exception{
//获取通用请求头 //获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址 //获取链接地址
...@@ -131,7 +139,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -131,7 +139,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) { private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word)throws Exception {
Map<String,Object> resultMap = new HashMap<String,Object>(); Map<String,Object> resultMap = new HashMap<String,Object>();
List<ZhiHuData> list = new ArrayList<ZhiHuData>(); List<ZhiHuData> list = new ArrayList<ZhiHuData>();
boolean more = true; boolean more = true;
...@@ -203,7 +211,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -203,7 +211,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return ZhiHuData 返回类型 * @return ZhiHuData 返回类型
*/ */
private static ZhiHuData analysisZhihuAnswer(String url, Proxy proxy,ZhiHuData zhihu){ private static ZhiHuData analysisZhihuAnswer(String url, Proxy proxy,ZhiHuData zhihu)throws Exception{
try { try {
String htmlBody = downloadHtml(url, proxy, "问答"); String htmlBody = downloadHtml(url, proxy, "问答");
if(htmlBody != null){ if(htmlBody != null){
...@@ -239,7 +247,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -239,7 +247,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @return ZhiHuData 返回类型 * @return ZhiHuData 返回类型
*/ */
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
private static ZhiHuData analysisZhihuArticle(String url, Proxy proxy, ZhiHuData zhihu){ private static ZhiHuData analysisZhihuArticle(String url, Proxy proxy, ZhiHuData zhihu)throws Exception{
try { try {
String htmlBody = downloadHtml(url, proxy, "文章"); String htmlBody = downloadHtml(url, proxy, "文章");
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
......
package com.zhiwei.media_data_crawler.test; //package com.zhiwei.media_data_crawler.test;
//
import java.net.Proxy; //import java.net.Proxy;
import java.util.List; //import java.util.List;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse; //import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
import com.zhiwei.media_data_crawler.data.DataCrawler; //import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData; //import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.media_data_crawler.entity.ZhiHuData; //import com.zhiwei.media_data_crawler.entity.ZhiHuData;
//
public class DataCrawlerTest { //public class DataCrawlerTest {
//
//
//
//
//
@Test // @Test
public void getSoNewsTest(){ // public void getSoNewsTest(){
String word = "马云"; //关键词 // String word = "马云"; //关键词
String startTime = "2017-03-01 00:00:00"; //开始时间 // String startTime = "2017-03-01 00:00:00"; //开始时间
String endTime = "2017-03-01 23:59:59"; //结束时间 // String endTime = "2017-03-01 23:59:59"; //结束时间
Proxy proxy = null; //代理IP,不用可不填写 // Proxy proxy = null; //代理IP,不用可不填写
//百度新闻采集demo // //百度新闻采集demo
// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy); //// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo //// //搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy); //// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo //// //360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy); //// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//搜狗知乎采集 //// //搜狗知乎采集
List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy); //// List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
System.out.println(zhihuList.size()); //// System.out.println(zhihuList.size());
//
} // }
//
} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment