Commit 38bcf00d by zhiwei

添加自助翻页功能,如使用请添加休眠时间

parent 0930c2aa
...@@ -27,35 +27,40 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -27,35 +27,40 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(BaiduNewsCrawlerParse.class); private static Logger logger = LoggerFactory.getLogger(BaiduNewsCrawlerParse.class);
private static final String pt = "百度新闻"; private static final String pt = "百度新闻";
/** /**
* @Title: getBaiduNewsData * @Title: getBaiduNewsData
* @author hero * @author hero
* @Description: 采集百度新闻数据 * @Description: 采集百度新闻数据
* @param @param word * @param @param
* @param @param startTime * word
* @param @param endTime * @param @param
* @param @param proxy * startTime
* @param @return 设定文件 * @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型 * @return List<NewsData> 返回类型
* @throws Exception
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy){ public static List<NewsData> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy) throws Exception {
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 0; int page = 0;
boolean more = true; boolean more = true;
while(more){ while (more) {
//最大页数为20 // 最大页数为20
if(page>20){ if (page > 20) {
more = false; more = false;
} }
String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newsdy", page); String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newsdy", page);
if(htmlBody != null){ if (htmlBody != null) {
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word); Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>)dataMap.get("data"); List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
list.addAll(dataList); list.addAll(dataList);
more = (Boolean)dataMap.get("more"); more = (Boolean) dataMap.get("more");
}else{ } else {
more = false; more = false;
} }
page++; page++;
...@@ -65,33 +70,61 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -65,33 +70,61 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
} }
/** /**
* @Title: getBaiduNewsDataByTitle * @Title: getBaiduNewsData
* @author hero * @author hero
* @Description: 采集百度新闻数据,根据标题匹配 * @Description: 根据关键词获取数据
* @param @param word * @param @param word
* @param @param startTime * @param @param startTime
* @param @param endTime * @param @param endTime
* @param @param proxy * @param @param proxy
* @param @return 设定文件 * @param @param page
* @param @return
* @param @throws Exception 设定文件
* @return Map<String,Object> 返回类型
*/
public static Map<String, Object> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy,
int page) throws Exception{
String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newsdy", page);
if (htmlBody != null) {
return analysisData(htmlBody, proxy, word);
}
return null;
}
/**
* @Title: getBaiduNewsDataByTitle
* @author hero
* @Description: 采集百度新闻数据,根据标题匹配
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型 * @return List<NewsData> 返回类型
* @throws Exception
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getBaiduNewsDataByTitle(String word, String startTime, String endTime, Proxy proxy){ public static List<NewsData> getBaiduNewsDataByTitle(String word, String startTime, String endTime, Proxy proxy) throws Exception {
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 0; int page = 0;
boolean more = true; boolean more = true;
while(more){ while (more) {
//最大页数为20 // 最大页数为20
if(page>20){ if (page > 20) {
more = false; more = false;
} }
String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newstitle", page); String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newstitle", page);
if(htmlBody != null){ if (htmlBody != null) {
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word); Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>)dataMap.get("data"); List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
list.addAll(dataList); list.addAll(dataList);
more = (Boolean)dataMap.get("more"); more = (Boolean) dataMap.get("more");
}else{ } else {
more = false; more = false;
} }
page++; page++;
...@@ -100,30 +133,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -100,30 +133,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return list; return list;
} }
/** /**
* @Title: downloadHtml * @Title: downloadHtml
* @author hero * @author hero
* @Description: 获取数据流 * @Description: 获取数据流
* @param @param word * @param @param
* @param @param startTime * word
* @param @param endTime * @param @param
* @param @param proxy * startTime
* @param @param tn (tn=newsdy 为 全文匹配, tn=newstitle 为标题匹配) * @param @param
* @param @param page * endTime
* @param @return 设定文件 * @param @param
* proxy
* @param @param
* tn (tn=newsdy 为 全文匹配, tn=newstitle 为标题匹配)
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn,int page) { private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn,
//获取通用请求头 int page) throws Exception{
Map<String,String> headerMap = HeaderTool.getCommonHead(); // 获取通用请求头
//获取链接地址 Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, startTime, endTime, tn, page); String url = getUrl(word, startTime, endTime, tn, page);
headerMap.put("Host", "news.baidu.com"); headerMap.put("Host", "news.baidu.com");
headerMap.put("Referer", url); headerMap.put("Referer", url);
//下载数据页面 // 下载数据页面
for(int i = 1; i<=3; i++){ for (int i = 1; i <= 3; i++) {
try { try {
return get(url, proxy, headerMap); return get(url, proxy, headerMap);
} catch (IOException e) { } catch (IOException e) {
...@@ -134,15 +173,15 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -134,15 +173,15 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return null; return null;
} }
private static String downloadHtml(String url, Proxy proxy, int page) { private static String downloadHtml(String url, Proxy proxy, int page) throws Exception{
//获取通用请求头 // 获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
//获取链接地址 // 获取链接地址
url = url + "&pn="+page*30; url = url + "&pn=" + page * 30;
headerMap.put("Host", "news.baidu.com"); headerMap.put("Host", "news.baidu.com");
headerMap.put("Referer", url); headerMap.put("Referer", url);
//下载数据页面 // 下载数据页面
for(int i = 1; i<=3; i++){ for (int i = 1; i <= 3; i++) {
try { try {
return get(url, proxy, headerMap); return get(url, proxy, headerMap);
} catch (IOException e) { } catch (IOException e) {
...@@ -153,37 +192,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -153,37 +192,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return null; return null;
} }
/** /**
* @Title: analysisData * @Title: analysisData
* @author hero * @author hero
* @Description: 解析百度新闻数据 * @Description: 解析百度新闻数据
* @param @param htmlBody * @param @param
* @param @param proxy * htmlBody
* @param @param word * @param @param
* @param @return 设定文件 * proxy
* @param @param
* word
* @param @return
* 设定文件
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) { private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String,Object> resultMap = new HashMap<String,Object>(); Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
boolean more = true; boolean more = true;
/** 解析页面 */ /** 解析页面 */
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
/**判断是否有下一页**/ /** 判断是否有下一页 **/
if(document.select("p#page") == null) if (document.select("p#page") == null) {
{
more = false; more = false;
}else } else {
{ if (!document.select("p#page").text().contains("下一页")) {
if(!document.select("p#page").text().contains("下一页"))
{
more = false; more = false;
} }
} }
//开始解析 // 开始解析
Elements elementes = document.select("div.result"); Elements elementes = document.select("div.result");
String time = null; String time = null;
String source = null; String source = null;
...@@ -195,13 +233,12 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -195,13 +233,12 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
String content = null; String content = null;
Pattern pattern = null; Pattern pattern = null;
Matcher matcher = null; Matcher matcher = null;
for (Element element : elementes) for (Element element : elementes) {
{
try { try {
link = element.select("h3.c-title").select("a").attr("href"); link = element.select("h3.c-title").select("a").attr("href");
title = element.select("h3.c-title").select("a").text(); title = element.select("h3.c-title").select("a").text();
soureAndtime = element.select("div.c-row").select("p.c-author").html(); soureAndtime = element.select("div.c-row").select("p.c-author").html();
/**截取时间*/ /** 截取时间 */
if (soureAndtime.contains("&nbsp;&nbsp;")) { if (soureAndtime.contains("&nbsp;&nbsp;")) {
String soureAndtimes[] = soureAndtime.split("&nbsp;&nbsp;"); String soureAndtimes[] = soureAndtime.split("&nbsp;&nbsp;");
time = soureAndtimes[1]; time = soureAndtimes[1];
...@@ -209,10 +246,10 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -209,10 +246,10 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
} else { } else {
time = element.select("div.c-row").select("p.c-author").text(); time = element.select("div.c-row").select("p.c-author").text();
} }
/**文章发布时间处理**/ /** 文章发布时间处理 **/
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ; time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
// 处理文章简介 // 处理文章简介
if(element.select("div.c-row")!=null){ if (element.select("div.c-row") != null) {
descript = element.select("div.c-row").text(); descript = element.select("div.c-row").text();
soureAndtimeText = element.select("div.c-row").select("p.c-author").text(); soureAndtimeText = element.select("div.c-row").select("p.c-author").text();
content = descript.substring(soureAndtimeText.length(), descript.length()); content = descript.substring(soureAndtimeText.length(), descript.length());
...@@ -220,14 +257,14 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -220,14 +257,14 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
matcher = pattern.matcher(content); matcher = pattern.matcher(content);
content = matcher.replaceAll("").replace("-", "").replace("百度快照", ""); content = matcher.replaceAll("").replace("-", "").replace("百度快照", "");
} }
//添加到数据集合中 // 添加到数据集合中
NewsData newsData = new NewsData(link, title, source, time, content, pt, word); NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
list.add(newsData); list.add(newsData);
/**采集相同新闻链接**/ /** 采集相同新闻链接 **/
String otherUrl = element.select("div.c-row").select("a.c-more_link").attr("href"); String otherUrl = element.select("div.c-row").select("a.c-more_link").attr("href");
if(otherUrl!=null && !otherUrl.equals("")) if (otherUrl != null && !otherUrl.equals("")) {
{ String otherLink = "http://news.baidu.com"
String otherLink = "http://news.baidu.com" + element.select("div.c-row").select("a.c-more_link").attr("href"); + element.select("div.c-row").select("a.c-more_link").attr("href");
List<NewsData> otherDataList = getOherBaiduNewsData(otherLink, word, proxy); List<NewsData> otherDataList = getOherBaiduNewsData(otherLink, word, proxy);
list.addAll(otherDataList); list.addAll(otherDataList);
ZhiWeiTools.sleep(100); ZhiWeiTools.sleep(100);
...@@ -244,34 +281,37 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -244,34 +281,37 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return resultMap; return resultMap;
} }
/** /**
* @Title: getOherBaiduNewsData * @Title: getOherBaiduNewsData
* @author hero * @author hero
* @Description: 解析相似新闻 * @Description: 解析相似新闻
* @param @param url * @param @param
* @param @param word * url
* @param @param proxy * @param @param
* @param @return 设定文件 * word
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型 * @return List<NewsData> 返回类型
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getOherBaiduNewsData(String url, String word, Proxy proxy){ public static List<NewsData> getOherBaiduNewsData(String url, String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 0; int page = 0;
boolean more = true; boolean more = true;
while(more){ while (more) {
//最大页数为20 // 最大页数为20
if(page>20){ if (page > 20) {
more = false; more = false;
} }
String htmlBody = downloadHtml(url, proxy, page); String htmlBody = downloadHtml(url, proxy, page);
if(htmlBody != null){ if (htmlBody != null) {
Map<String,Object> dataMap = analysisData(htmlBody, null, word); Map<String, Object> dataMap = analysisData(htmlBody, null, word);
List<NewsData> dataList = (List<NewsData>)dataMap.get("data"); List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
list.addAll(dataList); list.addAll(dataList);
more = (Boolean)dataMap.get("more"); more = (Boolean) dataMap.get("more");
}else{ } else {
more = false; more = false;
} }
page++; page++;
...@@ -279,31 +319,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -279,31 +319,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
return list; return list;
} }
/** /**
* @Title: getUrl * @Title: getUrl
* @author hero * @author hero
* @Description: 获取链接 * @Description: 获取链接
* @param @param word * @param @param
* @param @param startTime * word
* @param @param endTime * @param @param
* @param @param page * startTime
* @param @return 设定文件 * @param @param
* endTime
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
private static String getUrl(String word, String startTime, String endTime, String tn, int page){ private static String getUrl(String word, String startTime, String endTime, String tn, int page) {
long bt = 0; long bt = 0;
long et = 0; long et = 0;
String url = null; String url = null;
if(startTime!=null){ if (startTime != null) {
bt = TimeParse.stringFormartDate(startTime).getTime()/1000; bt = TimeParse.stringFormartDate(startTime).getTime() / 1000;
} }
if(endTime!=null){ if (endTime != null) {
et = TimeParse.stringFormartDate(endTime).getTime()/1000; et = TimeParse.stringFormartDate(endTime).getTime() / 1000;
} }
if(word!=null){ if (word != null) {
url = "http://news.baidu.com/ns?from=news&cl=2&bt=" + bt url = "http://news.baidu.com/ns?from=news&cl=2&bt=" + bt + "&et=" + et + "&q1="
+ "&et=" + et + "&q1=" +URLCodeUtil.getURLEncode(word, "utf-8") + "&q3=&q4=&tn="+ tn +"&ct=0&rn=50&clk=sortbytime&q6=&pn=" + page * 50; + URLCodeUtil.getURLEncode(word, "utf-8") + "&q3=&q4=&tn=" + tn
+ "&ct=0&rn=50&clk=sortbytime&q6=&pn=" + page * 50;
} }
return url; return url;
} }
......
...@@ -32,9 +32,10 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -32,9 +32,10 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy * @param @param proxy
* @param @return 设定文件 * @param @return 设定文件
* @return List<NewsData> 返回类型 * @return List<NewsData> 返回类型
* @throws Exception
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsData(String word, Proxy proxy) { public static List<NewsData> getSoNewsData(String word, Proxy proxy) throws Exception {
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 1; int page = 1;
boolean more = true; boolean more = true;
...@@ -59,6 +60,19 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -59,6 +60,19 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
return list; return list;
} }
public static Map<String,Object> getSoNewsData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, "news", proxy, page);
if (htmlBody != null) {
return analysisData(htmlBody, proxy, word);
}
return null;
}
/** /**
* @Title: getSoNewsDataByTitle * @Title: getSoNewsDataByTitle
* @author hero * @author hero
...@@ -69,7 +83,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -69,7 +83,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @return List<NewsData> 返回类型 * @return List<NewsData> 返回类型
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy) { public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 1; int page = 1;
boolean more = true; boolean more = true;
...@@ -104,7 +118,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -104,7 +118,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
private static String downloadHtml(String word, String tn, Proxy proxy, int page) { private static String downloadHtml(String word, String tn, Proxy proxy, int page)throws Exception {
// 获取通用请求头 // 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址 // 获取链接地址
...@@ -138,7 +152,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -138,7 +152,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* 设定文件 * 设定文件
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) { private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>(); Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
boolean more = true; boolean more = true;
...@@ -199,7 +213,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -199,7 +213,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) { private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>(); Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
boolean more = true; boolean more = true;
......
...@@ -35,9 +35,10 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -35,9 +35,10 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy * @param @param proxy
* @param @return 设定文件 * @param @return 设定文件
* @return List<NewsData> 返回类型 * @return List<NewsData> 返回类型
* @throws Exception
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getSougouNewsData(String word, Proxy proxy){ public static List<NewsData> getSougouNewsData(String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 1; int page = 1;
boolean more = true; boolean more = true;
...@@ -62,8 +63,18 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -62,8 +63,18 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
} }
public static Map<String,Object> getSougouNewsData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, 1, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){
return analysisData(htmlBody, proxy, word);
}
return null;
}
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getSougouNewsDataByTitle(String word, Proxy proxy){ public static List<NewsData> getSougouNewsDataByTitle(String word, Proxy proxy)throws Exception{
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 0; int page = 0;
boolean more = true; boolean more = true;
...@@ -99,7 +110,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -99,7 +110,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
private static String downloadHtml(String word, int mode, Proxy proxy, int page) { private static String downloadHtml(String word, int mode, Proxy proxy, int page) throws Exception{
//获取通用请求头 //获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址 //获取链接地址
...@@ -118,7 +129,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -118,7 +129,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
return null; return null;
} }
private static String downloadHtml(String url, Proxy proxy, int page) { private static String downloadHtml(String url, Proxy proxy, int page) throws Exception{
//获取通用请求头 //获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址 //获取链接地址
...@@ -149,7 +160,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -149,7 +160,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) { private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String,Object> resultMap = new HashMap<String,Object>(); Map<String,Object> resultMap = new HashMap<String,Object>();
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
boolean more = true; boolean more = true;
...@@ -241,7 +252,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -241,7 +252,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @return List<NewsData> 返回类型 * @return List<NewsData> 返回类型
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getOherSougouNewsData(String url, String word, Proxy proxy){ public static List<NewsData> getOherSougouNewsData(String url, String word, Proxy proxy)throws Exception{
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<NewsData>();
int page = 1; int page = 1;
boolean more = true; boolean more = true;
......
...@@ -38,9 +38,10 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -38,9 +38,10 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy * @param @param proxy
* @param @return 设定文件 * @param @return 设定文件
* @return List<ZhiHuData> 返回类型 * @return List<ZhiHuData> 返回类型
* @throws Exception
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<ZhiHuData> getSougouZhihuData(String word, Proxy proxy){ public static List<ZhiHuData> getSougouZhihuData(String word, Proxy proxy) throws Exception{
List<ZhiHuData> list = new ArrayList<ZhiHuData>(); List<ZhiHuData> list = new ArrayList<ZhiHuData>();
int page = 1; int page = 1;
boolean more = true; boolean more = true;
...@@ -66,6 +67,13 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -66,6 +67,13 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
} }
public static Map<String,Object> getSougouZhihuData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){
return analysisData(htmlBody, proxy, word);
}
return null;
}
/** /**
* *
...@@ -79,7 +87,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -79,7 +87,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
private static String downloadHtml(String word, Proxy proxy, int page) { private static String downloadHtml(String word, Proxy proxy, int page) throws Exception{
//获取通用请求头 //获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址 //获取链接地址
...@@ -97,7 +105,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -97,7 +105,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
return null; return null;
} }
private static String downloadHtml(String url, Proxy proxy, String type) { private static String downloadHtml(String url, Proxy proxy, String type) throws Exception{
//获取通用请求头 //获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址 //获取链接地址
...@@ -131,7 +139,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -131,7 +139,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) { private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word)throws Exception {
Map<String,Object> resultMap = new HashMap<String,Object>(); Map<String,Object> resultMap = new HashMap<String,Object>();
List<ZhiHuData> list = new ArrayList<ZhiHuData>(); List<ZhiHuData> list = new ArrayList<ZhiHuData>();
boolean more = true; boolean more = true;
...@@ -203,7 +211,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -203,7 +211,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件 * @param @return 设定文件
* @return ZhiHuData 返回类型 * @return ZhiHuData 返回类型
*/ */
private static ZhiHuData analysisZhihuAnswer(String url, Proxy proxy,ZhiHuData zhihu){ private static ZhiHuData analysisZhihuAnswer(String url, Proxy proxy,ZhiHuData zhihu)throws Exception{
try { try {
String htmlBody = downloadHtml(url, proxy, "问答"); String htmlBody = downloadHtml(url, proxy, "问答");
if(htmlBody != null){ if(htmlBody != null){
...@@ -239,7 +247,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK { ...@@ -239,7 +247,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @return ZhiHuData 返回类型 * @return ZhiHuData 返回类型
*/ */
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
private static ZhiHuData analysisZhihuArticle(String url, Proxy proxy, ZhiHuData zhihu){ private static ZhiHuData analysisZhihuArticle(String url, Proxy proxy, ZhiHuData zhihu)throws Exception{
try { try {
String htmlBody = downloadHtml(url, proxy, "文章"); String htmlBody = downloadHtml(url, proxy, "文章");
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
......
package com.zhiwei.media_data_crawler.test; //package com.zhiwei.media_data_crawler.test;
//
import java.net.Proxy; //import java.net.Proxy;
import java.util.List; //import java.util.List;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse; //import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
import com.zhiwei.media_data_crawler.data.DataCrawler; //import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData; //import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.media_data_crawler.entity.ZhiHuData; //import com.zhiwei.media_data_crawler.entity.ZhiHuData;
//
public class DataCrawlerTest { //public class DataCrawlerTest {
//
//
//
//
//
@Test // @Test
public void getSoNewsTest(){ // public void getSoNewsTest(){
String word = "马云"; //关键词 // String word = "马云"; //关键词
String startTime = "2017-03-01 00:00:00"; //开始时间 // String startTime = "2017-03-01 00:00:00"; //开始时间
String endTime = "2017-03-01 23:59:59"; //结束时间 // String endTime = "2017-03-01 23:59:59"; //结束时间
Proxy proxy = null; //代理IP,不用可不填写 // Proxy proxy = null; //代理IP,不用可不填写
//百度新闻采集demo // //百度新闻采集demo
// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy); //// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo //// //搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy); //// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo //// //360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy); //// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//搜狗知乎采集 //// //搜狗知乎采集
List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy); //// List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
System.out.println(zhihuList.size()); //// System.out.println(zhihuList.size());
//
} // }
//
} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment