Commit 38bcf00d by zhiwei

添加自助翻页功能,如使用请添加休眠时间

parent 0930c2aa
......@@ -23,107 +23,146 @@ import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(BaiduNewsCrawlerParse.class);
private static final String pt = "百度新闻";
/**
* @Title: getBaiduNewsData
* @author hero
* @Description: 采集百度新闻数据
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @return 设定文件
* @Title: getBaiduNewsData
* @author hero
* @Description: 采集百度新闻数据
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy){
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
while(more){
//最大页数为20
if(page>20){
more = false;
}
String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newsdy", page);
if(htmlBody != null){
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList);
more = (Boolean)dataMap.get("more");
}else{
more = false;
}
page++;
public static List<NewsData> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy) throws Exception {
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 20) {
more = false;
}
String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newsdy", page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
ZhiWeiTools.sleep(3000);
}
return list;
}
return list;
}
/**
* @Title: getBaiduNewsDataByTitle
* @Title: getBaiduNewsData
* @author hero
* @Description: 采集百度新闻数据,根据标题匹配
* @Description: 根据关键词获取数据
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @return 设定文件
* @param @param page
* @param @return
* @param @throws Exception 设定文件
* @return Map<String,Object> 返回类型
*/
public static Map<String, Object> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy,
int page) throws Exception{
String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newsdy", page);
if (htmlBody != null) {
return analysisData(htmlBody, proxy, word);
}
return null;
}
/**
* @Title: getBaiduNewsDataByTitle
* @author hero
* @Description: 采集百度新闻数据,根据标题匹配
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getBaiduNewsDataByTitle(String word, String startTime, String endTime, Proxy proxy){
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
while(more){
//最大页数为20
if(page>20){
more = false;
}
String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newstitle", page);
if(htmlBody != null){
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList);
more = (Boolean)dataMap.get("more");
}else{
more = false;
}
page++;
ZhiWeiTools.sleep(3000);
}
return list;
public static List<NewsData> getBaiduNewsDataByTitle(String word, String startTime, String endTime, Proxy proxy) throws Exception {
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 20) {
more = false;
}
String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newstitle", page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
ZhiWeiTools.sleep(3000);
}
return list;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @param tn (tn=newsdy 为 全文匹配, tn=newstitle 为标题匹配)
* @param @param page
* @param @return 设定文件
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @param
* tn (tn=newsdy 为 全文匹配, tn=newstitle 为标题匹配)
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn,int page) {
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn,
int page) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, startTime, endTime, tn, page);
headerMap.put("Host", "news.baidu.com");
headerMap.put("Referer", url);
//下载数据页面
for(int i = 1; i<=3; i++){
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
......@@ -133,16 +172,16 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
}
return null;
}
private static String downloadHtml(String url, Proxy proxy, int page) {
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
url = url + "&pn="+page*30;
private static String downloadHtml(String url, Proxy proxy, int page) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
url = url + "&pn=" + page * 30;
headerMap.put("Host", "news.baidu.com");
headerMap.put("Referer", url);
//下载数据页面
for(int i = 1; i<=3; i++){
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
......@@ -152,38 +191,37 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
}
return null;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析百度新闻数据
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @return 设定文件
* @Title: analysisData
* @author hero
* @Description: 解析百度新闻数据
* @param @param
* htmlBody
* @param @param
* proxy
* @param @param
* word
* @param @return
* 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) {
Map<String,Object> resultMap = new HashMap<String,Object>();
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/**判断是否有下一页**/
if(document.select("p#page") == null)
{
/** 判断是否有下一页 **/
if (document.select("p#page") == null) {
more = false;
}else
{
if(!document.select("p#page").text().contains("下一页"))
{
} else {
if (!document.select("p#page").text().contains("下一页")) {
more = false;
}
}
//开始解析
// 开始解析
Elements elementes = document.select("div.result");
String time = null;
String source = null;
......@@ -195,13 +233,12 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
String content = null;
Pattern pattern = null;
Matcher matcher = null;
for (Element element : elementes)
{
for (Element element : elementes) {
try {
link = element.select("h3.c-title").select("a").attr("href");
title = element.select("h3.c-title").select("a").text();
soureAndtime = element.select("div.c-row").select("p.c-author").html();
/**截取时间*/
/** 截取时间 */
if (soureAndtime.contains("&nbsp;&nbsp;")) {
String soureAndtimes[] = soureAndtime.split("&nbsp;&nbsp;");
time = soureAndtimes[1];
......@@ -209,10 +246,10 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
} else {
time = element.select("div.c-row").select("p.c-author").text();
}
/**文章发布时间处理**/
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ;
/** 文章发布时间处理 **/
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
// 处理文章简介
if(element.select("div.c-row")!=null){
if (element.select("div.c-row") != null) {
descript = element.select("div.c-row").text();
soureAndtimeText = element.select("div.c-row").select("p.c-author").text();
content = descript.substring(soureAndtimeText.length(), descript.length());
......@@ -220,14 +257,14 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
matcher = pattern.matcher(content);
content = matcher.replaceAll("").replace("-", "").replace("百度快照", "");
}
//添加到数据集合中
// 添加到数据集合中
NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
list.add(newsData);
/**采集相同新闻链接**/
/** 采集相同新闻链接 **/
String otherUrl = element.select("div.c-row").select("a.c-more_link").attr("href");
if(otherUrl!=null && !otherUrl.equals(""))
{
String otherLink = "http://news.baidu.com" + element.select("div.c-row").select("a.c-more_link").attr("href");
if (otherUrl != null && !otherUrl.equals("")) {
String otherLink = "http://news.baidu.com"
+ element.select("div.c-row").select("a.c-more_link").attr("href");
List<NewsData> otherDataList = getOherBaiduNewsData(otherLink, word, proxy);
list.addAll(otherDataList);
ZhiWeiTools.sleep(100);
......@@ -240,70 +277,78 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
* @Title: getOherBaiduNewsData
* @author hero
* @Description: 解析相似新闻
* @param @param url
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @Title: getOherBaiduNewsData
* @author hero
* @Description: 解析相似新闻
* @param @param
* url
* @param @param
* word
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getOherBaiduNewsData(String url, String word, Proxy proxy){
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
while(more){
//最大页数为20
if(page>20){
more = false;
}
String htmlBody = downloadHtml(url, proxy, page);
if(htmlBody != null){
Map<String,Object> dataMap = analysisData(htmlBody, null, word);
List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList);
more = (Boolean)dataMap.get("more");
}else{
more = false;
}
page++;
}
return list;
public static List<NewsData> getOherBaiduNewsData(String url, String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 20) {
more = false;
}
String htmlBody = downloadHtml(url, proxy, page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, null, word);
List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
}
return list;
}
/**
* @Title: getUrl
* @author hero
* @Title: getUrl
* @author hero
* @Description: 获取链接
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param page
* @param @return 设定文件
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型
*/
private static String getUrl(String word, String startTime, String endTime, String tn, int page){
private static String getUrl(String word, String startTime, String endTime, String tn, int page) {
long bt = 0;
long et = 0;
String url = null;
if(startTime!=null){
bt = TimeParse.stringFormartDate(startTime).getTime()/1000;
if (startTime != null) {
bt = TimeParse.stringFormartDate(startTime).getTime() / 1000;
}
if(endTime!=null){
et = TimeParse.stringFormartDate(endTime).getTime()/1000;
if (endTime != null) {
et = TimeParse.stringFormartDate(endTime).getTime() / 1000;
}
if(word!=null){
url = "http://news.baidu.com/ns?from=news&cl=2&bt=" + bt
+ "&et=" + et + "&q1=" +URLCodeUtil.getURLEncode(word, "utf-8") + "&q3=&q4=&tn="+ tn +"&ct=0&rn=50&clk=sortbytime&q6=&pn=" + page * 50;
if (word != null) {
url = "http://news.baidu.com/ns?from=news&cl=2&bt=" + bt + "&et=" + et + "&q1="
+ URLCodeUtil.getURLEncode(word, "utf-8") + "&q3=&q4=&tn=" + tn
+ "&ct=0&rn=50&clk=sortbytime&q6=&pn=" + page * 50;
}
return url;
}
......
......@@ -32,9 +32,10 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsData(String word, Proxy proxy) {
public static List<NewsData> getSoNewsData(String word, Proxy proxy) throws Exception {
List<NewsData> list = new ArrayList<NewsData>();
int page = 1;
boolean more = true;
......@@ -59,6 +60,19 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
return list;
}
public static Map<String,Object> getSoNewsData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, "news", proxy, page);
if (htmlBody != null) {
return analysisData(htmlBody, proxy, word);
}
return null;
}
/**
* @Title: getSoNewsDataByTitle
* @author hero
......@@ -69,7 +83,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @return List<NewsData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy) {
public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>();
int page = 1;
boolean more = true;
......@@ -104,7 +118,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, String tn, Proxy proxy, int page) {
private static String downloadHtml(String word, String tn, Proxy proxy, int page)throws Exception {
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
......@@ -138,7 +152,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) {
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
......@@ -199,7 +213,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) {
private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
......
......@@ -35,9 +35,10 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getSougouNewsData(String word, Proxy proxy){
public static List<NewsData> getSougouNewsData(String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>();
int page = 1;
boolean more = true;
......@@ -62,8 +63,18 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
}
public static Map<String,Object> getSougouNewsData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, 1, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){
return analysisData(htmlBody, proxy, word);
}
return null;
}
@SuppressWarnings("unchecked")
public static List<NewsData> getSougouNewsDataByTitle(String word, Proxy proxy){
public static List<NewsData> getSougouNewsDataByTitle(String word, Proxy proxy)throws Exception{
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
......@@ -99,7 +110,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, int mode, Proxy proxy, int page) {
private static String downloadHtml(String word, int mode, Proxy proxy, int page) throws Exception{
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
......@@ -118,7 +129,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
return null;
}
private static String downloadHtml(String url, Proxy proxy, int page) {
private static String downloadHtml(String url, Proxy proxy, int page) throws Exception{
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
......@@ -149,7 +160,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) {
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String,Object> resultMap = new HashMap<String,Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
......@@ -241,7 +252,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @return List<NewsData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getOherSougouNewsData(String url, String word, Proxy proxy){
public static List<NewsData> getOherSougouNewsData(String url, String word, Proxy proxy)throws Exception{
List<NewsData> list = new ArrayList<NewsData>();
int page = 1;
boolean more = true;
......
......@@ -38,9 +38,10 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy
* @param @return 设定文件
* @return List<ZhiHuData> 返回类型
* @throws Exception
*/
@SuppressWarnings("unchecked")
public static List<ZhiHuData> getSougouZhihuData(String word, Proxy proxy){
public static List<ZhiHuData> getSougouZhihuData(String word, Proxy proxy) throws Exception{
List<ZhiHuData> list = new ArrayList<ZhiHuData>();
int page = 1;
boolean more = true;
......@@ -66,6 +67,13 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
}
public static Map<String,Object> getSougouZhihuData(String word, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){
return analysisData(htmlBody, proxy, word);
}
return null;
}
/**
*
......@@ -79,7 +87,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, Proxy proxy, int page) {
private static String downloadHtml(String word, Proxy proxy, int page) throws Exception{
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
......@@ -97,7 +105,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
return null;
}
private static String downloadHtml(String url, Proxy proxy, String type) {
private static String downloadHtml(String url, Proxy proxy, String type) throws Exception{
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
......@@ -131,7 +139,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) {
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word)throws Exception {
Map<String,Object> resultMap = new HashMap<String,Object>();
List<ZhiHuData> list = new ArrayList<ZhiHuData>();
boolean more = true;
......@@ -203,7 +211,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return ZhiHuData 返回类型
*/
private static ZhiHuData analysisZhihuAnswer(String url, Proxy proxy,ZhiHuData zhihu){
private static ZhiHuData analysisZhihuAnswer(String url, Proxy proxy,ZhiHuData zhihu)throws Exception{
try {
String htmlBody = downloadHtml(url, proxy, "问答");
if(htmlBody != null){
......@@ -239,7 +247,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @return ZhiHuData 返回类型
*/
@SuppressWarnings("deprecation")
private static ZhiHuData analysisZhihuArticle(String url, Proxy proxy, ZhiHuData zhihu){
private static ZhiHuData analysisZhihuArticle(String url, Proxy proxy, ZhiHuData zhihu)throws Exception{
try {
String htmlBody = downloadHtml(url, proxy, "文章");
Document document = Jsoup.parse(htmlBody);
......
package com.zhiwei.media_data_crawler.test;
import java.net.Proxy;
import java.util.List;
import org.junit.Test;
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
public class DataCrawlerTest {
@Test
public void getSoNewsTest(){
String word = "马云"; //关键词
String startTime = "2017-03-01 00:00:00"; //开始时间
String endTime = "2017-03-01 23:59:59"; //结束时间
Proxy proxy = null; //代理IP,不用可不填写
//百度新闻采集demo
// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//搜狗知乎采集
List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
System.out.println(zhihuList.size());
}
}
//package com.zhiwei.media_data_crawler.test;
//
//import java.net.Proxy;
//import java.util.List;
//
//import org.junit.Test;
//
//import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
//import com.zhiwei.media_data_crawler.data.DataCrawler;
//import com.zhiwei.media_data_crawler.entity.NewsData;
//import com.zhiwei.media_data_crawler.entity.ZhiHuData;
//
//public class DataCrawlerTest {
//
//
//
//
//
// @Test
// public void getSoNewsTest(){
// String word = "马云"; //关键词
// String startTime = "2017-03-01 00:00:00"; //开始时间
// String endTime = "2017-03-01 23:59:59"; //结束时间
// Proxy proxy = null; //代理IP,不用可不填写
// //百度新闻采集demo
//// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
//// //搜狗新闻关键词采集demo
//// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
//// //360新闻采集demo
//// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//// //搜狗知乎采集
//// List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
//// System.out.println(zhihuList.size());
//
// }
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment