Commit 8c543a2e by yangchen

新增百度资讯采集

parent 7f0418e6
......@@ -2,7 +2,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>media_data_crawler</artifactId>
<version>0.1.0-SNAPSHOT</version>
<version>0.1.1-SNAPSHOT</version>
<name>media_data_crawler</name>
<description>网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等</description>
......@@ -16,7 +16,7 @@
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.3.6-RELEASE</version>
<version>0.5.2-RELEASE</version>
<scope>provided</scope>
</dependency>
</dependencies>
......
......@@ -564,6 +564,7 @@ public class BaiduNewsCrawlerParse {
et = TimeParse.stringFormartDate(endTime).getTime() / 1000;
}
if (word != null) {
// url = "https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=" + URLCodeUtil.getURLEncode(word, "UTF-8") + "&medium=0&pn=" + page*10;
url = "http://news.baidu.com/ns?from=news&cl=2&bt=" + bt + "&et=" + et + "&q1="
+ URLCodeUtil.getURLEncode(word, "utf-8") + "&q3=&q4=&tn=" + tn
+ "&ct=0&rn=50&clk=sortbytime&q6=&pn=" + page * 50;
......
......@@ -13,7 +13,33 @@ import com.zhiwei.media_data_crawler.entity.*;
public class DataCrawler {
public static Long sleepTime;
/**
*
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词和时间,全文匹配百度新闻数据
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
*/
public static List<NewsData> getBaiduInforData(String word,String endTime) {
try {
return BaiduInforCrawlerParse.getBaiduInforData(word,endTime);
} catch (Exception e) {
e.printStackTrace();
return Collections.emptyList();
}
}
/**
*
* @Title: getBaiduNewsData
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment