Commit 45483734 by yangchen

修改百度资讯接口 和 360搜索关键词采集

parent 8c543a2e
......@@ -155,6 +155,8 @@ public class BaiduInforCrawlerParse {
more = false;
}
}
// 开始解析
Elements elementes = document.select("div.result");
String time = null;
......@@ -179,6 +181,7 @@ public class BaiduInforCrawlerParse {
source = soureAndtimes[0];
} else {
time = element.select("div.c-row").select("p.c-author").text().trim();
source = element.select("a.c-showurl > span").text();
}
/** 文章发布时间处理 **/
time = time.replaceAll(" ", "");
......@@ -193,6 +196,9 @@ public class BaiduInforCrawlerParse {
matcher = pattern.matcher(content);
content = matcher.replaceAll("").replace("-", "").replace("百度快照", "");
}
if(Objects.nonNull(source)) {
source = source.replaceAll("<.*?>", "").trim();
}
// 添加到数据集合中
NewsData newsData = new NewsData(link, title, source, time, content, PT, word);
list.add(newsData);
......@@ -287,8 +293,10 @@ public class BaiduInforCrawlerParse {
String url = null;
if (word != null) {
if(Objects.nonNull(time)) {
// https://www.baidu.com/s?ie=utf-8&cl=2&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&tn=news&word=http%3A%2F%2Fbaijiahao.baidu.com%2Fs%3Fid%3D1600799795509096909%26wfr%3Dspider%26for%3Dpc&rsv_sug3=2&rsv_sug4=221&rsv_sug1=1&rsv_n=2&rsv_sug2=0&inputT=601
// https://www.baidu.com/s?rn=50&ie=utf-8&cl=2&medium=0&rtt=4&bsst=1&rsv_dl=news_t_sk&tn=news&wd=%E6%B5%99%E6%B1%9F%E4%B8%B4%E6%B5%B7&tfflag=0&gpc=stf%3D1559318400%2C1561910400%7Cstftype%3D2
time = String.valueOf(TimeParse.stringFormartDate(time).getTime()/1000);
url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=" + URLCodeUtil.getURLEncode(word, "UTF-8") + "&medium=0&rn=50&gpc=stf%3D1546272000%2C"+time+"%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=" + page*50;
url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=" + URLCodeUtil.getURLEncode(word, "UTF-8") + "&medium=0&rn=50&gpc=stf%3D0%2C"+time+"%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=" + page*50;
}else {
url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=" + URLCodeUtil.getURLEncode(word, "UTF-8") + "&medium=0&rn=50&pn=" + page*50;
}
......@@ -299,7 +307,7 @@ public class BaiduInforCrawlerParse {
//https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E6%B5%99%E6%B1%9F&medium=1&rn=50&gpc=stf%3D1546272000%2C1548950400%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=0
// public static void main(String[] args) throws Exception {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<NewsData> ndList = getBaiduInforData("腾讯");
// List<NewsData> ndList = getBaiduInforData("马云","2019-07-04 23:59:59");
// System.out.println(ndList.size());
// }
......
......@@ -43,14 +43,10 @@ public class SoNewsCrawlerParse {
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsData(String word, Proxy proxy) throws Exception {
List<NewsData> list = new ArrayList<NewsData>();
List<NewsData> list = new ArrayList<>();
int page = 1;
boolean more = true;
while (more) {
// 最大页数为50
if (page > 50) {
more = false;
}
String htmlBody = downloadHtml(word, "news", proxy, page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
......@@ -65,6 +61,10 @@ public class SoNewsCrawlerParse {
if(DataCrawler.sleepTime!=null){
ZhiWeiTools.sleep(DataCrawler.sleepTime);
}
// 最大页数为50
if (page > 50) {
more = false;
}
}
return list;
}
......@@ -93,7 +93,7 @@ public class SoNewsCrawlerParse {
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>();
List<NewsData> list = new ArrayList<>();
int page = 1;
boolean more = true;
while (more) {
......@@ -174,8 +174,8 @@ public class SoNewsCrawlerParse {
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>();
Map<String, Object> resultMap = new HashMap<>();
List<NewsData> list = new ArrayList<>();
boolean more = true;
/** 解析页面 */
......@@ -200,8 +200,9 @@ public class SoNewsCrawlerParse {
if(!element.attr("class").equals("res-list hasimg hasmediav")){
link = element.select("h3").select("a").attr("href");
title = element.select("h3").select("a").text();
time = element.select("p.newsinfo").select("span.posttime").attr("title");
source = element.select("p.newsinfo").select("span.sitename").text();
// #news > li > div.info.b-info > span:nth-child(3)
time = element.select("div.info.b-info").select("span:nth-child(3)").text();
source = element.select("div.info.b-info").select("span.sitename").text();
/** 文章发布时间处理 **/
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
// 处理文章简介
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment