Commit 45483734 by yangchen

修改百度资讯接口 和 360搜索关键词采集

parent 8c543a2e
...@@ -49,7 +49,7 @@ public class BaiduInforCrawlerParse { ...@@ -49,7 +49,7 @@ public class BaiduInforCrawlerParse {
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getBaiduInforData(String word,String endTime) throws Exception { public static List<NewsData> getBaiduInforData(String word,String endTime) throws Exception {
List<NewsData> list = new ArrayList<>(); List<NewsData> list = new ArrayList<>();
GroupSync groupSync = new GroupSync(); GroupSync groupSync = new GroupSync();
for(int i = 0;i< 10;i++) { for(int i = 0;i< 10;i++) {
groupSync.add(); groupSync.add();
String url = getUrl(word, i,endTime); String url = getUrl(word, i,endTime);
...@@ -155,6 +155,8 @@ public class BaiduInforCrawlerParse { ...@@ -155,6 +155,8 @@ public class BaiduInforCrawlerParse {
more = false; more = false;
} }
} }
// 开始解析 // 开始解析
Elements elementes = document.select("div.result"); Elements elementes = document.select("div.result");
String time = null; String time = null;
...@@ -179,6 +181,7 @@ public class BaiduInforCrawlerParse { ...@@ -179,6 +181,7 @@ public class BaiduInforCrawlerParse {
source = soureAndtimes[0]; source = soureAndtimes[0];
} else { } else {
time = element.select("div.c-row").select("p.c-author").text().trim(); time = element.select("div.c-row").select("p.c-author").text().trim();
source = element.select("a.c-showurl > span").text();
} }
/** 文章发布时间处理 **/ /** 文章发布时间处理 **/
time = time.replaceAll(" ", ""); time = time.replaceAll(" ", "");
...@@ -193,6 +196,9 @@ public class BaiduInforCrawlerParse { ...@@ -193,6 +196,9 @@ public class BaiduInforCrawlerParse {
matcher = pattern.matcher(content); matcher = pattern.matcher(content);
content = matcher.replaceAll("").replace("-", "").replace("百度快照", ""); content = matcher.replaceAll("").replace("-", "").replace("百度快照", "");
} }
if(Objects.nonNull(source)) {
source = source.replaceAll("<.*?>", "").trim();
}
// 添加到数据集合中 // 添加到数据集合中
NewsData newsData = new NewsData(link, title, source, time, content, PT, word); NewsData newsData = new NewsData(link, title, source, time, content, PT, word);
list.add(newsData); list.add(newsData);
...@@ -287,8 +293,10 @@ public class BaiduInforCrawlerParse { ...@@ -287,8 +293,10 @@ public class BaiduInforCrawlerParse {
String url = null; String url = null;
if (word != null) { if (word != null) {
if(Objects.nonNull(time)) { if(Objects.nonNull(time)) {
// https://www.baidu.com/s?ie=utf-8&cl=2&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&tn=news&word=http%3A%2F%2Fbaijiahao.baidu.com%2Fs%3Fid%3D1600799795509096909%26wfr%3Dspider%26for%3Dpc&rsv_sug3=2&rsv_sug4=221&rsv_sug1=1&rsv_n=2&rsv_sug2=0&inputT=601
// https://www.baidu.com/s?rn=50&ie=utf-8&cl=2&medium=0&rtt=4&bsst=1&rsv_dl=news_t_sk&tn=news&wd=%E6%B5%99%E6%B1%9F%E4%B8%B4%E6%B5%B7&tfflag=0&gpc=stf%3D1559318400%2C1561910400%7Cstftype%3D2
time = String.valueOf(TimeParse.stringFormartDate(time).getTime()/1000); time = String.valueOf(TimeParse.stringFormartDate(time).getTime()/1000);
url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=" + URLCodeUtil.getURLEncode(word, "UTF-8") + "&medium=0&rn=50&gpc=stf%3D1546272000%2C"+time+"%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=" + page*50; url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=" + URLCodeUtil.getURLEncode(word, "UTF-8") + "&medium=0&rn=50&gpc=stf%3D0%2C"+time+"%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=" + page*50;
}else { }else {
url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=" + URLCodeUtil.getURLEncode(word, "UTF-8") + "&medium=0&rn=50&pn=" + page*50; url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=" + URLCodeUtil.getURLEncode(word, "UTF-8") + "&medium=0&rn=50&pn=" + page*50;
} }
...@@ -299,7 +307,7 @@ public class BaiduInforCrawlerParse { ...@@ -299,7 +307,7 @@ public class BaiduInforCrawlerParse {
//https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E6%B5%99%E6%B1%9F&medium=1&rn=50&gpc=stf%3D1546272000%2C1548950400%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=0 //https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E6%B5%99%E6%B1%9F&medium=1&rn=50&gpc=stf%3D1546272000%2C1548950400%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=0
// public static void main(String[] args) throws Exception { // public static void main(String[] args) throws Exception {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER); // ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<NewsData> ndList = getBaiduInforData("腾讯"); // List<NewsData> ndList = getBaiduInforData("马云","2019-07-04 23:59:59");
// System.out.println(ndList.size()); // System.out.println(ndList.size());
// } // }
......
...@@ -30,7 +30,7 @@ public class SoNewsCrawlerParse { ...@@ -30,7 +30,7 @@ public class SoNewsCrawlerParse {
private static Logger logger = LogManager.getLogger(SoNewsCrawlerParse.class); private static Logger logger = LogManager.getLogger(SoNewsCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static final String pt = "360新闻"; private static final String pt = "360新闻";
/** /**
* @Title: getSoNewsData * @Title: getSoNewsData
* @author hero * @author hero
...@@ -43,14 +43,10 @@ public class SoNewsCrawlerParse { ...@@ -43,14 +43,10 @@ public class SoNewsCrawlerParse {
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsData(String word, Proxy proxy) throws Exception { public static List<NewsData> getSoNewsData(String word, Proxy proxy) throws Exception {
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<>();
int page = 1; int page = 1;
boolean more = true; boolean more = true;
while (more) { while (more) {
// 最大页数为50
if (page > 50) {
more = false;
}
String htmlBody = downloadHtml(word, "news", proxy, page); String htmlBody = downloadHtml(word, "news", proxy, page);
if (htmlBody != null) { if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word); Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
...@@ -65,6 +61,10 @@ public class SoNewsCrawlerParse { ...@@ -65,6 +61,10 @@ public class SoNewsCrawlerParse {
if(DataCrawler.sleepTime!=null){ if(DataCrawler.sleepTime!=null){
ZhiWeiTools.sleep(DataCrawler.sleepTime); ZhiWeiTools.sleep(DataCrawler.sleepTime);
} }
// 最大页数为50
if (page > 50) {
more = false;
}
} }
return list; return list;
} }
...@@ -93,7 +93,7 @@ public class SoNewsCrawlerParse { ...@@ -93,7 +93,7 @@ public class SoNewsCrawlerParse {
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy) throws Exception{ public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy) throws Exception{
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<>();
int page = 1; int page = 1;
boolean more = true; boolean more = true;
while (more) { while (more) {
...@@ -174,8 +174,8 @@ public class SoNewsCrawlerParse { ...@@ -174,8 +174,8 @@ public class SoNewsCrawlerParse {
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{ private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>(); Map<String, Object> resultMap = new HashMap<>();
List<NewsData> list = new ArrayList<NewsData>(); List<NewsData> list = new ArrayList<>();
boolean more = true; boolean more = true;
/** 解析页面 */ /** 解析页面 */
...@@ -200,8 +200,9 @@ public class SoNewsCrawlerParse { ...@@ -200,8 +200,9 @@ public class SoNewsCrawlerParse {
if(!element.attr("class").equals("res-list hasimg hasmediav")){ if(!element.attr("class").equals("res-list hasimg hasmediav")){
link = element.select("h3").select("a").attr("href"); link = element.select("h3").select("a").attr("href");
title = element.select("h3").select("a").text(); title = element.select("h3").select("a").text();
time = element.select("p.newsinfo").select("span.posttime").attr("title"); // #news > li > div.info.b-info > span:nth-child(3)
source = element.select("p.newsinfo").select("span.sitename").text(); time = element.select("div.info.b-info").select("span:nth-child(3)").text();
source = element.select("div.info.b-info").select("span.sitename").text();
/** 文章发布时间处理 **/ /** 文章发布时间处理 **/
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss"); time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
// 处理文章简介 // 处理文章简介
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment