Commit 8c543a2e by yangchen

新增百度资讯采集

parent 7f0418e6
......@@ -2,7 +2,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>media_data_crawler</artifactId>
<version>0.1.0-SNAPSHOT</version>
<version>0.1.1-SNAPSHOT</version>
<name>media_data_crawler</name>
<description>网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等</description>
......@@ -16,7 +16,7 @@
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.3.6-RELEASE</version>
<version>0.5.2-RELEASE</version>
<scope>provided</scope>
</dependency>
</dependencies>
......
package com.zhiwei.media_data_crawler.crawler;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.zhiwei.async.GroupSync;
import com.zhiwei.async.TaskBoot;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import okhttp3.Response;
public class BaiduInforCrawlerParse {
private static Logger logger = LogManager.getLogger(BaiduNewsCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).useCookieJar(true).throwException(false).build();
private static final String PT = "百度资讯";
/**
* @Title: getBaiduNewsData
* @author hero
* @Description: 采集百度新闻数据
* @param @param
* word
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getBaiduInforData(String word,String endTime) throws Exception {
List<NewsData> list = new ArrayList<>();
GroupSync groupSync = new GroupSync();
for(int i = 0;i< 10;i++) {
groupSync.add();
String url = getUrl(word, i,endTime);
TaskBoot.blockingAsync(() -> {
try {
String htmlBody = downloadHtml(url);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody,word);
List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
System.out.println(url);
list.addAll(dataList);
}
} catch (Exception e) {
} finally {
groupSync.done();
}
});
}
groupSync.await();
// while (more) {
// String htmlBody = downloadHtml(word, page,null);
// if (htmlBody != null) {
// Map<String, Object> dataMap = analysisData(htmlBody, word);
// List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
// list.addAll(dataList);
// logger.info("第 {} 页 ,采集到 {} 条",page,list.size());
// System.out.println("第 "+page+" 页 ,采集到 "+list.size()+" 条");
// more = (Boolean) dataMap.get("more");
// } else {
// more = false;
// }
// page++;
// if(DataCrawler.sleepTime != null ){
// ZhiWeiTools.sleep(DataCrawler.sleepTime);
// }
// // 最大页数为30
// if (page > 30) {
// more = false;
// }
// }
return list;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param
* word
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String url) {
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
headerMap.put("Host", "www.baidu.com");
headerMap.put("referer", url);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_HEAVY_PROXY)){
String result = response.body().string();
if(!result.contains("location.href.replace")) {
return result;
}
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e);
}
}
return null;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析百度新闻数据
* @param @param
* htmlBody
* @param @param
* proxy
* @param @param
* word
* @param @return
* 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisData(String htmlBody, String word){
Map<String, Object> resultMap = new HashMap<>();
List<NewsData> list = new ArrayList<>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/** 判断是否有下一页 **/
if (document.select("p#page") == null) {
more = false;
} else {
if (!document.select("p#page").text().contains("下一页")) {
more = false;
}
}
// 开始解析
Elements elementes = document.select("div.result");
String time = null;
String source = null;
String link = null;
String title = null;
String soureAndtime = null;
String descript = null;
String soureAndtimeText = null;
String content = null;
Pattern pattern = null;
Matcher matcher = null;
for (Element element : elementes) {
try {
link = element.select("h3.c-title").select("a").attr("href");
title = element.select("h3.c-title").select("a").text();
soureAndtime = element.select("div.c-row").select("p.c-author").html();
/** 截取时间 */
if (soureAndtime.contains("&nbsp;&nbsp;")) {
String soureAndtimes[] = soureAndtime.split("&nbsp;&nbsp;");
time = soureAndtimes[1];
source = soureAndtimes[0];
} else {
time = element.select("div.c-row").select("p.c-author").text().trim();
}
/** 文章发布时间处理 **/
time = time.replaceAll(" ", "");
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time.trim()), "yyyy-MM-dd HH:mm:ss");
// 处理文章简介
if (element.select("div.c-row") != null) {
descript = element.select("div.c-row").text();
soureAndtimeText = element.select("div.c-row").select("p.c-author").text();
content = descript.substring(soureAndtimeText.length(), descript.length());
pattern = Pattern.compile("\\d*条相同新闻");
matcher = pattern.matcher(content);
content = matcher.replaceAll("").replace("-", "").replace("百度快照", "");
}
// 添加到数据集合中
NewsData newsData = new NewsData(link, title, source, time, content, PT, word);
list.add(newsData);
/** 采集相同新闻链接 **/
String otherUrl = element.select("div.c-row").select("a.c-more_link").attr("href");
if (otherUrl != null && !otherUrl.equals("")) {
String otherLink = "http://www.baidu.com"
+ element.select("div.c-row").select("a.c-more_link").attr("href");
List<NewsData> otherDataList = getOherBaiduNewsData(otherLink, word);
list.addAll(otherDataList);
}
} catch (Exception e) {
System.out.println("soureAndtime======"+soureAndtime);
logger.error("百度新闻数据解析时出现问题,问题为:{}", e);
}
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
private static String downloadHtml(String url, int page) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
url = url + "&pn=" + page * 10;
headerMap.put("Host", "news.baidu.com");
headerMap.put("Referer", url);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_HEAVY_PROXY)){
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
}
}
return null;
}
/**
* @Title: getOherBaiduNewsData
* @author hero
* @Description: 解析相似新闻
* @param @param
* url
* @param @param
* word
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getOherBaiduNewsData(String url, String word) throws Exception{
List<NewsData> list = new ArrayList<>();
int page = 0;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 30) {
more = false;
}
String htmlBody = downloadHtml(url, page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, word);
List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
}
return list;
}
/**
* @Title: getUrl
* @author hero
* @Description: 获取链接
* @param @param
* word
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型
*/
private static String getUrl(String word,int page,String time) {
String url = null;
if (word != null) {
if(Objects.nonNull(time)) {
time = String.valueOf(TimeParse.stringFormartDate(time).getTime()/1000);
url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=" + URLCodeUtil.getURLEncode(word, "UTF-8") + "&medium=0&rn=50&gpc=stf%3D1546272000%2C"+time+"%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=" + page*50;
}else {
url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=" + URLCodeUtil.getURLEncode(word, "UTF-8") + "&medium=0&rn=50&pn=" + page*50;
}
}
return url;
}
//https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E6%B5%99%E6%B1%9F&medium=1&rn=50&gpc=stf%3D1546272000%2C1548950400%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=0
// public static void main(String[] args) throws Exception {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<NewsData> ndList = getBaiduInforData("腾讯");
// System.out.println(ndList.size());
// }
}
......@@ -564,6 +564,7 @@ public class BaiduNewsCrawlerParse {
et = TimeParse.stringFormartDate(endTime).getTime() / 1000;
}
if (word != null) {
// url = "https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=" + URLCodeUtil.getURLEncode(word, "UTF-8") + "&medium=0&pn=" + page*10;
url = "http://news.baidu.com/ns?from=news&cl=2&bt=" + bt + "&et=" + et + "&q1="
+ URLCodeUtil.getURLEncode(word, "utf-8") + "&q3=&q4=&tn=" + tn
+ "&ct=0&rn=50&clk=sortbytime&q6=&pn=" + page * 50;
......
......@@ -31,6 +31,32 @@ public class DataCrawler {
* 设定文件
* @return List<NewsData> 返回类型
*/
public static List<NewsData> getBaiduInforData(String word,String endTime) {
try {
return BaiduInforCrawlerParse.getBaiduInforData(word,endTime);
} catch (Exception e) {
e.printStackTrace();
return Collections.emptyList();
}
}
/**
*
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词和时间,全文匹配百度新闻数据
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
*/
public static List<NewsData> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy
) {
try {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment