Commit 491f1e25 by zhiwei

添加360采集今日头条数据

parent 4986288a
package com.zhiwei.media_data_crawler.crawler;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class SoCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(SoCrawlerParse.class);
private static final String pt = "360网页";
/**
* @Title: getSoNewsData
* @author hero
* @Description: 采集360新闻数据
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getSoData(String word, String site, String time, Proxy proxy) throws Exception {
List<NewsData> list = new ArrayList<NewsData>();
int page = 1;
boolean more = true;
while (more) {
// 最大页数为50
if (page > 50) {
more = false;
}
String htmlBody = downloadHtml(word, site, time, proxy, page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
logger.info("当前采集页数:{},当前采集关键词:{},当页数据量{}", page, word, dataList.size());
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
if(DataCrawler.sleepTime==null){
ZhiWeiTools.sleep(5000);
}
}
return list;
}
public static Map<String,Object> getSoData(String word, String site, String time, Proxy proxy, int page) throws Exception{
String htmlBody = downloadHtml(word, site, time, proxy, page);
if (htmlBody != null) {
return analysisData(htmlBody, proxy, word);
}
return null;
}
/**
* 根据指定域名获取相应关键词数据
* @Title: downloadHtml
* @author hero
* @param @param word
* @param @param site
* @param @param time 需要采集的时间: d 1天内, w 1周内, m 1个月内, 3m 三个月内, y 1年内
* @param @param proxy
* @param @param page
* @param @return
* @param @throws Exception 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, String site,String time, Proxy proxy, int page)throws Exception {
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, site, time, page);
headerMap.put("Host", "www.so.com");
headerMap.put("Referer", url);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (Exception e) {
logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析360新闻数据
* @param @param
* htmlBody
* @param @param
* proxy
* @param @param
* word
* @param @return
* 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/** 判断是否有下一页 **/
if (document.select("div#page") == null) {
more = false;
} else {
if (!document.select("div#page").text().contains("下一页")) {
more = false;
}
}
// 开始解析
Elements elementes = document.select("ul.result").select("li.res-list");
String time = null;
String source = null;
String link = null;
String title = null;
String content = null;
logger.info("关键词:::{},抓取回来的数据量为:::{}",word, elementes.size());
for (Element element : elementes) {
try {
if(!element.attr("class").equals("res-list hasimg hasmediav")){
link = element.select("h3.res-title").select("a").attr("href");
title = element.select("h3.res-title").select("a").text();
System.out.println(title+"============"+link);
NewsData newsData = null;
String realUrl = link;
if(link.contains("www.so.com/link")) {
realUrl = getRealURL(link, proxy);
}
if(realUrl.contains("www.toutiao.com")) {
newsData = getTouTiaoInfo(realUrl, proxy, word);
}else {
time = element.select("span.gray").text();
source = element.select("p.res-linkinfo").select("a.mingpian").text();
/** 文章发布时间处理 **/
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
// 处理文章简介
content = element.select("[class=\"res-rich so-rich-news clearfix\"]").text();
// 添加到数据集合中
newsData = new NewsData(realUrl, title, source, time, content, pt, word);
}
if(newsData!=null) {
list.add(newsData);
}else {
break;
}
}
} catch (Exception e) {
logger.error("360新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
* 特殊处理拉取今日头条账号信息
* @Title: getTouTiaoInfo
* @author hero
* @param @param url
* @param @return 设定文件
* @return Map<String,String> 返回类型
*/
private static NewsData getTouTiaoInfo(String url,Proxy proxy,String word) {
try {
Map<String,String> headMap = HeaderTool.getCommonHead();
headMap.put("accept-encoding", "deflate, br");
headMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36");
String htmlBody = HttpClientTemplateOK.get(url, proxy, headMap);
if(htmlBody!=null){
if(htmlBody.contains("question")){
String html = htmlBody.split("var __wenda_data =")[1].split("\"err_tips\":\"\"};")[0]+"\"err_tips\":\"\"}";
JSONObject dataJson = JSONObject.parseObject(html);
String title = dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getString("title");
String content = dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getJSONObject("content").getString("text");
String time = dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getString("create_time");
String source = dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getJSONObject("user").getString("uname");
String user_id = dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getJSONObject("user").getString("user_id");
String link = "https://www.toutiao.com/a"+dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getString("qid")+"/";
return new NewsData(link, title, source, time, content, "头条问答", word, user_id);
}else if(htmlBody.contains("var BASE_DATA = ")){
String html = htmlBody.split("var BASE_DATA = ")[1].split("pgcInfo")[0]+"data:{}}";
JSONObject dataJson = JSONObject.parseObject(html);
String title = dataJson.getJSONObject("articleInfo").getString("title");
String content = dataJson.getJSONObject("articleInfo").getString("content");
String time = dataJson.getJSONObject("articleInfo").getJSONObject("subInfo").getString("time");
String source = dataJson.getJSONObject("mediaInfo").getString("name");
String user_id = dataJson.getJSONObject("mediaInfo").getString("uid");
String link = "https://www.toutiao.com/a"+dataJson.getJSONObject("articleInfo").getString("groupId")+"/";
return new NewsData(link, title, source, time, content, "今日头条", word, user_id);
}else if(htmlBody.contains("404错误页")){
logger.info("{}:::数据有问题,该文章已被删除}", url);
}else{
logger.info("{}:::数据有问题,页面中无文章信息,页面", url);
}
}
} catch (Exception e) {
e.printStackTrace();
logger.info("{}:::拉取页面信心出现错误,错误为::{}", url, e.fillInStackTrace());
return null;
}
return null;
}
/**
* 获取链接
* @Title: getUrl
* @author hero
* @param @param word
* @param @param site 需要匹配的域名
* @param @param time 需要采集的时间: d 1天内, w 1周内, m 1个月内, 3m 三个月内, y 1年内
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private static String getUrl(String word, String site, String time ,int page) {
String url = null;
if (word != null) {
url = "https://www.so.com/s?q="+URLCodeUtil.getURLEncode(word, "utf-8");
if(site!=null) {
url = url + "+site%3A" + site;
}
url = url+"&src=srp&fr=tab_news&psid=5fd92fac25104eda591f0de2029a346b&adv_t="+time+"&pn="+page;
}
System.out.println(url);
return url;
}
/**
* 获取真实地址
* @Title: getRealURL
* @author hero
* @param @param link
* @param @return 设定文件
* @return String 返回类型
*/
private static String getRealURL(String link,Proxy proxy) {
String url = null;
if(link != null) {
try {
String htmlBody = HttpClientTemplateOK.get(link, proxy, null);
if(htmlBody!=null) {
url = htmlBody.split("window.location.replace\\(\"")[1].split("\"\\)")[0];
url = url.replaceAll("http", "https");
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
return url;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment