Commit f01e39b6 by leiliangliang

增加36氪人气榜和虎嗅热文推荐

parent b20cc34b
......@@ -5,7 +5,7 @@
<version>0.0.6-SNAPSHOT</version>
<name>各平台热搜榜单采集程序</name>
<description>各平台热搜榜单采集程序
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序</description>
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
......@@ -51,16 +51,16 @@
<artifactId>lombok</artifactId>
<version>1.18.8</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.quartz-scheduler</groupId>-->
<!-- <artifactId>quartz</artifactId>-->
<!-- <version>${quartz.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.quartz-scheduler</groupId>-->
<!-- <artifactId>quartz-jobs</artifactId>-->
<!-- <version>${quartz.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.quartz-scheduler</groupId>-->
<!-- <artifactId>quartz</artifactId>-->
<!-- <version>${quartz.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.quartz-scheduler</groupId>-->
<!-- <artifactId>quartz-jobs</artifactId>-->
<!-- <version>${quartz.version}</version>-->
<!-- </dependency>-->
<!-- Spring文件配置 -->
<dependency>
<groupId>org.springframework</groupId>
......@@ -119,6 +119,11 @@
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>
</dependencies>
......@@ -147,10 +152,10 @@
</filters>
<transformers>
<!-- <transformer-->
<!-- implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">-->
<!-- <mainClass>com.zhiwei.searchhotcrawler.run.HotSearchRun</mainClass>-->
<!-- </transformer>-->
<!-- <transformer-->
<!-- implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">-->
<!-- <mainClass>com.zhiwei.searchhotcrawler.run.HotSearchRun</mainClass>-->
<!-- </transformer>-->
<!-- 不覆盖同名文件,而是追加合并同名文件 -->
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/spring.handlers</resource>
......
......@@ -22,5 +22,7 @@ public enum HotSearchType {
腾讯较真榜,
脉脉热榜,
B站排行榜,
B站热搜
B站热搜,
人气榜36,
虎嗅热文推荐,
}
package com.zhiwei.searchhotcrawler.crawler;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.time.Duration;
import java.util.*;
/**
* @author ll
* @ClassName:HotSearch36KrCrawler
* @Description:
* @date 2021年5月21日 上午11:54:31
*/
@Log4j2
public class HotSearch36KrCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public static List<HotSearchList> hotSearch36Kr(Date date) {
String url = "https://www.36kr.com/hot-list/catalog";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析36Kr人气榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("article-list")) {
return ansysData(htmlBody,date);
} else {
log.info("解析36Kr人气榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
// public static List<HotSearchList> hotSearch36Kr(Date date) {
// String url = "https://www.36kr.com/hot-list/catalog";
// //建立一个新的客户端请求(创建HttpClient对象)
// CloseableHttpClient httpClient = HttpClients.createDefault();
// //创建请求对象实例
// HttpGet httpGet = new HttpGet(url);
// httpGet.addHeader("User-Agent", "spider");
// //获取响应的结果
// CloseableHttpResponse response = null;
// try {
// //调用HttpClient对象的execute方法发送请求
// response = httpClient.execute(httpGet);
//
// if (Objects.nonNull(response)) {
// //获取HttpEntity对象其中包含了响应内容(响应头)
// HttpEntity entity = response.getEntity();
//
// String htmlBody = EntityUtils.toString(entity);
// return ansysData(htmlBody,date);
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
// return Collections.emptyList();
// }
/**
* 解析数据
*
* @param htmlBody
* @return
*/
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
List<HotSearchList> list = new ArrayList<>();
String webSite = "https://www.36kr.com";
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.article-list").first().select("div.article-wrapper");
if (Objects.nonNull(elements) && !elements.isEmpty()) {
// 获取排名rank
int rank = 0;
for (Element element : elements) {
try {
rank++;
// 获取关键词(String)
String keyWord = element.select("p.title-wrapper").select("a.article-item-title").text();
// logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
String everurl = element.select("p.title-wrapper").select("a.article-item-title").attr("href");
// 获取搜索指数count(int)
String url = webSite + everurl;
String hot = null;
// 判断热度值所在的规则是否为null
if (!element.select("span").isEmpty()) {
hot = element.select("span").text();
}
Long count = 0L;
// 判断hot是否为空
if (StringUtils.isNotBlank(hot)) {
String[] hots = hot.split("热度");
String trim = hots[1].trim();
Double num = Double.valueOf(trim);
count = Math.round(num);
}
if (Objects.nonNull(rank)) {
if (count == 0) {
log.info(htmlBody);
log.info(hot);
log.info(element);
} else {
HotSearchList hotSearch = new HotSearchList(url, keyWord, count, rank, HotSearchType.人气榜36.name(), date);
list.add(hotSearch);
}
}
} catch (Exception e) {
log.error("解析36Kr人气榜时出现解析错误", e);
}
}
}
} catch (Exception e) {
log.error("解析36Kr人气榜时出现解析错误,数据不是json结构", e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.time.Duration;
import java.util.*;
/**
* @author: ll
* @ClassName: HuXiuHotSearchCrawler
* @Description: pc端虎嗅热文推荐采集
* @date: 2021年5月24日 下午16:35:31
* @Title: HuXiuHotSearchCrawler
*/
@Log4j2
public class HuXiuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
public static List<HotSearchList> HuXiuHotArticleRecommended(Date date){
String url = "https://www.huxiu.com/";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析虎嗅热文推荐时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("hot__list")) {
return ansysData(htmlBody,date);
} else {
log.info("解析虎嗅热文推荐时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
// String url="https://www.huxiu.com/";
// //创建客户端请求对象
// CloseableHttpClient httpClient = HttpClients.createDefault();
// //创建请求对象实例
// HttpGet httpGet = new HttpGet(url);
// //设置头信息
// httpGet.addHeader("User-Agent","spider");
//
// //获取响应结果
// try {
// CloseableHttpResponse response = httpClient.execute(httpGet);
// //判断响应结果是否为空
// if (Objects.nonNull(response)) {
// //获取HttpEntity对象其中包含了响应内容(响应头)
// HttpEntity entity = response.getEntity();
// String htmlBody = EntityUtils.toString(entity);
// return ansysData(htmlBody,date);
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
//
// return Collections.emptyList();
// }
//解析页面数据
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
ArrayList<HotSearchList> list = new ArrayList<>();
String webSite="https://www.huxiu.com";
try {
//获取Document文档对象
Document document = Jsoup.parse(htmlBody);
//获取元素集合
Elements elements = document.select("div.hot__list").select("div.focus-item");
if (Objects.nonNull(elements) && !elements.isEmpty()){
// 获取排名rank
Integer rank = 0;
for (Element element : elements) {
try {
rank++;
//获取关键词
String keyWord= element.select("p").text();
//获取关键词相关链接
String href = element.select("a.focus-item__left").attr("href");
String url=webSite+href;
//获取讨论量
String comment = element.select("i").first().text();
Long commentCount = Long.valueOf(comment);
String topicLead =null;
long count=0L;
HotSearchList hotSearchList = new HotSearchList(url, keyWord,count, rank,HotSearchType.虎嗅热文推荐.name(),commentCount, topicLead, date);
list.add(hotSearchList);
} catch (NumberFormatException e) {
log.error("解析虎嗅热文推荐时出现解析错误",e);
}
}
}
} catch (Exception e) {
log.error("解析虎嗅热文推荐时出现解析错误,数据不是json结构",e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.dao;
import com.mongodb.BasicDBObject;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
......@@ -52,6 +50,10 @@ public class HotSearchCacheDAO {
// if("今日头条热搜".equals(hotSearch.getType())){
// document.put("comment_count", hotSearch.getCommentCount());
// }
if("虎嗅热文推荐".equals(hotSearch.getType())){
document.put("comment_count", hotSearch.getCommentCount());
}
if("腾讯较真榜".equals(hotSearch.getType())){
document.put("topic_result",hotSearch.getTopicResult());
}
......@@ -125,7 +127,7 @@ public class HotSearchCacheDAO {
//计算上升速度
double riseSpeed = nowDoc.containsKey("riseSpeed")?nowDoc.getDouble("riseSpeed"):0.00;
if(nonNull(lastCount) && nowDoc.containsKey("firstCount")) {
long firstCount = Long.parseLong(nowDoc.get("firstCount").toString());
long firstCount = nowDoc.getLong("firstCount");
riseSpeed = ((double)(lastCount - firstCount)/(double)firstCount)*1000/((double)duration);
}
// endTime = getEndTime(type, new Date());
......@@ -181,6 +183,10 @@ public class HotSearchCacheDAO {
// if(readCount != null){
// nowDoc.put("readCount",readCount);
// }
if("虎嗅热文推荐".equals(type)){
nowDoc.put("comment_count",document.getLong("comment_count"));
}
if(topicResult != null){
nowDoc.put("topicResult",topicResult);
}
......@@ -207,7 +213,7 @@ public class HotSearchCacheDAO {
}
}
}catch (Exception e){
log.error("数据存储时出错:", e);
log.error("数据存储时出错:{}", e);
}
}
......
package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.time.Duration;
import java.util.*;
/**
* @author ll
* @ClassName:HotSearch36KrCrawler
* @Description:
* @date 2021年5月21日 上午11:54:31
*/
@Log4j2
public class HotSearch36KrCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public static List<HotSearchList> hotSearch36Kr(Date date) {
String url = "https://www.36kr.com/hot-list/catalog";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析36Kr人气榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("article-list")) {
return ansysData(htmlBody,date);
} else {
log.info("解析36Kr人气榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
// public static List<HotSearchList> hotSearch36Kr(Date date) {
// String url = "https://www.36kr.com/hot-list/catalog";
// //建立一个新的客户端请求(创建HttpClient对象)
// CloseableHttpClient httpClient = HttpClients.createDefault();
// //创建请求对象实例
// HttpGet httpGet = new HttpGet(url);
// httpGet.addHeader("User-Agent", "spider");
// //获取响应的结果
// CloseableHttpResponse response = null;
// try {
// //调用HttpClient对象的execute方法发送请求
// response = httpClient.execute(httpGet);
//
// if (Objects.nonNull(response)) {
// //获取HttpEntity对象其中包含了响应内容(响应头)
// HttpEntity entity = response.getEntity();
//
// String htmlBody = EntityUtils.toString(entity);
// return ansysData(htmlBody,date);
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
// return Collections.emptyList();
// }
/**
* 解析数据
*
* @param htmlBody
* @return
*/
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
List<HotSearchList> list = new ArrayList<>();
String webSite = "https://www.36kr.com";
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.article-list").first().select("div.article-wrapper");
if (Objects.nonNull(elements) && !elements.isEmpty()) {
// 获取排名rank
int rank = 0;
for (Element element : elements) {
try {
rank++;
// 获取关键词(String)
String keyWord = element.select("p.title-wrapper").select("a.article-item-title").text();
// logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
String everurl = element.select("p.title-wrapper").select("a.article-item-title").attr("href");
// 获取搜索指数count(int)
String url = webSite + everurl;
String hot = null;
// 判断热度值所在的规则是否为null
if (!element.select("span").isEmpty()) {
hot = element.select("span").text();
}
Long count = 0L;
// 判断hot是否为空
if (StringUtils.isNotBlank(hot)) {
String[] hots = hot.split("热度");
String trim = hots[1].trim();
Double num = Double.valueOf(trim);
count = Math.round(num);
}
if (Objects.nonNull(rank)) {
if (count == 0) {
log.info(htmlBody);
log.info(hot);
log.info(element);
} else {
HotSearchList hotSearch = new HotSearchList(url, keyWord, count, rank, HotSearchType.人气榜36.name(), date);
list.add(hotSearch);
}
}
} catch (Exception e) {
log.error("解析36Kr人气榜时出现解析错误", e);
}
}
}
} catch (Exception e) {
log.error("解析36Kr人气榜时出现解析错误,数据不是json结构", e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.time.Duration;
import java.util.*;
/**
* @author: ll
* @ClassName: HuXiuHotSearchCrawler
* @Description: pc端虎嗅热文推荐采集
* @date: 2021年5月24日 下午16:35:31
* @Title: HuXiuHotSearchCrawler
*/
@Log4j2
public class HuXiuHotSearchCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
public static List<HotSearchList> HuXiuHotArticleRecommended(Date date){
String url = "https://www.huxiu.com/";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析虎嗅热文推荐时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("hot__list")) {
return ansysData(htmlBody,date);
} else {
log.info("解析虎嗅热文推荐时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
// String url="https://www.huxiu.com/";
// //创建客户端请求对象
// CloseableHttpClient httpClient = HttpClients.createDefault();
// //创建请求对象实例
// HttpGet httpGet = new HttpGet(url);
// //设置头信息
// httpGet.addHeader("User-Agent","spider");
//
// //获取响应结果
// try {
// CloseableHttpResponse response = httpClient.execute(httpGet);
// //判断响应结果是否为空
// if (Objects.nonNull(response)) {
// //获取HttpEntity对象其中包含了响应内容(响应头)
// HttpEntity entity = response.getEntity();
// String htmlBody = EntityUtils.toString(entity);
// return ansysData(htmlBody,date);
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
//
// return Collections.emptyList();
// }
//解析页面数据
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
ArrayList<HotSearchList> list = new ArrayList<>();
String webSite="https://www.huxiu.com";
try {
//获取Document文档对象
Document document = Jsoup.parse(htmlBody);
//获取元素集合
Elements elements = document.select("div.hot__list").select("div.focus-item");
if (Objects.nonNull(elements) && !elements.isEmpty()){
// 获取排名rank
Integer rank = 0;
for (Element element : elements) {
try {
rank++;
//获取关键词
String keyWord= element.select("p").text();
//获取关键词相关链接
String href = element.select("a.focus-item__left").attr("href");
String url=webSite+href;
//获取讨论量
String comment = element.select("i").first().text();
Long commentCount = Long.valueOf(comment);
String topicLead =null;
long count=0L;
HotSearchList hotSearchList = new HotSearchList(url, keyWord,count, rank,HotSearchType.虎嗅热文推荐.name(),commentCount, topicLead, date);
list.add(hotSearchList);
} catch (NumberFormatException e) {
log.error("解析虎嗅热文推荐时出现解析错误",e);
}
}
}
} catch (Exception e) {
log.error("解析虎嗅热文推荐时出现解析错误,数据不是json结构",e);
}
return list;
}
}
......@@ -8,6 +8,8 @@ import com.zhiwei.searchhotcrawler.crawler.*;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.RedisDao;
import com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO;
import com.zhiwei.searchhotcrawler.crawler.HotSearch36KrCrawler;
import com.zhiwei.searchhotcrawler.crawler.HuXiuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.timer.TouTiaoExecutor;
import com.zhiwei.searchhotcrawler.util.DateUtils;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
......@@ -38,6 +40,30 @@ public class GatherTimer {
/** 知乎时事子分类 */
private String DEPTH = "depth";
/**
* 虎嗅热文推荐的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ?")
public void crawlerHuXiu() {
logger.info("虎嗅热文推荐开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> huXiuList = HuXiuHotSearchCrawler.HuXiuHotArticleRecommended(date);
logger.info("{}, 虎嗅热文推荐此轮采集到的数据量为:{}", new Date(), Integer.valueOf(huXiuList != null ? huXiuList.size() : 0));
TipsUtils.addHotList(HotSearchType.虎嗅热文推荐.name(), huXiuList);
logger.info("虎嗅热文推荐采集结束...");
/**
* 36氪人气榜的采集
*/
logger.info("36氪人气榜开始采集...");
List<HotSearchList> list36Kr = HotSearch36KrCrawler.hotSearch36Kr(date);
logger.info("{}, 36氪人气榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list36Kr != null ? list36Kr.size() : 0));
TipsUtils.addHotList(HotSearchType.人气榜36.name(), list36Kr);
logger.info("36氪人气榜采集结束...");
}
/**
* 微博热搜的采集
*/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment