Commit f671bae7 by 马黎滨

Merge branch 'mlbWork' into 'master'

定时器采集

See merge request !23
parents d5791e24 f3e0b6c2
......@@ -10,6 +10,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<spring.version>4.2.2.RELEASE</spring.version>
</properties>
<developers>
......@@ -55,6 +56,62 @@
<artifactId>crawler-core</artifactId>
<version>0.6.0.4-RELEASE</version>
</dependency>
<dependency>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz</artifactId>
<version>${quartz.version}</version>
</dependency>
<dependency>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz-jobs</artifactId>
<version>${quartz.version}</version>
</dependency>
<!-- Spring文件配置 -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-aop</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-beans</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-core</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-expression</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context-support</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-web</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-tx</artifactId>
<version>${spring.version}</version>
</dependency>
</dependencies>
<build>
......
......@@ -81,42 +81,42 @@ public class HotSearchList implements Serializable{
public HotSearchList(){}
public HotSearchList(String url, String name, Integer count,Boolean hot,Integer rank,String type,String icon){
public HotSearchList(String url, String name, Integer count,Boolean hot,Integer rank,String type,String icon,Date date){
this.id = name + "_" + new Date().getTime() + "_" + type;
this.url = url;
this.name = name;
this.count = count;
this.hot = hot;
this.rank = rank;
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.time = date;
this.day = TimeParse.dateFormartString(date, "yyyy-MM-dd");
this.type = type;
this.icon = icon;
}
public HotSearchList(String url, String name, Integer count,Integer rank,String type){
public HotSearchList(String url, String name, Integer count,Integer rank,String type,Date date){
this.id = name + "_" + new Date().getTime()+ "_" + type;
this.url = url;
this.name = name;
this.count = count;
this.hot = true;
this.rank = rank;
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.time = date;
this.day = TimeParse.dateFormartString(date, "yyyy-MM-dd");
this.type = type;
}
public HotSearchList(String url, String name, Integer count,Integer rank,String type, Integer commentCount, String topicLead){
public HotSearchList(String url, String name, Integer count,Integer rank,String type, Integer commentCount, String topicLead,Date date){
this.id = name + "_" + new Date().getTime()+ "_" + type;
this.url = url;
this.name = name;
this.count = count;
this.hot = true;
this.rank = rank;
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.time = date;
this.day = TimeParse.dateFormartString(date, "yyyy-MM-dd");
this.type = type;
this.commentCount = commentCount;
this.topicLead = topicLead;
......
package com.zhiwei.searchhotcrawler.crawler;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.*;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
......@@ -39,7 +36,7 @@ public class BaiDuHotSearchCrawler {
* @Description: PC端百度风云榜采集
* @return void 返回类型
*/
public static List<HotSearchList> baiduHotSearch() {
public static List<HotSearchList> baiduHotSearch(Date date) {
String url = "http://top.baidu.com/buzz?b=1&fr=topindex";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
......@@ -49,7 +46,7 @@ public class BaiDuHotSearchCrawler {
log.error("解析百度风云榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("mainBody")) {
return ansysData(htmlBody);
return ansysData(htmlBody,date);
} else {
log.info("解析百度风云榜时出现解析错误,页面结构有问题");
}
......@@ -62,7 +59,7 @@ public class BaiDuHotSearchCrawler {
* @param htmlBody
* @return
*/
private static List<HotSearchList> ansysData(String htmlBody){
private static List<HotSearchList> ansysData(String htmlBody,Date date){
List<HotSearchList> list = new ArrayList<>();
try {
Document document = Jsoup.parse(htmlBody);
......@@ -110,7 +107,7 @@ public class BaiDuHotSearchCrawler {
log.info(hot);
log.info(element);
} else {
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name());
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name(),date);
list.add(hotSearch);
}
}
......
......@@ -2,6 +2,7 @@ package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import lombok.extern.log4j.Log4j2;
......@@ -32,6 +33,8 @@ public class DouyinHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).build();
public static List<HotSearchList> list = new ArrayList<>();
/**
* @Title: getMobileDouyinHotList
* @author hero
......@@ -39,8 +42,8 @@ public class DouyinHotSearchCrawler {
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public static List<HotSearchList> getMobileDouyinHotList(){
List<HotSearchList> list = new ArrayList<>();
public static List<HotSearchList> getMobileDouyinHotList(Date date){
//List<HotSearchList> list = new ArrayList<>();
String url = "https://api.amemv.com/aweme/v1/hot/search/list/";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
......@@ -69,7 +72,7 @@ public class DouyinHotSearchCrawler {
Integer hotValue = null;
hotValue = Integer.valueOf(hotValueStr);
// logger.info("热度为:::{}", hot_value);
HotSearchList douyin = new HotSearchList(null, word, hotValue, position, HotSearchType.抖音热搜.name());
HotSearchList douyin = new HotSearchList(null, word, hotValue, position, HotSearchType.抖音热搜.name(),date);
list.add(douyin);
}
}
......
......@@ -26,7 +26,7 @@ public class FengHuangSearchCrawler {
* 获取凤凰新闻热榜
* @return
*/
public static List<HotSearchList> getFengHuangHotList(){
public static List<HotSearchList> getFengHuangHotData(Date date){
log.info("凤凰新闻热榜开始采集");
List<HotSearchList> list = new ArrayList<>();
for(int page = 1; page <=2; page++) {
......@@ -50,7 +50,7 @@ public class FengHuangSearchCrawler {
Integer count = TipsUtils.getHotCount(hotValue);
Integer commentCount = jsonArray.getJSONObject(i).getIntValue("commentsall");
HotSearchList hotSearchList = new HotSearchList(fenghuangUrl,name,count,
rank,HotSearchType.凤凰新闻热榜.name(),commentCount,topicLead);
rank,HotSearchType.凤凰新闻热榜.name(),commentCount,topicLead,date);
list.add(hotSearchList);
}
}
......@@ -65,7 +65,7 @@ public class FengHuangSearchCrawler {
* 获取凤凰新闻热搜
* @return
*/
public static List<HotSearchList> getFengHuangHotData(){
public static List<HotSearchList> getFengHuangHotSearch(Date date){
log.info("凤凰新闻热搜开始采集");
List<HotSearchList> list = new ArrayList<>();
String url = "https://shankapi.ifeng.com/autumn/sogouSearchHotword";
......@@ -89,7 +89,7 @@ public class FengHuangSearchCrawler {
fenghuangUrl = "https://so.ifeng.com/?q="+id;
}
HotSearchList hotSearchList = new HotSearchList(fenghuangUrl, name, null, rank,
HotSearchType.凤凰新闻热搜.name());
HotSearchType.凤凰新闻热搜.name(),date);
list.add(hotSearchList);
}
}
......
package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.*;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
......@@ -41,7 +37,7 @@ public class SougoHotSearchCrawler {
* @Description: TODO(PC端搜狗微信关键词采集)
* @return void 返回类型
*/
public static List<HotSearchList> sougoHotSearch() {
public static List<HotSearchList> sougoHotSearch(Date date) {
String url = "https://weixin.sogou.com";
List<HotSearchList> list = new ArrayList<>();
Map<String,String> headMap = HeaderTool.getCommonHead();
......@@ -75,7 +71,7 @@ public class SougoHotSearchCrawler {
String everurl = element.select("li").select("a").attr("href");
HotSearchList hotSearch = new HotSearchList(everurl, kw, null, rank, HotSearchType.搜狗微信热搜.name());
HotSearchList hotSearch = new HotSearchList(everurl, kw, null, rank, HotSearchType.搜狗微信热搜.name(),date);
if (Objects.nonNull(rank)) {
list.add(hotSearch);
......
......@@ -15,6 +15,7 @@ import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
......@@ -22,7 +23,7 @@ import java.util.List;
public class SouhuTopicCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static List<HotSearchList> getSouhuTopic(){
public static List<HotSearchList> getSouhuTopic(Date date){
List<HotSearchList> hotSearchLists = new ArrayList<>();
log.info("搜狐话题榜开始采集...");
JSONArray dataJson = null;
......@@ -43,10 +44,16 @@ public class SouhuTopicCrawler {
Integer rank = i+1;
String name = dataJson.getJSONObject(i).getJSONObject("eventNewsInfo").getString("title");
String hotValue = dataJson.getJSONObject(i).getString("value");
Integer count = TipsUtils.getHotCount(hotValue.substring(0,hotValue.indexOf("观点")));
Integer count = 0;
if(hotValue.contains("观点")) {
count = TipsUtils.getHotCount(hotValue.substring(0, hotValue.indexOf("观点")));
}else{
log.error("搜狐话题采集热度为空,采集结束");
return Collections.emptyList();
}
String souguUrl = dataJson.getJSONObject(i).getJSONObject("eventNewsInfo").getString("h5Link");
String icon = dataJson.getJSONObject(i).getJSONObject("attrInfo").getString("displayText");
HotSearchList hotSearchList = new HotSearchList(souguUrl,name,count,true,rank, HotSearchType.搜狐话题.name(),icon);
HotSearchList hotSearchList = new HotSearchList(souguUrl,name,count,true,rank, HotSearchType.搜狐话题.name(),icon,date);
hotSearchLists.add(hotSearchList);
}
log.info("{}, 此轮搜狐话题榜采集到的数据量为:{}", new Date(), Integer.valueOf(hotSearchLists != null ? hotSearchLists.size() : 0));
......
......@@ -26,7 +26,7 @@ public class TengXunCrawler {
* 腾讯热榜数据采集
* @return
*/
public static List<HotSearchList> getTengXunHotList() {
public static List<HotSearchList> getTengXunHotList(Date date) {
log.info("腾讯新闻热榜开始采集...");
List<HotSearchList> list = new ArrayList<>();
JSONArray dataJson = null;
......@@ -68,7 +68,7 @@ public class TengXunCrawler {
icon = "新";
}
}
HotSearchList hotSearchList = new HotSearchList(tengxunUrl, name, count, false, rank, HotSearchType.腾讯新闻.name(), icon);
HotSearchList hotSearchList = new HotSearchList(tengxunUrl, name, count, false, rank, HotSearchType.腾讯新闻.name(), icon,date);
list.add(hotSearchList);
}
}
......
......@@ -36,7 +36,7 @@ public class ToutiaoHotSearchCrawler {
* @Description: TODO(手机端Iphone 微博热搜采集)
* @return void 返回类型
*/
public static List<HotSearchList> toutiaoHotSearchByPhone(){
public static List<HotSearchList> toutiaoHotSearchByPhone(Date date){
String origin = "hot_board";
String jsUrl = "https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js";
Request jsRequest = RequestUtils.wrapGet(jsUrl);
......@@ -77,7 +77,7 @@ public class ToutiaoHotSearchCrawler {
String wordsType = word.getString("Label");
String icon = getIcon(wordsType);
HotSearchList hotSearch = new HotSearchList(link, name, hotCount, true, rank, HotSearchType.今日头条热搜.name(), icon);
HotSearchList hotSearch = new HotSearchList(link, name, hotCount, true, rank, HotSearchType.今日头条热搜.name(), icon,date);
result.add(hotSearch);
rank++;
} catch (Exception e) {
......
......@@ -32,7 +32,7 @@ public class WangYiHotSearchCrawler {
* 网易新闻实时热榜的采集
* @return
*/
public static List<HotSearchList> getWangYiHotSearch(){
public static List<HotSearchList> getWangYiHotSearch(Date date){
List<HotSearchList> hotSearchLists = new ArrayList<>();
log.info("网易新闻实时热榜开始采集");
String url = "https://v6-gw.m.163.com/nc-main/api/v1/hqc/no-repeat-hot-list";
......@@ -54,7 +54,7 @@ public class WangYiHotSearchCrawler {
int count = jsonObject.getJSONObject(i).getIntValue("hotValue");
String contentId = jsonObject.getJSONObject(i).getString("contentId");
String wangyiUrl = "https://c.m.163.com/news/a/" + contentId + ".html";
HotSearchList hotSearchList = new HotSearchList(wangyiUrl, name, count, rank, HotSearchType.网易热榜.name());
HotSearchList hotSearchList = new HotSearchList(wangyiUrl, name, count, rank, HotSearchType.网易热榜.name(),date);
hotSearchLists.add(hotSearchList);
}
log.info("{}, 此轮网易新闻热榜采集到的数据量为:{}", new Date(), Integer.valueOf(hotSearchLists != null ? hotSearchLists.size() : 0));
......@@ -71,7 +71,7 @@ public class WangYiHotSearchCrawler {
* 网易新闻跟帖热议的采集
* @return
*/
public static List<HotSearchList> getWangYicomment(){
public static List<HotSearchList> getWangYicomment(Date date){
List<HotSearchList> hotSearchLists = new ArrayList<>();
log.info("网易新闻跟贴热议开始采集");
String url = "https://v6-gw.m.163.com/gentie-web/api/v2/products/a2869674571f77b5a0867c3d71db5856/rankDocs/all/list?ibc=newsapph5&limit=30";
......@@ -93,7 +93,7 @@ public class WangYiHotSearchCrawler {
int count = jsonObject.getJSONObject(i).getIntValue("hotScore")*10000;
String contentId = jsonObject.getJSONObject(i).getString("docId");
String wangyiUrl = "https://c.m.163.com/news/a/" + contentId + ".html";
HotSearchList hotSearchList = new HotSearchList(wangyiUrl, name, count, rank, HotSearchType.网易跟帖热议.name());
HotSearchList hotSearchList = new HotSearchList(wangyiUrl, name, count, rank, HotSearchType.网易跟帖热议.name(),date);
hotSearchLists.add(hotSearchList);
}
log.info("{}, 此轮网易新闻跟贴热议采集到的数据量为:{}", new Date(), Integer.valueOf(hotSearchLists != null ? hotSearchLists.size() : 0));
......
......@@ -40,60 +40,60 @@ public class WeiboHotSearchCrawler {
* @Description: TODO(PC端微博热搜采集)
* @return void 返回类型
*/
public static List<HotSearchList> weiboHotSearch(){
String url = "https://s.weibo.com/top/summary?cate=realtimehot";
List<HotSearchList> list = new ArrayList<HotSearchList>();
for(int i =0; i<3; i++){
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
if(i==2){
return list;
}else{
continue;
}
}
if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) {
try {
// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
// script = script.replace("(", "").replace(")", "");
// JSONObject json = JSONObject.parseObject(script);
// String html = json.getString("html");
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
for (Element element : elements) {
try {
String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href");
String name = element.select("td.td-02").select("a").text();
String num = !element.select("td.td-02").select("span").text().equals("") ? element.select("td.td-02").select("span").text() : "0";
String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("") ? element.select("td[class=\"td-01 ranktop\"]").text() : "-1";
int hotCount = Integer.valueOf(num);
int rankCount = Integer.valueOf(rank);
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), null);
list.add(hotSearch);
} catch (Exception e) {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log.error("解析微博时时热搜时出现解析错误", e);
continue;
}
}
} catch (Exception e) {
log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace());
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
return null;
}
} else {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
}
break;
}
return list;
}
// public static List<HotSearchList> weiboHotSearch(){
// String url = "https://s.weibo.com/top/summary?cate=realtimehot";
//
// List<HotSearchList> list = new ArrayList<HotSearchList>();
// for(int i =0; i<3; i++){
// String htmlBody = null;
// Request request = RequestUtils.wrapGet(url);
// try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// } catch (Exception e) {
// if(i==2){
// return list;
// }else{
// continue;
// }
// }
// if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) {
// try {
//// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
//// script = script.replace("(", "").replace(")", "");
//// JSONObject json = JSONObject.parseObject(script);
//// String html = json.getString("html");
// Document document = Jsoup.parse(htmlBody);
// Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
// for (Element element : elements) {
// try {
// String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href");
// String name = element.select("td.td-02").select("a").text();
// String num = !element.select("td.td-02").select("span").text().equals("") ? element.select("td.td-02").select("span").text() : "0";
// String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("") ? element.select("td[class=\"td-01 ranktop\"]").text() : "-1";
//
// int hotCount = Integer.valueOf(num);
// int rankCount = Integer.valueOf(rank);
// HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), null);
// list.add(hotSearch);
// } catch (Exception e) {
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
// log.error("解析微博时时热搜时出现解析错误", e);
// continue;
// }
// }
// } catch (Exception e) {
// log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace());
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
// return null;
// }
// } else {
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
// log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
// }
// break;
// }
// return list;
// }
......@@ -104,7 +104,7 @@ public class WeiboHotSearchCrawler {
* @Description: TODO(手机端Iphone 微博热搜采集)
* @return void 返回类型
*/
public static List<HotSearchList> weiboHotSearchByPhone(){
public static List<HotSearchList> weiboHotSearchByPhone(Date date){
String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
......@@ -142,7 +142,7 @@ public class WeiboHotSearchCrawler {
icon = icon.split("_")[1].split(".png")[0];
}
String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon);
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
result.add(hotSearch);
rank++;
}
......
......@@ -130,7 +130,7 @@ public class WeiboTopicCrawler {
/**
* 微博平话题榜采集
*/
public static List<HotSearchList> startCrawlerByPhone(){
public static List<HotSearchList> startCrawlerByPhone(Date date){
List<HotSearchList> topicList = new ArrayList<>();
for(int page=1; page<=6; page++){
String pageUrl = "https://m.weibo.cn/api/container/getIndex?containerid=231648_-_2&page=" + page;
......@@ -146,7 +146,7 @@ public class WeiboTopicCrawler {
continue;
}
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
topicList.addAll(parseTopicHtml(htmlBody));
topicList.addAll(parseTopicHtml(htmlBody,date));
break;
} else {
log.info("下载榜单列表页面时数据格式错误,页面为:{}", htmlBody);
......@@ -157,7 +157,7 @@ public class WeiboTopicCrawler {
}
private static List<HotSearchList> parseTopicHtml(String htmlBody) {
private static List<HotSearchList> parseTopicHtml(String htmlBody,Date date) {
try {
JSONArray cards = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("cards");
if(Objects.nonNull(cards) && !cards.isEmpty()) {
......@@ -207,7 +207,7 @@ public class WeiboTopicCrawler {
}catch (Exception e){
e.printStackTrace();
}
HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博话题.name(), commentNum, description);
HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博话题.name(), commentNum, description,date);
topicList.add(topic);
}
return topicList;
......
......@@ -28,7 +28,7 @@ public class XinLangHotSearchCrawler {
* 新浪热榜的采集
* @return
*/
public static List<HotSearchList> getXinLangHotSearch(){
public static List<HotSearchList> getXinLangHotSearch(Date date){
List<HotSearchList> hotSearchLists = new ArrayList<>();
log.info("新浪热榜开始采集");
String url = "https://sinanews.sina.cn/h5/top_news_list.d.html";
......@@ -54,6 +54,12 @@ public class XinLangHotSearchCrawler {
String hotValue = jsonArray.getJSONObject(i).getString("hotValue");
Integer count = TipsUtils.getHotCount(hotValue);
String showTags = jsonArray.getJSONObject(i).getString("showTags");
String routeUri = jsonArray.getJSONObject(i).getString("routeUri");
String xinLangUrl = null;
if(routeUri.contains("groupId")){
xinLangUrl = "https://super.sina.cn/shequn/forum/detail_" +
routeUri.substring(routeUri.indexOf("groupId=")+8) + ".html";
}
String icon = null;
if (showTags.contains("新")) {
icon = "新";
......@@ -62,7 +68,7 @@ public class XinLangHotSearchCrawler {
} else if (showTags.contains("沸")) {
icon = "沸";
}
HotSearchList hotSearchList = new HotSearchList(null, name, count, true, rank, HotSearchType.新浪热榜.name(), icon);
HotSearchList hotSearchList = new HotSearchList(xinLangUrl, name, count, true, rank, HotSearchType.新浪热榜.name(), icon, date);
hotSearchLists.add(hotSearchList);
}
log.info("{}, 此轮新浪热榜采集到的数据量为:{}", new Date(), Integer.valueOf(hotSearchLists != null ? hotSearchLists.size() : 0));
......@@ -80,7 +86,7 @@ public class XinLangHotSearchCrawler {
* 新浪热点的采集
* @return
*/
public static List<HotSearchList> getXinLangHotSpot(){
public static List<HotSearchList> getXinLangHotSpot(Date date){
List<HotSearchList> hotSearchLists = new ArrayList<>();
log.info("新浪热点开始采集");
String url = "http://interface.sina.cn/wap_api/hot_rank_data.d.json";
......@@ -102,7 +108,7 @@ public class XinLangHotSearchCrawler {
String name = dataJson.getJSONObject(i).getString("title");
String xinlangUrl = dataJson.getJSONObject(i).getString("wapurl");
Integer hot = dataJson.getJSONObject(i).getIntValue("hot_value");
HotSearchList hotSearchList = new HotSearchList(xinlangUrl, name, hot, rank, HotSearchType.新浪热点.name());
HotSearchList hotSearchList = new HotSearchList(xinlangUrl, name, hot, rank, HotSearchType.新浪热点.name(),date);
hotSearchLists.add(hotSearchList);
}
log.info("{}, 此轮新浪热点采集到的数据量为:{}", new Date(), Integer.valueOf(hotSearchLists != null ? hotSearchLists.size() : 0));
......
......@@ -16,10 +16,7 @@ import okhttp3.Request;
import okhttp3.Response;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
@Log4j2
public class ZhihuChildHotSearchCrawler {
......@@ -32,7 +29,7 @@ public class ZhihuChildHotSearchCrawler {
* @param typeName
* @return
*/
public static List<HotSearchList> getZhihuTopicSearch(String type,String typeName) {
public static List<HotSearchList> getZhihuTopicSearch(String type, String typeName, Date date) {
List<HotSearchList> list = new ArrayList<>();
String url = "https://www.zhihu.com/api/v3/feed/topstory/hot-lists/"+type;
Map<String,String> headerMap = new HashMap<>();
......@@ -57,7 +54,7 @@ public class ZhihuChildHotSearchCrawler {
String hotCountString = jsonObject.getJSONObject("metrics_area").getString("text");
Integer count = TipsUtils.getHotCount(hotCountString.substring(0, hotCountString.indexOf("领域热度")));
String childUrl = jsonObject.getJSONObject("link").getString("url");
HotSearchList hotSearchList = new HotSearchList(childUrl, name, count, rank, HotSearchType.知乎热搜.name() + typeName + "分类");
HotSearchList hotSearchList = new HotSearchList(childUrl, name, count, rank, HotSearchType.知乎热搜.name() + typeName + "分类",date);
list.add(hotSearchList);
}
}
......
......@@ -2,6 +2,7 @@ package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
......@@ -37,42 +38,42 @@ public class ZhihuHotSearchCrawler {
* @Description: 知乎热搜采集程序
* @return void 返回类型
*/
public static List<HotSearchList> getZhihuHotList(){
List<HotSearchList> list = null;
String url = "https://www.zhihu.com/api/v4/search/top_search";
String rerferer = "https://www.zhihu.com/search?type=content&q=%E5%BF%AB%E6%89%8B";
Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");
headerMap.put("Host", "www.zhihu.com");
headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
headerMap.put("accept", "application/json, text/plain, */*");
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
headerMap.put("Referer", rerferer);
Request request = RequestUtils.wrapGet(url, headerMap);
String htmlBody = null;
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
}catch (IOException e) {
log.debug("获取知乎热搜时出现问题:{}", e);
}
if (htmlBody != null && htmlBody.contains("words")) {
list = new ArrayList<>();
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray words = topSearch.getJSONObject("top_search").getJSONArray("words");
String link = null;
String displayQuery = null;
String query = null;
for (int i = 0; i < words.size(); i++) {
JSONObject word = words.getJSONObject(i);
query = word.getString("query");
displayQuery = word.getString("display_query");
link = "https://www.zhihu.com/search?q=" + URLCodeUtil.getURLEncode(query, "utf-8") + "&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content";
HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name());
list.add(zhihu);
}
}
return list;
}
// public static List<HotSearchList> getZhihuHotList(){
// List<HotSearchList> list = null;
// String url = "https://www.zhihu.com/api/v4/search/top_search";
// String rerferer = "https://www.zhihu.com/search?type=content&q=%E5%BF%AB%E6%89%8B";
// Map<String,String> headerMap = HeaderTool.getCommonHead();
// headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");
// headerMap.put("Host", "www.zhihu.com");
// headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
// headerMap.put("accept", "application/json, text/plain, */*");
// headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
// headerMap.put("Referer", rerferer);
// Request request = RequestUtils.wrapGet(url, headerMap);
// String htmlBody = null;
// try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// }catch (IOException e) {
// log.debug("获取知乎热搜时出现问题:{}", e);
// }
// if (htmlBody != null && htmlBody.contains("words")) {
// list = new ArrayList<>();
// JSONObject topSearch = JSONObject.parseObject(htmlBody);
// JSONArray words = topSearch.getJSONObject("top_search").getJSONArray("words");
// String link = null;
// String displayQuery = null;
// String query = null;
// for (int i = 0; i < words.size(); i++) {
// JSONObject word = words.getJSONObject(i);
// query = word.getString("query");
// displayQuery = word.getString("display_query");
// link = "https://www.zhihu.com/search?q=" + URLCodeUtil.getURLEncode(query, "utf-8") + "&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content";
// HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name());
// list.add(zhihu);
// }
// }
// return list;
// }
......@@ -83,7 +84,7 @@ public class ZhihuHotSearchCrawler {
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public static List<HotSearchList> getMobileZhihuHotList(){
public static List<HotSearchList> getMobileZhihuHotList(Date date){
List<HotSearchList> list = new ArrayList<>();
String url = "https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0";
Map<String,String> headerMap = HeaderTool.getCommonHead();
......@@ -127,7 +128,7 @@ public class ZhihuHotSearchCrawler {
} catch (Exception e) {
e.printStackTrace();
}
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name());
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(),date);
list.add(zhihu);
}
}
......
......@@ -9,6 +9,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.Data;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
......@@ -19,16 +20,13 @@ import org.w3c.dom.Element;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.*;
@Log4j2
public class ZhihuTopicSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static List<HotSearchList> getZhihuTopicSearch(){
public static List<HotSearchList> getZhihuTopicSearch(Date date){
List<HotSearchList> list = new ArrayList<>();
String url = "https://www.zhihu.com/topsearch";
JSONObject jsonObject = null;
......@@ -52,7 +50,7 @@ public class ZhihuTopicSearchCrawler {
String name = data.getString("queryDisplay");
String realQuery = data.getString("realQuery");
String zhihuUrl = "https://www.zhihu.com/search?q=" + realQuery + "&utm_content=search_hot&type=content";
HotSearchList hotSearchList = new HotSearchList(zhihuUrl, name, null, rank, HotSearchType.知乎热搜榜单.name());
HotSearchList hotSearchList = new HotSearchList(zhihuUrl, name, null, rank, HotSearchType.知乎热搜榜单.name(),date);
list.add(hotSearchList);
}
return list;
......
......@@ -40,7 +40,10 @@ public class HotSearchCacheDAO {
document.put("rank", hotSearch.getRank());
document.put("type", hotSearch.getType());
document.put("icon", hotSearch.getIcon());
if("微博话题".equals(hotSearch.getType())){
document.put("topic_lead", hotSearch.getTopicLead());
document.put("comment_count", hotSearch.getCommentCount());
}
addAndUpdateData(document);
dataes.add(document);
});
......@@ -99,7 +102,7 @@ public class HotSearchCacheDAO {
//计算热搜时长
int duration = nowDoc.getInteger("duration");
int durationNow = getDuration(type, duration);
endTime = getEndTime(type, new Date());
// endTime = getEndTime(type, new Date());
//更新相应信息
nowDoc.put("endTime", endTime);
nowDoc.put("lastRank", lastRank);
......
......@@ -6,6 +6,8 @@ import com.zhiwei.searchhotcrawler.cache.CacheListener;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.timer.*;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
......@@ -16,6 +18,7 @@ public class HotSearchRun {
public static void main(String[] args) {
ApplicationContext context = new ClassPathXmlApplicationContext("applicationContext.xml");
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
......@@ -43,18 +46,18 @@ public class HotSearchRun {
//采集程序启动
new WeiboHotSearchRun().start();
new BaiduHotSearchRun().start();
// new SougoHotSearchRun().start();
new DouyinHotSearchRun().start();
// new ZhihuHotSearchRun().start();
new WeiboSuperTopicRun().start();
new WeiboTopicRun().start();
// new ToutiaoHotSearchRun().start();
// new ZhihuTopSearchRun().start();
new ZhihuChildHotSearchRun().start();
new ThreadOneRun().start();
// //抖音链接更新
new DouYinUrlHotSearchRun().start();
// new WeiboHotSearchRun().start();
// new BaiduHotSearchRun().start();
//// new SougoHotSearchRun().start();
// new DouyinHotSearchRun().start();
//// new ZhihuHotSearchRun().start();
// new WeiboSuperTopicRun().start();
// new WeiboTopicRun().start();
//// new ToutiaoHotSearchRun().start();
//// new ZhihuTopSearchRun().start();
// new ZhihuChildHotSearchRun().start();
// new ThreadOneRun().start();
//// //抖音链接更新
// new DouYinUrlHotSearchRun().start();
}
}
......@@ -42,32 +42,32 @@ public class BaiduHotSearchRun extends Thread{
private void getHotList() {
log.info("百度风云榜采集开始........");
// HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> baiduList = BaiDuHotSearchCrawler.baiduHotSearch();
log.info("{}, 此轮百度风云榜采集到的数据量为:{}", new Date(), Integer.valueOf(baiduList != null ? baiduList.size() : 0));
// if(Objects.nonNull(list) && !list.isEmpty()) {
// List<Document> data = hotSearchCacheDAO.addData(list);
// hotSearchDAO.addHotSearchList(data);
// TipsUtils.recoveryTips("百度热搜",new Date());
// } else {
// TipsUtils.sendTips("百度热搜",new Date());
// }
TipsUtils.addHotList("百度热搜",baiduList);
log.info("百度风云榜采集结束........");
ZhiWeiTools.sleep(2000L);
log.info("搜狗微信采集开始........");
List<HotSearchList> sougouList = SougoHotSearchCrawler.sougoHotSearch();
log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(sougouList != null ? sougouList.size() : 0));
TipsUtils.addHotList("搜狗微信热搜",sougouList);
log.info("搜狗微信采集结束........");
ZhiWeiTools.sleep(2000L);
log.info("知乎话题采集开始........");
List<HotSearchList> zhihuList = ZhihuHotSearchCrawler.getMobileZhihuHotList();
log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(zhihuList != null ? zhihuList.size() : 0));
TipsUtils.addHotList("知乎热搜",zhihuList);
log.info("知乎话题采集结束........");
// log.info("百度风云榜采集开始........");
//// HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
//// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<HotSearchList> baiduList = BaiDuHotSearchCrawler.baiduHotSearch();
// log.info("{}, 此轮百度风云榜采集到的数据量为:{}", new Date(), Integer.valueOf(baiduList != null ? baiduList.size() : 0));
//// if(Objects.nonNull(list) && !list.isEmpty()) {
//// List<Document> data = hotSearchCacheDAO.addData(list);
//// hotSearchDAO.addHotSearchList(data);
//// TipsUtils.recoveryTips("百度热搜",new Date());
//// } else {
//// TipsUtils.sendTips("百度热搜",new Date());
//// }
// TipsUtils.addHotList("百度热搜",baiduList);
// log.info("百度风云榜采集结束........");
// ZhiWeiTools.sleep(2000L);
// log.info("搜狗微信采集开始........");
// List<HotSearchList> sougouList = SougoHotSearchCrawler.sougoHotSearch();
// log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(sougouList != null ? sougouList.size() : 0));
// TipsUtils.addHotList("搜狗微信热搜",sougouList);
// log.info("搜狗微信采集结束........");
// ZhiWeiTools.sleep(2000L);
// log.info("知乎话题采集开始........");
// List<HotSearchList> zhihuList = ZhihuHotSearchCrawler.getMobileZhihuHotList();
// log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(zhihuList != null ? zhihuList.size() : 0));
// TipsUtils.addHotList("知乎热搜",zhihuList);
// log.info("知乎话题采集结束........");
}
}
\ No newline at end of file
......@@ -38,24 +38,24 @@ public class DouYinUrlHotSearchRun extends Thread {
* @return void
*/
private void getUrlList() {
log.info("抖音链接更新开始........");
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> list = DouyinHotSearchRun.list;
if(list != null && list.size()>0) {
for (int i = 0; i < list.size(); i++) {
String name = list.get(i).getName();
String id = name+"_"+list.get(i).getType();
String url = DouyinHotSearchCrawler.getDouyinUrl("https://aweme-hl.snssdk.com/aweme/v1/hot/search/video/list/?hotword="+name);
if(url != null) {
Document document = new Document();
document.put("id", id);
document.put("url", url);
hotSearchCacheDAO.updateDouyinUrl(document);
}
}
log.info("抖音链接更新结束........");
}else{
log.info("抖音链接更新失败,获取抖音数据为空");
}
// log.info("抖音链接更新开始........");
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<HotSearchList> list = DouyinHotSearchRun.list;
// if(list != null && list.size()>0) {
// for (int i = 0; i < list.size(); i++) {
// String name = list.get(i).getName();
// String id = name+"_"+list.get(i).getType();
// String url = DouyinHotSearchCrawler.getDouyinUrl("https://aweme-hl.snssdk.com/aweme/v1/hot/search/video/list/?hotword="+name);
// if(url != null) {
// Document document = new Document();
// document.put("id", id);
// document.put("url", url);
// hotSearchCacheDAO.updateDouyinUrl(document);
// }
// }
// log.info("抖音链接更新结束........");
// }else{
// log.info("抖音链接更新失败,获取抖音数据为空");
// }
}
}
......@@ -48,23 +48,23 @@ public class DouyinHotSearchRun extends Thread{
* @return void
*/
private void getHotList() {
log.info("抖音热搜榜采集开始........");
list = DouyinHotSearchCrawler.getMobileDouyinHotList();
log.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
TipsUtils.addHotList("抖音热搜",list);
log.info("抖音热搜榜采集结束........");
ZhiWeiTools.sleep(3000L);
log.info("今日头条热搜采集开始........");
List<HotSearchList> toutiaoList = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone();
log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(toutiaoList != null ? toutiaoList.size() : 0));
TipsUtils.addHotList(HotSearchType.今日头条热搜.name(),toutiaoList);
log.info("今日头条热搜采集结束........");
ZhiWeiTools.sleep(3000L);
log.info("知乎热搜榜单采集开始...");
List<HotSearchList> zhihuList = ZhihuTopicSearchCrawler.getZhihuTopicSearch();
log.info("{}, 知乎热搜榜单此轮采集到的数据量为:{}", new Date(), Integer.valueOf(zhihuList != null ? zhihuList.size() : 0));
TipsUtils.addHotList(HotSearchType.知乎热搜榜单.name(),zhihuList);
log.info("知乎热搜榜单采集结束........");
// log.info("抖音热搜榜采集开始........");
// list = DouyinHotSearchCrawler.getMobileDouyinHotList();
// log.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// TipsUtils.addHotList("抖音热搜",list);
// log.info("抖音热搜榜采集结束........");
// ZhiWeiTools.sleep(3000L);
// log.info("今日头条热搜采集开始........");
// List<HotSearchList> toutiaoList = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone();
// log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(toutiaoList != null ? toutiaoList.size() : 0));
// TipsUtils.addHotList(HotSearchType.今日头条热搜.name(),toutiaoList);
// log.info("今日头条热搜采集结束........");
// ZhiWeiTools.sleep(3000L);
// log.info("知乎热搜榜单采集开始...");
// List<HotSearchList> zhihuList = ZhihuTopicSearchCrawler.getZhihuTopicSearch();
// log.info("{}, 知乎热搜榜单此轮采集到的数据量为:{}", new Date(), Integer.valueOf(zhihuList != null ? zhihuList.size() : 0));
// TipsUtils.addHotList(HotSearchType.知乎热搜榜单.name(),zhihuList);
// log.info("知乎热搜榜单采集结束........");
}
}
......@@ -39,19 +39,19 @@ public class SougoHotSearchRun extends Thread {
private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
log.info("搜狗微信采集开始........");
List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch();
log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("搜狗微信热搜",new Date());
}else {
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
TipsUtils.recoveryTips("搜狗微信热搜",new Date());
}
log.info("搜狗微信采集结束........");
// HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// log.info("搜狗微信采集开始........");
// List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch();
// log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// if(list == null || list.size() == 0){
// TipsUtils.sendTips("搜狗微信热搜",new Date());
// }else {
// List<Document> data = hotSearchCacheDAO.addData(list);
// hotSearchDAO.addHotSearchList(data);
// TipsUtils.recoveryTips("搜狗微信热搜",new Date());
// }
// log.info("搜狗微信采集结束........");
}
}
......@@ -31,22 +31,22 @@ public class ThreadOneRun extends Thread {
}
private void getHotList(){
List<HotSearchList> tengXunlist = TengXunCrawler.getTengXunHotList();
TipsUtils.addHotList("腾讯新闻",tengXunlist);
ZhiWeiTools.sleep(1500L);
List<HotSearchList> xinLanglist = XinLangHotSearchCrawler.getXinLangHotSearch();
TipsUtils.addHotList("新浪热榜",xinLanglist);
ZhiWeiTools.sleep(1500L);
List<HotSearchList> souhuList = SouhuTopicCrawler.getSouhuTopic();
TipsUtils.addHotList("搜狐话题",souhuList);
ZhiWeiTools.sleep(1500L);
List<HotSearchList> xinLangHotList = XinLangHotSearchCrawler.getXinLangHotSpot();
TipsUtils.addHotList("新浪热点",xinLangHotList);
ZhiWeiTools.sleep(1500L);
List<HotSearchList> fengHuangHotList = FengHuangSearchCrawler.getFengHuangHotList();
TipsUtils.addHotList("凤凰新闻热榜",fengHuangHotList);
ZhiWeiTools.sleep(1500L);
List<HotSearchList> fengHuangHotDataList = FengHuangSearchCrawler.getFengHuangHotData();
TipsUtils.addHotList("凤凰新闻热搜",fengHuangHotDataList);
// List<HotSearchList> tengXunlist = TengXunCrawler.getTengXunHotList();
// TipsUtils.addHotList("腾讯新闻",tengXunlist);
// ZhiWeiTools.sleep(1500L);
// List<HotSearchList> xinLanglist = XinLangHotSearchCrawler.getXinLangHotSearch();
// TipsUtils.addHotList("新浪热榜",xinLanglist);
// ZhiWeiTools.sleep(1500L);
// List<HotSearchList> souhuList = SouhuTopicCrawler.getSouhuTopic();
// TipsUtils.addHotList("搜狐话题",souhuList);
// ZhiWeiTools.sleep(1500L);
// List<HotSearchList> xinLangHotList = XinLangHotSearchCrawler.getXinLangHotSpot();
// TipsUtils.addHotList("新浪热点",xinLangHotList);
// ZhiWeiTools.sleep(1500L);
// List<HotSearchList> fengHuangHotList = FengHuangSearchCrawler.getFengHuangHotList();
// TipsUtils.addHotList("凤凰新闻热榜",fengHuangHotList);
// ZhiWeiTools.sleep(1500L);
// List<HotSearchList> fengHuangHotDataList = FengHuangSearchCrawler.getFengHuangHotData();
// TipsUtils.addHotList("凤凰新闻热搜",fengHuangHotDataList);
}
}
......@@ -35,19 +35,19 @@ public class ToutiaoHotSearchRun extends Thread{
private void getHotList() {
log.info("今日头条热搜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> list = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone();
log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("今日头条热搜",new Date());
}else {
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
TipsUtils.recoveryTips("今日头条热搜",new Date());
}
log.info("今日头条热搜采集结束........");
// log.info("今日头条热搜采集开始........");
// HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<HotSearchList> list = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone();
// log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// if(list == null || list.size() == 0){
// TipsUtils.sendTips("今日头条热搜",new Date());
// }else {
// List<Document> data = hotSearchCacheDAO.addData(list);
// hotSearchDAO.addHotSearchList(data);
// TipsUtils.recoveryTips("今日头条热搜",new Date());
// }
// log.info("今日头条热搜采集结束........");
}
}
......@@ -34,17 +34,17 @@ public class WeiboHotSearchRun extends Thread{
private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearchByPhone();
log.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("微博热搜",new Date());
}else {
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
TipsUtils.recoveryTips("微博热搜",new Date());
}
// HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearchByPhone();
// log.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// if(list == null || list.size() == 0){
// TipsUtils.sendTips("微博热搜",new Date());
// }else {
// List<Document> data = hotSearchCacheDAO.addData(list);
// hotSearchDAO.addHotSearchList(data);
// TipsUtils.recoveryTips("微博热搜",new Date());
// }
}
}
......@@ -32,28 +32,28 @@ public class WeiboSuperTopicRun extends Thread{
private void getTopicList() {
WeiboSuperTopicDAO weiboTopicDAO = new WeiboSuperTopicDAO();
log.info("微博超话采集开始........");
List<WeiboSuperTopic> list = WeiboSuperTopicCrawler.startCrawler();
log.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<Document> data = new ArrayList<>();
for(WeiboSuperTopic topic : list){
log.info("topic::::{}", topic);
Document doc = new Document();
doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName());
doc.put("rank", topic.getRank());
doc.put("score_num", topic.getScore());
doc.put("fensi_num", topic.getFensi());
doc.put("post_num", topic.getPostNum());
doc.put("type", topic.getType());
doc.put("day", topic.getDay());
doc.put("time", topic.getTime());
doc.put("url", topic.getUrl());
data.add(doc);
}
weiboTopicDAO.addTopicList(data);
log.info("微博话题采集结束........");
// WeiboSuperTopicDAO weiboTopicDAO = new WeiboSuperTopicDAO();
// log.info("微博超话采集开始........");
// List<WeiboSuperTopic> list = WeiboSuperTopicCrawler.startCrawler();
// log.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// List<Document> data = new ArrayList<>();
// for(WeiboSuperTopic topic : list){
// log.info("topic::::{}", topic);
// Document doc = new Document();
// doc.put("_id", topic.getId());
// doc.put("name", topic.getTopicName());
// doc.put("rank", topic.getRank());
// doc.put("score_num", topic.getScore());
// doc.put("fensi_num", topic.getFensi());
// doc.put("post_num", topic.getPostNum());
// doc.put("type", topic.getType());
// doc.put("day", topic.getDay());
// doc.put("time", topic.getTime());
// doc.put("url", topic.getUrl());
// data.add(doc);
// }
// weiboTopicDAO.addTopicList(data);
// log.info("微博话题采集结束........");
}
}
......@@ -33,35 +33,35 @@ public class WeiboTopicRun extends Thread{
private void getTopicList() {
HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
log.info("微博话题采集开始........");
List<HotSearchList> list = WeiboTopicCrawler.startCrawlerByPhone();
log.info("{}, 微博话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("微博话题",new Date());
}else{
TipsUtils.recoveryTips("微博话题",new Date());
}
List<Document> data = new ArrayList<>();
for(HotSearchList topic : list){
Document doc = new Document();
doc.put("_id", topic.getId());
doc.put("name", topic.getName());
doc.put("url", topic.getUrl());
doc.put("count", topic.getCount());
doc.put("hot", topic.getHot());
doc.put("day", topic.getDay());
doc.put("time", topic.getTime());
doc.put("rank", topic.getRank());
doc.put("type", topic.getType());
doc.put("topic_lead", topic.getTopicLead());
doc.put("comment_count", topic.getCommentCount());
data.add(doc);
hotSearchCacheDAO.addAndUpdateData(doc);
}
weiboHotSearchDAO.addHotSearchList(data);
log.info("微博话题采集结束........");
// HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO();
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// log.info("微博话题采集开始........");
// List<HotSearchList> list = WeiboTopicCrawler.startCrawlerByPhone();
// log.info("{}, 微博话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// if(list == null || list.size() == 0){
// TipsUtils.sendTips("微博话题",new Date());
// }else{
// TipsUtils.recoveryTips("微博话题",new Date());
// }
// List<Document> data = new ArrayList<>();
// for(HotSearchList topic : list){
// Document doc = new Document();
// doc.put("_id", topic.getId());
// doc.put("name", topic.getName());
// doc.put("url", topic.getUrl());
// doc.put("count", topic.getCount());
// doc.put("hot", topic.getHot());
// doc.put("day", topic.getDay());
// doc.put("time", topic.getTime());
// doc.put("rank", topic.getRank());
// doc.put("type", topic.getType());
// doc.put("topic_lead", topic.getTopicLead());
// doc.put("comment_count", topic.getCommentCount());
// data.add(doc);
// hotSearchCacheDAO.addAndUpdateData(doc);
// }
// weiboHotSearchDAO.addHotSearchList(data);
// log.info("微博话题采集结束........");
}
}
......@@ -35,50 +35,50 @@ public class ZhihuChildHotSearchRun extends Thread {
}
private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
for (int i = 0; i < childType.size(); i++) {
String name = this.getTypeName(childType.get(i));
if (!"".equals(name)) {
log.info("知乎{}话题热榜采集开始...", name);
List<HotSearchList> list = ZhihuChildHotSearchCrawler.getZhihuTopicSearch(childType.get(i), name);
log.info("{}, 知乎{}话题此轮采集到的数据量为:{}", new Date(),name, Integer.valueOf(list != null ? list.size() : 0));
if (list == null || list.size() == 0) {
TipsUtils.sendTips("知乎热搜"+name+"分类", new Date());
}else {
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
TipsUtils.recoveryTips("知乎热搜"+name+"分类",new Date());
}
log.info("知乎{}话题热榜采集结束...", name);
ZhiWeiTools.sleep(3000);
}
}
//网易实时热榜采集
ZhiWeiTools.sleep(3000L);
List<HotSearchList> wangyiHotSearchList = WangYiHotSearchCrawler.getWangYiHotSearch();
TipsUtils.addHotList("网易热榜",wangyiHotSearchList);
//网易跟帖热议采集
ZhiWeiTools.sleep(3000L);
List<HotSearchList> wangyiComment = WangYiHotSearchCrawler.getWangYicomment();
TipsUtils.addHotList("网易跟帖热议",wangyiComment);
// HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// for (int i = 0; i < childType.size(); i++) {
// String name = this.getTypeName(childType.get(i));
// if (!"".equals(name)) {
// log.info("知乎{}话题热榜采集开始...", name);
// List<HotSearchList> list = ZhihuChildHotSearchCrawler.getZhihuTopicSearch(childType.get(i), name);
// log.info("{}, 知乎{}话题此轮采集到的数据量为:{}", new Date(),name, Integer.valueOf(list != null ? list.size() : 0));
// if (list == null || list.size() == 0) {
// TipsUtils.sendTips("知乎热搜"+name+"分类", new Date());
// }else {
// List<Document> data = hotSearchCacheDAO.addData(list);
// hotSearchDAO.addHotSearchList(data);
// TipsUtils.recoveryTips("知乎热搜"+name+"分类",new Date());
// }
// log.info("知乎{}话题热榜采集结束...", name);
// ZhiWeiTools.sleep(3000);
// }
// }
// //网易实时热榜采集
// ZhiWeiTools.sleep(3000L);
// List<HotSearchList> wangyiHotSearchList = WangYiHotSearchCrawler.getWangYiHotSearch();
// TipsUtils.addHotList("网易热榜",wangyiHotSearchList);
// //网易跟帖热议采集
// ZhiWeiTools.sleep(3000L);
// List<HotSearchList> wangyiComment = WangYiHotSearchCrawler.getWangYicomment();
// TipsUtils.addHotList("网易跟帖热议",wangyiComment);
}
private String getTypeName(String type){
String name;
switch (type) {
case "digital":
name = "数码";
break;
case "focus":
name = "国际";
break;
case "depth":
name = "时事";
break;
default:
name = "";
}
return name;
}
// private String getTypeName(String type){
// String name;
// switch (type) {
// case "digital":
// name = "数码";
// break;
// case "focus":
// name = "国际";
// break;
// case "depth":
// name = "时事";
// break;
// default:
// name = "";
// }
// return name;
// }
}
......@@ -39,20 +39,20 @@ public class ZhihuHotSearchRun extends Thread{
private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
log.info("知乎话题采集开始...,当前线程名字:{}", Thread.currentThread().getName());
// List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList();
List<HotSearchList> list = ZhihuHotSearchCrawler.getMobileZhihuHotList();
log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("知乎热搜",new Date());
}else {
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
TipsUtils.recoveryTips("知乎热搜",new Date());
}
log.info("知乎话题采集结束........");
// HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// log.info("知乎话题采集开始...,当前线程名字:{}", Thread.currentThread().getName());
//// List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList();
// List<HotSearchList> list = ZhihuHotSearchCrawler.getMobileZhihuHotList();
// log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// if(list == null || list.size() == 0){
// TipsUtils.sendTips("知乎热搜",new Date());
// }else {
// List<Document> data = hotSearchCacheDAO.addData(list);
// hotSearchDAO.addHotSearchList(data);
// TipsUtils.recoveryTips("知乎热搜",new Date());
// }
// log.info("知乎话题采集结束........");
}
}
......@@ -32,18 +32,18 @@ public class ZhihuTopSearchRun extends Thread {
}
public void getHotList(){
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
log.info("知乎热搜采集开始...,当前线程名字:{}", Thread.currentThread().getName());
List<HotSearchList> list = ZhihuTopicSearchCrawler.getZhihuTopicSearch();
log.info("{}, 知乎热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("知乎热搜榜单",new Date());
}else {
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
TipsUtils.recoveryTips("知乎热搜榜单",new Date());
}
log.info("知乎热搜话题采集结束........");
// HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// log.info("知乎热搜采集开始...,当前线程名字:{}", Thread.currentThread().getName());
// List<HotSearchList> list = ZhihuTopicSearchCrawler.getZhihuTopicSearch();
// log.info("{}, 知乎热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// if(list == null || list.size() == 0){
// TipsUtils.sendTips("知乎热搜榜单",new Date());
// }else {
// List<Document> data = hotSearchCacheDAO.addData(list);
// hotSearchDAO.addHotSearchList(data);
// TipsUtils.recoveryTips("知乎热搜榜单",new Date());
// }
// log.info("知乎热搜话题采集结束........");
}
}
package com.zhiwei.searchhotcrawler.timer.quartz;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import com.zhiwei.searchhotcrawler.crawler.*;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO;
import com.zhiwei.searchhotcrawler.util.DateUtils;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.scheduling.annotation.Async;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@Component
@EnableScheduling
@EnableAsync
public class GatherTimer {
private Logger logger = LoggerFactory.getLogger(GatherTimer.class);
/** 知乎数码子分类 */
private String DIGITAL = "digital";
/** 知乎国际子分类 */
private String FOCUS = "focus";
/** 知乎时事子分类 */
private String DEPTH = "depth";
/**
* 微博热搜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeiBo(){
logger.info("微博热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> weiboList = WeiboHotSearchCrawler.weiboHotSearchByPhone(date);
logger.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(weiboList != null ? weiboList.size() : 0));
TipsUtils.addHotList(HotSearchType.微博热搜.name(),weiboList);
logger.info("微博热搜采集结束...");
}
/**
* 今日头条热搜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerTouTiao(){
logger.info("今日头条热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> toutiaoList = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone(date);
logger.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(toutiaoList != null ? toutiaoList.size() : 0));
TipsUtils.addHotList(HotSearchType.今日头条热搜.name(),toutiaoList);
logger.info("今日头条热搜采集结束...");
}
/**
* 百度热搜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerBaiDu(){
logger.info("百度热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> baiduList = BaiDuHotSearchCrawler.baiduHotSearch(date);
logger.info("{}, 百度热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(baiduList != null ? baiduList.size() : 0));
TipsUtils.addHotList(HotSearchType.百度热搜.name(),baiduList);
logger.info("百度热搜采集结束...");
}
/**
* 抖音热搜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerDouYin(){
logger.info("抖音热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> douyinList = DouyinHotSearchCrawler.getMobileDouyinHotList(date);
logger.info("{}, 抖音热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(douyinList != null ? douyinList.size() : 0));
TipsUtils.addHotList(HotSearchType.抖音热搜.name(),douyinList);
logger.info("抖音热搜采集结束...");
}
/**
* 抖音链接的更新
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 0/5 * * * ? ")
public void updateDouYinUrl(){
logger.info("抖音链接更新开始...");
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> douyinList = DouyinHotSearchCrawler.list;
if(douyinList!=null && douyinList.size()>0){
for(int i=0; i<douyinList.size(); i++){
String name = douyinList.get(i).getName();
String id = name+"_"+douyinList.get(i).getType();
String url = DouyinHotSearchCrawler.getDouyinUrl("https://aweme-hl.snssdk.com/aweme/v1/hot/search/video/list/?hotword="+name);
if(url != null) {
Document document = new Document();
document.put("id", id);
document.put("url", url);
hotSearchCacheDAO.updateDouyinUrl(document);
}
}
logger.info("抖音链接更新结束");
}else{
logger.info("抖音链接更新失败,抖音热搜列表获取为空。");
}
}
/**
* 知乎热榜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerZhihu(){
logger.info("知乎热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> zhihuList = ZhihuHotSearchCrawler.getMobileZhihuHotList(date);
logger.info("{}, 知乎热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(zhihuList != null ? zhihuList.size() : 0));
TipsUtils.addHotList(HotSearchType.知乎热搜.name(),zhihuList);
logger.info("知乎热搜采集结束...");
}
/**
* 搜狗微信热点的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeChat(){
logger.info("搜狗微信热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch(date);
logger.info("{}, 搜狗微信热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
TipsUtils.addHotList(HotSearchType.搜狗微信热搜.name(),list);
logger.info("搜狗微信热搜采集结束...");
}
/**
* 微博话题的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeiBoTopic(){
logger.info("微博话题开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = WeiboTopicCrawler.startCrawlerByPhone(date);
logger.info("{}, 微博话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
TipsUtils.addHotList(HotSearchType.微博话题.name(),list);
logger.info("微博话题采集结束...");
}
/**
* 腾讯新闻热点的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerTengXun(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = TengXunCrawler.getTengXunHotList(date);
TipsUtils.addHotList(HotSearchType.腾讯新闻.name(),list);
}
/**
* 新浪热点的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerXinLangHotSpot(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = XinLangHotSearchCrawler.getXinLangHotSpot(date);
TipsUtils.addHotList(HotSearchType.新浪热点.name(),list);
}
/**
* 新浪热榜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerXinLangHotSearch(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = XinLangHotSearchCrawler.getXinLangHotSearch(date);
TipsUtils.addHotList(HotSearchType.新浪热榜.name(),list);
}
/**
* 网易新闻热榜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerWangYiHotSearch(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = WangYiHotSearchCrawler.getWangYiHotSearch(date);
TipsUtils.addHotList(HotSearchType.网易热榜.name(),list);
}
/**
* 网易新闻跟帖热议的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerWangYiHotComment(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = WangYiHotSearchCrawler.getWangYicomment(date);
TipsUtils.addHotList(HotSearchType.网易跟帖热议.name(),list);
}
/**
* 凤凰新闻热榜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerFengHuangHotData(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = FengHuangSearchCrawler.getFengHuangHotData(date);
TipsUtils.addHotList(HotSearchType.凤凰新闻热榜.name(),list);
}
/**
* 凤凰新闻热搜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ")
public void crawlerFengHuangHotSearch(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = FengHuangSearchCrawler.getFengHuangHotSearch(date);
TipsUtils.addHotList(HotSearchType.凤凰新闻热搜.name(),list);
}
/**
* 搜狐话题的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void crawlerSouHuTopic(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = SouhuTopicCrawler.getSouhuTopic(date);
TipsUtils.addHotList(HotSearchType.搜狐话题.name(),list);
}
/**
* 知乎热搜话题的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void crawlerZhihuHotTopic(){
logger.info("知乎热搜话题开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = ZhihuTopicSearchCrawler.getZhihuTopicSearch(date);
logger.info("{}, 知乎热搜话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
TipsUtils.addHotList(HotSearchType.知乎热搜榜单.name(),list);
logger.info("知乎热搜话题采集结束...");
}
/**
* 知乎热搜数码分类采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void crawlerZhiHuDigital(){
this.crawlerZhiHuChild(DIGITAL);
}
/**
* 知乎热搜国际分类采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void crawlerZhiHuFocus(){
this.crawlerZhiHuChild(FOCUS);
}
/**
* 知乎热搜时事分类采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void crawlerZhiHuDepth(){
this.crawlerZhiHuChild(DEPTH);
}
/**
* 微博超话的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 0 0/3 * * ? ")
public void crawlerWeiBoSuperTopic(){
logger.info("微博超话采集开始........");
Date date = DateUtils.getMillSecondTime(new Date());
WeiboSuperTopicDAO weiboTopicDAO = new WeiboSuperTopicDAO();
List<WeiboSuperTopic> list = WeiboSuperTopicCrawler.startCrawler();
logger.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<Document> data = new ArrayList<>();
for(WeiboSuperTopic topic : list){
logger.info("topic::::{}", topic);
Document doc = new Document();
doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName());
doc.put("rank", topic.getRank());
doc.put("score_num", topic.getScore());
doc.put("fensi_num", topic.getFensi());
doc.put("post_num", topic.getPostNum());
doc.put("type", topic.getType());
doc.put("day", topic.getDay());
doc.put("time", topic.getTime());
doc.put("url", topic.getUrl());
data.add(doc);
}
weiboTopicDAO.addTopicList(data);
logger.info("微博话题采集结束........");
}
/**
* 知乎子类采集函数
* @param type
*/
private void crawlerZhiHuChild(String type){
Date date = DateUtils.getMillSecondTime(new Date());
String name = this.getTypeName(type);
logger.info("知乎{}话题热榜采集开始...", name);
List<HotSearchList> list = ZhihuChildHotSearchCrawler.getZhihuTopicSearch(type,name,date);
logger.info("{}, 知乎{}话题此轮采集到的数据量为:{}", new Date(),name, Integer.valueOf(list != null ? list.size() : 0));
TipsUtils.addHotList(name,list);
logger.info("知乎{}话题热榜采集结束...", name);
}
private String getTypeName(String type){
String name;
switch (type) {
case "digital":
name = "数码";
break;
case "focus":
name = "国际";
break;
case "depth":
name = "时事";
break;
default:
name = "";
}
return name;
}
}
package com.zhiwei.searchhotcrawler.util;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
public class DateUtils {
/**
* 取得指定月份后的时间
* @param date
* @return
*/
public static Date getMonthByMonth(Date date,Integer month){
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
calendar.add(Calendar.MONTH,month);
return calendar.getTime();
}
/**
* 获取下一个星期(七天之后)的时间
* @param date
* @return
*/
public static Date getNextWeek(Date date){
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
calendar.add(Calendar.WEEK_OF_MONTH,1);
return calendar.getTime();
}
/**
* 获取规定小时前后的时间
* @param date
* @param hour
* @return
*/
public static Date getDateByHour(Date date,Integer hour){
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
calendar.add(Calendar.HOUR,hour);
return calendar.getTime();
}
/**
* 获取规定天数前后的时间
* @param date
* @param days
* @return
*/
public static Date getDateByDays(Date date, Integer days){
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
calendar.add(Calendar.DATE,days);
return calendar.getTime();
}
/**
* 获取规定分钟前后的时间
* @param date
* @param minutes
* @return
*/
public static Date getDateByMinutes(Date date, Integer minutes){
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
calendar.add(Calendar.MINUTE,minutes);
return calendar.getTime();
}
/**
* 返回下一年的时间
* @param date
* @return
*/
public static Date getNextYear(Date date){
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
calendar.add(Calendar.YEAR,1);
return calendar.getTime();
}
/**
* 返回上一年的时间
* @param date
* @return
*/
public static Date getLastYear(Date date){
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
calendar.add(Calendar.YEAR,-1);
return calendar.getTime();
}
/**
* 时间精确到小时
* @param date
* @return
*/
public static Date getDateToAccurateHour(Date date){
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
calendar.set(Calendar.MINUTE,0);
calendar.set(Calendar.SECOND,0);
return calendar.getTime();
}
/**
* 返回年
* @param date
* @return
*/
public static String getYearFormat(Date date){
SimpleDateFormat format = new SimpleDateFormat("yyyy");
return format.format(date);
}
/**
* 返回年月
* @param date
* @return
*/
public static String getMonthFormat(Date date){
SimpleDateFormat format = new SimpleDateFormat("yyyy年MM月");
return format.format(date);
}
/**
* 返回年月日
* @param date
* @return
*/
public static String getDayFormat(Date date){
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
return format.format(date);
}
/**
* 返回年月日时
* @param date
* @return
*/
public static String getHourFormat(Date date){
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH");
return format.format(date);
}
/**
* 返回年月日时分秒
* @param date
* @return
*/
public static String getTimeFormat(Date date){
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
return format.format(date);
}
//获取近一周的时间格式化
public static List<String> getWeekTimeFormat(Date date){
List<String> weekList = new ArrayList<>();
for(int i=0; i<7; i++){
weekList.add(getDayFormat(date));
date = getDateByDays(date,-1);
}
return weekList;
}
/**
* 将毫秒值清零
* @param date
* @return
*/
public static Date getMillSecondTime(Date date){
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
calendar.set(Calendar.MILLISECOND, 0);
return calendar.getTime();
}
}
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:context="http://www.springframework.org/schema/context"
xmlns:aop="http://www.springframework.org/schema/aop" xmlns:tx="http://www.springframework.org/schema/tx"
xmlns:task="http://www.springframework.org/schema/task"
xsi:schemaLocation="http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context-4.2.xsd
http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans-4.2.xsd
http://www.springframework.org/schema/tx
http://www.springframework.org/schema/tx/spring-tx-4.2.xsd
http://www.springframework.org/schema/aop
http://www.springframework.org/schema/aop/spring-aop-4.2.xsd
http://www.springframework.org/schema/task
http://www.springframework.org/schema/task/spring-task.xsd">
<!-- 开启注解处理器 -->
<context:annotation-config />
<!-- 基于注解方式的定时器 -->
<task:annotation-driven scheduler="myScheduler" />
<task:scheduler id="myScheduler" pool-size="50"/>
<!-- 开启组件自动扫描,扫描路径由base-package属性指定 -->
<context:component-scan base-package="com.zhiwei.searchhotcrawler" />
</beans>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment