Commit d00d9860 by 马黎滨

Merge branch 'mlbWork' into 'master'

Mlb work

See merge request !7
parents d767f59c c209c204
......@@ -50,6 +50,11 @@
<artifactId>lombok</artifactId>
<version>1.18.8</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.0.4-RELEASE</version>
</dependency>
</dependencies>
<build>
......
......@@ -8,5 +8,6 @@ public enum HotSearchType {
搜狗微信热搜,
微博话题,
今日头条热搜,
知乎热搜榜单
知乎热搜榜单,
腾讯新闻
}
......@@ -7,6 +7,8 @@ import java.util.List;
import java.util.Objects;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -29,7 +31,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
@Log4j2
public class BaiDuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @Title: BaiDuHotSearchTest
......@@ -39,16 +41,18 @@ public class BaiDuHotSearchCrawler {
*/
public static List<HotSearchList> baiduHotSearch() {
String url = "http://top.baidu.com/buzz?b=1&fr=topindex";
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null && htmlBody.contains("mainBody")) {
return ansysData(htmlBody);
} else {
log.info("解析百度风云榜时出现解析错误,页面结构有问题");
}
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("mainBody")) {
return ansysData(htmlBody);
} else {
log.info("解析百度风云榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
......
......@@ -5,6 +5,8 @@ import java.util.ArrayList;
import java.util.List;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -28,7 +30,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
@Log4j2
public class DouyinHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).build();
/**
* @Title: getMobileDouyinHotList
......@@ -40,34 +42,36 @@ public class DouyinHotSearchCrawler {
public static List<HotSearchList> getMobileDouyinHotList(){
List<HotSearchList> list = null;
String url = "https://api.amemv.com/aweme/v1/hot/search/list/";
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("word_list")){
list = new ArrayList<>();
JSONObject data = JSONObject.parseObject(htmlBody);
JSONArray wordList = data.getJSONObject("data").getJSONArray("word_list");
String positionStr = null;
String word = null;
String hotValueStr = null;
for (int i = 0; i < wordList.size(); i++) {
JSONObject wl = wordList.getJSONObject(i);
//获取排名
positionStr = wl.getString("position");
Integer position = null;
position = Integer.valueOf(positionStr);
//获取关键词
word = wl.getString("word");
//获取热度值
hotValueStr =wl.getString("hot_value");
Integer hotValue = null;
hotValue = Integer.valueOf(hotValueStr);
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
}catch (IOException e) {
log.debug("获取抖音热搜榜时出现问题:{}", e);
}
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("word_list")) {
list = new ArrayList<>();
JSONObject data = JSONObject.parseObject(htmlBody);
JSONArray wordList = data.getJSONObject("data").getJSONArray("word_list");
String positionStr = null;
String word = null;
String hotValueStr = null;
for (int i = 0; i < wordList.size(); i++) {
JSONObject wl = wordList.getJSONObject(i);
//获取排名
positionStr = wl.getString("position");
Integer position = null;
position = Integer.valueOf(positionStr);
//获取关键词
word = wl.getString("word");
//获取热度值
hotValueStr = wl.getString("hot_value");
Integer hotValue = null;
hotValue = Integer.valueOf(hotValueStr);
// logger.info("热度为:::{}", hot_value);
HotSearchList douyin = new HotSearchList(null,word, hotValue, position,HotSearchType.抖音热搜.name());
list.add(douyin);
}
HotSearchList douyin = new HotSearchList(null, word, hotValue, position, HotSearchType.抖音热搜.name());
list.add(douyin);
}
} catch (IOException e) {
log.debug("获取抖音热搜榜时出现问题:{}", e);
}
return list;
}
......
......@@ -7,6 +7,8 @@ import java.util.Map;
import java.util.Objects;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -31,7 +33,7 @@ import com.zhiwei.tools.httpclient.HeaderTool;
@Log4j2
public class SougoHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).build();
/**
* @Title: SougoHotSearchTest
......@@ -41,55 +43,57 @@ public class SougoHotSearchCrawler {
*/
public static List<HotSearchList> sougoHotSearch() {
String url = "https://weixin.sogou.com";
List<HotSearchList> list = new ArrayList<>();
Map<String,String> headMap = HeaderTool.getCommonHead();
Request request = RequestUtils.wrapGet(url, headMap);
for (int i = 0; i < 3; i++) {
String htmlBody = null;
try {
Map<String,String> headMap = HeaderTool.getCommonHead();
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null && htmlBody.contains("topwords")) {
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("ol#topwords").select("li");
for (Element element : elements) {
try {
// 获取排名rank
String rankStr = null;
if (!element.select("li").select("i").isEmpty()) {
rankStr = element.select("li").select("i").text();
}
Integer rank = null;
if (StringUtils.isNoneBlank(rankStr)) {
rank = Integer.valueOf(rankStr);
}
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
}catch (Exception e) {
log.error("解析搜狗微信时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("topwords")) {
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("ol#topwords").select("li");
for (Element element : elements) {
try {
// 获取排名rank
String rankStr = null;
if (!element.select("li").select("i").isEmpty()) {
rankStr = element.select("li").select("i").text();
}
Integer rank = null;
if (StringUtils.isNoneBlank(rankStr)) {
rank = Integer.valueOf(rankStr);
}
// 获取关键词(String)
String kw = element.select("li").select("a").attr("title");
// 获取关键词(String)
String kw = element.select("li").select("a").attr("title");
// logger.info("关键词:{}", kw);
String everurl = element.select("li").select("a").attr("href");
String everurl = element.select("li").select("a").attr("href");
HotSearchList hotSearch = new HotSearchList(everurl, kw, null, rank, HotSearchType.搜狗微信热搜.name());
HotSearchList hotSearch = new HotSearchList(everurl, kw, null, rank, HotSearchType.搜狗微信热搜.name());
if (Objects.nonNull(rank)) {
list.add(hotSearch);
}
} catch (Exception e) {
log.error("解析搜狗微信时出现解析错误", e);
if (Objects.nonNull(rank)) {
list.add(hotSearch);
}
} catch (Exception e) {
log.error("解析搜狗微信时出现解析错误", e);
}
} catch (Exception e) {
log.error("解析搜狗微信时出现解析错误,数据不是json结构", e.fillInStackTrace());
return Collections.emptyList();
}
} else {
log.info("解析搜狗微信时出现解析错误,页面结构有问题");
break;
} catch (Exception e) {
log.error("解析搜狗微信时出现解析错误,数据不是json结构", e.fillInStackTrace());
return Collections.emptyList();
}
break;
} catch (Exception e) {
log.error("解析搜狗微信时出现解析错误,页面结构有问题", e);
} else {
log.info("解析搜狗微信时出现解析错误,页面结构有问题");
}
}
return list;
}
......
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@Log4j2
public class TengXunCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* 腾讯热榜数据采集
* @return
*/
public static List<HotSearchList> getTengXunHotList() {
log.info("腾讯新闻热榜开始采集...");
List<HotSearchList> list = new ArrayList<>();
JSONArray dataJson = null;
String htmlBody = null;
String url = "https://r.inews.qq.com/getWeiboRankingList?chlid=news_recommend_hot&appver=28_android_4.2.40&devid=&qn-rid=&qn-sig=f690e21095559203e3f55c42a04f8f15";
Request request = RequestUtils.wrapGet(url);
//采集为空最多重试3次
for (int t = 0; t < 3 && dataJson == null; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
e.printStackTrace();
}
if (htmlBody != null && htmlBody.contains("idlist")) {
JSONObject topSearch = JSONObject.parseObject(htmlBody);
dataJson = topSearch.getJSONArray("idlist").getJSONObject(0).getJSONArray("newslist");
for (int i = 1; i < dataJson.size(); i++) {
Integer rank = i;
String name = dataJson.getJSONObject(i).getString("title");
String tengxunUrl = "https://view.inews.qq.com/topic/" + dataJson.getJSONObject(i).getString("id");
Integer count = 0;
String icon = null;
if (dataJson.getJSONObject(i).containsKey("topic")) {
count = dataJson.getJSONObject(i).getJSONObject("topic").getIntValue("ranking_score");
if (dataJson.getJSONObject(i).getJSONObject("topic").containsKey("rec_icon")) {
icon = dataJson.getJSONObject(i).getJSONObject("topic").getString("rec_icon");
}
} else if (dataJson.getJSONObject(i).containsKey("hotEvent")) {
count = dataJson.getJSONObject(i).getJSONObject("hotEvent").getIntValue("hotScore");
if (dataJson.getJSONObject(i).getJSONObject("hotEvent").containsKey("rec_icon")) {
icon = dataJson.getJSONObject(i).getJSONObject("hotEvent").getString("rec_icon");
}
}
if (icon != null) {
if (icon.contains("11918331890")) {
icon = "热";
} else if (icon.contains("11918332271")) {
icon = "新";
}
}
HotSearchList hotSearchList = new HotSearchList(tengxunUrl, name, count, false, rank, HotSearchType.腾讯新闻.name(), icon);
list.add(hotSearchList);
}
}
ZhiWeiTools.sleep(3000L);
}
log.info("{}, 此轮腾讯新闻热榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
log.info("腾讯新闻采集结束");
return list;
}
}
......@@ -9,6 +9,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.tools.URLCodeUtil;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -37,56 +39,60 @@ public class ToutiaoHotSearchCrawler {
public static List<HotSearchList> toutiaoHotSearchByPhone(){
String origin = "hot_board";
String jsUrl = "https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js";
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(jsUrl)).body().string();
if(htmlBody.contains("origin")){
String s = htmlBody.substring(htmlBody.indexOf("origin:")+"origin:".length());
origin = s.substring(1,s.indexOf("}")-1);
}
Request jsRequest = RequestUtils.wrapGet(jsUrl);
String jsBody = null;
try(Response response = httpBoot.syncCall(jsRequest)) {
jsBody = response.body().string();
} catch (IOException e) {
log.error("获取今日头条实时热搜头部信息标识失败",e);
}
if(jsBody != null && jsBody.contains("origin")){
String s = jsBody.substring(jsBody.indexOf("origin:")+"origin:".length());
origin = s.substring(1,s.indexOf("}")-1);
}
//采集头条内容
String url = "https://i.snssdk.com/hot-event/hot-board/?origin="+origin;
Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1");
headerMap.put("referer","https://ib.snssdk.com/rogue/aladdin_landingpage/template/aladdin_landingpage/hot_words.html?isBrowser=true&traffic_source=");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for(int count =0; count<=5; count++){
String url = "https://i.snssdk.com/hot-event/hot-board/?origin="+origin;
Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1");
headerMap.put("referer","https://ib.snssdk.com/rogue/aladdin_landingpage/template/aladdin_landingpage/hot_words.html?isBrowser=true&traffic_source=");
String htmlBody;
try {
List<HotSearchList> result = new ArrayList<HotSearchList>();
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
try {
JSONArray words = JSONObject.parseObject(htmlBody).getJSONArray("data");
int rank = 1;
for(int i=0;i<words.size();i++){
try {
JSONObject word = words.getJSONObject(i);
String name = word.getString("Title");
String link = "https://ib.snssdk.com/search/?keyword="+ URLCodeUtil.getURLEncode(name, "utf-8") +"&pd=synthesis&source=trending_list&traffic_source=";
Integer hotCount = word.getInteger("HotValue");
String wordsType = word.getString("Label");
String icon = getIcon(wordsType);
try(Response response = httpBoot.syncCall(request)) {
htmlBody = response.body().string();
} catch (IOException e1) {
log.error("解析今日头条实时热搜时出现连接失败",e1);
}
List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
try {
JSONArray words = JSONObject.parseObject(htmlBody).getJSONArray("data");
int rank = 1;
for (int i = 0; i < words.size(); i++) {
try {
JSONObject word = words.getJSONObject(i);
String name = word.getString("Title");
String link = "https://ib.snssdk.com/search/?keyword=" + URLCodeUtil.getURLEncode(name, "utf-8") + "&pd=synthesis&source=trending_list&traffic_source=";
Integer hotCount = word.getInteger("HotValue");
String wordsType = word.getString("Label");
String icon = getIcon(wordsType);
HotSearchList hotSearch = new HotSearchList(link, name, hotCount, true, rank, HotSearchType.今日头条热搜.name(), icon);
result.add(hotSearch);
rank++;
} catch (Exception e) {
log.error("解析今日头条实时热搜时出现解析错误",e);
continue;
}
HotSearchList hotSearch = new HotSearchList(link, name, hotCount, true, rank, HotSearchType.今日头条热搜.name(), icon);
result.add(hotSearch);
rank++;
} catch (Exception e) {
log.error("解析今日头条实时热搜时出现解析错误", e);
continue;
}
return result;
} catch (Exception e) {
log.error("解析今日头条实时热搜时出现解析错误,数据不是json结构",e);
}
}else{
log.info("解析今日头条实时热搜时出现解析错误,页面结构有问题");
return result;
} catch (Exception e) {
log.error("解析今日头条实时热搜时出现解析错误,数据不是json结构", e);
}
} catch (IOException e1) {
log.error("解析今日头条实时热搜时出现连接失败",e1);
} else {
log.info("解析今日头条实时热搜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
......
......@@ -4,6 +4,8 @@ import java.io.IOException;
import java.util.*;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -44,52 +46,52 @@ public class WeiboHotSearchCrawler {
List<HotSearchList> list = new ArrayList<HotSearchList>();
for(int i =0; i<3; i++){
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody!=null && htmlBody.contains("pl_top_realtimehot")){
try {
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
if(i==2){
return list;
}else{
continue;
}
}
if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) {
try {
// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
// script = script.replace("(", "").replace(")", "");
// JSONObject json = JSONObject.parseObject(script);
// String html = json.getString("html");
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
for(Element element : elements){
try {
String id = "http://s.weibo.com"+element.select("td.td-02").select("a").attr("href");
String name = element.select("td.td-02").select("a").text();
String num = !element.select("td.td-02").select("span").text().equals("")?element.select("td.td-02").select("span").text():"0";
String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("")?element.select("td[class=\"td-01 ranktop\"]").text():"-1";
int hotCount = Integer.valueOf(num);
int rankCount = Integer.valueOf(rank);
HotSearchList hotSearch = new HotSearchList(id, name, hotCount,true, rankCount, HotSearchType.微博热搜.name(),null);
list.add(hotSearch);
} catch (Exception e) {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log.error("解析微博时时热搜时出现解析错误", e);
continue;
}
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
for (Element element : elements) {
try {
String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href");
String name = element.select("td.td-02").select("a").text();
String num = !element.select("td.td-02").select("span").text().equals("") ? element.select("td.td-02").select("span").text() : "0";
String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("") ? element.select("td[class=\"td-01 ranktop\"]").text() : "-1";
int hotCount = Integer.valueOf(num);
int rankCount = Integer.valueOf(rank);
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), null);
list.add(hotSearch);
} catch (Exception e) {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log.error("解析微博时时热搜时出现解析错误", e);
continue;
}
} catch (Exception e) {
log.error("解析微博时时热搜时出现解析错误,数据不是json结构",e.fillInStackTrace());
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
return null;
}
}else{
} catch (Exception e) {
log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace());
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
}
break;
} catch (Exception e) {
if(i==2){
return list;
}else{
continue;
return null;
}
} else {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
}
break;
}
return list;
}
......@@ -103,61 +105,61 @@ public class WeiboHotSearchCrawler {
* @return void 返回类型
*/
public static List<HotSearchList> weiboHotSearchByPhone(){
String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for(int count =0; count<=5; count++){
String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
String htmlBody;
try {
List<HotSearchList> result = new ArrayList<HotSearchList>();
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")){
try {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray cards = json.getJSONArray("cards");
int rank = 0;
for(int i=0;i<cards.size();i++){
try {
JSONObject card = cards.getJSONObject(i);
JSONArray cardGroup = card.getJSONArray("card_group");
if(Objects.nonNull(cardGroup) && !cardGroup.isEmpty()){
String title = card.getString("title");
boolean hot = true;
if(Objects.nonNull(title) && title.contains("实时上升热点")){
hot = false;
rank = 51;
}
for(int j=0; j<cardGroup.size(); j++){
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
int hotCount = cardInfo.getIntValue("desc_extr");
String icon = cardInfo.getString("icon");
if(StringUtils.isNotBlank(icon)){
icon = icon.split("_")[1].split(".png")[0];
}
String id = "http://s.weibo.com/weibo/"+URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon);
result.add(hotSearch);
rank++;
try(Response response = httpBoot.syncCall(request)) {
htmlBody = response.body().string();
} catch (IOException e1) {
log.error("解析微博时时热搜时出现连接失败",e1);
}
List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray cards = json.getJSONArray("cards");
int rank = 0;
for (int i = 0; i < cards.size(); i++) {
try {
JSONObject card = cards.getJSONObject(i);
JSONArray cardGroup = card.getJSONArray("card_group");
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
String title = card.getString("title");
boolean hot = true;
if (Objects.nonNull(title) && title.contains("实时上升热点")) {
hot = false;
rank = 51;
}
for (int j = 0; j < cardGroup.size(); j++) {
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
int hotCount = cardInfo.getIntValue("desc_extr");
String icon = cardInfo.getString("icon");
if (StringUtils.isNotBlank(icon)) {
icon = icon.split("_")[1].split(".png")[0];
}
}else{
log.info("card 数据结构为:{}", card);
String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon);
result.add(hotSearch);
rank++;
}
} catch (Exception e) {
log.error("解析微博时时热搜时出现解析错误",e);
continue;
} else {
log.info("card 数据结构为:{}", card);
}
} catch (Exception e) {
log.error("解析微博时时热搜时出现解析错误", e);
continue;
}
return result;
} catch (Exception e) {
log.error("解析微博时时热搜时出现解析错误,数据不是json结构",e);
}
}else{
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
return result;
} catch (Exception e) {
log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e);
}
} catch (IOException e1) {
log.error("解析微博时时热搜时出现连接失败",e1);
} else {
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
......
......@@ -10,6 +10,8 @@ import java.util.Objects;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -52,29 +54,29 @@ public class WeiboSuperTopicCrawler {
urlMap.put("明星上升", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=up&from=&wm=");
List<WeiboSuperTopic> topicList = new ArrayList<>();
for(Entry<String,String> entry : urlMap.entrySet()) {
String url = entry.getValue();
String type = entry.getKey();
for(int page= 1; page<=5; page++) {
String pageUrl = url + "&page=" + page;
Request request = RequestUtils.wrapGet(pageUrl, headMap);
String htmlBody = null;
//重试三次
for(int retryTimes = 1; retryTimes<=3; retryTimes++) {
try {
// System.out.println("pageUrl=========="+pageUrl);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc1")) {
topicList.addAll(parseTopicRankHtml(page, htmlBody, type));
break;
}else {
log.error("获取榜单列表页面时数据格式错误,页面为:{}", htmlBody);
}
} catch (Exception e) {
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
}catch (Exception e) {
log.error("获取榜单列表页面时出现错误,错误为:{}", e);
continue;
}
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc1")) {
topicList.addAll(parseTopicRankHtml(page, htmlBody, type));
break;
} else {
log.error("获取榜单列表页面时数据格式错误,页面为:{}", htmlBody);
}
}
}
}
return topicList;
......@@ -136,23 +138,24 @@ public class WeiboSuperTopicCrawler {
*/
private static WeiboSuperTopic getTopicInfo(String id, WeiboSuperTopic topic) {
for(int retryTimes=1; retryTimes<=3; retryTimes++) {
try {
String url = "https://m.weibo.cn/api/container/getIndex?containerid="+ id;
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc_more")) {
String descMore = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("pageInfo").getJSONArray("desc_more").getString(0);
if(StringUtils.isNotBlank(descMore)) {
String readNum = descMore.replaceAll("阅读|帖子.*", "").trim();
String postNum = descMore.replaceAll(".*帖子|粉丝.*", "").trim();
topic.setPostNum(postNum);
topic.setReadNum(readNum);
return topic;
}
}
String url = "https://m.weibo.cn/api/container/getIndex?containerid="+ id;
Request request = RequestUtils.wrapGet(url);
String htmlBody = null;
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析榜单详情页面时出现错误,错误为:{}", e);
}
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc_more")) {
String descMore = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("pageInfo").getJSONArray("desc_more").getString(0);
if (StringUtils.isNotBlank(descMore)) {
String readNum = descMore.replaceAll("阅读|帖子.*", "").trim();
String postNum = descMore.replaceAll(".*帖子|粉丝.*", "").trim();
topic.setPostNum(postNum);
topic.setReadNum(readNum);
return topic;
}
}
}
return topic;
}
......
......@@ -11,6 +11,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import com.zhiwei.tools.tools.URLCodeUtil;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
......@@ -131,21 +133,23 @@ public class WeiboTopicCrawler {
List<HotSearchList> topicList = new ArrayList<>();
for(int page=1; page<=6; page++){
String pageUrl = "https://api.weibo.cn/2/page?gsid=_2A25zJX_EDeRxGedH71YS8CzKzzmIHXVuc_QMrDV6PUJbkdANLXPbkWpNUK3OyitGCJsX8exvua-vfubUqCiaA4lb&from=10A1193010&c=iphone&s=2827eebe&count=20&containerid=106003type%253D25%2526t%253D3%2526disable_hot%253D1%2526filter_type%253Dtopicscene&page=" + page;
Request request = RequestUtils.wrapGet(pageUrl);
String htmlBody = null;
//重试三次
for(int retryTimes = 1; retryTimes<=5; retryTimes++) {
try {
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
// log.info("pageUrl::{}", pageUrl);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
topicList.addAll(parseTopicHtml(htmlBody));
break;
}else {
log.info("下载榜单列表页面时数据格式错误,页面为:{}", htmlBody);
}
htmlBody = response.body().string();
} catch (Exception e) {
log.error("下载榜单列表页面时出现错误,错误为:{}", e);
continue;
}
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
topicList.addAll(parseTopicHtml(htmlBody));
break;
} else {
log.info("下载榜单列表页面时数据格式错误,页面为:{}", htmlBody);
}
}
}
return topicList;
......
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Log4j2
public class ZhihuChildHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* 知乎子级分类数据采集
* @param type
* @param typeName
* @return
*/
public static List<HotSearchList> getZhihuTopicSearch(String type,String typeName) {
List<HotSearchList> list = new ArrayList<>();
String url = "https://www.zhihu.com/api/v3/feed/topstory/hot-lists/"+type;
Map<String,String> headerMap = new HashMap<>();
headerMap.put("x-api-version","3.0.76");
JSONArray dataJson =null;
String htmlBody =null;
Request request = RequestUtils.wrapGet(url, headerMap);
//采集为空最多重试3次
for (int t = 0; t < 3 && dataJson == null; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
e.printStackTrace();
}
if (htmlBody != null && htmlBody.contains("data")) {
JSONObject topSearch = JSONObject.parseObject(htmlBody);
dataJson = topSearch.getJSONArray("data");
for (int i = 0; i < dataJson.size(); i++) {
JSONObject jsonObject = dataJson.getJSONObject(i).getJSONObject("target");
Integer rank = i + 1;
String name = jsonObject.getJSONObject("title_area").getString("text");
String hotCountString = jsonObject.getJSONObject("metrics_area").getString("text");
Integer count = getHotCount(hotCountString);
String childUrl = jsonObject.getJSONObject("link").getString("url");
HotSearchList hotSearchList = new HotSearchList(childUrl, name, count, rank, HotSearchType.知乎热搜.name() + typeName + "分类");
list.add(hotSearchList);
}
}
ZhiWeiTools.sleep(3000L);
}
return list;
}
/**
* 截取出热度值
* @param hotCountString
* @return
*/
private static Integer getHotCount(String hotCountString){
Integer count;
if(hotCountString.contains("万")){
hotCountString = hotCountString.replaceAll("万.*", "").trim();
count = (int)(Double.parseDouble(hotCountString)*10000);
}else if(hotCountString.contains("亿")){
hotCountString = hotCountString.replaceAll("亿.*", "").trim();
count = (int)(Double.parseDouble(hotCountString)*10000000);
}else{
count = Integer.getInteger(hotCountString.substring(0, hotCountString.indexOf("领域热度")));
}
return count;
}
}
......@@ -6,6 +6,8 @@ import java.util.List;
import java.util.Map;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -46,27 +48,28 @@ public class ZhihuHotSearchCrawler {
headerMap.put("accept", "application/json, text/plain, */*");
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
headerMap.put("Referer", rerferer);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null && htmlBody.contains("words")){
list = new ArrayList<>();
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray words = topSearch.getJSONObject("top_search").getJSONArray("words");
String link = null;
String displayQuery = null;
String query = null;
for (int i = 0; i < words.size(); i++) {
JSONObject word = words.getJSONObject(i);
query = word.getString("query");
displayQuery = word.getString("display_query");
link = "https://www.zhihu.com/search?q="+URLCodeUtil.getURLEncode(query, "utf-8")+"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content";
HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name());
list.add(zhihu);
}
}
} catch (IOException e) {
Request request = RequestUtils.wrapGet(url, headerMap);
String htmlBody = null;
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
}catch (IOException e) {
log.debug("获取知乎热搜时出现问题:{}", e);
return list;
}
if (htmlBody != null && htmlBody.contains("words")) {
list = new ArrayList<>();
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray words = topSearch.getJSONObject("top_search").getJSONArray("words");
String link = null;
String displayQuery = null;
String query = null;
for (int i = 0; i < words.size(); i++) {
JSONObject word = words.getJSONObject(i);
query = word.getString("query");
displayQuery = word.getString("display_query");
link = "https://www.zhihu.com/search?q=" + URLCodeUtil.getURLEncode(query, "utf-8") + "&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content";
HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name());
list.add(zhihu);
}
}
return list;
}
......@@ -81,7 +84,7 @@ public class ZhihuHotSearchCrawler {
* @return List<ZhihuHotSearch> 返回类型
*/
public static List<HotSearchList> getMobileZhihuHotList(){
List<HotSearchList> list = new ArrayList<>();;
List<HotSearchList> list = new ArrayList<>();
String url = "https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0";
Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "api.zhihu.com");
......@@ -89,43 +92,44 @@ public class ZhihuHotSearchCrawler {
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36");
headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null && htmlBody.contains("author")){
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray dataJson = topSearch.getJSONArray("data");
String link = null;
String displayQuery = null;
Integer hotCount = null;
String hotText = null;
for (int i = 0; i < dataJson.size(); i++) {
JSONObject data = dataJson.getJSONObject(i).getJSONObject("target");
displayQuery = data.getString("title");
link = "https://www.zhihu.com/question/" + data.getLongValue("id");
hotText = dataJson.getJSONObject(i).getString("detail_text");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.debug("获取知乎热搜时出现问题:{}", e);
return list;
}
if (htmlBody != null && htmlBody.contains("author")) {
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray dataJson = topSearch.getJSONArray("data");
String link = null;
String displayQuery = null;
Integer hotCount = null;
String hotText = null;
for (int i = 0; i < dataJson.size(); i++) {
JSONObject data = dataJson.getJSONObject(i).getJSONObject("target");
displayQuery = data.getString("title");
link = "https://www.zhihu.com/question/" + data.getLongValue("id");
hotText = dataJson.getJSONObject(i).getString("detail_text");
//计算热度
try {
if(hotText.contains("万")){
hotText = hotText.replaceAll("万.*", "").trim();
hotCount = (int)(Double.parseDouble(hotText)*10000);
}else if(hotText.contains("亿")){
hotText = hotText.replaceAll("亿.*", "").trim();
hotCount = (int)(Double.parseDouble(hotText)*10000000);
}else{
hotCount = Integer.getInteger(hotText);
}
}catch (Exception e){
e.printStackTrace();
//计算热度
try {
if (hotText.contains("万")) {
hotText = hotText.replaceAll("万.*", "").trim();
hotCount = (int) (Double.parseDouble(hotText) * 10000);
} else if (hotText.contains("亿")) {
hotText = hotText.replaceAll("亿.*", "").trim();
hotCount = (int) (Double.parseDouble(hotText) * 10000000);
} else {
hotCount = Integer.getInteger(hotText);
}
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i+1, HotSearchType.知乎热搜.name());
list.add(zhihu);
} catch (Exception e) {
e.printStackTrace();
}
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name());
list.add(zhihu);
}
} catch (IOException e) {
log.debug("获取知乎热搜时出现问题:{}", e);
return list;
}
return list;
}
......
......@@ -10,6 +10,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
......@@ -30,35 +32,34 @@ public class ZhihuTopicSearchCrawler {
List<HotSearchList> list = new ArrayList<>();
String url = "https://www.zhihu.com/topsearch";
JSONObject jsonObject = null;
try {
for(int t=0 ;t<3 && jsonObject== null;t++)
{
// ZhiWeiTools.sleep(10000L);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),
ProxyHolder.NAT_HEAVY_PROXY).body().string();
// log.info("页面内容获取:{}",htmlBody);
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for (int t = 0; t < 3 && jsonObject == null; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("知乎热搜页面连接异常", e);
}
if (htmlBody != null) {
Document document = Jsoup.parse(htmlBody);
String html = document.getElementsByTag("script").select("#js-initialData").html();
jsonObject = JSONObject.parseObject(html);
}
if(jsonObject != null) {
JSONArray dataJson = jsonObject.getJSONObject("initialState").getJSONObject("topsearch").getJSONArray("data");
for (int i = 0; i < dataJson.size(); i++) {
Integer rank = i + 1;
JSONObject data = dataJson.getJSONObject(i);
String name = data.getString("queryDisplay");
String realQuery = data.getString("realQuery");
String zhihuUrl = "https://www.zhihu.com/search?q=" + realQuery + "&utm_content=search_hot&type=content";
HotSearchList hotSearchList = new HotSearchList(zhihuUrl, name, null, rank, HotSearchType.知乎热搜榜单.name());
list.add(hotSearchList);
if (jsonObject != null) {
JSONArray dataJson = jsonObject.getJSONObject("initialState").getJSONObject("topsearch").getJSONArray("data");
for (int i = 0; i < dataJson.size(); i++) {
Integer rank = i + 1;
JSONObject data = dataJson.getJSONObject(i);
String name = data.getString("queryDisplay");
String realQuery = data.getString("realQuery");
String zhihuUrl = "https://www.zhihu.com/search?q=" + realQuery + "&utm_content=search_hot&type=content";
HotSearchList hotSearchList = new HotSearchList(zhihuUrl, name, null, rank, HotSearchType.知乎热搜榜单.name());
list.add(hotSearchList);
}
return list;
}
return list;
}else{
log.error("知乎热搜榜单页面获取异常,404");
log.error(jsonObject);
} else {
log.error("知乎热搜榜单页面获取异常");
}
} catch (IOException e) {
log.error("知乎热搜获取异常", e);
}
return Collections.emptyList();
}
......
......@@ -6,10 +6,9 @@ import java.util.Date;
import java.util.List;
import java.util.Objects;
import com.mongodb.client.ListIndexesIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.*;
import com.mongodb.client.model.IndexOptions;
import com.mongodb.client.model.Sorts;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import lombok.extern.log4j.Log4j2;
......@@ -64,5 +63,20 @@ public class HotSearchListDAO{
log.error("存储数据时出错,错误为:{}", e);
}
}
public Date getLastTimeByType(String type){
try {
BasicDBObject basicDBObject = new BasicDBObject();
basicDBObject.put("type", type);
MongoCursor<Document> cursor = mongoCollection.find(basicDBObject).sort(
Sorts.orderBy(Sorts.descending("time"))).skip(0).limit(1).iterator();
while (cursor.hasNext()) {
return (Date) cursor.next().get("time");
}
}catch (Exception e){
log.error("查询数据时出错,错误为:{}",e);
}
return null;
}
}
......@@ -52,5 +52,7 @@ public class HotSearchRun {
new WeiboTopicRun().start();
new ToutiaoHotSearchRun().start();
new ZhihuTopSearchRun().start();
new ZhihuChildHotSearchRun().start();
new ThreadOneRun().start();
}
}
......@@ -7,6 +7,7 @@ import java.util.Objects;
import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import org.slf4j.Logger;
......@@ -47,6 +48,8 @@ public class BaiduHotSearchRun extends Thread{
if(Objects.nonNull(list) && !list.isEmpty()) {
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
} else {
TipsUtils.sendTips("百度热搜",new Date());
}
log.info("百度风云榜采集结束........");
}
......
......@@ -6,6 +6,7 @@ import java.util.List;
import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import org.slf4j.Logger;
......@@ -47,6 +48,9 @@ public class DouyinHotSearchRun extends Thread{
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> list = DouyinHotSearchCrawler.getMobileDouyinHotList();
log.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("抖音热搜",new Date());
}
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
log.info("抖音热搜榜采集结束........");
......
......@@ -7,6 +7,7 @@ import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import org.slf4j.Logger;
......@@ -43,6 +44,9 @@ public class SougoHotSearchRun extends Thread {
log.info("搜狗微信采集开始........");
List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch();
log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("搜狗微信热搜",new Date());
}
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
log.info("搜狗微信采集结束........");
......
package com.zhiwei.searchhotcrawler.timer;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.TengXunCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Log4j2
public class ThreadOneRun extends Thread {
@Override
public void run() {
boolean f = true;
while(f) {
try {
getHotList();
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*1000);
}
ZhiWeiTools.sleep(50);
}
}
private void getHotList(){
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> list = TengXunCrawler.getTengXunHotList();
if(list == null || list.size() == 0){
TipsUtils.sendTips("腾讯新闻",new Date());
} else {
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
}
}
}
......@@ -5,6 +5,7 @@ import com.zhiwei.searchhotcrawler.crawler.ToutiaoHotSearchCrawler;
import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
......@@ -39,6 +40,9 @@ public class ToutiaoHotSearchRun extends Thread{
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> list = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone();
log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("今日头条热搜",new Date());
}
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
log.info("今日头条热搜采集结束........");
......
......@@ -6,6 +6,7 @@ import java.util.List;
import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
......@@ -37,6 +38,9 @@ public class WeiboHotSearchRun extends Thread{
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearchByPhone();
log.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("微博热搜",new Date());
}
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
}
......
......@@ -4,6 +4,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.WeiboTopicCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
......@@ -37,6 +38,9 @@ public class WeiboTopicRun extends Thread{
log.info("微博话题采集开始........");
List<HotSearchList> list = WeiboTopicCrawler.startCrawlerByPhone();
log.info("{}, 微博话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("微博话题",new Date());
}
List<Document> data = new ArrayList<>();
for(HotSearchList topic : list){
Document doc = new Document();
......
package com.zhiwei.searchhotcrawler.timer;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.ZhihuChildHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Log4j2
public class ZhihuChildHotSearchRun extends Thread {
private List<String> childType = Arrays.asList("digital","focus","depth");
@Override
public void run() {
boolean f = true;
while(f) {
try {
getHotList();
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
}
ZhiWeiTools.sleep(50);
}
}
private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
for (int i = 0; i < childType.size(); i++) {
String name = this.getTypeName(childType.get(i));
if (!"".equals(name)) {
log.info("知乎{}话题热榜采集开始...", name);
List<HotSearchList> list = ZhihuChildHotSearchCrawler.getZhihuTopicSearch(childType.get(i), name);
log.info("{}, 知乎{}话题此轮采集到的数据量为:{}", new Date(),name, Integer.valueOf(list != null ? list.size() : 0));
if (list == null || list.size() == 0) {
TipsUtils.sendTips("知乎热搜"+name+"分类", new Date());
}
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
log.info("知乎{}话题热榜采集结束...", name);
ZhiWeiTools.sleep(3000);
}
}
}
private String getTypeName(String type){
String name;
switch (type) {
case "digital":
name = "数码";
break;
case "focus":
name = "国际";
break;
case "depth":
name = "时事";
break;
default:
name = "";
}
return name;
}
}
......@@ -6,6 +6,7 @@ import java.util.List;
import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import org.slf4j.Logger;
......@@ -44,6 +45,9 @@ public class ZhihuHotSearchRun extends Thread{
// List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList();
List<HotSearchList> list = ZhihuHotSearchCrawler.getMobileZhihuHotList();
log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("知乎热搜",new Date());
}
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
log.info("知乎话题采集结束........");
......
......@@ -4,6 +4,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.ZhihuTopicSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
......@@ -36,6 +37,9 @@ public class ZhihuTopSearchRun extends Thread {
log.info("知乎热搜采集开始...,当前线程名字:{}", Thread.currentThread().getName());
List<HotSearchList> list = ZhihuTopicSearchCrawler.getZhihuTopicSearch();
log.info("{}, 知乎热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("知乎热搜榜单",new Date());
}
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
log.info("知乎热搜话题采集结束........");
......
package com.zhiwei.searchhotcrawler.util;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.MediaType;
import okhttp3.Request;
import okhttp3.RequestBody;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Map;
/**
* http请求的工具类
*/
public final class HttpClientUtils {
private static final Logger LOGGER = LogManager.getLogger(HttpClientUtils.class);
private static final String NAME_VALUE_SEPARATOR = "=";
private static final String QUERY_PARAM_SEP = "&";
private static final String URL_QUERY_PARAM_SEPARATOR = "?";
private static final HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(2).build();
public static String sendPost(String url, String jsonParam){
return sendPost(url, jsonParam, null, Charset.forName("UTF-8"));
}
public static String sendPost(String url, String jsonParam, Map<String, String> headers, final Charset charset) {
if (StringUtils.isEmpty(url)) {
LOGGER.error("URL can not be empty or null.");
}
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Post Request:{}", url);
}
String result = null;
Request request= RequestUtils.wrapPost(url, headers, RequestBody.create(MediaType.get("application/json"),
jsonParam));
try(Response response = httpBoot.syncCall(request)) {
result = response.body().string();
}catch (IOException e) {
LOGGER.error("http connection error :" + e.getMessage(), e);
}
return result;
}
}
package com.zhiwei.searchhotcrawler.util;
import com.alibaba.fastjson.JSONObject;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 企业微信机器人推送工具
*
* @ClassName: QYWechatUtil
* @author: 陈炜涛
* @date: 2019年7月17日 下午2:33:12
*
* @Copyright: 2019 www.zhiweidata.com
*/
public class QYWechatUtil {
/** 推送地址 **/
private static final String SEND_URL = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=";
/** markdown模式 **/
public static final String MSGTYPE_MARKDOWN = "markdown";
/** 文字 **/
public static final String MSGTYPE_TEXT = "text";
/** 图片,需另外封装 **/
public static final String MSGTYPE_IMAGE = "image";
/** 图文,需另外封装 **/
public static final String MSGTYPE_NEWS = "news";
/**
* @param key
* 发送预警的key 目标机器人
* @param content
* @param mentionedList
* '@'对象id集合
* @param mentionedMobileList
* 手机号码集合
* @return
* @return: String
* @throws @author:
* 陈炜涛
* @date: 2019年7月17日 下午2:56:40
*/
public static String send(String key, String msgtype, String content, List<String> mentionedList,
List<String> mentionedMobileList) {
msgtype = msgtype != null && !msgtype.isEmpty() ? msgtype : MSGTYPE_TEXT;
TextBody text = new TextBody(content, mentionedList, mentionedMobileList);
Map<String, Object> dataMap = new HashMap<>();
dataMap.put("msgtype", msgtype);
dataMap.put(msgtype, text);
return HttpClientUtils.sendPost(SEND_URL + key, JSONObject.toJSONString(dataMap));
}
}
/**
* 中转对象仅在此处使用
*
* @ClassName: Body
* @author: 陈炜涛
* @date: 2019年7月17日 下午2:50:19
*
* @Copyright: 2019 www.zhiweidata.com
*/
class TextBody {
/**
* 消息内容
*/
private String content;
/**
* 通知人id
*/
private List<String> mentionedList;
/**
* 通知人手机号
*/
private List<String> mentionedMobileList;
public TextBody() {
super();
}
public TextBody(String content, List<String> mentionedList, List<String> mentionedMobileList) {
super();
this.content = content;
this.mentionedList = mentionedList;
this.mentionedMobileList = mentionedMobileList;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public List<String> getMentionedList() {
return mentionedList;
}
public void setMentionedList(List<String> mentionedList) {
this.mentionedList = mentionedList;
}
public List<String> getMentionedMobileList() {
return mentionedMobileList;
}
public void setMentionedMobileList(List<String> mentionedMobileList) {
this.mentionedMobileList = mentionedMobileList;
}
@Override
public String toString() {
return "TextBody [content=" + content + ", mentionedList=" + mentionedList + ", mentionedMobileList="
+ mentionedMobileList + "]";
}
}
package com.zhiwei.searchhotcrawler.util;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Date;
/**
* 预警发送
*/
public class TipsUtils {
private static Long timeDifference = 5 * 60 * 1000L;
private static String key = "a8e26ce3-8aaa-4d3e-bcf6-30b81526050b";
private Logger logger = LoggerFactory.getLogger(TipsUtils.class);
//未采集到数据发送预警信息
public static void sendTips(String type, Date time){
HotSearchListDAO hotSearchListDAO = new HotSearchListDAO();
//获取数据库最后一条数据判断该程序几分钟没有采集到数据
Date lastTime = hotSearchListDAO.getLastTimeByType(type);
if(time.getTime() - lastTime.getTime() > timeDifference){
//发送预警
String crawlerContent = String.format("%s已经连续%s分钟未采集到数据",type,(time.getTime() - lastTime.getTime())/1000/60);
QYWechatUtil.send(key, QYWechatUtil.MSGTYPE_TEXT, crawlerContent,
null, null);
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment