Commit cfb1f13a by 马黎滨

网易新闻实时热榜和跟贴热议采集

parent eb385cb2
...@@ -13,5 +13,7 @@ public enum HotSearchType { ...@@ -13,5 +13,7 @@ public enum HotSearchType {
新浪热榜, 新浪热榜,
新浪热点, 新浪热点,
搜狐话题, 搜狐话题,
凤凰新闻热榜 凤凰新闻热榜,
网易热榜,
网易跟帖热议
} }
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
/**
* 网易新闻采集
*/
@Log4j2
public class WangYiHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).build();
/**
* 网易新闻实时热榜的采集
* @return
*/
public static List<HotSearchList> getWangYiHotSearch(){
List<HotSearchList> hotSearchLists = new ArrayList<>();
log.info("网易新闻实时热榜开始采集");
String url = "https://v6-gw.m.163.com/nc-main/api/v1/hqc/no-repeat-hot-list";
Request request = RequestUtils.wrapGet(url);
String htmlBody = null;
for(int t=0 ;t<3; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("网易新闻实时热榜页面连接异常...", e);
}
if(htmlBody!=null && htmlBody.contains("data")) {
JSONObject bodyObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray jsonObject = bodyObject.getJSONArray("items");
if(jsonObject != null) {
for (int i = 0; i < jsonObject.size(); i++) {
int rank = i + 1;
String name = jsonObject.getJSONObject(i).getString("title");
int count = jsonObject.getJSONObject(i).getIntValue("hotValue");
String contentId = jsonObject.getJSONObject(i).getString("contentId");
String wangyiUrl = "https://c.m.163.com/news/a/" + contentId + ".html";
HotSearchList hotSearchList = new HotSearchList(wangyiUrl, name, count, rank, HotSearchType.网易热榜.name());
hotSearchLists.add(hotSearchList);
}
log.info("{}, 此轮网易新闻热榜采集到的数据量为:{}", new Date(), Integer.valueOf(hotSearchLists != null ? hotSearchLists.size() : 0));
log.info("网易新闻热榜采集结束");
return hotSearchLists;
}
}
ZhiWeiTools.sleep(3000L);
}
return hotSearchLists;
}
/**
* 网易新闻跟帖热议的采集
* @return
*/
public static List<HotSearchList> getWangYicomment(){
List<HotSearchList> hotSearchLists = new ArrayList<>();
log.info("网易新闻跟贴热议开始采集");
String url = "https://v6-gw.m.163.com/gentie-web/api/v2/products/a2869674571f77b5a0867c3d71db5856/rankDocs/all/list?ibc=newsapph5&limit=30";
Request request = RequestUtils.wrapGet(url);
String htmlBody = null;
for(int t=0 ;t<3; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("网易新闻跟贴热议页面连接异常...", e);
}
if(htmlBody!=null && htmlBody.contains("data")) {
JSONObject bodyObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray jsonObject = bodyObject.getJSONArray("cmtDocs");
if(jsonObject != null) {
for (int i = 0; i < jsonObject.size(); i++) {
int rank = i + 1;
String name = jsonObject.getJSONObject(i).getString("doc_title");
int count = jsonObject.getJSONObject(i).getIntValue("hotScore")*10000;
String contentId = jsonObject.getJSONObject(i).getString("docId");
String wangyiUrl = "https://c.m.163.com/news/a/" + contentId + ".html";
HotSearchList hotSearchList = new HotSearchList(wangyiUrl, name, count, rank, HotSearchType.网易跟帖热议.name());
hotSearchLists.add(hotSearchList);
}
log.info("{}, 此轮网易新闻跟贴热议采集到的数据量为:{}", new Date(), Integer.valueOf(hotSearchLists != null ? hotSearchLists.size() : 0));
log.info("网易新闻跟贴热议采集结束");
return hotSearchLists;
}
}
ZhiWeiTools.sleep(3000L);
}
return hotSearchLists;
}
}
...@@ -50,11 +50,11 @@ public class HotSearchRun { ...@@ -50,11 +50,11 @@ public class HotSearchRun {
// new ZhihuHotSearchRun().start(); // new ZhihuHotSearchRun().start();
new WeiboSuperTopicRun().start(); new WeiboSuperTopicRun().start();
new WeiboTopicRun().start(); new WeiboTopicRun().start();
new ToutiaoHotSearchRun().start(); // new ToutiaoHotSearchRun().start();
new ZhihuTopSearchRun().start(); // new ZhihuTopSearchRun().start();
new ZhihuChildHotSearchRun().start(); new ZhihuChildHotSearchRun().start();
new ThreadOneRun().start(); new ThreadOneRun().start();
//抖音链接更新 // //抖音链接更新
new DouYinUrlHotSearchRun().start(); new DouYinUrlHotSearchRun().start();
} }
} }
...@@ -5,6 +5,9 @@ import java.util.Date; ...@@ -5,6 +5,9 @@ import java.util.Date;
import java.util.List; import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.crawler.ToutiaoHotSearchCrawler;
import com.zhiwei.searchhotcrawler.crawler.ZhihuTopicSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
...@@ -46,18 +49,22 @@ public class DouyinHotSearchRun extends Thread{ ...@@ -46,18 +49,22 @@ public class DouyinHotSearchRun extends Thread{
*/ */
private void getHotList() { private void getHotList() {
log.info("抖音热搜榜采集开始........"); log.info("抖音热搜榜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
list = DouyinHotSearchCrawler.getMobileDouyinHotList(); list = DouyinHotSearchCrawler.getMobileDouyinHotList();
log.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){ TipsUtils.addHotList("抖音热搜",list);
TipsUtils.sendTips("抖音热搜",new Date());
}else {
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
TipsUtils.recoveryTips("抖音热搜",new Date());
}
log.info("抖音热搜榜采集结束........"); log.info("抖音热搜榜采集结束........");
ZhiWeiTools.sleep(3000L);
log.info("今日头条热搜采集开始........");
List<HotSearchList> toutiaoList = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone();
log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(toutiaoList != null ? toutiaoList.size() : 0));
TipsUtils.addHotList(HotSearchType.今日头条热搜.name(),toutiaoList);
log.info("今日头条热搜采集结束........");
ZhiWeiTools.sleep(3000L);
log.info("知乎热搜榜单采集开始...");
List<HotSearchList> zhihuList = ZhihuTopicSearchCrawler.getZhihuTopicSearch();
log.info("{}, 知乎热搜榜单此轮采集到的数据量为:{}", new Date(), Integer.valueOf(zhihuList != null ? zhihuList.size() : 0));
TipsUtils.addHotList(HotSearchType.知乎热搜榜单.name(),zhihuList);
log.info("知乎热搜榜单采集结束........");
} }
} }
package com.zhiwei.searchhotcrawler.timer; package com.zhiwei.searchhotcrawler.timer;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.FengHuangSearchCrawler; import com.zhiwei.searchhotcrawler.crawler.*;
import com.zhiwei.searchhotcrawler.crawler.SouhuTopicCrawler;
import com.zhiwei.searchhotcrawler.crawler.TengXunCrawler;
import com.zhiwei.searchhotcrawler.crawler.XinLangHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
...@@ -35,31 +32,18 @@ public class ThreadOneRun extends Thread { ...@@ -35,31 +32,18 @@ public class ThreadOneRun extends Thread {
private void getHotList(){ private void getHotList(){
List<HotSearchList> tengXunlist = TengXunCrawler.getTengXunHotList(); List<HotSearchList> tengXunlist = TengXunCrawler.getTengXunHotList();
addHotList("腾讯新闻",tengXunlist); TipsUtils.addHotList("腾讯新闻",tengXunlist);
ZhiWeiTools.sleep(3000L); ZhiWeiTools.sleep(3000L);
List<HotSearchList> xinLanglist = XinLangHotSearchCrawler.getXinLangHotSearch(); List<HotSearchList> xinLanglist = XinLangHotSearchCrawler.getXinLangHotSearch();
addHotList("新浪热榜",xinLanglist); TipsUtils.addHotList("新浪热榜",xinLanglist);
ZhiWeiTools.sleep(3000L); ZhiWeiTools.sleep(3000L);
List<HotSearchList> souhuList = SouhuTopicCrawler.getSouhuTopic(); List<HotSearchList> souhuList = SouhuTopicCrawler.getSouhuTopic();
addHotList("搜狐话题",souhuList); TipsUtils.addHotList("搜狐话题",souhuList);
ZhiWeiTools.sleep(3000L); ZhiWeiTools.sleep(3000L);
List<HotSearchList> xinLangHotList = XinLangHotSearchCrawler.getXinLangHotSpot(); List<HotSearchList> xinLangHotList = XinLangHotSearchCrawler.getXinLangHotSpot();
addHotList("新浪热点",xinLangHotList); TipsUtils.addHotList("新浪热点",xinLangHotList);
ZhiWeiTools.sleep(3000L); ZhiWeiTools.sleep(3000L);
List<HotSearchList> fengHuangHotList = FengHuangSearchCrawler.getFengHuangHotList(); List<HotSearchList> fengHuangHotList = FengHuangSearchCrawler.getFengHuangHotList();
addHotList("凤凰新闻热榜",fengHuangHotList); TipsUtils.addHotList("凤凰新闻热榜",fengHuangHotList);
}
private void addHotList(String type, List<HotSearchList> list){
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
if(list == null || list.size() == 0){
TipsUtils.sendTips(type,new Date());
} else {
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
TipsUtils.recoveryTips(type,new Date());
}
} }
} }
package com.zhiwei.searchhotcrawler.timer; package com.zhiwei.searchhotcrawler.timer;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.WangYiHotSearchCrawler;
import com.zhiwei.searchhotcrawler.crawler.ZhihuChildHotSearchCrawler; import com.zhiwei.searchhotcrawler.crawler.ZhihuChildHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
...@@ -53,6 +54,14 @@ public class ZhihuChildHotSearchRun extends Thread { ...@@ -53,6 +54,14 @@ public class ZhihuChildHotSearchRun extends Thread {
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(3000);
} }
} }
//网易实时热榜采集
ZhiWeiTools.sleep(3000L);
List<HotSearchList> wangyiHotSearchList = WangYiHotSearchCrawler.getWangYiHotSearch();
TipsUtils.addHotList("网易热榜",wangyiHotSearchList);
//网易跟帖热议采集
ZhiWeiTools.sleep(3000L);
List<HotSearchList> wangyiComment = WangYiHotSearchCrawler.getWangYicomment();
TipsUtils.addHotList("网易跟帖热议",wangyiComment);
} }
private String getTypeName(String type){ private String getTypeName(String type){
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment