Commit cb69d3bb by 马黎滨

Merge branch 'mlbWork' into 'master'

新浪,搜狐,凤凰采集

See merge request !16
parents fabaafbb c6957bee
...@@ -9,5 +9,9 @@ public enum HotSearchType { ...@@ -9,5 +9,9 @@ public enum HotSearchType {
微博话题, 微博话题,
今日头条热搜, 今日头条热搜,
知乎热搜榜单, 知乎热搜榜单,
腾讯新闻 腾讯新闻,
新浪热榜,
新浪热点,
搜狐话题,
凤凰新闻热榜
} }
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@Log4j2
public class FengHuangSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).build();
/**
* 获取凤凰新闻热榜
* @return
*/
public static List<HotSearchList> getFengHuangHotList(){
log.info("凤凰新闻热榜开始采集");
List<HotSearchList> list = new ArrayList<>();
for(int page = 1; page <=2; page++) {
String url = "https://nine.ifeng.com/hotspotlist?gv=7.9.1&page="+page;
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("凤凰新闻热榜页面连接异常...", e);
}
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("list");
if(jsonArray != null){
for (int i=0; i<jsonArray.size(); i++){
Integer rank = i+1;
String name = jsonArray.getJSONObject(i).getJSONObject("hotLabel").getString("desp");
String topicLead = jsonArray.getJSONObject(i).getString("title");
String fenghuangUrl = jsonArray.getJSONObject(i).getJSONObject("link").getString("weburl");
String hotValue = jsonArray.getJSONObject(i).getJSONObject("hotLabel").getString("hotGrade");
Integer count = TipsUtils.getHotCount(hotValue);
Integer commentCount = jsonArray.getJSONObject(i).getIntValue("commentsall");
HotSearchList hotSearchList = new HotSearchList(fenghuangUrl,name,count,
rank,HotSearchType.凤凰新闻热榜.name(),commentCount,topicLead);
list.add(hotSearchList);
}
}
}
}
log.info("{}, 此轮凤凰新闻热榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
log.info("凤凰新闻热榜采集结束");
return list;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@Log4j2
public class SouhuTopicCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static List<HotSearchList> getSouhuTopic(){
List<HotSearchList> hotSearchLists = new ArrayList<>();
log.info("搜狐话题榜开始采集...");
JSONArray dataJson = null;
String htmlBody = null;
String url = "https://api.k.sohu.com/api/news/moment/v2/list.go?pageSize=50";
Request request = RequestUtils.wrapGet(url);
for(int t=0; t<3&&dataJson==null; t++){
try(Response response = httpBoot.syncCall(request)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("搜狐话题页面连接失败",e);
}
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
JSONObject jsonObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
dataJson = jsonObject.getJSONObject("topicList").getJSONArray("datas");
if(dataJson != null) {
for (int i = 0; i < dataJson.size();i++){
Integer rank = i+1;
String name = dataJson.getJSONObject(i).getJSONObject("eventNewsInfo").getString("title");
String hotValue = dataJson.getJSONObject(i).getString("value");
Integer count = TipsUtils.getHotCount(hotValue.substring(0,hotValue.indexOf("观点")));
String souguUrl = dataJson.getJSONObject(i).getJSONObject("eventNewsInfo").getString("h5Link");
String icon = dataJson.getJSONObject(i).getJSONObject("attrInfo").getString("displayText");
HotSearchList hotSearchList = new HotSearchList(souguUrl,name,count,true,rank, HotSearchType.搜狐话题.name(),icon);
hotSearchLists.add(hotSearchList);
}
log.info("{}, 此轮搜狐话题榜采集到的数据量为:{}", new Date(), Integer.valueOf(hotSearchLists != null ? hotSearchLists.size() : 0));
log.info("搜狐话题榜采集结束");
return hotSearchLists;
}
}
ZhiWeiTools.sleep(3000L);
}
return hotSearchLists;
}
}
...@@ -43,6 +43,7 @@ public class TengXunCrawler { ...@@ -43,6 +43,7 @@ public class TengXunCrawler {
if (htmlBody != null && htmlBody.contains("idlist")) { if (htmlBody != null && htmlBody.contains("idlist")) {
JSONObject topSearch = JSONObject.parseObject(htmlBody); JSONObject topSearch = JSONObject.parseObject(htmlBody);
dataJson = topSearch.getJSONArray("idlist").getJSONObject(0).getJSONArray("newslist"); dataJson = topSearch.getJSONArray("idlist").getJSONObject(0).getJSONArray("newslist");
if(dataJson != null) {
for (int i = 1; i < dataJson.size(); i++) { for (int i = 1; i < dataJson.size(); i++) {
Integer rank = i; Integer rank = i;
String name = dataJson.getJSONObject(i).getString("title"); String name = dataJson.getJSONObject(i).getString("title");
...@@ -71,6 +72,7 @@ public class TengXunCrawler { ...@@ -71,6 +72,7 @@ public class TengXunCrawler {
list.add(hotSearchList); list.add(hotSearchList);
} }
} }
}
ZhiWeiTools.sleep(3000L); ZhiWeiTools.sleep(3000L);
} }
log.info("{}, 此轮腾讯新闻热榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 此轮腾讯新闻热榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
......
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.*;
@Log4j2
public class XinLangHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* 新浪热榜的采集
* @return
*/
public static List<HotSearchList> getXinLangHotSearch(){
List<HotSearchList> hotSearchLists = new ArrayList<>();
log.info("新浪热榜开始采集");
String url = "https://sinanews.sina.cn/h5/top_news_list.d.html";
Request request = RequestUtils.wrapGet(url);
String htmlBody = null;
JSONObject jsonObject = null;
for(int t=0 ;t<3&&jsonObject==null; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("新浪热榜页面连接异常...", e);
}
if(htmlBody!=null) {
Document document = Jsoup.parse(htmlBody);
String html = document.getElementsByTag("script").last().html();
jsonObject = JSONObject.parseObject(html.substring(html.indexOf("{"), html.length() - 1));
// log.info(jsonObject);
JSONArray jsonArray = jsonObject.getJSONObject("data").getJSONObject("data").getJSONArray("result");
if (jsonArray != null) {
for (int i = 0; i < jsonArray.size(); i++) {
String name = jsonArray.getJSONObject(i).getString("text");
Integer rank = i + 1;
String hotValue = jsonArray.getJSONObject(i).getString("hotValue");
Integer count = TipsUtils.getHotCount(hotValue);
String showTags = jsonArray.getJSONObject(i).getString("showTags");
String icon = null;
if (showTags.contains("新")) {
icon = "新";
} else if (showTags.contains("热")) {
icon = "热";
} else if (showTags.contains("沸")) {
icon = "沸";
}
HotSearchList hotSearchList = new HotSearchList(null, name, count, true, rank, HotSearchType.新浪热榜.name(), icon);
hotSearchLists.add(hotSearchList);
}
log.info("{}, 此轮新浪热榜采集到的数据量为:{}", new Date(), Integer.valueOf(hotSearchLists != null ? hotSearchLists.size() : 0));
log.info("新浪热榜采集结束");
return hotSearchLists;
}
}
ZhiWeiTools.sleep(3000L);
}
return hotSearchLists;
}
/**
* 新浪热点的采集
* @return
*/
public static List<HotSearchList> getXinLangHotSpot(){
List<HotSearchList> hotSearchLists = new ArrayList<>();
log.info("新浪热点开始采集");
String url = "http://interface.sina.cn/wap_api/hot_rank_data.d.json";
Request request = RequestUtils.wrapGet(url);
String htmlBody = null;
JSONArray dataJson = null;
for(int t=0 ;t<3&&dataJson==null; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("新浪热点页面连接异常...", e);
}
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
JSONObject jsonObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
dataJson = jsonObject.getJSONArray("lists");
if (dataJson != null) {
for (int i = 0; i < dataJson.size(); i++) {
Integer rank = i + 1;
String name = dataJson.getJSONObject(i).getString("title");
String xinlangUrl = dataJson.getJSONObject(i).getString("wapurl");
Integer hot = dataJson.getJSONObject(i).getIntValue("hot_value");
HotSearchList hotSearchList = new HotSearchList(xinlangUrl, name, hot, rank, HotSearchType.新浪热点.name());
hotSearchLists.add(hotSearchList);
}
log.info("{}, 此轮新浪热点采集到的数据量为:{}", new Date(), Integer.valueOf(hotSearchLists != null ? hotSearchLists.size() : 0));
log.info("新浪热点采集结束");
return hotSearchLists;
}
}
ZhiWeiTools.sleep(3000L);
}
return hotSearchLists;
}
}
...@@ -9,6 +9,7 @@ import com.zhiwei.crawler.proxy.ProxyHolder; ...@@ -9,6 +9,7 @@ import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
...@@ -54,7 +55,7 @@ public class ZhihuChildHotSearchCrawler { ...@@ -54,7 +55,7 @@ public class ZhihuChildHotSearchCrawler {
Integer rank = i + 1; Integer rank = i + 1;
String name = jsonObject.getJSONObject("title_area").getString("text"); String name = jsonObject.getJSONObject("title_area").getString("text");
String hotCountString = jsonObject.getJSONObject("metrics_area").getString("text"); String hotCountString = jsonObject.getJSONObject("metrics_area").getString("text");
Integer count = getHotCount(hotCountString); Integer count = TipsUtils.getHotCount(hotCountString.substring(0, hotCountString.indexOf("领域热度")));
String childUrl = jsonObject.getJSONObject("link").getString("url"); String childUrl = jsonObject.getJSONObject("link").getString("url");
HotSearchList hotSearchList = new HotSearchList(childUrl, name, count, rank, HotSearchType.知乎热搜.name() + typeName + "分类"); HotSearchList hotSearchList = new HotSearchList(childUrl, name, count, rank, HotSearchType.知乎热搜.name() + typeName + "分类");
list.add(hotSearchList); list.add(hotSearchList);
...@@ -64,23 +65,4 @@ public class ZhihuChildHotSearchCrawler { ...@@ -64,23 +65,4 @@ public class ZhihuChildHotSearchCrawler {
} }
return list; return list;
} }
/**
* 截取出热度值
* @param hotCountString
* @return
*/
private static Integer getHotCount(String hotCountString){
Integer count;
if(hotCountString.contains("万")){
hotCountString = hotCountString.replaceAll("万.*", "").trim();
count = (int)(Double.parseDouble(hotCountString)*10000);
}else if(hotCountString.contains("亿")){
hotCountString = hotCountString.replaceAll("亿.*", "").trim();
count = (int)(Double.parseDouble(hotCountString)*10000000);
}else{
count = Integer.getInteger(hotCountString.substring(0, hotCountString.indexOf("领域热度")));
}
return count;
}
} }
package com.zhiwei.searchhotcrawler.timer; package com.zhiwei.searchhotcrawler.timer;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.FengHuangSearchCrawler;
import com.zhiwei.searchhotcrawler.crawler.SouhuTopicCrawler;
import com.zhiwei.searchhotcrawler.crawler.TengXunCrawler; import com.zhiwei.searchhotcrawler.crawler.TengXunCrawler;
import com.zhiwei.searchhotcrawler.crawler.XinLangHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
...@@ -15,13 +18,15 @@ import java.util.concurrent.TimeUnit; ...@@ -15,13 +18,15 @@ import java.util.concurrent.TimeUnit;
@Log4j2 @Log4j2
public class ThreadOneRun extends Thread { public class ThreadOneRun extends Thread {
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
private HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
@Override @Override
public void run() { public void run() {
boolean f = true; boolean f = true;
while(f) { while(f) {
try { try {
getHotList(); getHotList();
TimeUnit.MINUTES.sleep(1); TimeUnit.MINUTES.sleep(3);
} catch (Exception e) { } catch (Exception e) {
e.fillInStackTrace(); e.fillInStackTrace();
ZhiWeiTools.sleep(60*1000); ZhiWeiTools.sleep(60*1000);
...@@ -31,15 +36,30 @@ public class ThreadOneRun extends Thread { ...@@ -31,15 +36,30 @@ public class ThreadOneRun extends Thread {
} }
private void getHotList(){ private void getHotList(){
HotSearchListDAO hotSearchDAO = new HotSearchListDAO(); List<HotSearchList> tengXunlist = TengXunCrawler.getTengXunHotList();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO(); addHotList("腾讯新闻",tengXunlist);
List<HotSearchList> list = TengXunCrawler.getTengXunHotList(); ZhiWeiTools.sleep(3000L);
List<HotSearchList> xinLanglist = XinLangHotSearchCrawler.getXinLangHotSearch();
addHotList("新浪热榜",xinLanglist);
ZhiWeiTools.sleep(3000L);
List<HotSearchList> souhuList = SouhuTopicCrawler.getSouhuTopic();
addHotList("搜狐话题",souhuList);
ZhiWeiTools.sleep(3000L);
List<HotSearchList> xinLangHotList = XinLangHotSearchCrawler.getXinLangHotSpot();
addHotList("新浪热点",xinLangHotList);
ZhiWeiTools.sleep(3000L);
List<HotSearchList> fengHuangHotList = FengHuangSearchCrawler.getFengHuangHotList();
addHotList("凤凰新闻热榜",fengHuangHotList);
}
private void addHotList(String type, List<HotSearchList> list){
if(list == null || list.size() == 0){ if(list == null || list.size() == 0){
TipsUtils.sendTips("腾讯新闻",new Date()); TipsUtils.sendTips(type,new Date());
} else { } else {
List<Document> data = hotSearchCacheDAO.addData(list); List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data); hotSearchDAO.addHotSearchList(data);
TipsUtils.sendTips("腾讯新闻",new Date()); TipsUtils.recoveryTips(type,new Date());
} }
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment