Commit e03ea262 by 马黎滨

采集添加代理ip

parent a98a48ca
...@@ -3,6 +3,7 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -3,6 +3,7 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
...@@ -30,11 +31,11 @@ public class SouhuTopicCrawler { ...@@ -30,11 +31,11 @@ public class SouhuTopicCrawler {
String htmlBody = null; String htmlBody = null;
String url = "https://api.k.sohu.com/api/news/moment/v2/list.go?pageSize=50"; String url = "https://api.k.sohu.com/api/news/moment/v2/list.go?pageSize=50";
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for(int t=0; t<3&&dataJson==null; t++){ for(int t=0; t<3 && dataJson==null; t++){
try(Response response = httpBoot.syncCall(request)) { try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string(); htmlBody = response.body().string();
} catch (IOException e) { } catch (IOException e) {
log.error("搜狐话题页面连接失败",e); log.error("搜狐话题页面连接失败",e.fillInStackTrace());
} }
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){ if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
JSONObject jsonObject = JSONObject.parseObject(htmlBody).getJSONObject("data"); JSONObject jsonObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
......
...@@ -41,7 +41,7 @@ public class ToutiaoHotSearchCrawler { ...@@ -41,7 +41,7 @@ public class ToutiaoHotSearchCrawler {
String jsUrl = "https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js"; String jsUrl = "https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js";
Request jsRequest = RequestUtils.wrapGet(jsUrl); Request jsRequest = RequestUtils.wrapGet(jsUrl);
String jsBody = null; String jsBody = null;
try(Response response = httpBoot.syncCall(jsRequest)) { try(Response response = httpBoot.syncCall(jsRequest,ProxyHolder.NAT_HEAVY_PROXY)) {
jsBody = response.body().string(); jsBody = response.body().string();
} catch (IOException e) { } catch (IOException e) {
log.error("获取今日头条实时热搜头部信息标识失败",e); log.error("获取今日头条实时热搜头部信息标识失败",e);
...@@ -55,7 +55,7 @@ public class ToutiaoHotSearchCrawler { ...@@ -55,7 +55,7 @@ public class ToutiaoHotSearchCrawler {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for(int count =0; count<=5; count++){ for(int count =0; count<=5; count++){
try(Response response = httpBoot.syncCall(request)) { try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string(); htmlBody = response.body().string();
} catch (IOException e1) { } catch (IOException e1) {
log.error("解析今日头条实时热搜时出现连接失败",e1); log.error("解析今日头条实时热搜时出现连接失败",e1);
......
...@@ -2,7 +2,9 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,7 +2,9 @@ package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import com.alibaba.fastjson.JSON;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
...@@ -111,10 +113,10 @@ public class WeiboHotSearchCrawler { ...@@ -111,10 +113,10 @@ public class WeiboHotSearchCrawler {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap); Request request = RequestUtils.wrapGet(url, headerMap);
for(int count =0; count<=5; count++){ for(int count =0; count<=5; count++){
try(Response response = httpBoot.syncCall(request)) { try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string(); htmlBody = response.body().string();
} catch (IOException e1) { } catch (IOException e) {
log.error("解析微博时时热搜时出现连接失败",e1); log.error("解析微博时热搜时出现连接失败",e);
} }
List<HotSearchList> result = new ArrayList<HotSearchList>(); List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
...@@ -122,17 +124,21 @@ public class WeiboHotSearchCrawler { ...@@ -122,17 +124,21 @@ public class WeiboHotSearchCrawler {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data"); JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray cards = json.getJSONArray("cards"); JSONArray cards = json.getJSONArray("cards");
int rank = 0; int rank = 0;
for (int i = 0; i < cards.size(); i++) { // for (int i = 0; i < cards.size(); i++) {
try { try {
JSONObject card = cards.getJSONObject(i); JSONObject card = cards.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group"); JSONArray cardGroup = card.getJSONArray("card_group");
JSONObject topCard =cardGroup.getJSONObject(0);
if(!topCard.containsKey("pic")){
rank = 1;
}
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) { if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
String title = card.getString("title"); // String title = card.getString("title");
boolean hot = true; boolean hot = true;
if (Objects.nonNull(title) && title.contains("实时上升热点")) { // if (Objects.nonNull(title) && title.contains("实时上升热点")) {
hot = false; // hot = false;
rank = 51; // rank = 51;
} // }
for (int j = 0; j < cardGroup.size(); j++) { for (int j = 0; j < cardGroup.size(); j++) {
JSONObject cardInfo = cardGroup.getJSONObject(j); JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc"); String name = cardInfo.getString("desc");
...@@ -150,19 +156,62 @@ public class WeiboHotSearchCrawler { ...@@ -150,19 +156,62 @@ public class WeiboHotSearchCrawler {
log.info("card 数据结构为:{}", card); log.info("card 数据结构为:{}", card);
} }
} catch (Exception e) { } catch (Exception e) {
log.error("解析微博时热搜时出现解析错误", e); log.error("解析微博时热搜时出现解析错误", e);
continue; continue;
} }
} // }
return result; return result;
} catch (Exception e) { } catch (Exception e) {
log.error("解析微博时热搜时出现解析错误,数据不是json结构", e); log.error("解析微博时热搜时出现解析错误,数据不是json结构", e);
} }
} else { } else {
log.info("解析微博时热搜时出现解析错误,页面结构有问题"); log.info("解析微博时热搜时出现解析错误,页面结构有问题");
} }
} }
return Collections.emptyList(); return Collections.emptyList();
} }
/**
* 微博预热榜(实时上升热点采集)
* @param date
* @return
*/
public static List<HotSearchList> weiboPreheatSearch(Date date){
String url = "https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜时出现连接失败",e);
}
List<HotSearchList> result = new ArrayList<>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")){
JSONArray cardArray = JSON.parseObject(htmlBody).getJSONArray("cards");
if(cardArray.size() > 1) {
JSONObject jsonObject = cardArray.getJSONObject(1);
if ("实时上升热点".equals(jsonObject.getString("title")) &&
jsonObject.containsKey("card_group")) {
JSONArray jsonArray = jsonObject.getJSONArray("card_group");
for(int i=0; i<jsonArray.size(); i++){
JSONObject cardInfo = jsonArray.getJSONObject(i);
String name = cardInfo.getString("desc");
int hotCount = cardInfo.getIntValue("desc_extr");
String weiboUrl = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
HotSearchList hotSearchList = new HotSearchList(weiboUrl,name,hotCount,null,HotSearchType.微博预热榜.name(),date);
result.add(hotSearchList);
}
//根据热度排序,赋值排名
result = result.stream().sorted(Comparator.comparing(HotSearchList::getCount).reversed()).collect(Collectors.toList());
int rank =1;
for(HotSearchList hotSearchList : result){
hotSearchList.setRank(rank);
rank++;
}
}
}
}
return result;
}
} }
...@@ -139,7 +139,6 @@ public class WeiboTopicCrawler { ...@@ -139,7 +139,6 @@ public class WeiboTopicCrawler {
//重试三次 //重试三次
for(int retryTimes = 1; retryTimes<=5; retryTimes++) { for(int retryTimes = 1; retryTimes<=5; retryTimes++) {
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
// log.info("pageUrl::{}", pageUrl);
htmlBody = response.body().string(); htmlBody = response.body().string();
} catch (Exception e) { } catch (Exception e) {
log.error("下载榜单列表页面时出现错误,错误为:{}", e); log.error("下载榜单列表页面时出现错误,错误为:{}", e);
...@@ -159,8 +158,9 @@ public class WeiboTopicCrawler { ...@@ -159,8 +158,9 @@ public class WeiboTopicCrawler {
private static List<HotSearchList> parseTopicHtml(String htmlBody,Date date) { private static List<HotSearchList> parseTopicHtml(String htmlBody,Date date) {
try { try {
JSONArray cards = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("cards"); JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("cards");
if(Objects.nonNull(cards) && !cards.isEmpty()) { if(Objects.nonNull(jsonArray) && !jsonArray.isEmpty()) {
JSONArray cards = jsonArray.getJSONObject(0).getJSONArray("card_group");
List<HotSearchList> topicList = new ArrayList<>(); List<HotSearchList> topicList = new ArrayList<>();
Integer rank = null; Integer rank = null;
String topicName = null; String topicName = null;
...@@ -169,9 +169,8 @@ public class WeiboTopicCrawler { ...@@ -169,9 +169,8 @@ public class WeiboTopicCrawler {
Integer commentNum = null; Integer commentNum = null;
Integer readNum = null; Integer readNum = null;
String desc2 = null; String desc2 = null;
for(int i=0; i<cards.size(); i++) { for(int i=0; i<cards.size(); i++) {
JSONObject cardGroup = cards.getJSONObject(i).getJSONArray("card_group").getJSONObject(0); JSONObject cardGroup = cards.getJSONObject(i);
rank = cardGroup.getInteger("top_mark_text"); rank = cardGroup.getInteger("top_mark_text");
topicName = cardGroup.getString("title_sub"); topicName = cardGroup.getString("title_sub");
url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode(topicName, "utf-8"); url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode(topicName, "utf-8");
......
...@@ -33,8 +33,8 @@ public class TipsUtils { ...@@ -33,8 +33,8 @@ public class TipsUtils {
if (!typeTips.containsKey(type)) { if (!typeTips.containsKey(type)) {
//发送预警 //发送预警
String crawlerContent = String.format("%s数据采集异常", type); String crawlerContent = String.format("%s数据采集异常", type);
// QYWechatUtil.send(key, QYWechatUtil.MSGTYPE_TEXT, crawlerContent, QYWechatUtil.send(key, QYWechatUtil.MSGTYPE_TEXT, crawlerContent,
// null, null); null, null);
} }
typeTips.put(type, time); typeTips.put(type, time);
} }
...@@ -52,8 +52,8 @@ public class TipsUtils { ...@@ -52,8 +52,8 @@ public class TipsUtils {
typeTips.remove(type); typeTips.remove(type);
//发送恢复通知 //发送恢复通知
String crawlerContent = String.format("%s数据采集恢复正常", type); String crawlerContent = String.format("%s数据采集恢复正常", type);
// QYWechatUtil.send(key, QYWechatUtil.MSGTYPE_TEXT, crawlerContent, QYWechatUtil.send(key, QYWechatUtil.MSGTYPE_TEXT, crawlerContent,
// null, null); null, null);
} }
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment