Commit e03ea262 by 马黎滨

采集添加代理ip

parent a98a48ca
......@@ -3,6 +3,7 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
......@@ -30,11 +31,11 @@ public class SouhuTopicCrawler {
String htmlBody = null;
String url = "https://api.k.sohu.com/api/news/moment/v2/list.go?pageSize=50";
Request request = RequestUtils.wrapGet(url);
for(int t=0; t<3&&dataJson==null; t++){
try(Response response = httpBoot.syncCall(request)) {
for(int t=0; t<3 && dataJson==null; t++){
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("搜狐话题页面连接失败",e);
log.error("搜狐话题页面连接失败",e.fillInStackTrace());
}
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
JSONObject jsonObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
......
......@@ -41,7 +41,7 @@ public class ToutiaoHotSearchCrawler {
String jsUrl = "https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js";
Request jsRequest = RequestUtils.wrapGet(jsUrl);
String jsBody = null;
try(Response response = httpBoot.syncCall(jsRequest)) {
try(Response response = httpBoot.syncCall(jsRequest,ProxyHolder.NAT_HEAVY_PROXY)) {
jsBody = response.body().string();
} catch (IOException e) {
log.error("获取今日头条实时热搜头部信息标识失败",e);
......@@ -55,7 +55,7 @@ public class ToutiaoHotSearchCrawler {
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for(int count =0; count<=5; count++){
try(Response response = httpBoot.syncCall(request)) {
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e1) {
log.error("解析今日头条实时热搜时出现连接失败",e1);
......
......@@ -2,7 +2,9 @@ package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
import com.alibaba.fastjson.JSON;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
......@@ -111,10 +113,10 @@ public class WeiboHotSearchCrawler {
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for(int count =0; count<=5; count++){
try(Response response = httpBoot.syncCall(request)) {
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e1) {
log.error("解析微博时时热搜时出现连接失败",e1);
} catch (IOException e) {
log.error("解析微博时热搜时出现连接失败",e);
}
List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
......@@ -122,17 +124,21 @@ public class WeiboHotSearchCrawler {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray cards = json.getJSONArray("cards");
int rank = 0;
for (int i = 0; i < cards.size(); i++) {
// for (int i = 0; i < cards.size(); i++) {
try {
JSONObject card = cards.getJSONObject(i);
JSONObject card = cards.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group");
JSONObject topCard =cardGroup.getJSONObject(0);
if(!topCard.containsKey("pic")){
rank = 1;
}
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
String title = card.getString("title");
// String title = card.getString("title");
boolean hot = true;
if (Objects.nonNull(title) && title.contains("实时上升热点")) {
hot = false;
rank = 51;
}
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
// hot = false;
// rank = 51;
// }
for (int j = 0; j < cardGroup.size(); j++) {
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
......@@ -150,19 +156,62 @@ public class WeiboHotSearchCrawler {
log.info("card 数据结构为:{}", card);
}
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误", e);
log.error("解析微博时热搜时出现解析错误", e);
continue;
}
}
// }
return result;
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误,数据不是json结构", e);
log.error("解析微博时热搜时出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微博时热搜时出现解析错误,页面结构有问题");
log.info("解析微博时热搜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
/**
* 微博预热榜(实时上升热点采集)
* @param date
* @return
*/
public static List<HotSearchList> weiboPreheatSearch(Date date){
String url = "https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜时出现连接失败",e);
}
List<HotSearchList> result = new ArrayList<>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")){
JSONArray cardArray = JSON.parseObject(htmlBody).getJSONArray("cards");
if(cardArray.size() > 1) {
JSONObject jsonObject = cardArray.getJSONObject(1);
if ("实时上升热点".equals(jsonObject.getString("title")) &&
jsonObject.containsKey("card_group")) {
JSONArray jsonArray = jsonObject.getJSONArray("card_group");
for(int i=0; i<jsonArray.size(); i++){
JSONObject cardInfo = jsonArray.getJSONObject(i);
String name = cardInfo.getString("desc");
int hotCount = cardInfo.getIntValue("desc_extr");
String weiboUrl = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
HotSearchList hotSearchList = new HotSearchList(weiboUrl,name,hotCount,null,HotSearchType.微博预热榜.name(),date);
result.add(hotSearchList);
}
//根据热度排序,赋值排名
result = result.stream().sorted(Comparator.comparing(HotSearchList::getCount).reversed()).collect(Collectors.toList());
int rank =1;
for(HotSearchList hotSearchList : result){
hotSearchList.setRank(rank);
rank++;
}
}
}
}
return result;
}
}
......@@ -139,7 +139,6 @@ public class WeiboTopicCrawler {
//重试三次
for(int retryTimes = 1; retryTimes<=5; retryTimes++) {
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
// log.info("pageUrl::{}", pageUrl);
htmlBody = response.body().string();
} catch (Exception e) {
log.error("下载榜单列表页面时出现错误,错误为:{}", e);
......@@ -159,8 +158,9 @@ public class WeiboTopicCrawler {
private static List<HotSearchList> parseTopicHtml(String htmlBody,Date date) {
try {
JSONArray cards = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("cards");
if(Objects.nonNull(cards) && !cards.isEmpty()) {
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("cards");
if(Objects.nonNull(jsonArray) && !jsonArray.isEmpty()) {
JSONArray cards = jsonArray.getJSONObject(0).getJSONArray("card_group");
List<HotSearchList> topicList = new ArrayList<>();
Integer rank = null;
String topicName = null;
......@@ -169,9 +169,8 @@ public class WeiboTopicCrawler {
Integer commentNum = null;
Integer readNum = null;
String desc2 = null;
for(int i=0; i<cards.size(); i++) {
JSONObject cardGroup = cards.getJSONObject(i).getJSONArray("card_group").getJSONObject(0);
JSONObject cardGroup = cards.getJSONObject(i);
rank = cardGroup.getInteger("top_mark_text");
topicName = cardGroup.getString("title_sub");
url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode(topicName, "utf-8");
......
......@@ -33,8 +33,8 @@ public class TipsUtils {
if (!typeTips.containsKey(type)) {
//发送预警
String crawlerContent = String.format("%s数据采集异常", type);
// QYWechatUtil.send(key, QYWechatUtil.MSGTYPE_TEXT, crawlerContent,
// null, null);
QYWechatUtil.send(key, QYWechatUtil.MSGTYPE_TEXT, crawlerContent,
null, null);
}
typeTips.put(type, time);
}
......@@ -52,8 +52,8 @@ public class TipsUtils {
typeTips.remove(type);
//发送恢复通知
String crawlerContent = String.format("%s数据采集恢复正常", type);
// QYWechatUtil.send(key, QYWechatUtil.MSGTYPE_TEXT, crawlerContent,
// null, null);
QYWechatUtil.send(key, QYWechatUtil.MSGTYPE_TEXT, crawlerContent,
null, null);
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment