Commit bd60cfd9 by leiliangliang

微博恢复到原来采集状态

parent e9f9cfb3
......@@ -125,102 +125,6 @@ public class WeiboHotSearchCrawler {
// }
/**
* @return void 返回类型
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
*/
public static List<HotSearchList> weiboHotSearchByPhone(Date date) {
//String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
String url = "https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot";
Map<String, String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博时热搜时出现连接失败", e);
}
List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONArray json = JSONObject.parseObject(htmlBody).getJSONArray("cards");
int rank = 0;
// for (int i = 0; i < cards.size(); i++) {
try {
JSONObject card = json.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group");
JSONObject topCard = cardGroup.getJSONObject(0);
if (!topCard.containsKey("pic")) {
rank = 1;
}
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
// String title = card.getString("title");
boolean hot = true;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
// hot = false;
// rank = 51;
// }
for (int j = 0; j < cardGroup.size(); j++) {
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
String desc_extr = cardInfo.getString("desc_extr");
String heatLabel=null;
Long hotCount =0L;
if (!StringUtils.isEmpty(desc_extr)&&Objects.nonNull(desc_extr)){
String regEx="[^0-9]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(desc_extr);
String num = m.replaceAll("").trim();
hotCount = Long.valueOf(num);
String[] split = desc_extr.split(" ");
if (split.length>1){
String heat= split[0].trim();
boolean flag = StringUtils.isNumeric(heat);
if (!flag){
heatLabel= split[0].trim();
}
}
}
String iconUrl = cardInfo.getString("icon");
String icon=null;
if (StringUtils.isNotBlank(iconUrl)) {
icon = iconUrl.split("_")[1].split(".png")[0];
}
// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String id = cardInfo.getString("scheme");
String url1 = "https://m.weibo.cn/search?"+id.split("[?]")[1];
HotSearchList hotSearch = new HotSearchList(url1, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
hotSearch.setHeatLabel(heatLabel);
if (Objects.nonNull(iconUrl)){hotSearch.setIconUrl(iconUrl);}
result.add(hotSearch);
rank++;
redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS, name + "_微博热搜");
}
} else {
log.info("card 数据结构为:{}", card);
}
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误", e);
continue;
}
// }
return result;
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微博时热搜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
// /**
// * @return void 返回类型
// * @Title: weiboHotSearchByPhoneTest
......@@ -243,12 +147,11 @@ public class WeiboHotSearchCrawler {
// List<HotSearchList> result = new ArrayList<HotSearchList>();
// if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
// try {
// JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data");
// JSONArray cards = json.getJSONArray("cards");
// JSONArray json = JSONObject.parseObject(htmlBody).getJSONArray("cards");
// int rank = 0;
//// for (int i = 0; i < cards.size(); i++) {
// try {
// JSONObject card = cards.getJSONObject(0);
// JSONObject card = json.getJSONObject(0);
// JSONArray cardGroup = card.getJSONArray("card_group");
// JSONObject topCard = cardGroup.getJSONObject(0);
// if (!topCard.containsKey("pic")) {
......@@ -266,17 +169,23 @@ public class WeiboHotSearchCrawler {
// String name = cardInfo.getString("desc");
// String desc_extr = cardInfo.getString("desc_extr");
// String heatLabel=null;
// Long hotCount =null;
// if (Objects.nonNull(desc_extr)){
// Long hotCount =0L;
// if (!StringUtils.isEmpty(desc_extr)&&Objects.nonNull(desc_extr)){
// String regEx="[^0-9]";
// Pattern p = Pattern.compile(regEx);
// Matcher m = p.matcher(desc_extr);
// String num = m.replaceAll("").trim();
// hotCount = Long.valueOf(num);
// String[] split = desc_extr.split(" ");
// if (split.length>1){
// String heat= split[0].trim();
// boolean flag = StringUtils.isNumeric(heat);
// if (!flag){
// heatLabel= split[0].trim();
// hotCount= Long.valueOf(split[1].trim());
//
// }else {
// hotCount = cardInfo.getLongValue("desc_extr");
// }
// }
//
// }
// String iconUrl = cardInfo.getString("icon");
// String icon=null;
// if (StringUtils.isNotBlank(iconUrl)) {
......@@ -284,7 +193,8 @@ public class WeiboHotSearchCrawler {
// }
//// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
// String id = cardInfo.getString("scheme");
// HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
// String url1 = "https://m.weibo.cn/search?"+id.split("[?]")[1];
// HotSearchList hotSearch = new HotSearchList(url1, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
// hotSearch.setHeatLabel(heatLabel);
// if (Objects.nonNull(iconUrl)){hotSearch.setIconUrl(iconUrl);}
// result.add(hotSearch);
......@@ -312,6 +222,96 @@ public class WeiboHotSearchCrawler {
// }
/**
* @return void 返回类型
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
*/
public static List<HotSearchList> weiboHotSearchByPhone(Date date) {
String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
//String url = "https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot";
Map<String, String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博时热搜时出现连接失败", e);
}
List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray cards = json.getJSONArray("cards");
int rank = 0;
// for (int i = 0; i < cards.size(); i++) {
try {
JSONObject card = cards.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group");
JSONObject topCard = cardGroup.getJSONObject(0);
if (!topCard.containsKey("pic")) {
rank = 1;
}
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
// String title = card.getString("title");
boolean hot = true;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
// hot = false;
// rank = 51;
// }
for (int j = 0; j < cardGroup.size(); j++) {
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
String desc_extr = cardInfo.getString("desc_extr");
String heatLabel=null;
Long hotCount =null;
if (Objects.nonNull(desc_extr)){
String[] split = desc_extr.split(" ");
if (split.length>1){
heatLabel= split[0].trim();
hotCount= Long.valueOf(split[1].trim());
}else {
hotCount = cardInfo.getLongValue("desc_extr");
}
}
String iconUrl = cardInfo.getString("icon");
String icon=null;
if (StringUtils.isNotBlank(iconUrl)) {
icon = iconUrl.split("_")[1].split(".png")[0];
}
// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String id = cardInfo.getString("scheme");
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
hotSearch.setHeatLabel(heatLabel);
if (Objects.nonNull(iconUrl)){hotSearch.setIconUrl(iconUrl);}
result.add(hotSearch);
rank++;
//redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS, name + "_微博热搜");
}
} else {
log.info("card 数据结构为:{}", card);
}
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误", e);
continue;
}
// }
return result;
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微博时热搜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
/**
* 微博预热榜(实时上升热点采集)
*
* @param date
......
......@@ -218,7 +218,7 @@ public class HotSearchCacheDAO {
nowDoc.put("pictureUrl",pictureUrl);
}
if("微博热搜".equals(type)){
//nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc);
nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc);
//更新微博话题贡献者,关于功能
Document documentPC = WeiboHotSearchCrawler.weiboUpdatePC(nowDoc);
if (documentPC.containsKey("分类")) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment