Commit bd60cfd9 by leiliangliang

微博恢复到原来采集状态

parent e9f9cfb3
...@@ -125,102 +125,6 @@ public class WeiboHotSearchCrawler { ...@@ -125,102 +125,6 @@ public class WeiboHotSearchCrawler {
// } // }
/**
* @return void 返回类型
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
*/
public static List<HotSearchList> weiboHotSearchByPhone(Date date) {
//String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
String url = "https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot";
Map<String, String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博时热搜时出现连接失败", e);
}
List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONArray json = JSONObject.parseObject(htmlBody).getJSONArray("cards");
int rank = 0;
// for (int i = 0; i < cards.size(); i++) {
try {
JSONObject card = json.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group");
JSONObject topCard = cardGroup.getJSONObject(0);
if (!topCard.containsKey("pic")) {
rank = 1;
}
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
// String title = card.getString("title");
boolean hot = true;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
// hot = false;
// rank = 51;
// }
for (int j = 0; j < cardGroup.size(); j++) {
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
String desc_extr = cardInfo.getString("desc_extr");
String heatLabel=null;
Long hotCount =0L;
if (!StringUtils.isEmpty(desc_extr)&&Objects.nonNull(desc_extr)){
String regEx="[^0-9]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(desc_extr);
String num = m.replaceAll("").trim();
hotCount = Long.valueOf(num);
String[] split = desc_extr.split(" ");
if (split.length>1){
String heat= split[0].trim();
boolean flag = StringUtils.isNumeric(heat);
if (!flag){
heatLabel= split[0].trim();
}
}
}
String iconUrl = cardInfo.getString("icon");
String icon=null;
if (StringUtils.isNotBlank(iconUrl)) {
icon = iconUrl.split("_")[1].split(".png")[0];
}
// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String id = cardInfo.getString("scheme");
String url1 = "https://m.weibo.cn/search?"+id.split("[?]")[1];
HotSearchList hotSearch = new HotSearchList(url1, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
hotSearch.setHeatLabel(heatLabel);
if (Objects.nonNull(iconUrl)){hotSearch.setIconUrl(iconUrl);}
result.add(hotSearch);
rank++;
redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS, name + "_微博热搜");
}
} else {
log.info("card 数据结构为:{}", card);
}
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误", e);
continue;
}
// }
return result;
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微博时热搜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
// /** // /**
// * @return void 返回类型 // * @return void 返回类型
// * @Title: weiboHotSearchByPhoneTest // * @Title: weiboHotSearchByPhoneTest
...@@ -243,12 +147,11 @@ public class WeiboHotSearchCrawler { ...@@ -243,12 +147,11 @@ public class WeiboHotSearchCrawler {
// List<HotSearchList> result = new ArrayList<HotSearchList>(); // List<HotSearchList> result = new ArrayList<HotSearchList>();
// if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) { // if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
// try { // try {
// JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data"); // JSONArray json = JSONObject.parseObject(htmlBody).getJSONArray("cards");
// JSONArray cards = json.getJSONArray("cards");
// int rank = 0; // int rank = 0;
//// for (int i = 0; i < cards.size(); i++) { //// for (int i = 0; i < cards.size(); i++) {
// try { // try {
// JSONObject card = cards.getJSONObject(0); // JSONObject card = json.getJSONObject(0);
// JSONArray cardGroup = card.getJSONArray("card_group"); // JSONArray cardGroup = card.getJSONArray("card_group");
// JSONObject topCard = cardGroup.getJSONObject(0); // JSONObject topCard = cardGroup.getJSONObject(0);
// if (!topCard.containsKey("pic")) { // if (!topCard.containsKey("pic")) {
...@@ -266,16 +169,22 @@ public class WeiboHotSearchCrawler { ...@@ -266,16 +169,22 @@ public class WeiboHotSearchCrawler {
// String name = cardInfo.getString("desc"); // String name = cardInfo.getString("desc");
// String desc_extr = cardInfo.getString("desc_extr"); // String desc_extr = cardInfo.getString("desc_extr");
// String heatLabel=null; // String heatLabel=null;
// Long hotCount =null; // Long hotCount =0L;
// if (Objects.nonNull(desc_extr)){ // if (!StringUtils.isEmpty(desc_extr)&&Objects.nonNull(desc_extr)){
// String regEx="[^0-9]";
// Pattern p = Pattern.compile(regEx);
// Matcher m = p.matcher(desc_extr);
// String num = m.replaceAll("").trim();
// hotCount = Long.valueOf(num);
// String[] split = desc_extr.split(" "); // String[] split = desc_extr.split(" ");
// if (split.length>1){ // if (split.length>1){
// heatLabel= split[0].trim(); // String heat= split[0].trim();
// hotCount= Long.valueOf(split[1].trim()); // boolean flag = StringUtils.isNumeric(heat);
// // if (!flag){
// }else { // heatLabel= split[0].trim();
// hotCount = cardInfo.getLongValue("desc_extr"); // }
// } // }
//
// } // }
// String iconUrl = cardInfo.getString("icon"); // String iconUrl = cardInfo.getString("icon");
// String icon=null; // String icon=null;
...@@ -284,7 +193,8 @@ public class WeiboHotSearchCrawler { ...@@ -284,7 +193,8 @@ public class WeiboHotSearchCrawler {
// } // }
//// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top"; //// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
// String id = cardInfo.getString("scheme"); // String id = cardInfo.getString("scheme");
// HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date); // String url1 = "https://m.weibo.cn/search?"+id.split("[?]")[1];
// HotSearchList hotSearch = new HotSearchList(url1, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
// hotSearch.setHeatLabel(heatLabel); // hotSearch.setHeatLabel(heatLabel);
// if (Objects.nonNull(iconUrl)){hotSearch.setIconUrl(iconUrl);} // if (Objects.nonNull(iconUrl)){hotSearch.setIconUrl(iconUrl);}
// result.add(hotSearch); // result.add(hotSearch);
...@@ -312,6 +222,96 @@ public class WeiboHotSearchCrawler { ...@@ -312,6 +222,96 @@ public class WeiboHotSearchCrawler {
// } // }
/** /**
* @return void 返回类型
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
*/
public static List<HotSearchList> weiboHotSearchByPhone(Date date) {
String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
//String url = "https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot";
Map<String, String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博时热搜时出现连接失败", e);
}
List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray cards = json.getJSONArray("cards");
int rank = 0;
// for (int i = 0; i < cards.size(); i++) {
try {
JSONObject card = cards.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group");
JSONObject topCard = cardGroup.getJSONObject(0);
if (!topCard.containsKey("pic")) {
rank = 1;
}
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
// String title = card.getString("title");
boolean hot = true;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
// hot = false;
// rank = 51;
// }
for (int j = 0; j < cardGroup.size(); j++) {
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
String desc_extr = cardInfo.getString("desc_extr");
String heatLabel=null;
Long hotCount =null;
if (Objects.nonNull(desc_extr)){
String[] split = desc_extr.split(" ");
if (split.length>1){
heatLabel= split[0].trim();
hotCount= Long.valueOf(split[1].trim());
}else {
hotCount = cardInfo.getLongValue("desc_extr");
}
}
String iconUrl = cardInfo.getString("icon");
String icon=null;
if (StringUtils.isNotBlank(iconUrl)) {
icon = iconUrl.split("_")[1].split(".png")[0];
}
// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String id = cardInfo.getString("scheme");
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
hotSearch.setHeatLabel(heatLabel);
if (Objects.nonNull(iconUrl)){hotSearch.setIconUrl(iconUrl);}
result.add(hotSearch);
rank++;
//redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS, name + "_微博热搜");
}
} else {
log.info("card 数据结构为:{}", card);
}
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误", e);
continue;
}
// }
return result;
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微博时热搜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
/**
* 微博预热榜(实时上升热点采集) * 微博预热榜(实时上升热点采集)
* *
* @param date * @param date
......
...@@ -218,7 +218,7 @@ public class HotSearchCacheDAO { ...@@ -218,7 +218,7 @@ public class HotSearchCacheDAO {
nowDoc.put("pictureUrl",pictureUrl); nowDoc.put("pictureUrl",pictureUrl);
} }
if("微博热搜".equals(type)){ if("微博热搜".equals(type)){
//nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc); nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc);
//更新微博话题贡献者,关于功能 //更新微博话题贡献者,关于功能
Document documentPC = WeiboHotSearchCrawler.weiboUpdatePC(nowDoc); Document documentPC = WeiboHotSearchCrawler.weiboUpdatePC(nowDoc);
if (documentPC.containsKey("分类")) { if (documentPC.containsKey("分类")) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment