Commit 8ec17aa9 by leiliangliang

更新微博pc端采集程序

parent 5ab03924
......@@ -52,109 +52,105 @@ public class WeiboHotSearchCrawler {
static WeiBoUserDao weiBoUserDao = new WeiBoUserDao();
static WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao();
// /**
// * @return void 返回类型
// * @Title: weiboHotSearchTest
// * @author hero
// * @Description: TODO(PC端微博热搜采集)
// */
// public static List<HotSearchList> weiboHotSearch() {
// String url = "https://s.weibo.com/top/summary?cate=realtimehot";
// Map<String, String> headerMap = new HashMap<>();
// headerMap.put("Cookie", "SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN");
// List<HotSearchList> list = new ArrayList<HotSearchList>();
// for (int i = 0; i < 3; i++) {
// String htmlBody = null;
// Request request = RequestUtils.wrapGet(url, headerMap);
// try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// } catch (Exception e) {
// if (i == 2) {
// return list;
// } else {
// continue;
// }
// }
// if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) {
// try {
//// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
//// script = script.replace("(", "").replace(")", "");
//// JSONObject json = JSONObject.parseObject(script);
//// String html =
// Date date = new Date();
// org.jsoup.nodes.Document document = Jsoup.parse(htmlBody);
// Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
// for (Element element : elements) {
// try {
// String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href");
// String name = element.select("td.td-02").select("a").text();
// //String num = !element.select("td.td-02").select("span").text().equals("") ? element.select("td.td-02").select("span").text() : "0";
// String num = element.select("td.td-02").select("span").text();
// //String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("") ? element.select("td[class=\"td-01 ranktop\"]").text() : "-1";
// //获取排名
// String rank = element.select("td.td-01").text();
// Integer rankCount = null;
// //默认推荐位排名为0 置顶为-1
// if ("•".equals(rank)) {
// rankCount = 0;
// id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href_to");
// } else if (StringUtils.isEmpty(rank)) {
// rankCount = -1;
// } else {
// rankCount = Integer.valueOf(rank);
// }
// //获取icon
// String text = element.select("td.td-03").text();
// String icon = null;
// if (StringUtils.isNotEmpty(text) && nonNull(text)) {
// if ("商".equals(text)) {
// icon = "jian";
// } else if ("新".equals(text)) {
// icon = "new";
// } else if ("热".equals(text)) {
// icon = "hot";
// } else if ("沸".equals(text)) {
// icon = "fei";
// } else if ("爆".equals(text)) {
// icon = "boom";
// }
// }
// //获取热度标签
// String heatLabel = null;
// //获取热度值 置顶 推荐位 默认值为0
// Long hotCount =0L;
// if (StringUtils.isNotEmpty(num) && Objects.nonNull(num)) {
// String[] split = num.split(" ");
// if (split.length > 1) {
// heatLabel = split[0].trim();
// hotCount = Long.valueOf(split[1].trim());
// }else {
// hotCount = Long.valueOf(num);
// }
// }
// // Long hotCount = Long.valueOf(num);
// HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), icon, date);
// hotSearch.setHeatLabel(heatLabel);
// list.add(hotSearch);
// } catch (Exception e) {
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
// log.error("解析微博时时热搜时出现解析错误", e);
// continue;
// }
// }
// } catch (Exception e) {
// log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace());
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
// return null;
// }
// } else {
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
// log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
// }
// break;
// }
// return list;
// }
/**
* @return void 返回类型
* @Title: weiboHotSearchTest
* @author hero
* @Description: TODO(PC端微博热搜采集)
*/
public static List<HotSearchList> weiboHotSearch() {
String url = "https://s.weibo.com/top/summary?cate=realtimehot";
Map<String, String> headerMap = new HashMap<>();
headerMap.put("Cookie", "SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN");
List<HotSearchList> list = new ArrayList<HotSearchList>();
for (int i = 0; i < 3; i++) {
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
if (i == 2) {
return list;
} else {
continue;
}
}
if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) {
try {
Date date = new Date();
org.jsoup.nodes.Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
for (Element element : elements) {
try {
//获取链接
String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href");
//获取标题
String name = element.select("td.td-02").select("a").text();
//获取热度值
String num = element.select("td.td-02").select("span").text();
//获取排名
String rank = element.select("td.td-01").text();
Integer rankCount = null;
//默认推荐位排名为0 置顶为-1
if ("•".equals(rank)) {
rankCount = 0;
id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href_to");
} else if (StringUtils.isEmpty(rank)) {
rankCount = -1;
} else {
rankCount = Integer.valueOf(rank);
}
//获取icon
String text = element.select("td.td-03").text();
String icon = null;
if (StringUtils.isNotEmpty(text) && nonNull(text)) {
if ("商".equals(text)) {
icon = "jian";
} else if ("新".equals(text)) {
icon = "new";
} else if ("热".equals(text)) {
icon = "hot";
} else if ("沸".equals(text)) {
icon = "fei";
} else if ("爆".equals(text)) {
icon = "boom";
}
}
//获取热度标签
String heatLabel = null;
//获取热度值 置顶 推荐位 默认值为0
Long hotCount =0L;
if (StringUtils.isNotEmpty(num) && Objects.nonNull(num)) {
String[] split = num.split(" ");
if (split.length > 1) {
heatLabel = split[0].trim();
hotCount = Long.valueOf(split[1].trim());
}else {
hotCount = Long.valueOf(num);
}
}
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), icon, date);
hotSearch.setHeatLabel(heatLabel);
list.add(hotSearch);
} catch (Exception e) {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log.error("解析微博时时热搜时出现解析错误", e);
continue;
}
}
} catch (Exception e) {
log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace());
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
return null;
}
} else {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
}
break;
}
return list;
}
/**
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment