Commit 4987c3f7 by leiliangliang

搜狗微信热搜采集程序更新

parent e4f4c636
......@@ -40,10 +40,10 @@ public class SougoHotSearchCrawler {
* @return void 返回类型
*/
public static List<HotSearchList> sougoHotSearch(Date date) {
String url = "https://weixin.sogou.com";
//String url = "https://weixin.sogou.com";
String url = "https://weixin.sogou.com/pcindex/pc/web/web.js?";
List<HotSearchList> list = new ArrayList<>();
Map<String,String> headMap = HeaderTool.getCommonHead();
Request request = RequestUtils.wrapGet(url, headMap);
Request request = RequestUtils.wrapGet(url);
for (int i = 0; i < 3; i++) {
String htmlBody = null;
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
......@@ -55,36 +55,16 @@ public class SougoHotSearchCrawler {
}
if (htmlBody != null && htmlBody.contains("topwords")) {
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("ol#topwords").select("li");
for (Element element : elements) {
try {
// 获取排名rank
String rankStr = null;
if (!element.select("li").select("i").isEmpty()) {
rankStr = element.select("li").select("i").text();
}
Integer rank = null;
if (StringUtils.isNoneBlank(rankStr)) {
rank = Integer.valueOf(rankStr);
}
// 获取关键词(String)
String kw = element.select("li").select("a").attr("title");
// log.info("关键词:{}", kw);
String everurl = element.select("li").select("a").attr("href");
int rank = 0;
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("topwords");
for (Object object : jsonArray) {
rank++;
JSONObject json = (JSONObject)JSONObject.toJSON(object);
String kw = json.getString("word");
String everurl="https://weixin.sogou.com/weixin?type=2&query="+kw;
HotSearchList hotSearch = new HotSearchList(everurl, kw, null, rank, HotSearchType.搜狗微信热搜.name(),date);
if (Objects.nonNull(rank)) {
list.add(hotSearch);
}
} catch (Exception e) {
log.error("解析搜狗微信时出现解析错误", e);
}
}
break;
} catch (Exception e) {
log.error("解析搜狗微信时出现解析错误,数据不是json结构", e.fillInStackTrace());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment