更新虎嗅采集程序

9a3f1625 · leiliangliang · 310520db · 9a3f1625
Commit 9a3f1625 authored Dec 22, 2021 by leiliangliang
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 28 deletions

src/main/java/com/zhiwei/searchhotcrawler/crawler/HuXiuHotSearchCrawler.java
+21 -28

No files found.
--- a/src/main/java/com/zhiwei/searchhotcrawler/crawler/HuXiuHotSearchCrawler.java
+++ b/src/main/java/com/zhiwei/searchhotcrawler/crawler/HuXiuHotSearchCrawler.java
 package com.zhiwei.searchhotcrawler.crawler;

+import com.alibaba.fastjson.JSONArray;
+import com.alibaba.fastjson.JSONObject;
 import com.zhiwei.crawler.core.HttpBoot;
 import com.zhiwei.crawler.core.proxy.ProxyHolder;
 import com.zhiwei.crawler.core.utils.RequestUtils;
@@ -73,35 +75,26 @@ public class HuXiuHotSearchCrawler {
    //解析页面数据
    private static List<HotSearchList> ansysData(String htmlBody, Date date) {
        ArrayList<HotSearchList> list = new ArrayList<>();
-        String webSite="https://www.huxiu.com";
        try {
-            //获取Document文档对象
-            Document document = Jsoup.parse(htmlBody);
-            //获取元素集合
-            Elements elements = document.select("div.hot__list").select("div.focus-item");
-
-            if (Objects.nonNull(elements) && !elements.isEmpty()){
-                // 获取排名rank
-                Integer rank = 0;
-                for (Element element : elements) {
-                    try {
-                        rank++;
-                        //获取关键词
-                        String keyWord= element.select("p").text();
-                        //获取关键词相关链接
-                        String href = element.select("a.focus-item__left").attr("href");
-                        String url=webSite+href;
-                        //获取讨论量
-                        String comment = element.select("i").first().text();
-                        Long commentCount = Long.valueOf(comment);
-                        String topicLead =null;
-                        long count=0L;
-                        HotSearchList hotSearchList = new HotSearchList(url, keyWord,count, rank,HotSearchType.虎嗅热文推荐.name(),commentCount, topicLead, date);
-                        list.add(hotSearchList);
-                    } catch (NumberFormatException e) {
-                        log.error("解析虎嗅热文推荐时出现解析错误",e);
-                    }
-                }
+            String substring = htmlBody.substring(htmlBody.indexOf("articleHot") + 12, htmlBody.indexOf("momentList") - 2);
+            JSONArray arr = JSONObject.parseArray(substring);
+            //获取每个jsonObject对象的值
+            Integer rank = 0;
+            for (Object object : arr) {
+                rank++;
+                JSONObject json = (JSONObject)JSONObject.toJSON(object);
+                //获取标题
+                String title = json.getString("title");
+                //获取链接
+                String url = json.getString("share_url");
+                //获取讨论量
+                JSONObject countInfo = json.getJSONObject("count_info");
+                String commentnum = countInfo.getString("commentnum");
+                Long commentCount = Long.valueOf(commentnum);
+                String topicLead =null;
+                long count=0L;
+                HotSearchList hotSearchList = new HotSearchList(url, title,count, rank,HotSearchType.虎嗅热文推荐.name(),commentCount, topicLead, date);
+                list.add(hotSearchList);
            }
        } catch (Exception e) {
            log.error("解析虎嗅热文推荐时出现解析错误,数据不是json结构",e);