Merge branch 'working' into 'master'

Working See merge request !218

Merge branch 'working' into 'master'
Working See merge request !218
8f07a0cc · chenweitao · 3b1f63a6 · ab4d9e51 · 8f07a0cc · 8f07a0cc
Commit 8f07a0cc authored Nov 21, 2022 by chenweitao
5 changed files
--- a/src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+++ b/src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
@@ -33,6 +33,7 @@ public class BaiDuHotSearchCrawler {
 //    private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
    /**
     * @return void 返回类型
     * @Title: BaiDuHotSearchTest

--- a/src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
+++ b/src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
@@ -35,7 +35,8 @@ public class KuaiShouHotSearchCrawler {
     * @Description: PC端36Kr人气榜采集
     */
    public static List<HotSearchList> KuaiShouHotSearchCrawler(Date date) {
-        String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791";
+        //String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791";
+        String url = "https://www.kuaishou.com/?isHome=1";
        String htmlBody = null;
        Request request = RequestUtils.wrapGet(url);
        Response response = HttpClientUtils.httpBoot.syncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY);

--- a/src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+++ b/src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
@@ -8,6 +8,7 @@ import com.zhiwei.http.boot.Response;
 import com.zhiwei.http.proxy.ProxyServerSupplier;
 import com.zhiwei.http.proxy.ProxySupplier;
 import com.zhiwei.http.util.RequestUtils;
+import com.zhiwei.searchhotcrawler.util.DelTagsUtil;
 import com.zhiwei.searchhotcrawler.util.HttpClientUtils;
 import io.netty.handler.ssl.SslProvider;
 import lombok.extern.log4j.Log4j2;
@@ -144,10 +145,12 @@ public class ZhihuHotSearchCrawler {
                        String tog = nonNull(doc.get("tag")) ? doc.getString("tag") : null;
                        Long view = nonNull(doc.get("view")) ? Long.valueOf(doc.get("view").toString()) : null;
                        Long fans = nonNull(doc.get("fans")) ? Long.valueOf(doc.get("fans").toString()) : null;
+                        String topicLead = nonNull(doc.get("topicLead")) ? doc.getString("topicLead") : null;
                        HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(), date);
                        zhihu.setFans(fans);
                        zhihu.setView(view);
                        zhihu.setTag(tog);
+                        zhihu.setTopicLead(topicLead);
                        list.add(zhihu);
                    }
                    return list;
@@ -178,12 +181,18 @@ public class ZhihuHotSearchCrawler {
                log.error("单条知乎热搜数据页面连接失败", cause);
                return doc;
            } else {
+                String[] split = url.split("/");
+                String id= split[4];
                String htmlBody = response.bodyString();
                if (htmlBody != null && htmlBody.contains("QuestionHeader")) {
                    Document document = Jsoup.parse(htmlBody);
                    //获取标签
                    String label = "";
                    Elements select = document.select("div.QuestionHeader-topics").select("div.css-1gomreu");
+                    String substring = htmlBody.substring(htmlBody.indexOf("initialState") - 2, htmlBody.indexOf("subAppName") + 19);
+                    JSONObject jsonObject = JSONObject.parseObject(substring);
+                    String detail= jsonObject.getJSONObject("initialState").getJSONObject("entities").getJSONObject("questions").getJSONObject(id).getString("detail");
+                    String topicLead = DelTagsUtil.getTextFromHtml(detail);
                    for (Element element : select) {
                        String text = "`" + element.select("div.css-1gomreu").text() + ";";
                        label = label + text;
@@ -195,6 +204,7 @@ public class ZhihuHotSearchCrawler {
                    doc.put("fans", Long.valueOf(count[0].replaceAll(",", "").trim()));
                    //获取浏览量
                    doc.put("view", Long.valueOf(count[1].replaceAll(",", "").trim()));
+                    doc.put("topicLead",topicLead);
                    return doc;
                } else {
                    return doc;

--- a/src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+++ b/src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
@@ -114,6 +114,7 @@ public class HotSearchCacheDAO {
                document.put("tag", hotSearch.getTag());
                document.put("view", hotSearch.getView());
                document.put("fans", hotSearch.getFans());
+                document.put("topic_lead", hotSearch.getTopicLead());
            }
            if ("微博出圈榜".equals(hotSearch.getType())) {
@@ -143,6 +144,9 @@ public class HotSearchCacheDAO {
            if ("B站排行榜".equals(hotSearch.getType())) {
                document.remove("downtext");
            }
+            if ("知乎热搜".equals(hotSearch.getType())) {
+                document.remove("topic_lead");
+            }
            if (hotSearch.getType().contains("微博品牌")) {
                document.remove("readCount");
                document.remove("discussCount");

--- a/src/main/java/com/zhiwei/searchhotcrawler/util/DelTagsUtil.java
+++ b/src/main/java/com/zhiwei/searchhotcrawler/util/DelTagsUtil.java
+package com.zhiwei.searchhotcrawler.util;
+/**
+ * 去除文章内容页页面代码里的HTML标签
+ */
+public class DelTagsUtil {
+    /**
+     * 去除html代码中含有的标签
+     * @param htmlStr
+     * @return
+     */
+    public static String delHtmlTags(String htmlStr) {
+        //定义script的正则表达式，去除js可以防止注入
+        String scriptRegex="<script[^>]*?>[\\s\\S]*?<\\/script>";
+        //定义style的正则表达式，去除style样式，防止css代码过多时只截取到css样式代码
+        String styleRegex="<style[^>]*?>[\\s\\S]*?<\\/style>";
+        //定义HTML标签的正则表达式，去除标签，只提取文字内容
+        String htmlRegex="<[^>]+>";
+        //定义空格,回车,换行符,制表符
+        String spaceRegex = "\\s*|\t|\r|\n";
+        // 过滤script标签
+        htmlStr = htmlStr.replaceAll(scriptRegex, "");
+        // 过滤style标签
+        htmlStr = htmlStr.replaceAll(styleRegex, "");
+        // 过滤html标签
+        htmlStr = htmlStr.replaceAll(htmlRegex, "");
+        // 过滤空格等
+        htmlStr = htmlStr.replaceAll(spaceRegex, "");
+        return htmlStr.trim(); // 返回文本字符串
+    }
+    /**
+     * 获取HTML代码里的内容
+     * @param htmlStr
+     * @return
+     */
+    public static String getTextFromHtml(String htmlStr){
+        //去除html标签
+        htmlStr = delHtmlTags(htmlStr);
+        //去除空格" "
+        htmlStr = htmlStr.replaceAll(" ","");
+        return htmlStr;
+    }
+}