新增B站标签采集和知乎热搜标签采集

1fd52a37 · leiliangliang · d59803e9 · 1fd52a37 · 1fd52a37 · 1fd52a37
Commit 1fd52a37 authored Jan 10, 2022 by leiliangliang
5 changed files
--- a/pom.xml
+++ b/pom.xml
@@ -48,7 +48,12 @@
 			<artifactId>crawler-core</artifactId>
 			<version>0.6.7.4-SNAPSHOT</version>
 		</dependency>
-
+		<!-- https://mvnrepository.com/artifact/org.conscrypt/conscrypt-openjdk-uber -->
+		<dependency>
+			<groupId>org.conscrypt</groupId>
+			<artifactId>conscrypt-openjdk-uber</artifactId>
+			<version>2.5.2</version>
+		</dependency>
 		<!-- 日志依赖 -->
 		<!-- https://mvnrepository.com/artifact/org.apache.logging.log4j/log4j-core -->
 		<dependency>

--- a/src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
+++ b/src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
@@ -94,7 +94,7 @@ public class HotSearchList implements Serializable{
 	private String topicResult;

 	/**
-	 * 观看数（目前近B站排行榜及综合热门使用）
+	 * 观看数（目前近B站排行榜及综合热门,知乎浏览量使用）
 	 */
 	private Long view;

@@ -122,6 +122,16 @@ public class HotSearchList implements Serializable{
 	 * 内容
 	 */
 	private String content;
+
+	/**
+	 * 粉丝数（目前仅B站排行榜和知乎热搜使用）
+	 */
+	private Long fans;
+
+	/**
+	 * 标签（目前仅B站排行榜和知乎热搜使用）
+	 */
+	private String tag;
 	public HotSearchList(){}

 	public HotSearchList(String url, String name, Long count,Boolean hot,Integer rank,String type,String icon,Date date){

--- a/src/main/java/com/zhiwei/searchhotcrawler/crawler/BililiCrawler.java
+++ b/src/main/java/com/zhiwei/searchhotcrawler/crawler/BililiCrawler.java
@@ -7,19 +7,21 @@ import com.zhiwei.crawler.core.proxy.ProxyHolder;
 import com.zhiwei.crawler.core.utils.RequestUtils;
 import com.zhiwei.searchhotcrawler.bean.HotSearchList;
 import com.zhiwei.searchhotcrawler.bean.HotSearchType;
-import com.zhiwei.searchhotcrawler.util.TipsUtils;
 import com.zhiwei.tools.tools.URLCodeUtil;
 import com.zhiwei.tools.tools.ZhiWeiTools;
 import lombok.extern.log4j.Log4j2;
 import okhttp3.Request;
 import okhttp3.Response;
 import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;

 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Collections;
 import java.util.Date;
 import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;

 @Log4j2
 public class BililiCrawler {
@@ -32,6 +34,7 @@ public class BililiCrawler {
     */
    public static List<HotSearchList> getBilibiliHotSearch(Date date){
        List<HotSearchList> hotSearchLists = new ArrayList<>();
+        ExecutorService executor = Executors.newFixedThreadPool(10);
        log.info("bilibili排行榜开始采集...");
        JSONArray dataJson = null;
        String htmlBody = null;
@@ -43,38 +46,108 @@ public class BililiCrawler {
            } catch (IOException e) {
                log.error("B站排行榜页面连接失败",e.fillInStackTrace());
            }
-            if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
-                JSONObject jsonObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
-                dataJson = jsonObject.getJSONArray("list");
-                if(dataJson != null) {
-                    for (int i=0; i<dataJson.size(); i++) {
-                        JSONObject data = dataJson.getJSONObject(i);
-                        int rank = i+1;
-                        String name = data.getString("title");
-                        String topicLead = data.getString("desc");
-                        long count = data.getLongValue("score");
-                        String bvid = data.getString("bvid");
-                        String pic = data.getString("pic");
-                        String bUrl = "https://www.bilibili.com/video/"+bvid;
-                        Long view = null;
-                        Long barrage = null;
-                        if(data.containsKey("stat")) {
-                            JSONObject stat = data.getJSONObject("stat");
-                            view = stat.getLongValue("view");
-                            barrage = stat.getLongValue("danmaku");
+            try {
+                if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
+                    JSONObject jsonObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
+                    dataJson = jsonObject.getJSONArray("list");
+                    if(dataJson != null) {
+                        for (int i=0; i<dataJson.size(); i++) {
+                            JSONObject data = dataJson.getJSONObject(i);
+                            int rank = i+1;
+                            String name = data.getString("title");
+                            String topicLead = data.getString("desc");
+                            long count = data.getLongValue("score");
+                            String bvid = data.getString("bvid");
+                            String pic = data.getString("pic");
+                            String bUrl = "https://www.bilibili.com/video/"+bvid;
+                            Long view = null;
+                            Long barrage = null;
+                            if(data.containsKey("stat")) {
+                                JSONObject stat = data.getJSONObject("stat");
+                                view = stat.getLongValue("view");
+                                barrage = stat.getLongValue("danmaku");
+                            }
+                            //获取主持人
+                            String downtext=null;
+                            if(data.containsKey("owner")) {
+                                JSONObject stat = data.getJSONObject("owner");
+                                downtext = stat.getString("name");
+                            }
+                            HotSearchList hotSearchList = new HotSearchList(bUrl,name,topicLead,count,null,date,rank,HotSearchType.B站排行榜.name(),view,barrage,pic);
+                            hotSearchList.setDowntext(downtext);
+                            executor.execute(new Runnable() {
+                                @Override
+                                public void run() {
+                                    HotSearchList tag = getTag(bUrl, hotSearchList);
+                                    hotSearchLists.add(tag);
+                                }
+                            });
                        }
-                        HotSearchList hotSearchList = new HotSearchList(bUrl,name,topicLead,count,null,date,rank,HotSearchType.B站排行榜.name(),view,barrage,pic);
-                        hotSearchLists.add(hotSearchList);
+                        //进行多线程任务是否执行完毕 如到达指定时间也结束循环
+                        executor.shutdown();
+                        long time=0L;
+                        while (true){
+                            if (executor.isTerminated()){
+                                break;
+                            }
+                            try {
+                                Thread.sleep(3000);
+                                time=3000+time;
+                                if (time>50000){
+                                    break;
+                                }
+                            } catch (InterruptedException e) {
+                                e.printStackTrace();
+                            }
+                        }
+
                    }
                }
+            } catch (Exception e) {
+                log.error("B站排行榜页面解析异常:{}",e);
            }
-            ZhiWeiTools.sleep(3000L);
        }
        log.info("{}, B站排行榜此轮采集到的数据量为:{}", new Date(), hotSearchLists != null ? hotSearchLists.size() : 0);
        log.info("B站排行榜采集结束");
        return hotSearchLists;
    }

+    //获取标签及粉丝量
+    private static HotSearchList getTag(String url,HotSearchList hotSearchList) {
+                 Request request = RequestUtils.wrapGet(url);
+                 try {
+                     System.setProperty("https.protocols", "TLSv1,TLSv1.1,TLSv1.2,SSLv3");
+                     Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY);
+                     String htmlBody = response.body().string();
+                     if (htmlBody != null && htmlBody.contains("v-wrap")) {
+                         Document document = Jsoup.parse(htmlBody);
+                         //获取标签
+                         String tags = "`"+document.select("li.tag").text()+";";
+                         String tag = tags.replaceAll(" ", ";`");
+                         hotSearchList.setTag(tag);
+                         //获取粉丝数
+                         if (htmlBody.contains("v_upinfo")) {
+                             String text = document.select("div.follow-btn").select("span").text();
+                             String fan = text.split(" ")[2];
+                             Long fanCount =null;
+                             if (fan.contains("万")){
+                               double dou = Double.parseDouble(fan.replaceAll("万", " "));
+                                 fanCount =new Double(dou*10000).longValue();
+                             }else {
+                                 fanCount =Long.valueOf(fan);
+                             }
+                             hotSearchList.setFans(fanCount);
+                         }
+                         return hotSearchList;
+                     } else {
+                        return hotSearchList;
+                     }
+                 } catch (Exception e) {
+                     log.error("单条B站排行榜数据页面连接失败:{}", e);
+                     return hotSearchList;
+                 }
+    }
+
    /**
     * B站热搜的采集
     * @param date

--- a/src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+++ b/src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
 package com.zhiwei.searchhotcrawler.crawler;

 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Date;
-import java.util.List;
-import java.util.Map;
+import java.util.*;

+import com.zhiwei.crawler.core.config.SslProvider;
 import lombok.extern.log4j.Log4j2;
 import okhttp3.Request;
 import okhttp3.Response;
@@ -20,6 +18,12 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
 import com.zhiwei.searchhotcrawler.bean.HotSearchType;
 import com.zhiwei.tools.httpclient.HeaderTool;
 import com.zhiwei.tools.tools.URLCodeUtil;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import static java.util.Objects.nonNull;

 /**
 * @ClassName: ZhihuHotCrawler
@@ -30,7 +34,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
 @Log4j2
 public class ZhihuHotSearchCrawler {

-	private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
+	private static HttpBoot httpBoot = new HttpBoot.Builder().sslProvider(SslProvider.CONSCRYPT).retryTimes(3).build();
 	/**
 	 * @Title: getZhihuHotList
 	 * @author hero
@@ -100,37 +104,84 @@ public class ZhihuHotSearchCrawler {
 			log.debug("获取知乎热搜时出现问题:{}", e);
 			return list;
 		}
-		if (htmlBody != null && htmlBody.contains("author")) {
-			JSONObject topSearch = JSONObject.parseObject(htmlBody);
-			JSONArray dataJson = topSearch.getJSONArray("data");
-			String link = null;
-			String displayQuery = null;
-			Long hotCount = null;
-			String hotText = null;
-			for (int i = 0; i < dataJson.size(); i++) {
-				JSONObject data = dataJson.getJSONObject(i).getJSONObject("target");
-				displayQuery = data.getString("title");
-				link = "https://www.zhihu.com/question/" + data.getLongValue("id");
-				hotText = dataJson.getJSONObject(i).getString("detail_text");
+		try {
+			if (htmlBody != null && htmlBody.contains("author")) {
+				JSONObject topSearch = JSONObject.parseObject(htmlBody);
+				JSONArray dataJson = topSearch.getJSONArray("data");
+				String link = null;
+				String displayQuery = null;
+				Long hotCount = null;
+				String hotText = null;
+				for (int i = 0; i < dataJson.size(); i++) {
+					JSONObject data = dataJson.getJSONObject(i).getJSONObject("target");
+					displayQuery = data.getString("title");
+					link = "https://www.zhihu.com/question/" + data.getLongValue("id");
+					hotText = dataJson.getJSONObject(i).getString("detail_text");

-				//计算热度
-				try {
-					if (hotText.contains("万")) {
-						hotText = hotText.replaceAll("万.*", "").trim();
-						hotCount = (long) (Double.parseDouble(hotText) * 10000);
-					} else if (hotText.contains("亿")) {
-						hotText = hotText.replaceAll("亿.*", "").trim();
-						hotCount = (long) (Double.parseDouble(hotText) * 100000000);
-					} else {
-						hotCount = Long.getLong(hotText);
+					//计算热度
+					try {
+						if (hotText.contains("万")) {
+							hotText = hotText.replaceAll("万.*", "").trim();
+							hotCount = (long) (Double.parseDouble(hotText) * 10000);
+						} else if (hotText.contains("亿")) {
+							hotText = hotText.replaceAll("亿.*", "").trim();
+							hotCount = (long) (Double.parseDouble(hotText) * 100000000);
+						} else {
+							hotCount = Long.getLong(hotText);
+						}
+					} catch (Exception e) {
+						e.printStackTrace();
 					}
-				} catch (Exception e) {
-					e.printStackTrace();
+					org.bson.Document doc = getTag(link);
+					String tog = nonNull(doc.get("tag")) ? doc.getString("tag") : null;
+					Long view = nonNull(doc.get("view")) ? Long.valueOf(doc.get("view").toString()) : null;
+					Long fans = nonNull(doc.get("fans")) ? Long.valueOf(doc.get("fans").toString()) : null;
+					HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(),date);
+					zhihu.setFans(fans);
+					zhihu.setView(view);
+					zhihu.setTag(tog);
+					list.add(zhihu);
 				}
-				HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(),date);
-				list.add(zhihu);
 			}
+		} catch (Exception e) {
+			log.info("知乎热搜解析异常",e);
 		}
 		return list;
 	}
+
+	//访问pc端 获取标签及浏览量关注数
+	private static org.bson.Document getTag(String url) {
+		org.bson.Document doc = new org.bson.Document();
+		doc.put("tag",null);
+		//浏览量
+		doc.put("view",null);
+		//粉丝
+		doc.put("fans",null);
+		Map<String,String> Map = HeaderTool.getCommonHead();
+		Map.put("cookie", "_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4");
+		Request request = RequestUtils.wrapGet(url,Map);
+		try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
+			String htmlBody = response.body().string();
+			if (htmlBody != null && htmlBody.contains("QuestionHeader")) {
+				Document document = Jsoup.parse(htmlBody);
+				//获取标签
+				String content = "`"+document.select("div.Tag").text()+";";
+				String label = content.replaceAll(" ", ";`");
+				doc.put("tag",label.trim());
+				String strong = document.select("div.NumberBoard-itemInner").select("strong").text();
+				String[] count = strong.split(" ");
+				//获取关注数
+				doc.put("fans",Long.valueOf(count[0].replaceAll(",","").trim()));
+				//获取浏览量
+				doc.put("view",Long.valueOf(count[1].replaceAll(",","").trim()));
+				return doc;
+			}else {
+				return doc;
+			}
+		} catch (Exception e) {
+				log.error("单条知乎热搜数据页面连接失败",e);
+				return doc;
+		}
+
+	}
 }
--- a/src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+++ b/src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
@@ -96,6 +96,9 @@ public class HotSearchCacheDAO {
                document.put("view", hotSearch.getView());
                document.put("barrage", hotSearch.getBarrage());
                document.put("pictureUrl", hotSearch.getPictureUrl());
+                document.put("tag", hotSearch.getTag());
+                document.put("downtext", hotSearch.getDowntext());
+                document.put("fans", hotSearch.getFans());
            }
            if ("B站综合热门".equals(hotSearch.getType())) {
                document.put("heatLabel", hotSearch.getHeatLabel());
@@ -103,6 +106,11 @@ public class HotSearchCacheDAO {
                document.put("pictureUrl", hotSearch.getPictureUrl());
                document.put("commentCount", hotSearch.getCommentCount());
            }
+            if ("知乎热搜".equals(hotSearch.getType())) {
+                document.put("tag", hotSearch.getTag());
+                document.put("view", hotSearch.getView());
+                document.put("fans", hotSearch.getFans());
+            }
            addAndUpdateData(document);
            if ("百度热搜".equals(hotSearch.getType())) {
                document.remove("topic_lead");
@@ -113,6 +121,9 @@ public class HotSearchCacheDAO {
            if ("网易热榜".equals(hotSearch.getType())) {
                document.remove("downtext");
            }
+            if ("B站排行榜".equals(hotSearch.getType())) {
+                document.remove("downtext");
+            }
            dataes.add(document);
        }
        return dataes;
@@ -278,6 +289,13 @@ public class HotSearchCacheDAO {
                    if ("B站综合热门".equals(type)) {
                        nowDoc.put("pictureUrl", pictureUrl);
                    }
+                    if ("知乎热搜".equals(type)) {
+                        nowDoc.put("tag", nonNull(document.get("tag")) ? document.getString("tag") : null);
+                    }
+                    if ("B站排行榜".equals(type)) {
+                        nowDoc.put("tag", nonNull(document.get("tag")) ? document.getString("tag") : null);
+                        nowDoc.put("downtext", nonNull(document.get("downtext")) ? document.getString("downtext") : null);
+                    }
                    if ("微博热搜".equals(type)) {
                        nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc);
                        //更新微博话题贡献者,关于功能