Commit 1fd52a37 by leiliangliang

新增B站标签采集和知乎热搜标签采集

parent d59803e9
......@@ -48,7 +48,12 @@
<artifactId>crawler-core</artifactId>
<version>0.6.7.4-SNAPSHOT</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.conscrypt/conscrypt-openjdk-uber -->
<dependency>
<groupId>org.conscrypt</groupId>
<artifactId>conscrypt-openjdk-uber</artifactId>
<version>2.5.2</version>
</dependency>
<!-- 日志依赖 -->
<!-- https://mvnrepository.com/artifact/org.apache.logging.log4j/log4j-core -->
<dependency>
......
......@@ -94,7 +94,7 @@ public class HotSearchList implements Serializable{
private String topicResult;
/**
* 观看数(目前近B站排行榜及综合热门使用)
* 观看数(目前近B站排行榜及综合热门,知乎浏览量使用)
*/
private Long view;
......@@ -122,6 +122,16 @@ public class HotSearchList implements Serializable{
* 内容
*/
private String content;
/**
* 粉丝数(目前仅B站排行榜和知乎热搜使用)
*/
private Long fans;
/**
* 标签(目前仅B站排行榜和知乎热搜使用)
*/
private String tag;
public HotSearchList(){}
public HotSearchList(String url, String name, Long count,Boolean hot,Integer rank,String type,String icon,Date date){
......
......@@ -7,19 +7,21 @@ import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@Log4j2
public class BililiCrawler {
......@@ -32,6 +34,7 @@ public class BililiCrawler {
*/
public static List<HotSearchList> getBilibiliHotSearch(Date date){
List<HotSearchList> hotSearchLists = new ArrayList<>();
ExecutorService executor = Executors.newFixedThreadPool(10);
log.info("bilibili排行榜开始采集...");
JSONArray dataJson = null;
String htmlBody = null;
......@@ -43,38 +46,108 @@ public class BililiCrawler {
} catch (IOException e) {
log.error("B站排行榜页面连接失败",e.fillInStackTrace());
}
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
JSONObject jsonObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
dataJson = jsonObject.getJSONArray("list");
if(dataJson != null) {
for (int i=0; i<dataJson.size(); i++) {
JSONObject data = dataJson.getJSONObject(i);
int rank = i+1;
String name = data.getString("title");
String topicLead = data.getString("desc");
long count = data.getLongValue("score");
String bvid = data.getString("bvid");
String pic = data.getString("pic");
String bUrl = "https://www.bilibili.com/video/"+bvid;
Long view = null;
Long barrage = null;
if(data.containsKey("stat")) {
JSONObject stat = data.getJSONObject("stat");
view = stat.getLongValue("view");
barrage = stat.getLongValue("danmaku");
try {
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
JSONObject jsonObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
dataJson = jsonObject.getJSONArray("list");
if(dataJson != null) {
for (int i=0; i<dataJson.size(); i++) {
JSONObject data = dataJson.getJSONObject(i);
int rank = i+1;
String name = data.getString("title");
String topicLead = data.getString("desc");
long count = data.getLongValue("score");
String bvid = data.getString("bvid");
String pic = data.getString("pic");
String bUrl = "https://www.bilibili.com/video/"+bvid;
Long view = null;
Long barrage = null;
if(data.containsKey("stat")) {
JSONObject stat = data.getJSONObject("stat");
view = stat.getLongValue("view");
barrage = stat.getLongValue("danmaku");
}
//获取主持人
String downtext=null;
if(data.containsKey("owner")) {
JSONObject stat = data.getJSONObject("owner");
downtext = stat.getString("name");
}
HotSearchList hotSearchList = new HotSearchList(bUrl,name,topicLead,count,null,date,rank,HotSearchType.B站排行榜.name(),view,barrage,pic);
hotSearchList.setDowntext(downtext);
executor.execute(new Runnable() {
@Override
public void run() {
HotSearchList tag = getTag(bUrl, hotSearchList);
hotSearchLists.add(tag);
}
});
}
HotSearchList hotSearchList = new HotSearchList(bUrl,name,topicLead,count,null,date,rank,HotSearchType.B站排行榜.name(),view,barrage,pic);
hotSearchLists.add(hotSearchList);
//进行多线程任务是否执行完毕 如到达指定时间也结束循环
executor.shutdown();
long time=0L;
while (true){
if (executor.isTerminated()){
break;
}
try {
Thread.sleep(3000);
time=3000+time;
if (time>50000){
break;
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
} catch (Exception e) {
log.error("B站排行榜页面解析异常:{}",e);
}
ZhiWeiTools.sleep(3000L);
}
log.info("{}, B站排行榜此轮采集到的数据量为:{}", new Date(), hotSearchLists != null ? hotSearchLists.size() : 0);
log.info("B站排行榜采集结束");
return hotSearchLists;
}
//获取标签及粉丝量
private static HotSearchList getTag(String url,HotSearchList hotSearchList) {
Request request = RequestUtils.wrapGet(url);
try {
System.setProperty("https.protocols", "TLSv1,TLSv1.1,TLSv1.2,SSLv3");
Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY);
String htmlBody = response.body().string();
if (htmlBody != null && htmlBody.contains("v-wrap")) {
Document document = Jsoup.parse(htmlBody);
//获取标签
String tags = "`"+document.select("li.tag").text()+";";
String tag = tags.replaceAll(" ", ";`");
hotSearchList.setTag(tag);
//获取粉丝数
if (htmlBody.contains("v_upinfo")) {
String text = document.select("div.follow-btn").select("span").text();
String fan = text.split(" ")[2];
Long fanCount =null;
if (fan.contains("万")){
double dou = Double.parseDouble(fan.replaceAll("万", " "));
fanCount =new Double(dou*10000).longValue();
}else {
fanCount =Long.valueOf(fan);
}
hotSearchList.setFans(fanCount);
}
return hotSearchList;
} else {
return hotSearchList;
}
} catch (Exception e) {
log.error("单条B站排行榜数据页面连接失败:{}", e);
return hotSearchList;
}
}
/**
* B站热搜的采集
* @param date
......
package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.*;
import com.zhiwei.crawler.core.config.SslProvider;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
......@@ -20,6 +18,12 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import static java.util.Objects.nonNull;
/**
* @ClassName: ZhihuHotCrawler
......@@ -30,7 +34,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
@Log4j2
public class ZhihuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().sslProvider(SslProvider.CONSCRYPT).retryTimes(3).build();
/**
* @Title: getZhihuHotList
* @author hero
......@@ -100,37 +104,84 @@ public class ZhihuHotSearchCrawler {
log.debug("获取知乎热搜时出现问题:{}", e);
return list;
}
if (htmlBody != null && htmlBody.contains("author")) {
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray dataJson = topSearch.getJSONArray("data");
String link = null;
String displayQuery = null;
Long hotCount = null;
String hotText = null;
for (int i = 0; i < dataJson.size(); i++) {
JSONObject data = dataJson.getJSONObject(i).getJSONObject("target");
displayQuery = data.getString("title");
link = "https://www.zhihu.com/question/" + data.getLongValue("id");
hotText = dataJson.getJSONObject(i).getString("detail_text");
try {
if (htmlBody != null && htmlBody.contains("author")) {
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray dataJson = topSearch.getJSONArray("data");
String link = null;
String displayQuery = null;
Long hotCount = null;
String hotText = null;
for (int i = 0; i < dataJson.size(); i++) {
JSONObject data = dataJson.getJSONObject(i).getJSONObject("target");
displayQuery = data.getString("title");
link = "https://www.zhihu.com/question/" + data.getLongValue("id");
hotText = dataJson.getJSONObject(i).getString("detail_text");
//计算热度
try {
if (hotText.contains("万")) {
hotText = hotText.replaceAll("万.*", "").trim();
hotCount = (long) (Double.parseDouble(hotText) * 10000);
} else if (hotText.contains("亿")) {
hotText = hotText.replaceAll("亿.*", "").trim();
hotCount = (long) (Double.parseDouble(hotText) * 100000000);
} else {
hotCount = Long.getLong(hotText);
//计算热度
try {
if (hotText.contains("万")) {
hotText = hotText.replaceAll("万.*", "").trim();
hotCount = (long) (Double.parseDouble(hotText) * 10000);
} else if (hotText.contains("亿")) {
hotText = hotText.replaceAll("亿.*", "").trim();
hotCount = (long) (Double.parseDouble(hotText) * 100000000);
} else {
hotCount = Long.getLong(hotText);
}
} catch (Exception e) {
e.printStackTrace();
}
} catch (Exception e) {
e.printStackTrace();
org.bson.Document doc = getTag(link);
String tog = nonNull(doc.get("tag")) ? doc.getString("tag") : null;
Long view = nonNull(doc.get("view")) ? Long.valueOf(doc.get("view").toString()) : null;
Long fans = nonNull(doc.get("fans")) ? Long.valueOf(doc.get("fans").toString()) : null;
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(),date);
zhihu.setFans(fans);
zhihu.setView(view);
zhihu.setTag(tog);
list.add(zhihu);
}
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(),date);
list.add(zhihu);
}
} catch (Exception e) {
log.info("知乎热搜解析异常",e);
}
return list;
}
//访问pc端 获取标签及浏览量关注数
private static org.bson.Document getTag(String url) {
org.bson.Document doc = new org.bson.Document();
doc.put("tag",null);
//浏览量
doc.put("view",null);
//粉丝
doc.put("fans",null);
Map<String,String> Map = HeaderTool.getCommonHead();
Map.put("cookie", "_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4");
Request request = RequestUtils.wrapGet(url,Map);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
String htmlBody = response.body().string();
if (htmlBody != null && htmlBody.contains("QuestionHeader")) {
Document document = Jsoup.parse(htmlBody);
//获取标签
String content = "`"+document.select("div.Tag").text()+";";
String label = content.replaceAll(" ", ";`");
doc.put("tag",label.trim());
String strong = document.select("div.NumberBoard-itemInner").select("strong").text();
String[] count = strong.split(" ");
//获取关注数
doc.put("fans",Long.valueOf(count[0].replaceAll(",","").trim()));
//获取浏览量
doc.put("view",Long.valueOf(count[1].replaceAll(",","").trim()));
return doc;
}else {
return doc;
}
} catch (Exception e) {
log.error("单条知乎热搜数据页面连接失败",e);
return doc;
}
}
}
......@@ -96,6 +96,9 @@ public class HotSearchCacheDAO {
document.put("view", hotSearch.getView());
document.put("barrage", hotSearch.getBarrage());
document.put("pictureUrl", hotSearch.getPictureUrl());
document.put("tag", hotSearch.getTag());
document.put("downtext", hotSearch.getDowntext());
document.put("fans", hotSearch.getFans());
}
if ("B站综合热门".equals(hotSearch.getType())) {
document.put("heatLabel", hotSearch.getHeatLabel());
......@@ -103,6 +106,11 @@ public class HotSearchCacheDAO {
document.put("pictureUrl", hotSearch.getPictureUrl());
document.put("commentCount", hotSearch.getCommentCount());
}
if ("知乎热搜".equals(hotSearch.getType())) {
document.put("tag", hotSearch.getTag());
document.put("view", hotSearch.getView());
document.put("fans", hotSearch.getFans());
}
addAndUpdateData(document);
if ("百度热搜".equals(hotSearch.getType())) {
document.remove("topic_lead");
......@@ -113,6 +121,9 @@ public class HotSearchCacheDAO {
if ("网易热榜".equals(hotSearch.getType())) {
document.remove("downtext");
}
if ("B站排行榜".equals(hotSearch.getType())) {
document.remove("downtext");
}
dataes.add(document);
}
return dataes;
......@@ -278,6 +289,13 @@ public class HotSearchCacheDAO {
if ("B站综合热门".equals(type)) {
nowDoc.put("pictureUrl", pictureUrl);
}
if ("知乎热搜".equals(type)) {
nowDoc.put("tag", nonNull(document.get("tag")) ? document.getString("tag") : null);
}
if ("B站排行榜".equals(type)) {
nowDoc.put("tag", nonNull(document.get("tag")) ? document.getString("tag") : null);
nowDoc.put("downtext", nonNull(document.get("downtext")) ? document.getString("downtext") : null);
}
if ("微博热搜".equals(type)) {
nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc);
//更新微博话题贡献者,关于功能
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment