Commit 8f07a0cc by chenweitao

Merge branch 'working' into 'master'

Working

See merge request !218
parents 3b1f63a6 ab4d9e51
...@@ -33,6 +33,7 @@ public class BaiDuHotSearchCrawler { ...@@ -33,6 +33,7 @@ public class BaiDuHotSearchCrawler {
// private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); // private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/** /**
* @return void 返回类型 * @return void 返回类型
* @Title: BaiDuHotSearchTest * @Title: BaiDuHotSearchTest
......
...@@ -35,7 +35,8 @@ public class KuaiShouHotSearchCrawler { ...@@ -35,7 +35,8 @@ public class KuaiShouHotSearchCrawler {
* @Description: PC端36Kr人气榜采集 * @Description: PC端36Kr人气榜采集
*/ */
public static List<HotSearchList> KuaiShouHotSearchCrawler(Date date) { public static List<HotSearchList> KuaiShouHotSearchCrawler(Date date) {
String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"; //String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791";
String url = "https://www.kuaishou.com/?isHome=1";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
Response response = HttpClientUtils.httpBoot.syncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY); Response response = HttpClientUtils.httpBoot.syncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY);
......
...@@ -8,6 +8,7 @@ import com.zhiwei.http.boot.Response; ...@@ -8,6 +8,7 @@ import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxyServerSupplier; import com.zhiwei.http.proxy.ProxyServerSupplier;
import com.zhiwei.http.proxy.ProxySupplier; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils; import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.util.DelTagsUtil;
import com.zhiwei.searchhotcrawler.util.HttpClientUtils; import com.zhiwei.searchhotcrawler.util.HttpClientUtils;
import io.netty.handler.ssl.SslProvider; import io.netty.handler.ssl.SslProvider;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
...@@ -144,10 +145,12 @@ public class ZhihuHotSearchCrawler { ...@@ -144,10 +145,12 @@ public class ZhihuHotSearchCrawler {
String tog = nonNull(doc.get("tag")) ? doc.getString("tag") : null; String tog = nonNull(doc.get("tag")) ? doc.getString("tag") : null;
Long view = nonNull(doc.get("view")) ? Long.valueOf(doc.get("view").toString()) : null; Long view = nonNull(doc.get("view")) ? Long.valueOf(doc.get("view").toString()) : null;
Long fans = nonNull(doc.get("fans")) ? Long.valueOf(doc.get("fans").toString()) : null; Long fans = nonNull(doc.get("fans")) ? Long.valueOf(doc.get("fans").toString()) : null;
String topicLead = nonNull(doc.get("topicLead")) ? doc.getString("topicLead") : null;
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(), date); HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(), date);
zhihu.setFans(fans); zhihu.setFans(fans);
zhihu.setView(view); zhihu.setView(view);
zhihu.setTag(tog); zhihu.setTag(tog);
zhihu.setTopicLead(topicLead);
list.add(zhihu); list.add(zhihu);
} }
return list; return list;
...@@ -178,12 +181,18 @@ public class ZhihuHotSearchCrawler { ...@@ -178,12 +181,18 @@ public class ZhihuHotSearchCrawler {
log.error("单条知乎热搜数据页面连接失败", cause); log.error("单条知乎热搜数据页面连接失败", cause);
return doc; return doc;
} else { } else {
String[] split = url.split("/");
String id= split[4];
String htmlBody = response.bodyString(); String htmlBody = response.bodyString();
if (htmlBody != null && htmlBody.contains("QuestionHeader")) { if (htmlBody != null && htmlBody.contains("QuestionHeader")) {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
//获取标签 //获取标签
String label = ""; String label = "";
Elements select = document.select("div.QuestionHeader-topics").select("div.css-1gomreu"); Elements select = document.select("div.QuestionHeader-topics").select("div.css-1gomreu");
String substring = htmlBody.substring(htmlBody.indexOf("initialState") - 2, htmlBody.indexOf("subAppName") + 19);
JSONObject jsonObject = JSONObject.parseObject(substring);
String detail= jsonObject.getJSONObject("initialState").getJSONObject("entities").getJSONObject("questions").getJSONObject(id).getString("detail");
String topicLead = DelTagsUtil.getTextFromHtml(detail);
for (Element element : select) { for (Element element : select) {
String text = "`" + element.select("div.css-1gomreu").text() + ";"; String text = "`" + element.select("div.css-1gomreu").text() + ";";
label = label + text; label = label + text;
...@@ -195,6 +204,7 @@ public class ZhihuHotSearchCrawler { ...@@ -195,6 +204,7 @@ public class ZhihuHotSearchCrawler {
doc.put("fans", Long.valueOf(count[0].replaceAll(",", "").trim())); doc.put("fans", Long.valueOf(count[0].replaceAll(",", "").trim()));
//获取浏览量 //获取浏览量
doc.put("view", Long.valueOf(count[1].replaceAll(",", "").trim())); doc.put("view", Long.valueOf(count[1].replaceAll(",", "").trim()));
doc.put("topicLead",topicLead);
return doc; return doc;
} else { } else {
return doc; return doc;
......
...@@ -114,6 +114,7 @@ public class HotSearchCacheDAO { ...@@ -114,6 +114,7 @@ public class HotSearchCacheDAO {
document.put("tag", hotSearch.getTag()); document.put("tag", hotSearch.getTag());
document.put("view", hotSearch.getView()); document.put("view", hotSearch.getView());
document.put("fans", hotSearch.getFans()); document.put("fans", hotSearch.getFans());
document.put("topic_lead", hotSearch.getTopicLead());
} }
if ("微博出圈榜".equals(hotSearch.getType())) { if ("微博出圈榜".equals(hotSearch.getType())) {
...@@ -143,6 +144,9 @@ public class HotSearchCacheDAO { ...@@ -143,6 +144,9 @@ public class HotSearchCacheDAO {
if ("B站排行榜".equals(hotSearch.getType())) { if ("B站排行榜".equals(hotSearch.getType())) {
document.remove("downtext"); document.remove("downtext");
} }
if ("知乎热搜".equals(hotSearch.getType())) {
document.remove("topic_lead");
}
if (hotSearch.getType().contains("微博品牌")) { if (hotSearch.getType().contains("微博品牌")) {
document.remove("readCount"); document.remove("readCount");
document.remove("discussCount"); document.remove("discussCount");
......
package com.zhiwei.searchhotcrawler.util;
/**
* 去除文章内容页页面代码里的HTML标签
*/
public class DelTagsUtil {
/**
* 去除html代码中含有的标签
* @param htmlStr
* @return
*/
public static String delHtmlTags(String htmlStr) {
//定义script的正则表达式,去除js可以防止注入
String scriptRegex="<script[^>]*?>[\\s\\S]*?<\\/script>";
//定义style的正则表达式,去除style样式,防止css代码过多时只截取到css样式代码
String styleRegex="<style[^>]*?>[\\s\\S]*?<\\/style>";
//定义HTML标签的正则表达式,去除标签,只提取文字内容
String htmlRegex="<[^>]+>";
//定义空格,回车,换行符,制表符
String spaceRegex = "\\s*|\t|\r|\n";
// 过滤script标签
htmlStr = htmlStr.replaceAll(scriptRegex, "");
// 过滤style标签
htmlStr = htmlStr.replaceAll(styleRegex, "");
// 过滤html标签
htmlStr = htmlStr.replaceAll(htmlRegex, "");
// 过滤空格等
htmlStr = htmlStr.replaceAll(spaceRegex, "");
return htmlStr.trim(); // 返回文本字符串
}
/**
* 获取HTML代码里的内容
* @param htmlStr
* @return
*/
public static String getTextFromHtml(String htmlStr){
//去除html标签
htmlStr = delHtmlTags(htmlStr);
//去除空格" "
htmlStr = htmlStr.replaceAll(" ","");
return htmlStr;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment