Commit 64992203 by chenweitao

Merge branch 'working' into 'master'

百度热搜的更新

See merge request !96
parents c4c91711 d544547c
...@@ -12,6 +12,7 @@ import okhttp3.Response; ...@@ -12,6 +12,7 @@ import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -21,115 +22,158 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList; ...@@ -21,115 +22,158 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
/** /**
* @author hero
* @ClassName:BaiDuHotSearch * @ClassName:BaiDuHotSearch
* @Description: TODO(百度风云榜热搜采集) * @Description: TODO(百度风云榜热搜采集)
* @author hero
* @date 2019年7月10日 上午10:54:31 * @date 2019年7月10日 上午10:54:31
*/ */
@Log4j2 @Log4j2
public class BaiDuHotSearchCrawler { public class BaiDuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @return void 返回类型
* @Title: BaiDuHotSearchTest
* @author hero
* @Description: PC端百度风云榜采集
*/
public static List<HotSearchList> baiduHotSearch(Date date) {
String url = "http://top.baidu.com/buzz?b=1&fr=topindex";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("container-bg_lQ801")) {
return ansysNewData(htmlBody, date);
} else {
log.info("解析百度风云榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
/**
* 更新解析
*
* @param htmlBody
* @param date
* @return
*/
/** private static List<HotSearchList> ansysNewData(String htmlBody, Date date) {
* @Title: BaiDuHotSearchTest List<HotSearchList> list = new ArrayList<>();
* @author hero try {
* @Description: PC端百度风云榜采集 Document document = Jsoup.parse(htmlBody);
* @return void 返回类型 Elements elements = document.select("div.category-wrap_iQLoo");
*/ if (Objects.nonNull(elements) && !elements.isEmpty()) {
public static List<HotSearchList> baiduHotSearch(Date date) { for (Element element : elements) {
String url = "http://top.baidu.com/buzz?b=1&fr=topindex"; try {
String htmlBody = null; //获取排名
Request request = RequestUtils.wrapGet(url); String strRank = element.select("a.img-wrapper_29V76").select("div.index_1Ew5p").text();
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Integer rank = Integer.valueOf(strRank);
htmlBody = response.body().string(); //获取标题
} catch (Exception e) { String strTitle = element.select("a.title_dIF3B").text();
log.error("解析百度风云榜时出现解析错误,页面结构有问题", e); String title = strTitle.split(" ")[0];
} //获取链接
if (htmlBody != null && htmlBody.contains("mainBody")) { String url = element.select("div.content_1YWBm").select("a.title_dIF3B").attr("href");
return ansysData(htmlBody,date); //获取内容
} else { String content = element.select("div.small_Uvkd3").text();
log.info("解析百度风云榜时出现解析错误,页面结构有问题"); //获取搜索指数
} String strCount = element.select("div.hot-index_1Bl1a").text();
return Collections.emptyList(); Long count = Long.valueOf(strCount);
} HotSearchList hotSearch = new HotSearchList(url,title, count, rank, HotSearchType.百度热搜.name(), date);
hotSearch.setTopicLead(content);
list.add(hotSearch);
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误", e);
}
}
}
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误,数据不是json结构", e);
}
return list;
}
/** /**
* 解析数据 * 解析数据
* @param htmlBody *
* @return * @param htmlBody
*/ * @return
private static List<HotSearchList> ansysData(String htmlBody,Date date){ */
List<HotSearchList> list = new ArrayList<>(); private static List<HotSearchList> ansysData(String htmlBody, Date date) {
try { List<HotSearchList> list = new ArrayList<>();
Document document = Jsoup.parse(htmlBody); try {
Elements elements = document.select("table.list-table").select("tr"); Document document = Jsoup.parse(htmlBody);
if (Objects.nonNull(elements) && !elements.isEmpty()) { Elements elements = document.select("table.list-table").select("tr");
elements.forEach(element -> { if (Objects.nonNull(elements) && !elements.isEmpty()) {
try { elements.forEach(element -> {
// 获取排名rank try {
String rankStr = null; // 获取排名rank
// 根据网页标签,给rankStr做判断 String rankStr = null;
if (!element.select("td.first").select("span.num-top").isEmpty()) { // 根据网页标签,给rankStr做判断
rankStr = element.select("td.first").select("span.num-top").text(); if (!element.select("td.first").select("span.num-top").isEmpty()) {
} else if (!element.select("td.first").select("span.num-normal").isEmpty()) { rankStr = element.select("td.first").select("span.num-top").text();
rankStr = element.select("td.first").select("span.num-normal").text(); } else if (!element.select("td.first").select("span.num-normal").isEmpty()) {
} rankStr = element.select("td.first").select("span.num-normal").text();
Integer rank = null; }
// 判断rankStr是否为空 Integer rank = null;
if (StringUtils.isNoneBlank(rankStr)) { // 判断rankStr是否为空
rank = Integer.valueOf(rankStr); if (StringUtils.isNoneBlank(rankStr)) {
} rank = Integer.valueOf(rankStr);
}
// 获取关键词相关链接everurl(String) // 获取关键词相关链接everurl(String)
String everurl = element.select("td.keyword").select("a.list-title").attr("href"); String everurl = element.select("td.keyword").select("a.list-title").attr("href");
// 获取关键词(String) // 获取关键词(String)
String kw = element.select("td.keyword").select("a.list-title").text(); String kw = element.select("td.keyword").select("a.list-title").text();
// logger.info("关键词:{}", kw); // logger.info("关键词:{}", kw);
//从连接中获取正确编码关键词 //从连接中获取正确编码关键词
try{ try {
if (!everurl.isEmpty()){ if (!everurl.isEmpty()) {
kw = URLDecoder.decode(everurl.substring(everurl.indexOf("&wd=")+4).split("&")[0], "GB2312" ); kw = URLDecoder.decode(everurl.substring(everurl.indexOf("&wd=") + 4).split("&")[0], "GB2312");
} }
}catch (Exception e1){ } catch (Exception e1) {
log.error("解析百度风云榜,地址",e1); log.error("解析百度风云榜,地址", e1);
} }
// 获取搜索指数count(int) // 获取搜索指数count(int)
String hot = null; String hot = null;
// 判断热度值所在的规则是否为null // 判断热度值所在的规则是否为null
if (!element.select("td.last").select("span.icon-fall").isEmpty()) { if (!element.select("td.last").select("span.icon-fall").isEmpty()) {
hot = element.select("td.last").select("span.icon-fall").text(); hot = element.select("td.last").select("span.icon-fall").text();
} else if (!element.select("td.last").select("span.icon-rise").isEmpty()) { } else if (!element.select("td.last").select("span.icon-rise").isEmpty()) {
hot = element.select("td.last").select("span.icon-rise").text(); hot = element.select("td.last").select("span.icon-rise").text();
} } else if (!element.select("td.last").select("span.icon-fair").isEmpty()) {
else if (!element.select("td.last").select("span.icon-fair").isEmpty()) { hot = element.select("td.last").select("span.icon-fair").text();
hot = element.select("td.last").select("span.icon-fair").text(); }
} long count = 0;
long count = 0; // 判断hot是否为空
// 判断hot是否为空 if (StringUtils.isNotBlank(hot)) {
if (StringUtils.isNotBlank(hot)) { count = Integer.valueOf(hot);
count = Integer.valueOf(hot); }
} if (Objects.nonNull(rank)) {
if (Objects.nonNull(rank)) { if (count == 0) {
if(count == 0){ log.info(htmlBody);
log.info(htmlBody); log.info(hot);
log.info(hot); log.info(element);
log.info(element); } else {
} else { HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name(), date);
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name(),date); list.add(hotSearch);
list.add(hotSearch); }
} }
} } catch (Exception e) {
} catch (Exception e) { log.error("解析百度风云榜时出现解析错误", e);
log.error("解析百度风云榜时出现解析错误", e); }
} });
}); }
} } catch (Exception e) {
} catch (Exception e) { log.error("解析百度风云榜时出现解析错误,数据不是json结构", e);
log.error("解析百度风云榜时出现解析错误,数据不是json结构", e); }
} return list;
return list; }
}
} }
...@@ -53,6 +53,9 @@ public class HotSearchCacheDAO { ...@@ -53,6 +53,9 @@ public class HotSearchCacheDAO {
if("虎嗅热文推荐".equals(hotSearch.getType())){ if("虎嗅热文推荐".equals(hotSearch.getType())){
document.put("comment_count", hotSearch.getCommentCount()); document.put("comment_count", hotSearch.getCommentCount());
} }
if("百度热搜".equals(hotSearch.getType())){
document.put("topic_lead", hotSearch.getTopicLead());
}
if("腾讯较真榜".equals(hotSearch.getType())){ if("腾讯较真榜".equals(hotSearch.getType())){
document.put("topic_result",hotSearch.getTopicResult()); document.put("topic_result",hotSearch.getTopicResult());
...@@ -64,6 +67,9 @@ public class HotSearchCacheDAO { ...@@ -64,6 +67,9 @@ public class HotSearchCacheDAO {
document.put("pictureUrl",hotSearch.getPictureUrl()); document.put("pictureUrl",hotSearch.getPictureUrl());
} }
addAndUpdateData(document); addAndUpdateData(document);
if("百度热搜".equals(hotSearch.getType())){
document.remove("topic_lead");
}
dataes.add(document); dataes.add(document);
}); });
return dataes; return dataes;
......
package hotSaerchTest; package hotSaerchTest;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.client.MongoCollection;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyFactory; import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.proxy.config.SimpleConfig; import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.config.ProxyConfig; import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest; import com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest;
import com.zhiwei.searchhotcrawler.test.TaoBaoHotSearchCrawlerTest;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.bson.Document;
import org.junit.Test; import org.junit.Test;
import org.junit.runner.RunWith; import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration; import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import java.io.IOException;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import static com.ibm.icu.util.LocalePriorityList.add;
import static java.util.Objects.nonNull;
/** /**
* @author ll * @author ll
* @date 2021/6/10 6:30 * @date 2021/6/10 6:30
...@@ -24,6 +41,7 @@ import java.util.List; ...@@ -24,6 +41,7 @@ import java.util.List;
{"classpath:applicationContext.xml"}) {"classpath:applicationContext.xml"})
public class HotSearchTest { public class HotSearchTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/** /**
* 测试快手热榜采集 * 测试快手热榜采集
...@@ -40,4 +58,106 @@ public class HotSearchTest { ...@@ -40,4 +58,106 @@ public class HotSearchTest {
System.out.println(hotSearchLists.size()); System.out.println(hotSearchLists.size());
} }
@Test
public void WeiBoUpdate() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
Document document = new Document();
//String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26t%3D10%26q%3D%23我国新冠疫苗接种剂次超9亿%23";
String url = "https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23可口可乐回应C罗拒绝与可乐同框%23";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜详情页面时出现连接失败", e);
}
if (htmlBody != null && htmlBody.contains("data")) {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo");
if (json.containsKey("desc")) {
String topicLead = json.getString("desc");
if (!"".equals(topicLead)) {
document.put("topicLead", topicLead);
}
}
if (json.containsKey("cardlist_head_cards")) {
JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext");
String read = midText.replaceAll("阅读", "").replaceAll("讨论.*", "").trim();
String discussCount = midText.replaceAll(".*讨论", "").replaceAll("详情.*", "").trim();
String pictureUrl = readJson.getJSONObject("head_data").getString("portrait_url");
document.put("readCount", TipsUtils.getHotCount(read));
document.put("discussCount", TipsUtils.getHotCount(discussCount));
document.put("pictureUrl", pictureUrl);
if (readJson.getJSONObject("head_data").containsKey("downtext")) {
String downtext = readJson.getJSONObject("head_data").getString("downtext");
if (!"".equals(downtext)) {
document.put("downtext", downtext.replaceAll("主持人:", ""));
}
}
}
}
}
ad(document);
System.out.println(document);
}
private void ad(Document nowDoc) {
MongoCollection collection = MongoDBTemplate.getCollection(DBConfig.dbName, DBConfig.searchCacheCollName);
if(nowDoc.containsKey("topicLead")){
nowDoc.put("topicLead", nowDoc.getString("topicLead"));
}
if(nowDoc.containsKey("readCount") && nowDoc.containsKey("discussCount")) {
nowDoc.put("readCount", nonNull(nowDoc.get("readCount"))?Long.valueOf(nowDoc.get("readCount").toString()):null);
nowDoc.put("discussCount", nonNull(nowDoc.get("discussCount"))?Long.valueOf(nowDoc.get("discussCount").toString()):null);
}
if (nowDoc.containsKey("pictureUrl")) {
nowDoc.put("pictureUrl",nowDoc.getString("pictureUrl"));
}
if (nowDoc.containsKey("downtext")) {
nowDoc.put("downtext",nowDoc.getString("downtext"));
}
collection.insertOne(nowDoc);
}
/**
* 测试淘宝热搜采集
*/
@Test
public void taoBaoTestCrawler() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
List<HotSearchList> hotSearchLists = TaoBaoHotSearchCrawlerTest.taoBaoHotSearch(new Date());
System.out.println(hotSearchLists);
System.out.println(hotSearchLists.size());
}
/**
* 测试百度热搜采集
*/
@Test
public void baiDuTestCrawler() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
List<HotSearchList> hotSearchLists = BaiDuHotSearchCrawler.baiduHotSearch(new Date());
System.out.println(hotSearchLists);
System.out.println(hotSearchLists.size());
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment