Commit 9a3f1625 by leiliangliang

更新虎嗅采集程序

parent 310520db
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
......@@ -73,35 +75,26 @@ public class HuXiuHotSearchCrawler {
//解析页面数据
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
ArrayList<HotSearchList> list = new ArrayList<>();
String webSite="https://www.huxiu.com";
try {
//获取Document文档对象
Document document = Jsoup.parse(htmlBody);
//获取元素集合
Elements elements = document.select("div.hot__list").select("div.focus-item");
if (Objects.nonNull(elements) && !elements.isEmpty()){
// 获取排名rank
String substring = htmlBody.substring(htmlBody.indexOf("articleHot") + 12, htmlBody.indexOf("momentList") - 2);
JSONArray arr = JSONObject.parseArray(substring);
//获取每个jsonObject对象的值
Integer rank = 0;
for (Element element : elements) {
try {
for (Object object : arr) {
rank++;
//获取关键词
String keyWord= element.select("p").text();
//获取关键词相关链接
String href = element.select("a.focus-item__left").attr("href");
String url=webSite+href;
JSONObject json = (JSONObject)JSONObject.toJSON(object);
//获取标题
String title = json.getString("title");
//获取链接
String url = json.getString("share_url");
//获取讨论量
String comment = element.select("i").first().text();
Long commentCount = Long.valueOf(comment);
JSONObject countInfo = json.getJSONObject("count_info");
String commentnum = countInfo.getString("commentnum");
Long commentCount = Long.valueOf(commentnum);
String topicLead =null;
long count=0L;
HotSearchList hotSearchList = new HotSearchList(url, keyWord,count, rank,HotSearchType.虎嗅热文推荐.name(),commentCount, topicLead, date);
HotSearchList hotSearchList = new HotSearchList(url, title,count, rank,HotSearchType.虎嗅热文推荐.name(),commentCount, topicLead, date);
list.add(hotSearchList);
} catch (NumberFormatException e) {
log.error("解析虎嗅热文推荐时出现解析错误",e);
}
}
}
} catch (Exception e) {
log.error("解析虎嗅热文推荐时出现解析错误,数据不是json结构",e);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment