Commit 8d85b0c2 by chenweitao

Merge branch 'working' into 'master'

Working

See merge request !89
parents 51117558 c495fcc6
......@@ -71,6 +71,32 @@
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.7.2-RELEASE</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.20</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>5.3.6</version>
<scope>test</scope>
</dependency>
</dependencies>
<properties>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
......
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>searchhotcrawler</artifactId>
<version>0.0.6-SNAPSHOT</version>
<name>各平台热搜榜单采集程序</name>
<description>各平台热搜榜单采集程序
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序</description>
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>searchhotcrawler</artifactId>
<version>0.0.6-SNAPSHOT</version>
<name>各平台热搜榜单采集程序</name>
<description>各平台热搜榜单采集程序
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序</description>
<properties>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<spring.version>4.2.2.RELEASE</spring.version>
<spring.version>4.2.2.RELEASE</spring.version>
</properties>
<developers>
......@@ -21,108 +21,113 @@
</developer>
</developers>
<dependencies>
<dependencies>
<!-- 数据解析jar -->
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.12.2</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.12.2</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>sendmail</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.6-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.7.4-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.8</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.quartz-scheduler</groupId>-->
<!-- <artifactId>quartz</artifactId>-->
<!-- <version>${quartz.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.quartz-scheduler</groupId>-->
<!-- <artifactId>quartz-jobs</artifactId>-->
<!-- <version>${quartz.version}</version>-->
<!-- </dependency>-->
<!-- Spring文件配置 -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-aop</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-beans</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-core</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-expression</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context-support</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-web</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-tx</artifactId>
<version>${spring.version}</version>
</dependency>
<!-- redis写 -->
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.8.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>sendmail</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.6-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.7.4-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.8</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.quartz-scheduler</groupId>-->
<!-- <artifactId>quartz</artifactId>-->
<!-- <version>${quartz.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.quartz-scheduler</groupId>-->
<!-- <artifactId>quartz-jobs</artifactId>-->
<!-- <version>${quartz.version}</version>-->
<!-- </dependency>-->
<!-- Spring文件配置 -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-aop</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-beans</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-core</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-expression</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context-support</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-web</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-tx</artifactId>
<version>${spring.version}</version>
</dependency>
<!-- redis写 -->
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.8.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>
</dependencies>
</dependencies>
<build>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
......@@ -147,10 +152,10 @@
</filters>
<transformers>
<!-- <transformer-->
<!-- implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">-->
<!-- <mainClass>com.zhiwei.searchhotcrawler.run.HotSearchRun</mainClass>-->
<!-- </transformer>-->
<!-- <transformer-->
<!-- implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">-->
<!-- <mainClass>com.zhiwei.searchhotcrawler.run.HotSearchRun</mainClass>-->
<!-- </transformer>-->
<!-- 不覆盖同名文件,而是追加合并同名文件 -->
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/spring.handlers</resource>
......
......@@ -22,5 +22,7 @@ public enum HotSearchType {
腾讯较真榜,
脉脉热榜,
B站排行榜,
B站热搜
B站热搜,
人气榜36,
虎嗅热文推荐,
}
package com.zhiwei.searchhotcrawler.bean;
/**
* @ClassName: WeiBoMassage
* @Description: 微博主要信息
* @author ll
* @date 2021年5月27日 下午2:26:11
*/
import lombok.Data;
import lombok.ToString;
import java.io.Serializable;
import java.util.Date;
import java.util.List;
@Data
@ToString
public class WeiBoMassage implements Serializable {
private static final long serialVersionUID = 5640606453392799871L;
/**
* 主键
*/
private String id;
/**
* 用户id
*/
private String userId;
/**
* 内容
*/
private String text;
/**
* 用户名
*/
private String userName;
/**
*
*/
private String mid;
/**
* 创建时间
*/
private Date creatTime;
/**
* 编辑时间
*/
private Date editTime;
/**
*
*/
private Integer cardType;
/**
* 显示类型
*/
private Integer showType;
/**
* 转发数
*/
private Long repostCount;
/**
* 评论数
*/
private Long commentCount;
/**
* 点赞数
*/
private Long attitudeCount;
/**
* 播放量
*/
private Long playCount;
/**
* 图片地址
*/
private List<String> pictureUrlList;
/**
* 来源
*/
private String source;
/**
* 类型
*/
private String type;
/**
* 话题
*/
private String topic;
//是否转发
private Integer forward;
//转发 源微博mid
private String root_mid;
//转发 源微博user信息
//转发 源id
private String root_id;
//转发 源name
private String root_name;
//转发 源微博text
private String root_text;
//转发 源来源
private String root_source;
public WeiBoMassage() {
}
public WeiBoMassage(String userId, String text, String userName, String mid,
Date creatTime, Date editTime, Integer cardType, Integer showType, Long repostCount,
Long commentCount, Long attitudeCount, String source, String type, String topic) {
this.id =mid+"_"+HotSearchType.微博热搜.name()+"_"+topic;
this.userId = userId;
this.text = text;
this.userName = userName;
this.mid = mid;
this.creatTime = creatTime;
this.editTime = editTime;
this.cardType = cardType;
this.showType = showType;
this.repostCount = repostCount;
this.commentCount = commentCount;
this.attitudeCount = attitudeCount;
this.source = source;
this.type = type;
this.topic = topic;
}
}
package com.zhiwei.searchhotcrawler.bean;
/**
* @ClassName: WeiBoUser
* @Description: 微博用户
* @author ll
* @date 2021年5月27日 下午3:26:11
*/
import lombok.Data;
import lombok.ToString;
import java.io.Serializable;
import java.util.Date;
@Data
@ToString
public class WeiBoUser implements Serializable {
private static final long serialVersionUID = -2856936638431788899L;
/**
* 主键
*/
private String id;
/**
* 用户id
*/
private String userId;
/**
* 认证信息
*/
private String attestationMassage;
/**
* 用户名
*/
private String userName;
/**
* 话题
*/
private String topic;
/**
*时间
*/
private Date time;
/**
* 粉丝数
*/
private Long followerCount;
public WeiBoUser() {
}
public WeiBoUser(String userId, String attestationMassage, String userName,String topic,Date time,Long followerCount) {
this.id = userId+"_"+HotSearchType.微博热搜.name()+"_"+topic;
this.userId = userId;
this.attestationMassage = attestationMassage;
this.userName = userName;
this.topic=topic;
this.time=time;
this.followerCount=followerCount;
}
}
......@@ -19,6 +19,9 @@ public class DBConfig {
searchCacheCollName = conf.getProperty("searchCacheCollName");
topicCollName = conf.getProperty("topicCollName");
collWechatUserName = conf.getProperty("collWechatUserName");
weiBoMassageCollName = conf.getProperty("weiBoMassageCollName");
weiBoUserCollName = conf.getProperty("weiBoUserCollName");
} catch (Exception e) {
e.printStackTrace();
}
......@@ -32,4 +35,6 @@ public class DBConfig {
public static String searchCacheCollName;
public static String topicCollName;
public static String collWechatUserName;
public static String weiBoMassageCollName;
public static String weiBoUserCollName;
}
package com.zhiwei.searchhotcrawler.crawler;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.time.Duration;
import java.util.*;
/**
* @author ll
* @ClassName:HotSearch36KrCrawler
* @Description:
* @date 2021年5月21日 上午11:54:31
*/
@Log4j2
public class HotSearch36KrCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public static List<HotSearchList> hotSearch36Kr(Date date) {
String url = "https://www.36kr.com/hot-list/catalog";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析36Kr人气榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("article-list")) {
return ansysData(htmlBody,date);
} else {
log.info("解析36Kr人气榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
// public static List<HotSearchList> hotSearch36Kr(Date date) {
// String url = "https://www.36kr.com/hot-list/catalog";
// //建立一个新的客户端请求(创建HttpClient对象)
// CloseableHttpClient httpClient = HttpClients.createDefault();
// //创建请求对象实例
// HttpGet httpGet = new HttpGet(url);
// httpGet.addHeader("User-Agent", "spider");
// //获取响应的结果
// CloseableHttpResponse response = null;
// try {
// //调用HttpClient对象的execute方法发送请求
// response = httpClient.execute(httpGet);
//
// if (Objects.nonNull(response)) {
// //获取HttpEntity对象其中包含了响应内容(响应头)
// HttpEntity entity = response.getEntity();
//
// String htmlBody = EntityUtils.toString(entity);
// return ansysData(htmlBody,date);
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
// return Collections.emptyList();
// }
/**
* 解析数据
*
* @param htmlBody
* @return
*/
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
List<HotSearchList> list = new ArrayList<>();
String webSite = "https://www.36kr.com";
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.article-list").first().select("div.article-wrapper");
if (Objects.nonNull(elements) && !elements.isEmpty()) {
// 获取排名rank
int rank = 0;
for (Element element : elements) {
try {
rank++;
// 获取关键词(String)
String keyWord = element.select("p.title-wrapper").select("a.article-item-title").text();
// logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
String everurl = element.select("p.title-wrapper").select("a.article-item-title").attr("href");
// 获取搜索指数count(int)
String url = webSite + everurl;
String hot = null;
// 判断热度值所在的规则是否为null
if (!element.select("span").isEmpty()) {
hot = element.select("span").text();
}
Long count = 0L;
// 判断hot是否为空
if (StringUtils.isNotBlank(hot)) {
String[] hots = hot.split("热度");
String trim = hots[1].trim();
Double num = Double.valueOf(trim);
count = Math.round(num);
}
if (Objects.nonNull(rank)) {
if (count == 0) {
log.info(htmlBody);
log.info(hot);
log.info(element);
} else {
HotSearchList hotSearch = new HotSearchList(url, keyWord, count, rank, HotSearchType.人气榜36.name(), date);
list.add(hotSearch);
}
}
} catch (Exception e) {
log.error("解析36Kr人气榜时出现解析错误", e);
}
}
}
} catch (Exception e) {
log.error("解析36Kr人气榜时出现解析错误,数据不是json结构", e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.time.Duration;
import java.util.*;
/**
* @author: ll
* @ClassName: HuXiuHotSearchCrawler
* @Description: pc端虎嗅热文推荐采集
* @date: 2021年5月24日 下午16:35:31
* @Title: HuXiuHotSearchCrawler
*/
@Log4j2
public class HuXiuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
public static List<HotSearchList> HuXiuHotArticleRecommended(Date date){
String url = "https://www.huxiu.com/";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析虎嗅热文推荐时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("hot__list")) {
return ansysData(htmlBody,date);
} else {
log.info("解析虎嗅热文推荐时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
// String url="https://www.huxiu.com/";
// //创建客户端请求对象
// CloseableHttpClient httpClient = HttpClients.createDefault();
// //创建请求对象实例
// HttpGet httpGet = new HttpGet(url);
// //设置头信息
// httpGet.addHeader("User-Agent","spider");
//
// //获取响应结果
// try {
// CloseableHttpResponse response = httpClient.execute(httpGet);
// //判断响应结果是否为空
// if (Objects.nonNull(response)) {
// //获取HttpEntity对象其中包含了响应内容(响应头)
// HttpEntity entity = response.getEntity();
// String htmlBody = EntityUtils.toString(entity);
// return ansysData(htmlBody,date);
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
//
// return Collections.emptyList();
// }
//解析页面数据
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
ArrayList<HotSearchList> list = new ArrayList<>();
String webSite="https://www.huxiu.com";
try {
//获取Document文档对象
Document document = Jsoup.parse(htmlBody);
//获取元素集合
Elements elements = document.select("div.hot__list").select("div.focus-item");
if (Objects.nonNull(elements) && !elements.isEmpty()){
// 获取排名rank
Integer rank = 0;
for (Element element : elements) {
try {
rank++;
//获取关键词
String keyWord= element.select("p").text();
//获取关键词相关链接
String href = element.select("a.focus-item__left").attr("href");
String url=webSite+href;
//获取讨论量
String comment = element.select("i").first().text();
Long commentCount = Long.valueOf(comment);
String topicLead =null;
long count=0L;
HotSearchList hotSearchList = new HotSearchList(url, keyWord,count, rank,HotSearchType.虎嗅热文推荐.name(),commentCount, topicLead, date);
list.add(hotSearchList);
} catch (NumberFormatException e) {
log.error("解析虎嗅热文推荐时出现解析错误",e);
}
}
}
} catch (Exception e) {
log.error("解析虎嗅热文推荐时出现解析错误,数据不是json结构",e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.stream.Collectors;
import com.alibaba.fastjson.JSON;
import com.zhiwei.searchhotcrawler.bean.HotSearchCache;
import com.zhiwei.searchhotcrawler.bean.*;
import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.dao.RedisDao;
import com.zhiwei.searchhotcrawler.dao.WeiBoMassageDao;
import com.zhiwei.searchhotcrawler.dao.WeiBoUserDao;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
......@@ -25,12 +29,12 @@ import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.mail.SendMailWeibo;
import com.zhiwei.tools.tools.URLCodeUtil;
import org.springframework.beans.factory.annotation.Autowired;
import static java.util.Objects.nonNull;
/**
* @ClassName: WeiboHotSearch
* @Description: 微博实时热搜采集
......@@ -169,6 +173,7 @@ public class WeiboHotSearchCrawler {
continue;
}
// }
return result;
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误,数据不是json结构", e);
......@@ -242,6 +247,7 @@ public class WeiboHotSearchCrawler {
}
if (htmlBody != null && htmlBody.contains("data")) {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo");
List<JSONObject> cardsJsons = (List<JSONObject>)JSONObject.parseObject(htmlBody).getJSONObject("data").get("cards");
if(json.containsKey("desc")){
String topicLead = json.getString("desc");
if(!"".equals(topicLead)) {
......@@ -266,12 +272,275 @@ public class WeiboHotSearchCrawler {
}
}
}
//调用weiBoMassageDao添加数据
WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao();
//解析cards,获取热门微博、人物
for (JSONObject jsonObject : cardsJsons) {
if (nonNull(jsonObject) && !jsonObject.isEmpty()) {
if (jsonObject.containsKey("mblog")) {
if (jsonObject.getJSONObject("mblog").containsKey("title")) {
WeiBoMassage weiBoMassage = analysisWeiboMBlog(jsonObject, document.getString("name"));
if (Objects.nonNull(weiBoMassage)) {
weiBoMassageDao.addWeiBoMassage(weiBoMassage);
}
}
} else if (jsonObject.containsKey("card_group")) {
JSONArray cardGroup = jsonObject.getJSONArray("card_group");
WeiBoMassage weiBoMassage = analysisWeiboMassage(cardGroup, document.getString("name"));
if (Objects.nonNull(weiBoMassage)) {
weiBoMassageDao.addWeiBoMassage(weiBoMassage);
}
analysisWeiBoUsers(cardGroup, document.getString("name"));
}
} else {
log.info("获取数据失败");
}
}
return document;
}
}
return null;
}
/**
* 解析微博信息
*
* @param cardGroup
* @param topic
* @return
*/
public static WeiBoMassage analysisWeiboMassage(JSONArray cardGroup, String topic) {
for (int i = 0; i < cardGroup.size(); i++) {
if (cardGroup.getJSONObject(i).containsKey("mblog")) {
if (cardGroup.getJSONObject(i).getJSONObject("mblog").containsKey("title")) {
WeiBoMassage weiBoMassage = analysisWeiboMBlog(cardGroup.getJSONObject(i), topic);
return weiBoMassage;
}
}
}
return null;
}
/**
* 解析用户信息
*
* @param cardGroup
* @param topic
* @return
*/
public static void analysisWeiBoUsers(JSONArray cardGroup, String topic) {
//解析weibo人物信息
//创建weiBoUserDao
WeiBoUserDao weiBoUserDao = new WeiBoUserDao();
Date date = new Date();
for (int i = 0; i < cardGroup.size(); i++) {
if (3==Integer.valueOf(cardGroup.getJSONObject(i).getString("card_type"))) {
if (cardGroup.getJSONObject(i).containsKey("users")){
JSONArray users = cardGroup.getJSONObject(i).getJSONArray("users");
for (int i1 = 0; i1 < users.size(); i1++) {
//获取用户id
String userId = users.getJSONObject(i1).getString("id");
//获取用户名
String userName = users.getJSONObject(i1).getString("screen_name");
//获取认证信息
String attestationMassage = users.getJSONObject(i1).getString("verified_reason");
//获取粉丝数量
String followers_count = users.getJSONObject(i1).getString("followers_count");
Long followerCount =null;
if (!followers_count.contains("万")){
followerCount = Long.valueOf(followers_count);
}else {
String[] split = followers_count.split("万");
followerCount = Long.valueOf(split[0])*10000;
}
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount);
//判断weiBoUser是否为空添加数据
if (weiBoUser!= null) {
//调用weiBoUserDao中的方法添加数据
weiBoUserDao.addWeiBoUser(weiBoUser);
} else {
log.info("未采集到用户信息");
}
}
}
} else if (10==Integer.valueOf(cardGroup.getJSONObject(i).getString("card_type"))) {
if (cardGroup.getJSONObject(i).containsKey("user")){
JSONObject user = cardGroup.getJSONObject(i).getJSONObject("user");
//获取用户id
String userId = user.getString("id");
//获取用户名
String userName = user.getString("screen_name");
//获取认证信息
String attestationMassage = user.getString("verified_reason");
//获取粉丝数
String followers_count = user.getString("followers_count");
Long followerCount =null;
if (followers_count.contains("万")){
String[] split = followers_count.split("万");
followerCount = Long.valueOf(split[0])*10000;
}else {
followerCount = Long.valueOf(followers_count);
}
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount);
//判断weiBoUser是否为空添加数据
if (weiBoUser!= null) {
//调用weiBoUserDao中的方法添加数据
weiBoUserDao.addWeiBoUser(weiBoUser);
} else {
log.info("未采集到用户信息");
}
}
}
}
}
/**
* 解析微博类型
*
* @param jsonObject
* @param topic
* @return
*/
public static WeiBoMassage analysisWeiboMBlog(JSONObject jsonObject, String topic) {
JSONObject mblog = jsonObject.getJSONObject("mblog");
String type = mblog.getJSONObject("title").getString("text");
String card_type = jsonObject.getString("card_type");
Integer cardType = Integer.valueOf(card_type);
String show_type = jsonObject.getString("show_type");
Integer showType = Integer.valueOf(show_type);
//点赞数
String attitudes_count = mblog.getString("attitudes_count");
Long attitudeCount = null;
if (attitudes_count.contains("万")) {
String[] split = attitudes_count.split("万");
attitudeCount = Long.valueOf(split[0]) * 10000;
} else {
attitudeCount = Long.valueOf(attitudes_count);
}
//评论数
String comments_count = mblog.getString("comments_count");
Long commentCount = null;
if (comments_count.contains("万")) {
String[] split = comments_count.split("万");
commentCount = Long.valueOf(split[0]) * 10000;
} else {
commentCount = Long.valueOf(comments_count);
}
//转发数
String reposts_count = mblog.getString("reposts_count");
Long repostCount =null;
if (reposts_count.contains("万")){
String[] split = reposts_count.split("万");
repostCount = Long.valueOf(split[0]) * 10000;
}else {
repostCount = Long.valueOf(reposts_count);
}
Date createTime = null;
Date editTime = null;
try {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss z yyyy", java.util.Locale.US);
//创建时间
String created_at = mblog.getString("created_at");
createTime = simpleDateFormat.parse(created_at);
//编辑时间
if (mblog.containsKey("edit_at")){
String edit_at = mblog.getString("edit_at");
editTime = simpleDateFormat.parse(edit_at);
}
} catch (ParseException e) {
log.error("创建时间和编辑时间解析异常",e);
}
String mid = mblog.getString("mid");
//用户id
String userId = mblog.getJSONObject("user").getString("id");
//用户名
String userName = mblog.getJSONObject("user").getString("screen_name");
//来源
String source = mblog.getString("source");
//内容
String content = null;
if (mblog.getString("text").contains("<")) {
String text = mblog.getString("text");
org.jsoup.nodes.Document parse = Jsoup.parse(text);
content = parse.text();
} else {
content = mblog.getString("text");
}
WeiBoMassage weiBoMassage = new WeiBoMassage(userId, content, userName, mid, createTime, editTime, cardType, showType,
repostCount, commentCount, attitudeCount, source, type, topic);
//默认不转发为0
weiBoMassage.setForward(0);
JSONObject weiboJson = null;
//微博实体 是否转发
if (mblog.containsKey("retweeted_status")) {
weiboJson = mblog.getJSONObject("retweeted_status");
//处理转发特有的
//weiBoMassage.set
//源mid
String rootMid = weiboJson.getString("mid");
//源来源
String rootSource = weiboJson.getString("source");
//源text
String text = weiboJson.getString("text");
//解析
org.jsoup.nodes.Document parse = Jsoup.parse(text);
String rootText = parse.text();
//源用户id
String rootId = weiboJson.getJSONObject("user").getString("id");
//源用户名
String rootName = weiboJson.getJSONObject("user").getString("screen_name");
//数据保存到对象中
weiBoMassage.setRoot_mid(rootMid);
weiBoMassage.setRoot_id(rootId);
weiBoMassage.setRoot_source(rootSource);
weiBoMassage.setRoot_text(rootText);
weiBoMassage.setRoot_name(rootName);
//转发为1
weiBoMassage.setForward(1);
} else {
weiboJson = mblog;
}
List<String> pictureUrlList = new ArrayList();
Long playCount = null;
//获取播放量和图片链接
if (weiboJson.getJSONArray("pic_ids").size() > 0) {
JSONArray jsonArray = weiboJson.getJSONArray("pics");
for (int i = 0; i < jsonArray.size(); i++) {
String picUrl = jsonArray.getJSONObject(i).getString("url");
pictureUrlList.add(picUrl);
}
} else if (weiboJson.containsKey("page_info")) {
if (weiboJson.getJSONObject("page_info").containsKey("play_count")){
String play = weiboJson.getJSONObject("page_info").getString("play_count");
if (play.contains("万")) {
String[] split = play.split("万");
playCount = Long.valueOf(split[0]) * 10000;
}else if(play.contains("次")){
String[] split = play.split("次");
playCount = Long.valueOf(split[0]);
}
}
}
weiBoMassage.setPlayCount(playCount);
weiBoMassage.setPictureUrlList(pictureUrlList);
return weiBoMassage;
}
// /**
// * 微博更新历史数据
// * @param hotSearch
......
package com.zhiwei.searchhotcrawler.dao;
import com.mongodb.BasicDBObject;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
......@@ -38,7 +36,7 @@ public class HotSearchCacheDAO {
document.put("_id", hotSearch.getId());
document.put("name", hotSearch.getName());
document.put("url", hotSearch.getUrl());
document.put("count", hotSearch.getCount());
document.put("count", hotSearch.getCount());
document.put("hot", hotSearch.getHot());
document.put("day", hotSearch.getDay());
document.put("time", hotSearch.getTime());
......@@ -52,6 +50,10 @@ public class HotSearchCacheDAO {
// if("今日头条热搜".equals(hotSearch.getType())){
// document.put("comment_count", hotSearch.getCommentCount());
// }
if("虎嗅热文推荐".equals(hotSearch.getType())){
document.put("comment_count", hotSearch.getCommentCount());
}
if("腾讯较真榜".equals(hotSearch.getType())){
document.put("topic_result",hotSearch.getTopicResult());
}
......@@ -125,7 +127,7 @@ public class HotSearchCacheDAO {
//计算上升速度
double riseSpeed = nowDoc.containsKey("riseSpeed")?nowDoc.getDouble("riseSpeed"):0.00;
if(nonNull(lastCount) && nowDoc.containsKey("firstCount")) {
long firstCount = Long.parseLong(nowDoc.get("firstCount").toString());
long firstCount = nowDoc.getLong("firstCount");
riseSpeed = ((double)(lastCount - firstCount)/(double)firstCount)*1000/((double)duration);
}
// endTime = getEndTime(type, new Date());
......@@ -181,6 +183,10 @@ public class HotSearchCacheDAO {
// if(readCount != null){
// nowDoc.put("readCount",readCount);
// }
if("虎嗅热文推荐".equals(type)){
nowDoc.put("comment_count",document.getLong("comment_count"));
}
if(topicResult != null){
nowDoc.put("topicResult",topicResult);
}
......@@ -207,7 +213,7 @@ public class HotSearchCacheDAO {
}
}
}catch (Exception e){
log.error("数据存储时出错:", e);
log.error("数据存储时出错:{}", e);
}
}
......
package com.zhiwei.searchhotcrawler.dao;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.zhiwei.searchhotcrawler.bean.WeiBoMassage;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
/**
*微博信息入库
*/
@Log4j2
public class WeiBoMassageDao {
public static MongoDatabase mongoDatabase = MongoDBTemplate.getDB(DBConfig.dbName);
public static MongoCollection mongoCollection;
public WeiBoMassageDao() {
String collName = DBConfig.weiBoMassageCollName;
mongoCollection = mongoDatabase.getCollection(collName);
//给数据表创建索引
MongoDBTemplate.createIndex(DBConfig.dbName, collName);
}
/**
* 添加数据入库
* @param weiBoMassage
*/
public void addWeiBoMassage(WeiBoMassage weiBoMassage){
log.info("weiBoMassage对象开始转document对象");
Document document = new Document();
document.put("_id",weiBoMassage.getId());
document.put("userId",weiBoMassage.getUserId());
document.put("text",weiBoMassage.getText());
document.put("userName",weiBoMassage.getUserName());
document.put("mid",weiBoMassage.getMid());
document.put("creatTime",weiBoMassage.getCreatTime());
if (Objects.nonNull(weiBoMassage.getEditTime())){
document.put("editTime",weiBoMassage.getEditTime());
}
document.put("cardType",weiBoMassage.getCardType());
document.put("showType",weiBoMassage.getShowType());
document.put("repostCount",weiBoMassage.getRepostCount());
document.put("commentCount",weiBoMassage.getCommentCount());
document.put("attitudeCount",weiBoMassage.getAttitudeCount());
if (Objects.nonNull(weiBoMassage.getPlayCount())){
document.put("playCount",weiBoMassage.getPlayCount());
}
if (weiBoMassage.getPictureUrlList().size()!=0){
document.put("pictureUrlList",weiBoMassage.getPictureUrlList());
}
document.put("source",weiBoMassage.getSource());
document.put("type",weiBoMassage.getType());
document.put("topic",weiBoMassage.getTopic());
document.put("forward",weiBoMassage.getForward());
if (0!=weiBoMassage.getForward()){
document.put("root_mid",weiBoMassage.getRoot_mid());
document.put("root_id",weiBoMassage.getRoot_id());
document.put("root_name",weiBoMassage.getRoot_name());
document.put("root_text",weiBoMassage.getRoot_text());
document.put("root_source",weiBoMassage.getRoot_source());
}
log.info("weiBoMassage对象转document对象完成");
try {
mongoCollection.insertOne(document);
log.info("数据插入成功");
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}",e);
}
}
}
package com.zhiwei.searchhotcrawler.dao;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.zhiwei.searchhotcrawler.bean.WeiBoMassage;
import com.zhiwei.searchhotcrawler.bean.WeiBoUser;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
@Log4j2
public class WeiBoUserDao {
public static MongoDatabase mongoDatabase = MongoDBTemplate.getDB(DBConfig.dbName);
public static MongoCollection mongoCollection;
public WeiBoUserDao() {
String collName = DBConfig.weiBoUserCollName;
mongoCollection = mongoDatabase.getCollection(collName);
//给数据表创建索引
MongoDBTemplate.createIndex(DBConfig.dbName, collName);
}
/**
* 添加数据入库
* @param weiBoUser
*/
public void addWeiBoUser(WeiBoUser weiBoUser){
log.info("WeiBoUser对象开始转document对象");
Document document = new Document();
document.put("_id",weiBoUser.getId());
document.put("userId",weiBoUser.getUserId());
if (Objects.nonNull(weiBoUser.getAttestationMassage())){
document.put("attestationMassage",weiBoUser.getAttestationMassage());
}
document.put("userName",weiBoUser.getUserName());
document.put("topic",weiBoUser.getTopic());
document.put("time",weiBoUser.getTime());
document.put("followerCount",weiBoUser.getFollowerCount());
log.info("WeiBoUser对象转document对象完成");
try {
mongoCollection.insertOne(document);
log.info("数据插入成功");
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}",e);
}
}
}
package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.time.Duration;
import java.util.*;
/**
* @author ll
* @ClassName:HotSearch36KrCrawler
* @Description:
* @date 2021年5月21日 上午11:54:31
*/
@Log4j2
public class HotSearch36KrCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public static List<HotSearchList> hotSearch36Kr(Date date) {
String url = "https://www.36kr.com/hot-list/catalog";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析36Kr人气榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("article-list")) {
return ansysData(htmlBody,date);
} else {
log.info("解析36Kr人气榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
// public static List<HotSearchList> hotSearch36Kr(Date date) {
// String url = "https://www.36kr.com/hot-list/catalog";
// //建立一个新的客户端请求(创建HttpClient对象)
// CloseableHttpClient httpClient = HttpClients.createDefault();
// //创建请求对象实例
// HttpGet httpGet = new HttpGet(url);
// httpGet.addHeader("User-Agent", "spider");
// //获取响应的结果
// CloseableHttpResponse response = null;
// try {
// //调用HttpClient对象的execute方法发送请求
// response = httpClient.execute(httpGet);
//
// if (Objects.nonNull(response)) {
// //获取HttpEntity对象其中包含了响应内容(响应头)
// HttpEntity entity = response.getEntity();
//
// String htmlBody = EntityUtils.toString(entity);
// return ansysData(htmlBody,date);
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
// return Collections.emptyList();
// }
/**
* 解析数据
*
* @param htmlBody
* @return
*/
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
List<HotSearchList> list = new ArrayList<>();
String webSite = "https://www.36kr.com";
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.article-list").first().select("div.article-wrapper");
if (Objects.nonNull(elements) && !elements.isEmpty()) {
// 获取排名rank
int rank = 0;
for (Element element : elements) {
try {
rank++;
// 获取关键词(String)
String keyWord = element.select("p.title-wrapper").select("a.article-item-title").text();
// logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
String everurl = element.select("p.title-wrapper").select("a.article-item-title").attr("href");
// 获取搜索指数count(int)
String url = webSite + everurl;
String hot = null;
// 判断热度值所在的规则是否为null
if (!element.select("span").isEmpty()) {
hot = element.select("span").text();
}
Long count = 0L;
// 判断hot是否为空
if (StringUtils.isNotBlank(hot)) {
String[] hots = hot.split("热度");
String trim = hots[1].trim();
Double num = Double.valueOf(trim);
count = Math.round(num);
}
if (Objects.nonNull(rank)) {
if (count == 0) {
log.info(htmlBody);
log.info(hot);
log.info(element);
} else {
HotSearchList hotSearch = new HotSearchList(url, keyWord, count, rank, HotSearchType.人气榜36.name(), date);
list.add(hotSearch);
}
}
} catch (Exception e) {
log.error("解析36Kr人气榜时出现解析错误", e);
}
}
}
} catch (Exception e) {
log.error("解析36Kr人气榜时出现解析错误,数据不是json结构", e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.time.Duration;
import java.util.*;
/**
* @author: ll
* @ClassName: HuXiuHotSearchCrawler
* @Description: pc端虎嗅热文推荐采集
* @date: 2021年5月24日 下午16:35:31
* @Title: HuXiuHotSearchCrawler
*/
@Log4j2
public class HuXiuHotSearchCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
public static List<HotSearchList> HuXiuHotArticleRecommended(Date date){
String url = "https://www.huxiu.com/";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析虎嗅热文推荐时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("hot__list")) {
return ansysData(htmlBody,date);
} else {
log.info("解析虎嗅热文推荐时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
// String url="https://www.huxiu.com/";
// //创建客户端请求对象
// CloseableHttpClient httpClient = HttpClients.createDefault();
// //创建请求对象实例
// HttpGet httpGet = new HttpGet(url);
// //设置头信息
// httpGet.addHeader("User-Agent","spider");
//
// //获取响应结果
// try {
// CloseableHttpResponse response = httpClient.execute(httpGet);
// //判断响应结果是否为空
// if (Objects.nonNull(response)) {
// //获取HttpEntity对象其中包含了响应内容(响应头)
// HttpEntity entity = response.getEntity();
// String htmlBody = EntityUtils.toString(entity);
// return ansysData(htmlBody,date);
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
//
// return Collections.emptyList();
// }
//解析页面数据
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
ArrayList<HotSearchList> list = new ArrayList<>();
String webSite="https://www.huxiu.com";
try {
//获取Document文档对象
Document document = Jsoup.parse(htmlBody);
//获取元素集合
Elements elements = document.select("div.hot__list").select("div.focus-item");
if (Objects.nonNull(elements) && !elements.isEmpty()){
// 获取排名rank
Integer rank = 0;
for (Element element : elements) {
try {
rank++;
//获取关键词
String keyWord= element.select("p").text();
//获取关键词相关链接
String href = element.select("a.focus-item__left").attr("href");
String url=webSite+href;
//获取讨论量
String comment = element.select("i").first().text();
Long commentCount = Long.valueOf(comment);
String topicLead =null;
long count=0L;
HotSearchList hotSearchList = new HotSearchList(url, keyWord,count, rank,HotSearchType.虎嗅热文推荐.name(),commentCount, topicLead, date);
list.add(hotSearchList);
} catch (NumberFormatException e) {
log.error("解析虎嗅热文推荐时出现解析错误",e);
}
}
}
} catch (Exception e) {
log.error("解析虎嗅热文推荐时出现解析错误,数据不是json结构",e);
}
return list;
}
}
......@@ -8,6 +8,8 @@ import com.zhiwei.searchhotcrawler.crawler.*;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.RedisDao;
import com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO;
import com.zhiwei.searchhotcrawler.crawler.HotSearch36KrCrawler;
import com.zhiwei.searchhotcrawler.crawler.HuXiuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.timer.TouTiaoExecutor;
import com.zhiwei.searchhotcrawler.util.DateUtils;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
......@@ -38,6 +40,30 @@ public class GatherTimer {
/** 知乎时事子分类 */
private String DEPTH = "depth";
/**
* 虎嗅热文推荐的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ?")
public void crawlerHuXiu() {
logger.info("虎嗅热文推荐开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> huXiuList = HuXiuHotSearchCrawler.HuXiuHotArticleRecommended(date);
logger.info("{}, 虎嗅热文推荐此轮采集到的数据量为:{}", new Date(), Integer.valueOf(huXiuList != null ? huXiuList.size() : 0));
TipsUtils.addHotList(HotSearchType.虎嗅热文推荐.name(), huXiuList);
logger.info("虎嗅热文推荐采集结束...");
/**
* 36氪人气榜的采集
*/
logger.info("36氪人气榜开始采集...");
List<HotSearchList> list36Kr = HotSearch36KrCrawler.hotSearch36Kr(date);
logger.info("{}, 36氪人气榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list36Kr != null ? list36Kr.size() : 0));
TipsUtils.addHotList(HotSearchType.人气榜36.name(), list36Kr);
logger.info("36氪人气榜采集结束...");
}
/**
* 微博热搜的采集
*/
......
......@@ -4,12 +4,19 @@ package weiboTest;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.bean.WeiBoMassage;
import com.zhiwei.searchhotcrawler.bean.WeiBoUser;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.dao.WeiBoMassageDao;
import com.zhiwei.searchhotcrawler.dao.WeiBoUserDao;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
......@@ -20,9 +27,14 @@ import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.TimeUnit;
import static java.util.Objects.nonNull;
/**
* @author cwt
......@@ -31,69 +43,161 @@ import java.util.*;
@Log4j2
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations =
{ "classpath:applicationContext.xml" })
public class WeiboHotSearchTest{
{"classpath:applicationContext.xml"})
public class WeiboHotSearchTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
@Test
public void test(){
Document document = Jsoup.parse("a href=\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23&extparam=%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23&luicode=10000011&lfid=100103type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23\" data-hide=\"\"><span class=\"surl-text\">#邓伦讲戏专业#</span></a><a href=\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E6%9E%81%E9%99%90%E6%8C%91%E6%88%98%23&luicode=10000011&lfid=100103type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23\" data-hide=\"\"><span class=\"surl-text\">#极限挑战#</span></a> <a href='/n/邓伦'>@邓伦</a> 和<a href='/n/景甜'>@景甜</a> 改编《甄嬛传》剧本,伦伦认真讲戏的样子让人瞬间穿越到拍摄现场。看来戏瘾上身的邓伦还过了一把导演的瘾,这专业的模样要不要考虑跨界当当导演呀~<span class=\"url-icon\"><img alt=[哈哈] src=\"https://h5.sinaimg.cn/m/emoticon/icon/default/d_haha-0ec05e6dad.png\" style=\"width:1em; height:1em;\" /></span><a data-url=\"http://t.cn/A6VJPN9w\" href=\"https://video.weibo.com/show?fid=1034:4640837901156490\" data-hide=\"\"><span class='url-icon'><img style='width: 1rem;height: 1rem' src='https://h5.sinaimg.cn/upload/2015/09/25/3/timeline_card_small_video_default.png'></span><span class=\"surl-text\">东方卫视极限挑战的微博视频</span></a>");
public void test() {
Document document = Jsoup.parse("<a href=\\\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E5%91%A8%E6%9F%AF%E5%AE%87%E7%88%B8%E7%88%B8%23&extparam=%23%E5%91%A8%E6%9F%AF%E5%AE%87%E7%88%B8%E7%88%B8%23&luicode=10000011&lfid=231522type%3D1%26q%3D%23%E5%91%A8%E6%9F%AF%E5%AE%87%E7%88%B8%E7%88%B8%23\\\" data-hide=\\\"\\\"><span class=\\\"surl-text\\\">#周柯宇爸爸#</span></a> \uD83E\uDDD0<a href=\\\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E5%91%A8%E6%9F%AF%E5%AE%87%E4%BC%A0%E9%94%80%E4%B9%8B%E5%AD%90%23&extparam=%23%E5%91%A8%E6%9F%AF%E5%AE%87%E4%BC%A0%E9%94%80%E4%B9%8B%E5%AD%90%23&luicode=10000011&lfid=231522type%3D1%26q%3D%23%E5%91%A8%E6%9F%AF%E5%AE%87%E7%88%B8%E7%88%B8%23\\\" data-hide=\\\"\\\"><span class=\\\"surl-text\\\">#周柯宇传销之子#</span></a> <br />周柯宇粉丝今天懂法了吗?没有我一会再来普法。周柯宇粉丝为传销洗地,周柯宇偶像失格,周柯宇粉丝素质低下,道德沦丧 \"\n");
System.out.println(document.text());
}
@Test
public void testHotWeibo(){
Date date = new Date();
List<HotSearchList> hotSearchLists = weiboHotSearchByPhone(date);
for (HotSearchList hotSearchList : hotSearchLists) {
public void test1() {
String url = "<a href";
System.out.println(url.startsWith("<"));
}
@Test
public void testHotWeibo() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
while(true) {
try {
Date date = new Date();
List<HotSearchList> hotSearchLists = weiboHotSearchByPhone(date);
for (HotSearchList hotSearchList : hotSearchLists) {
try {
org.bson.Document document = new org.bson.Document();
//System.out.println(hotSearchList);
document.put("url", hotSearchList.getUrl());
document.put("name", hotSearchList.getName());
test12(document);
} catch (Exception e) {
log.info("数据解析异常",e);
}
}
log.info("本轮微博话题采集解析完毕");
log.info(hotSearchLists.size());
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
log.info("微博热搜采集异常",e);
ZhiWeiTools.sleep(60*60*1000);
}
ZhiWeiTools.sleep(50);
}
//
// Date date = new Date();
// List<HotSearchList> hotSearchLists = weiboHotSearchByPhone(date);
// for (HotSearchList hotSearchList : hotSearchLists) {
// System.out.println(hotSearchList);
// }
}
//org.bson.Document document
// @Test
public void test12(org.bson.Document document) {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
// org.bson.Document document = new org.bson.Document();
// document.put("name","新疆人讲述真实的新疆");
// document.put("url","https://m.weibo.cn/search?containerid=100103type%3D1%26t%3D10%26q%3D%23%E6%96%B0%E7%96%86%E4%BA%BA%E8%AE%B2%E8%BF%B0%E7%9C%9F%E5%AE%9E%E7%9A%84%E6%96%B0%E7%96%86%23&isnewpage=1&extparam=seat%3D1%26filter_type%3Drealtimehot%26dgr%3D0%26cate%3D0%26pos%3D1%26realpos%3D2%26flag%3D1%26c_type%3D31%26display_time%3D1622705918&luicode=10000011&lfid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot");
log.info("更新微博热搜{}导语阅读量和讨论量", document.getString("name"));
String url = "https://m.weibo.cn/api/container/getIndex?" + document.getString("url").substring(
document.getString("url").indexOf("?") + 1, document.getString("url").indexOf("&"));
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 2; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜详情页面时出现连接失败", e);
}
if (htmlBody != null && htmlBody.contains("data")) {
JSONObject dataJson = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONObject cardlistInfoJson = dataJson.getJSONObject("cardlistInfo");
List<JSONObject> cardsJsons = (List<JSONObject>) dataJson.get("cards");
//解析cardlistInfo,讨论、导语、阅读
if (cardlistInfoJson.containsKey("desc")) {
String topicLead = cardlistInfoJson.getString("desc");
if (!"".equals(topicLead)) {
document.put("topicLead", topicLead);
}
}
if (cardlistInfoJson.containsKey("cardlist_head_cards")) {
JSONObject readJson = cardlistInfoJson.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext");
String read = midText.replaceAll("阅读", "").replaceAll("讨论.*", "").trim();
String discussCount = midText.replaceAll(".*讨论", "").replaceAll("详情.*", "").trim();
String pictureUrl = readJson.getJSONObject("head_data").getString("portrait_url");
document.put("readCount", TipsUtils.getHotCount(read));
document.put("discussCount", TipsUtils.getHotCount(discussCount));
document.put("pictureUrl", pictureUrl);
if (readJson.getJSONObject("head_data").containsKey("downtext")) {
String downtext = readJson.getJSONObject("head_data").getString("downtext");
if (!"".equals(downtext)) {
document.put("downtext", downtext.replaceAll("主持人:", ""));
}
}
}
}
//调用weiBoMassageDao添加数据
WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao();
//解析cards,获取热门微博、人物
for (JSONObject jsonObject : cardsJsons) {
if (nonNull(jsonObject) && !jsonObject.isEmpty()) {
if (jsonObject.containsKey("mblog")) {
if (jsonObject.getJSONObject("mblog").containsKey("title")) {
WeiBoMassage weiBoMassage = analysisWeiboMBlog(jsonObject, document.getString("name"));
if (Objects.nonNull(weiBoMassage)) {
weiBoMassageDao.addWeiBoMassage(weiBoMassage);
}
}
} else if (jsonObject.containsKey("card_group")) {
JSONArray cardGroup = jsonObject.getJSONArray("card_group");
WeiBoMassage weiBoMassage = analysisWeiboMassage(cardGroup, document.getString("name"));
if (Objects.nonNull(weiBoMassage)) {
weiBoMassageDao.addWeiBoMassage(weiBoMassage);
}
analysisWeiBoUsers(cardGroup, document.getString("name"));
}
} else {
log.info("获取数据失败");
}
}
break;
}
}
}
/**
* 微博热搜数据更新导语,阅读量,讨论量
*
* @param document
* @return
*/
public static org.bson.Document weiboUpdate(org.bson.Document document) {
log.info("更新微博热搜{}导语阅读量和讨论量",document.getString("name"));
String url = "https://m.weibo.cn/api/container/getIndex?"+ document.getString("url").substring(
document.getString("url").indexOf("?")+1,document.getString("url").indexOf("&"));
log.info("更新微博热搜{}导语阅读量和讨论量", document.getString("name"));
String url = "https://m.weibo.cn/api/container/getIndex?" + document.getString("url").substring(
document.getString("url").indexOf("?") + 1, document.getString("url").indexOf("&"));
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for(int count =0; count<=5; count++) {
for (int count = 0; count <= 2; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
......@@ -104,13 +208,13 @@ public class WeiboHotSearchTest{
JSONObject cardlistInfoJson = dataJson.getJSONObject("cardlistInfo");
List<JSONObject> cardsJsons = (List<JSONObject>) dataJson.get("cards");
//解析cardlistInfo,讨论、导语、阅读
if(cardlistInfoJson.containsKey("desc")){
if (cardlistInfoJson.containsKey("desc")) {
String topicLead = cardlistInfoJson.getString("desc");
if(!"".equals(topicLead)) {
if (!"".equals(topicLead)) {
document.put("topicLead", topicLead);
}
}
if(cardlistInfoJson.containsKey("cardlist_head_cards")){
if (cardlistInfoJson.containsKey("cardlist_head_cards")) {
JSONObject readJson = cardlistInfoJson.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext");
......@@ -119,63 +223,304 @@ public class WeiboHotSearchTest{
String pictureUrl = readJson.getJSONObject("head_data").getString("portrait_url");
document.put("readCount", TipsUtils.getHotCount(read));
document.put("discussCount", TipsUtils.getHotCount(discussCount));
document.put("pictureUrl",pictureUrl);
if (readJson.getJSONObject("head_data").containsKey("downtext")){
document.put("pictureUrl", pictureUrl);
if (readJson.getJSONObject("head_data").containsKey("downtext")) {
String downtext = readJson.getJSONObject("head_data").getString("downtext");
if(!"".equals(downtext)) {
document.put("downtext",downtext.replaceAll("主持人:",""));
if (!"".equals(downtext)) {
document.put("downtext", downtext.replaceAll("主持人:", ""));
}
}
}
}
//调用weiBoMassageDao添加数据
WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao();
//解析cards,获取热门微博、人物
for (JSONObject jsonObject : cardsJsons) {
if (nonNull(jsonObject) && !jsonObject.isEmpty()) {
if (jsonObject.containsKey("mblog")) {
if (jsonObject.getJSONObject("mblog").containsKey("title")) {
WeiBoMassage weiBoMassage = analysisWeiboMBlog(jsonObject, document.getString("name"));
if (Objects.nonNull(weiBoMassage)) {
weiBoMassageDao.addWeiBoMassage(weiBoMassage);
}
}
} else if (jsonObject.containsKey("card_group")) {
JSONArray cardGroup = jsonObject.getJSONArray("card_group");
WeiBoMassage weiBoMassage = analysisWeiboMassage(cardGroup, document.getString("name"));
if (Objects.nonNull(weiBoMassage)) {
weiBoMassageDao.addWeiBoMassage(weiBoMassage);
}
analysisWeiBoUsers(cardGroup, document.getString("name"));
}
} else {
log.info("获取数据失败");
}
}
return document;
}
}
return null;
}
/**
* 解析微博信息
*
* @param cardGroup
* @param topic
* @return
*/
public static WeiBoMassage analysisWeiboMassage(JSONArray cardGroup, String topic) {
for (int i = 0; i < cardGroup.size(); i++) {
if (cardGroup.getJSONObject(i).containsKey("mblog")) {
if (cardGroup.getJSONObject(i).getJSONObject("mblog").containsKey("title")) {
WeiBoMassage weiBoMassage = analysisWeiboMBlog(cardGroup.getJSONObject(i), topic);
return weiBoMassage;
}
}
}
return null;
}
/**
* 解析用户信息
*
* @param cardGroup
* @param topic
* @return
*/
public static void analysisWeiBoUsers(JSONArray cardGroup, String topic) {
//解析weibo人物信息
//创建weiBoUserDao
WeiBoUserDao weiBoUserDao = new WeiBoUserDao();
Date date = new Date();
for (int i = 0; i < cardGroup.size(); i++) {
if (3==Integer.valueOf(cardGroup.getJSONObject(i).getString("card_type"))) {
if (cardGroup.getJSONObject(i).containsKey("users")){
JSONArray users = cardGroup.getJSONObject(i).getJSONArray("users");
for (int i1 = 0; i1 < users.size(); i1++) {
//获取用户id
String userId = users.getJSONObject(i1).getString("id");
//获取用户名
String userName = users.getJSONObject(i1).getString("screen_name");
//获取认证信息
String attestationMassage = users.getJSONObject(i1).getString("verified_reason");
//获取粉丝数量
String followers_count = users.getJSONObject(i1).getString("followers_count");
Long followerCount =null;
if (!followers_count.contains("万")){
followerCount = Long.valueOf(followers_count);
}else {
String[] split = followers_count.split("万");
followerCount = Long.valueOf(split[0])*10000;
}
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount);
//判断weiBoUser是否为空添加数据
if (weiBoUser!= null) {
//调用weiBoUserDao中的方法添加数据
weiBoUserDao.addWeiBoUser(weiBoUser);
} else {
log.info("未采集到用户信息");
}
}
}
} else if (10==Integer.valueOf(cardGroup.getJSONObject(i).getString("card_type"))) {
if (cardGroup.getJSONObject(i).containsKey("user")){
JSONObject user = cardGroup.getJSONObject(i).getJSONObject("user");
//获取用户id
String userId = user.getString("id");
//获取用户名
String userName = user.getString("screen_name");
//获取认证信息
String attestationMassage = user.getString("verified_reason");
//获取粉丝数
String followers_count = user.getString("followers_count");
Long followerCount =null;
if (followers_count.contains("万")){
String[] split = followers_count.split("万");
followerCount = Long.valueOf(split[0])*10000;
}else {
followerCount = Long.valueOf(followers_count);
}
return document;
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount);
//判断weiBoUser是否为空添加数据
if (weiBoUser!= null) {
//调用weiBoUserDao中的方法添加数据
weiBoUserDao.addWeiBoUser(weiBoUser);
} else {
log.info("未采集到用户信息");
}
}
}
}
return null;
}
public JSONObject analysisWeiboSon(JSONObject readJson){
/**
* 解析微博类型
*
* @param jsonObject
* @param topic
* @return
*/
public static WeiBoMassage analysisWeiboMBlog(JSONObject jsonObject, String topic) {
JSONObject mblog = jsonObject.getJSONObject("mblog");
String type = mblog.getJSONObject("title").getString("text");
String card_type = jsonObject.getString("card_type");
Integer cardType = Integer.valueOf(card_type);
String show_type = jsonObject.getString("show_type");
Integer showType = Integer.valueOf(show_type);
//点赞数
String attitudes_count = mblog.getString("attitudes_count");
Long attitudeCount = null;
if (attitudes_count.contains("万")) {
String[] split = attitudes_count.split("万");
attitudeCount = Long.valueOf(split[0]) * 10000;
} else {
attitudeCount = Long.valueOf(attitudes_count);
}
//评论数
String comments_count = mblog.getString("comments_count");
Long commentCount = null;
if (comments_count.contains("万")) {
String[] split = comments_count.split("万");
commentCount = Long.valueOf(split[0]) * 10000;
} else {
commentCount = Long.valueOf(comments_count);
}
//转发数
String reposts_count = mblog.getString("reposts_count");
Long repostCount =null;
if (reposts_count.contains("万")){
String[] split = reposts_count.split("万");
repostCount = Long.valueOf(split[0]) * 10000;
}else {
repostCount = Long.valueOf(reposts_count);
}
Date createTime = null;
Date editTime = null;
try {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss z yyyy", java.util.Locale.US);
//创建时间
String created_at = mblog.getString("created_at");
createTime = simpleDateFormat.parse(created_at);
//编辑时间
if (mblog.containsKey("edit_at")){
String edit_at = mblog.getString("edit_at");
editTime = simpleDateFormat.parse(edit_at);
}
} catch (ParseException e) {
log.error("创建时间和编辑时间解析异常",e);
}
String mid = mblog.getString("mid");
//用户id
String userId = mblog.getJSONObject("user").getString("id");
//用户名
String userName = mblog.getJSONObject("user").getString("screen_name");
//来源
String source = mblog.getString("source");
//内容
String content = null;
if (mblog.getString("text").contains("<")) {
String text = mblog.getString("text");
Document parse = Jsoup.parse(text);
content = parse.text();
} else {
content = mblog.getString("text");
}
return null;
WeiBoMassage weiBoMassage = new WeiBoMassage(userId, content, userName, mid, createTime, editTime, cardType, showType,
repostCount, commentCount, attitudeCount, source, type, topic);
//默认不转发为0
weiBoMassage.setForward(0);
JSONObject weiboJson = null;
//微博实体 是否转发
if (mblog.containsKey("retweeted_status")) {
weiboJson = mblog.getJSONObject("retweeted_status");
//处理转发特有的
//weiBoMassage.set
//源mid
String rootMid = weiboJson.getString("mid");
//源来源
String rootSource = weiboJson.getString("source");
//源text
String text = weiboJson.getString("text");
//解析
Document parse = Jsoup.parse(text);
String rootText = parse.text();
//源用户id
String rootId = weiboJson.getJSONObject("user").getString("id");
//源用户名
String rootName = weiboJson.getJSONObject("user").getString("screen_name");
//数据保存到对象中
weiBoMassage.setRoot_mid(rootMid);
weiBoMassage.setRoot_id(rootId);
weiBoMassage.setRoot_source(rootSource);
weiBoMassage.setRoot_text(rootText);
weiBoMassage.setRoot_name(rootName);
//转发为1
weiBoMassage.setForward(1);
} else {
weiboJson = mblog;
}
List<String> pictureUrlList = new ArrayList();
Long playCount = null;
//获取播放量和图片链接
if (weiboJson.getJSONArray("pic_ids").size() > 0) {
JSONArray jsonArray = weiboJson.getJSONArray("pics");
for (int i = 0; i < jsonArray.size(); i++) {
String picUrl = jsonArray.getJSONObject(i).getString("url");
pictureUrlList.add(picUrl);
}
} else if (weiboJson.containsKey("page_info")) {
if (weiboJson.getJSONObject("page_info").containsKey("play_count")){
String play = weiboJson.getJSONObject("page_info").getString("play_count");
if (play.contains("万")) {
String[] split = play.split("万");
playCount = Long.valueOf(split[0]) * 10000;
}else if(play.contains("次")){
String[] split = play.split("次");
playCount = Long.valueOf(split[0]);
}
}
}
weiBoMassage.setPlayCount(playCount);
weiBoMassage.setPictureUrlList(pictureUrlList);
return weiBoMassage;
}
/**
* @return void 返回类型
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
* @return void 返回类型
*/
public static List<HotSearchList> weiboHotSearchByPhone(Date date){
public static List<HotSearchList> weiboHotSearchByPhone(Date date) {
String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
Map<String,String> headerMap = new HashMap<>();
Map<String, String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for(int count =0; count<=5; count++){
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博时热搜时出现连接失败",e);
log.error("解析微博时热搜时出现连接失败", e);
}
List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
......@@ -187,11 +532,11 @@ public class WeiboHotSearchTest{
try {
JSONObject card = cards.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group");
JSONObject topCard =cardGroup.getJSONObject(0);
if(!topCard.containsKey("pic")){
JSONObject topCard = cardGroup.getJSONObject(0);
if (!topCard.containsKey("pic")) {
rank = 1;
}
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
if (nonNull(cardGroup) && !cardGroup.isEmpty()) {
// String title = card.getString("title");
boolean hot = true;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
......@@ -232,4 +577,5 @@ public class WeiboHotSearchTest{
return Collections.emptyList();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment