Commit cb6bcd76 by zhiwei

添加微博话题采集,并添加lombok

parent a9966f9d
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>searchhotcrawler</artifactId>
<version>0.0.6-SNAPSHOT</version>
<name>各平台热搜榜单采集程序</name>
<description>各平台热搜榜单采集程序
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<developers>
<developer>
<id>Bewilder</id>
<name>zhiwei zhang</name>
<email>zhangzhiwei@zhiweidata.com</email>
</developer>
</developers>
<dependencies>
<!-- 数据解析jar -->
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.6.3</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>sendmail</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.5-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.5.6-SNAPSHOT</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.2</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>com.zhiwei.searchhotcrawler.run.HotSearchRun</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
<skipTests>true</skipTests>
</configuration>
</plugin>
</plugins>
</build>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>searchhotcrawler</artifactId>
<version>0.0.6-SNAPSHOT</version>
<name>各平台热搜榜单采集程序</name>
<description>各平台热搜榜单采集程序
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<developers>
<developer>
<id>Bewilder</id>
<name>zhiwei zhang</name>
<email>zhangzhiwei@zhiweidata.com</email>
</developer>
</developers>
<dependencies>
<!-- 数据解析jar -->
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.6.3</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>sendmail</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.6-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.0.4-RELEASE</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.8</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.2</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>com.zhiwei.searchhotcrawler.run.HotSearchRun</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
<skipTests>true</skipTests>
</configuration>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
......@@ -10,36 +10,80 @@ import java.io.Serializable;
import java.util.Date;
import com.zhiwei.tools.timeparse.TimeParse;
import lombok.Data;
import lombok.ToString;
@Data
@ToString
public class HotSearchList implements Serializable{
private static final long serialVersionUID = 2076919584659821600L;
private String id; //主键
private String url; //消息链接
private String name; //热搜关键词
private Integer count; //时时热搜量
private Boolean hot; //状态(true 为热搜; false为时时上升)
private String day; //天
private Date time; //时间
private Integer changeCount; //据上分钟变化量
private Integer rank; //排名
private String type; //分类
private String icon; //热搜类型
/**
* 主键
*/
private String id;
/**
* 消息链接
*/
private String url;
/**
* 热搜关键词
*/
private String name;
/**
* 热搜或话题导语
*/
private String topicLead;
/**
* 时时热搜量
*/
private Integer count;
/**
* 状态(true 为热搜; false为时时上升)
*/
private Boolean hot;
/**
* 天
*/
private String day;
/**
* 时间
*/
private Date time;
/**
* 据上分钟变化量
*/
private Integer changeCount;
/**
* 排名
*/
private Integer rank;
/**
* 分类
*/
private String type;
/**
* 热搜类型
*/
private String icon;
/**
* 话题讨论量
*/
private Integer commentCount;
public HotSearchList(){}
public HotSearchList(String url, String name, Integer count,Boolean hot,Integer rank,String type,String icon){
......@@ -67,122 +111,20 @@ public class HotSearchList implements Serializable{
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.type = type;
}
@Override
public String toString(){
return "new HotSearchList["
+ "id = " + id
+ ", url = " + url
+ ", name = " + name
+ ", count = " + count
+ ", time = " + time
+ ", hot = " + hot
+ ", rank = " + rank
+ ", day = " + day
+ ", changeCount = " + changeCount
+ ", type = " + type
+ ", icon = " + icon
+ "]";
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
public HotSearchList(String url, String name, Integer count,Integer rank,String type, Integer commentCount, String topicLead){
this.id = name + "_" + new Date().getTime();
this.url = url;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Integer getCount() {
return count;
}
public void setCount(Integer count) {
this.count = count;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public Integer getChangeCount() {
return changeCount;
}
public void setChangeCount(Integer changeCount) {
this.changeCount = changeCount;
}
public static long getSerialversionuid() {
return serialVersionUID;
}
public Boolean isHot() {
return hot;
}
public void setHot(Boolean hot) {
this.hot = hot;
}
public Boolean getHot() {
return hot;
}
public String getIcon() {
return icon;
}
public void setIcon(String icon) {
this.icon = icon;
}
public String getDay() {
return day;
}
public void setDay(String day) {
this.day = day;
}
public Integer getRank() {
return rank;
}
public void setRank(Integer rank) {
this.hot = true;
this.rank = rank;
}
public String getType() {
return type;
}
public void setType(String type) {
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.type = type;
this.commentCount = commentCount;
this.topicLead = topicLead;
}
}
package com.zhiwei.searchhotcrawler.bean;
public enum HotSearchType {
百度热搜,
微博热搜,
知乎热搜,
抖音热搜,
搜狗微信热搜
}
package com.zhiwei.searchhotcrawler.bean;
public enum HotSearchType {
百度热搜,
微博热搜,
知乎热搜,
抖音热搜,
搜狗微信热搜,
微博话题
}
package com.zhiwei.searchhotcrawler.bean;
import java.util.Date;
import com.zhiwei.tools.timeparse.TimeParse;
import lombok.Data;
import lombok.ToString;
/**
*
* @ClassName: WeiboSuperTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
@Data
@ToString
public class WeiboSuperTopic {
/**
* 主键
*/
private String id;
/**
* 话题链接
*/
public String url;
/**
* 话题名称
*/
public String topicName;
/**
* 话题排名
*/
public Integer rank;
/**
* 话题影响力
*/
public String score;
/**
* 话题粉丝数
*/
public String fensi;
/**
* 话题阅读数
*/
public String readNum;
/**
* 话题帖子数
*/
public String postNum;
/**
* 榜单类型
*/
public String type;
/**
* 天
*/
private String day;
/**
* 时间
*/
private Date time;
public WeiboSuperTopic() {}
public WeiboSuperTopic(String url, String topicName, Integer rank, String score,
String fensi, String type) {
this.url = url;
this.topicName = topicName;
this.rank = rank;
this.score = score;
this.fensi = fensi;
this.type = type;
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.id = topicName + "_" + type + "_" + day;
}
}
package com.zhiwei.searchhotcrawler.bean;
import java.util.Date;
import com.zhiwei.tools.timeparse.TimeParse;
/**
*
* @ClassName: WeiboTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
public class WeiboTopic {
private String id; //主键
public String url; //话题链接
public String topicName; //话题名称
public Integer rank; //话题排名
public String score; //话题影响力
public String fensi; //话题粉丝数
public String readNum; //话题阅读数
public String postNum; //话题帖子数
public String type; //榜单类型
private String day; //天
private Date time; //时间
public WeiboTopic() {}
public WeiboTopic(String url, String topicName, Integer rank, String score,
String fensi, String type) {
this.url = url;
this.topicName = topicName;
this.rank = rank;
this.score = score;
this.fensi = fensi;
this.type = type;
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.id = topicName + "_" + type + "_" + day;
}
@Override
public String toString() {
return "new WeiboTopic["
+ "topicName = " + topicName
+ ", rank = " + rank
+ ", score = " + score
+ ", fensi = " + fensi
+ ", type = " + type
+ ", readNum = " + readNum
+ ", postNum = " + postNum
+ ", url = " + url
+ "]";
}
public String getUrl() {
return url;
}
public String getTopicName() {
return topicName;
}
public Integer getRank() {
return rank;
}
public String getScore() {
return score;
}
public String getFensi() {
return fensi;
}
public String getReadNum() {
return readNum;
}
public String getPostNum() {
return postNum;
}
public String getType() {
return type;
}
public void setUrl(String url) {
this.url = url;
}
public void setTopicName(String topicName) {
this.topicName = topicName;
}
public void setRank(Integer rank) {
this.rank = rank;
}
public void setScore(String score) {
this.score = score;
}
public void setFensi(String fensi) {
this.fensi = fensi;
}
public void setReadNum(String readNum) {
this.readNum = readNum;
}
public void setPostNum(String postNum) {
this.postNum = postNum;
}
public void setType(String type) {
this.type = type;
}
public String getId() {
return id;
}
public String getDay() {
return day;
}
public Date getTime() {
return time;
}
public void setId(String id) {
this.id = id;
}
public void setDay(String day) {
this.day = day;
}
public void setTime(Date time) {
this.time = time;
}
}
package com.zhiwei.searchhotcrawler.cache;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class CacheListener {
Logger logger = LoggerFactory.getLogger(CacheListener.class);
public void startListen() {
new Thread(){
public void run() {
while (true) {
if(CacheManager.caches!=null && CacheManager.caches.size()>0){
for(String key : CacheManager.getAllKeys()) {
if (CacheManager.isTimeOut(key)) {
CacheManager.clearByKey(key);
logger.info(key + "缓存被清除");
}
}
}
ZhiWeiTools.sleep(500);
}
}
}.start();
}
}
package com.zhiwei.searchhotcrawler.cache;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class CacheListener {
/**
* 开启缓存监听
*/
public void startListen() {
new Thread(){
public void run() {
while (true) {
if(CacheManager.caches!=null && CacheManager.caches.size()>0){
for(String key : CacheManager.getAllKeys()) {
if (CacheManager.isTimeOut(key)) {
CacheManager.clearByKey(key);
log.info(key + "缓存被清除");
}
}
}
ZhiWeiTools.sleep(500);
}
}
}.start();
}
}
package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
/**
* @ClassName:BaiDuHotSearch
* @Description: TODO(百度风云榜热搜采集)
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
public class BaiDuHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(BaiDuHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: BaiDuHotSearchTest
* @author hero
* @Description: TODO(PC端百度风云榜采集)
* @param 设定文件
* @return void 返回类型
*/
public static List<HotSearchList> baiduHotSearch() {
String url = "http://top.baidu.com/buzz?b=1&fr=topindex";
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null && htmlBody.contains("mainBody")) {
return ansysData(htmlBody);
} else {
logger.info("解析百度风云榜时出现解析错误,页面结构有问题");
}
} catch (Exception e) {
logger.error("解析百度风云榜时出现解析错误,页面结构有问题", e);
}
return Collections.emptyList();
}
/**
* 解析数据
* @param htmlBody
* @return
*/
private static List<HotSearchList> ansysData(String htmlBody){
List<HotSearchList> list = new ArrayList<>();
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("table.list-table").select("tr");
if (Objects.nonNull(elements) && !elements.isEmpty()) {
elements.forEach(element -> {
try {
// 获取排名rank
String rankStr = null;
// 根据网页标签,给rankStr做判断
if (!element.select("td.first").select("span.num-top").isEmpty()) {
rankStr = element.select("td.first").select("span.num-top").text();
} else if (!element.select("td.first").select("span.num-normal").isEmpty()) {
rankStr = element.select("td.first").select("span.num-normal").text();
}
Integer rank = null;
// 判断rankStr是否为空
if (StringUtils.isNoneBlank(rankStr)) {
rank = Integer.valueOf(rankStr);
}
// 获取关键词(String)
String kw = element.select("td.keyword").select("a.list-title").text();
// logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
String everurl = element.select("td.keyword").select("a.list-title").attr("href");
// 获取搜索指数count(int)
String hot = null;
// 判断热度值所在的规则是否为null
if (!element.select("td.last").select("span.icon-fall").isEmpty()) {
hot = element.select("td.last").select("span.icon-fall").text();
} else if (!element.select("td.last").select("span.icon-rise").isEmpty()) {
hot = element.select("td.last").select("span.icon-rise").text();
}
int count = 0;
// 判断hot是否为空
if (StringUtils.isNotBlank(hot)) {
count = Integer.valueOf(hot);
}
if (Objects.nonNull(rank)) {
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name());
list.add(hotSearch);
}
} catch (Exception e) {
logger.error("解析百度风云榜时出现解析错误", e);
}
});
}
} catch (Exception e) {
logger.error("解析百度风云榜时出现解析错误,数据不是json结构", e);
}
return list;
}
package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
/**
* @ClassName:BaiDuHotSearch
* @Description: TODO(百度风云榜热搜采集)
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
@Log4j2
public class BaiDuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: BaiDuHotSearchTest
* @author hero
* @Description: PC端百度风云榜采集
* @return void 返回类型
*/
public static List<HotSearchList> baiduHotSearch() {
String url = "http://top.baidu.com/buzz?b=1&fr=topindex";
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null && htmlBody.contains("mainBody")) {
return ansysData(htmlBody);
} else {
log.info("解析百度风云榜时出现解析错误,页面结构有问题");
}
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误,页面结构有问题", e);
}
return Collections.emptyList();
}
/**
* 解析数据
* @param htmlBody
* @return
*/
private static List<HotSearchList> ansysData(String htmlBody){
List<HotSearchList> list = new ArrayList<>();
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("table.list-table").select("tr");
if (Objects.nonNull(elements) && !elements.isEmpty()) {
elements.forEach(element -> {
try {
// 获取排名rank
String rankStr = null;
// 根据网页标签,给rankStr做判断
if (!element.select("td.first").select("span.num-top").isEmpty()) {
rankStr = element.select("td.first").select("span.num-top").text();
} else if (!element.select("td.first").select("span.num-normal").isEmpty()) {
rankStr = element.select("td.first").select("span.num-normal").text();
}
Integer rank = null;
// 判断rankStr是否为空
if (StringUtils.isNoneBlank(rankStr)) {
rank = Integer.valueOf(rankStr);
}
// 获取关键词(String)
String kw = element.select("td.keyword").select("a.list-title").text();
// logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
String everurl = element.select("td.keyword").select("a.list-title").attr("href");
// 获取搜索指数count(int)
String hot = null;
// 判断热度值所在的规则是否为null
if (!element.select("td.last").select("span.icon-fall").isEmpty()) {
hot = element.select("td.last").select("span.icon-fall").text();
} else if (!element.select("td.last").select("span.icon-rise").isEmpty()) {
hot = element.select("td.last").select("span.icon-rise").text();
}
int count = 0;
// 判断hot是否为空
if (StringUtils.isNotBlank(hot)) {
count = Integer.valueOf(hot);
}
if (Objects.nonNull(rank)) {
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name());
list.add(hotSearch);
}
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误", e);
}
});
}
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误,数据不是json结构", e);
}
return list;
}
}
\ No newline at end of file
package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
/**
* @className DouyinHotSearchCrawler
* @Description:抖音热搜榜采集程序
* @author win 10
* @date:2019年07月11日 上午10:26:21
*/
public class DouyinHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(ZhihuHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: getMobileDouyinHotList
* @author hero
* @Description: 移动端抖音热搜榜
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public static List<HotSearchList> getMobileDouyinHotList(){
List<HotSearchList> list = null;
String url = "https://api.amemv.com/aweme/v1/hot/search/list/";
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("word_list")){
list = new ArrayList<>();
JSONObject data = JSONObject.parseObject(htmlBody);
JSONArray wordList = data.getJSONObject("data").getJSONArray("word_list");
String positionStr = null;
String word = null;
String hotValueStr = null;
for (int i = 0; i < wordList.size(); i++) {
JSONObject wl = wordList.getJSONObject(i);
//获取排名
positionStr = wl.getString("position");
Integer position = null;
position = Integer.valueOf(positionStr);
//获取关键词
word = wl.getString("word");
//获取热度值
hotValueStr =wl.getString("hot_value");
Integer hotValue = null;
hotValue = Integer.valueOf(hotValueStr);
// logger.info("热度为:::{}", hot_value);
HotSearchList douyin = new HotSearchList(null,word, hotValue, position,HotSearchType.抖音热搜.name());
list.add(douyin);
}
}
} catch (IOException e) {
logger.debug("获取抖音热搜榜时出现问题:{}", e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
/**
* @className DouyinHotSearchCrawler
* @Description:抖音热搜榜采集程序
* @author win 10
* @date:2019年07月11日 上午10:26:21
*/
@Log4j2
public class DouyinHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: getMobileDouyinHotList
* @author hero
* @Description: 移动端抖音热搜榜
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public static List<HotSearchList> getMobileDouyinHotList(){
List<HotSearchList> list = null;
String url = "https://api.amemv.com/aweme/v1/hot/search/list/";
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("word_list")){
list = new ArrayList<>();
JSONObject data = JSONObject.parseObject(htmlBody);
JSONArray wordList = data.getJSONObject("data").getJSONArray("word_list");
String positionStr = null;
String word = null;
String hotValueStr = null;
for (int i = 0; i < wordList.size(); i++) {
JSONObject wl = wordList.getJSONObject(i);
//获取排名
positionStr = wl.getString("position");
Integer position = null;
position = Integer.valueOf(positionStr);
//获取关键词
word = wl.getString("word");
//获取热度值
hotValueStr =wl.getString("hot_value");
Integer hotValue = null;
hotValue = Integer.valueOf(hotValueStr);
// logger.info("热度为:::{}", hot_value);
HotSearchList douyin = new HotSearchList(null,word, hotValue, position,HotSearchType.抖音热搜.name());
list.add(douyin);
}
}
} catch (IOException e) {
log.debug("获取抖音热搜榜时出现问题:{}", e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool;
/**
* @ClassName:SougoHotSearch
* @Description: TODO(搜狗微信关键词采集)
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
public class SougoHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(SougoHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: SougoHotSearchTest
* @author hero
* @Description: TODO(PC端搜狗微信关键词采集)
* @param 设定文件
* @return void 返回类型
*/
public static List<HotSearchList> sougoHotSearch() {
String url = "https://weixin.sogou.com";
List<HotSearchList> list = new ArrayList<>();
for (int i = 0; i < 3; i++) {
String htmlBody = null;
try {
Map<String,String> headMap = HeaderTool.getCommonHead();
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null && htmlBody.contains("topwords")) {
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("ol#topwords").select("li");
for (Element element : elements) {
try {
// 获取排名rank
String rankStr = null;
if (!element.select("li").select("i").isEmpty()) {
rankStr = element.select("li").select("i").text();
}
Integer rank = null;
if (StringUtils.isNoneBlank(rankStr)) {
rank = Integer.valueOf(rankStr);
}
// 获取关键词(String)
String kw = element.select("li").select("a").text();
// logger.info("关键词:{}", kw);
String everurl = element.select("li").select("a").attr("href");
HotSearchList hotSearch = new HotSearchList(everurl, kw, null, rank, HotSearchType.搜狗微信热搜.name());
if (Objects.nonNull(rank)) {
list.add(hotSearch);
}
} catch (Exception e) {
logger.error("解析搜狗微信时出现解析错误", e);
}
}
} catch (Exception e) {
logger.error("解析搜狗微信时出现解析错误,数据不是json结构", e.fillInStackTrace());
return Collections.emptyList();
}
} else {
logger.info("解析搜狗微信时出现解析错误,页面结构有问题");
}
break;
} catch (Exception e) {
logger.error("解析搜狗微信时出现解析错误,页面结构有问题", e);
}
}
return list;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool;
/**
* @ClassName:SougoHotSearch
* @Description: TODO(搜狗微信关键词采集)
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
@Log4j2
public class SougoHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: SougoHotSearchTest
* @author hero
* @Description: TODO(PC端搜狗微信关键词采集)
* @return void 返回类型
*/
public static List<HotSearchList> sougoHotSearch() {
String url = "https://weixin.sogou.com";
List<HotSearchList> list = new ArrayList<>();
for (int i = 0; i < 3; i++) {
String htmlBody = null;
try {
Map<String,String> headMap = HeaderTool.getCommonHead();
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null && htmlBody.contains("topwords")) {
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("ol#topwords").select("li");
for (Element element : elements) {
try {
// 获取排名rank
String rankStr = null;
if (!element.select("li").select("i").isEmpty()) {
rankStr = element.select("li").select("i").text();
}
Integer rank = null;
if (StringUtils.isNoneBlank(rankStr)) {
rank = Integer.valueOf(rankStr);
}
// 获取关键词(String)
String kw = element.select("li").select("a").text();
// logger.info("关键词:{}", kw);
String everurl = element.select("li").select("a").attr("href");
HotSearchList hotSearch = new HotSearchList(everurl, kw, null, rank, HotSearchType.搜狗微信热搜.name());
if (Objects.nonNull(rank)) {
list.add(hotSearch);
}
} catch (Exception e) {
log.error("解析搜狗微信时出现解析错误", e);
}
}
} catch (Exception e) {
log.error("解析搜狗微信时出现解析错误,数据不是json结构", e.fillInStackTrace());
return Collections.emptyList();
}
} else {
log.info("解析搜狗微信时出现解析错误,页面结构有问题");
}
break;
} catch (Exception e) {
log.error("解析搜狗微信时出现解析错误,页面结构有问题", e);
}
}
return list;
}
}
......@@ -7,6 +7,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -27,13 +28,13 @@ import com.zhiwei.tools.tools.URLCodeUtil;
/**
* @ClassName: WeiboHotSearch
* @Description: TODO(微博实时热搜采集)
* @Description: 微博实时热搜采集
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
public class WeiboHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(WeiboHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: weiboHotSearchTest
......@@ -70,18 +71,18 @@ public class WeiboHotSearchCrawler {
list.add(hotSearch);
} catch (Exception e) {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
logger.error("解析微博时时热搜时出现解析错误", e);
log.error("解析微博时时热搜时出现解析错误", e);
continue;
}
}
} catch (Exception e) {
logger.error("解析微博时时热搜时出现解析错误,数据不是json结构",e.fillInStackTrace());
log.error("解析微博时时热搜时出现解析错误,数据不是json结构",e.fillInStackTrace());
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
return null;
}
}else{
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
logger.info("解析微博时时热搜时出现解析错误,页面结构有问题");
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
}
break;
} catch (Exception e) {
......@@ -138,25 +139,25 @@ public class WeiboHotSearchCrawler {
}
String id = "http://s.weibo.com/weibo/"+URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon);
logger.info("采集到的数据:::{}", hotSearch);
log.info("采集到的数据:::{}", hotSearch);
result.add(hotSearch);
rank++;
}
} catch (Exception e) {
logger.error("解析微博时时热搜时出现解析错误",e);
log.error("解析微博时时热搜时出现解析错误",e);
continue;
}
}
return result;
} catch (Exception e) {
logger.error("解析微博时时热搜时出现解析错误,数据不是json结构",e);
log.error("解析微博时时热搜时出现解析错误,数据不是json结构",e);
return Collections.emptyList();
}
}else{
logger.info("解析微博时时热搜时出现解析错误,页面结构有问题");
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
}
} catch (IOException e1) {
logger.error("解析微博时时热搜时出现连接失败",e1);
log.error("解析微博时时热搜时出现连接失败",e1);
return Collections.emptyList();
}
return Collections.emptyList();
......
package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.WeiboTopic;
/**
*
* @ClassName: WeiboHuatiCrawler
* @Description: 微博话题榜单采集(明星)
* @author Bewilder ZW
* @date 2019年9月27日 下午3:01:34
*/
public class WeiboHuatiCrawler {
private static Logger logger = LoggerFactory.getLogger(WeiboHuatiCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Map<String,String> headMap = new HashMap<>();
static {
headMap.put("X-Requested-With", "XMLHttpRequest");
headMap.put("Referer", "https://huati.weibo.cn/discovery/super?extparam=ctg1_2%7Cscorll_1&luicode=10000011&lfid=100803_-_super&sourceType=weixin");
headMap.put("Host", "huati.weibo.cn");
}
/**
*
* 开始采集明星话题
* @return void
*/
public static List<WeiboTopic> startCrawler() {
Map<String,String> urlMap = new HashMap<>();
urlMap.put("明星", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=star&from=&wm=");
urlMap.put("明星潜力", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=potential&from=&wm=");
urlMap.put("明星上升", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=up&from=&wm=");
List<WeiboTopic> topicList = new ArrayList<>();
for(Entry<String,String> entry : urlMap.entrySet()) {
String url = entry.getValue();
String type = entry.getKey();
for(int page= 1; page<=5; page++) {
String pageUrl = url + "&page=" + page;
//重试三次
for(int retryTimes = 1; retryTimes<=3; retryTimes++) {
try {
System.out.println("pageUrl=========="+pageUrl);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc1")) {
topicList.addAll(parseTopicRankHtml(page, htmlBody, type));
break;
}else {
logger.error("获取榜单列表页面时数据格式错误,页面为:{}", htmlBody);
}
} catch (Exception e) {
logger.error("获取榜单列表页面时出现错误,错误为:{}", e);
continue;
}
}
}
}
return topicList;
}
/**
*
* 解析话题榜单
* @param htmlBody
* @param type
* @return void
*/
private static List<WeiboTopic> parseTopicRankHtml(int page,String htmlBody, String type) {
try {
JSONArray list = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("list");
if(Objects.nonNull(list) && !list.isEmpty()) {
page = (page-1)*20;
List<WeiboTopic> topicList = new ArrayList<>();
Integer toprank = null;
String topicName = null;
String id = null;
String score = null;
String desc1 = null;
String fensi = null;
String url = null;
for(int i=0;i<list.size();i++) {
JSONObject data = list.getJSONObject(i);
toprank = page + data.getInteger("toprank");
topicName = data.getString("display_name");
id = data.getString("page_id");
score = data.getString("score");
desc1 = data.getString("desc1");
fensi = desc1.replaceAll(".*影响力|粉丝", "").trim();
url = data.getString("link");
WeiboTopic topic = new WeiboTopic(url, topicName, toprank, score, fensi, type);
topic = getTopicInfo(id, topic);
System.out.println("topic====="+topic);
topicList.add(topic);
}
return topicList;
}
} catch (Exception e) {
logger.error("解析榜单列表页面时出现错误,错误为:{}", e);
}
return Collections.emptyList();
}
/**
*
* 根据单一话题id获取话题阅读数及发帖数
* @param id
* @param topic
* @return
* @return WeiboTopic
*/
private static WeiboTopic getTopicInfo(String id, WeiboTopic topic) {
for(int retryTimes=1; retryTimes<=3; retryTimes++) {
try {
String url = "https://m.weibo.cn/api/container/getIndex?containerid="+ id;
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc_more")) {
String descMore = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("pageInfo").getJSONArray("desc_more").getString(0);
if(StringUtils.isNotBlank(descMore)) {
String readNum = descMore.replaceAll("阅读|帖子.*", "").trim();
String postNum = descMore.replaceAll(".*帖子|粉丝.*", "").trim();
topic.setPostNum(postNum);
topic.setReadNum(readNum);
return topic;
}
}
} catch (Exception e) {
logger.error("解析榜单详情页面时出现错误,错误为:{}", e);
}
}
return topic;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
/**
*
* @ClassName: WeiboSuperTopicCrawler
* @Description: 微博超话榜单采集(明星)
* @author Bewilder ZW
* @date 2019年9月27日 下午3:01:34
*/
@Log4j2
public class WeiboSuperTopicCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Map<String,String> headMap = new HashMap<>();
static {
headMap.put("X-Requested-With", "XMLHttpRequest");
headMap.put("Referer", "https://huati.weibo.cn/discovery/super?extparam=ctg1_2%7Cscorll_1&luicode=10000011&lfid=100803_-_super&sourceType=weixin");
headMap.put("Host", "huati.weibo.cn");
}
/**
*
* 开始采集明星话题
* @return void
*/
public static List<WeiboSuperTopic> startCrawler() {
Map<String,String> urlMap = new HashMap<>();
urlMap.put("明星", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=star&from=&wm=");
urlMap.put("明星潜力", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=potential&from=&wm=");
urlMap.put("明星上升", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=up&from=&wm=");
List<WeiboSuperTopic> topicList = new ArrayList<>();
for(Entry<String,String> entry : urlMap.entrySet()) {
String url = entry.getValue();
String type = entry.getKey();
for(int page= 1; page<=5; page++) {
String pageUrl = url + "&page=" + page;
//重试三次
for(int retryTimes = 1; retryTimes<=3; retryTimes++) {
try {
System.out.println("pageUrl=========="+pageUrl);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc1")) {
topicList.addAll(parseTopicRankHtml(page, htmlBody, type));
break;
}else {
log.error("获取榜单列表页面时数据格式错误,页面为:{}", htmlBody);
}
} catch (Exception e) {
log.error("获取榜单列表页面时出现错误,错误为:{}", e);
continue;
}
}
}
}
return topicList;
}
/**
*
* 解析话题榜单
* @param htmlBody
* @param type
* @return void
*/
private static List<WeiboSuperTopic> parseTopicRankHtml(int page,String htmlBody, String type) {
try {
JSONArray list = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("list");
if(Objects.nonNull(list) && !list.isEmpty()) {
page = (page-1)*20;
List<WeiboSuperTopic> topicList = new ArrayList<>();
Integer toprank = null;
String topicName = null;
String id = null;
String score = null;
String desc1 = null;
String fensi = null;
String url = null;
for(int i=0;i<list.size();i++) {
JSONObject data = list.getJSONObject(i);
toprank = page + data.getInteger("toprank");
topicName = data.getString("display_name");
id = data.getString("page_id");
score = data.getString("score");
desc1 = data.getString("desc1");
fensi = desc1.replaceAll(".*影响力|粉丝", "").trim();
url = data.getString("link");
WeiboSuperTopic topic = new WeiboSuperTopic(url, topicName, toprank, score, fensi, type);
topic = getTopicInfo(id, topic);
System.out.println("topic====="+topic);
topicList.add(topic);
}
return topicList;
}
} catch (Exception e) {
log.error("解析榜单列表页面时出现错误,错误为:{}", e);
}
return Collections.emptyList();
}
/**
*
* 根据单一话题id获取话题阅读数及发帖数
* @param id
* @param topic
* @return
* @return WeiboTopic
*/
private static WeiboSuperTopic getTopicInfo(String id, WeiboSuperTopic topic) {
for(int retryTimes=1; retryTimes<=3; retryTimes++) {
try {
String url = "https://m.weibo.cn/api/container/getIndex?containerid="+ id;
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc_more")) {
String descMore = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("pageInfo").getJSONArray("desc_more").getString(0);
if(StringUtils.isNotBlank(descMore)) {
String readNum = descMore.replaceAll("阅读|帖子.*", "").trim();
String postNum = descMore.replaceAll(".*帖子|粉丝.*", "").trim();
topic.setPostNum(postNum);
topic.setReadNum(readNum);
return topic;
}
}
} catch (Exception e) {
log.error("解析榜单详情页面时出现错误,错误为:{}", e);
}
}
return topic;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import com.zhiwei.tools.tools.URLCodeUtil;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @ClassName: WeiboTopicCrawler
* @Description: 微博话题榜单采集
* @author Bewilder ZW
*/
@Log4j2
public class WeiboTopicCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Map<String,String> headMap = new HashMap<>();
static {
headMap.put("Host", "simg.s.weibo.com");
headMap.put("User-Agent", "Weibo/40651 CFNetwork/978.0.7 Darwin/18.6.0");
}
// /**
// *
// * 开始采集明星话题
// * @return void
// */
// public static List<HotSearchList> startCrawler() {
// List<HotSearchList> topicList = new ArrayList<>();
// for(int page=1; page<=7; page++){
// String pageUrl = "https://d.weibo.com/231650_ctg1_-_all?pids=Pl_Discover_Pt6Rank__4&cfs=920&Pl_Discover_Pt6Rank__4_filter=&Pl_Discover_Pt6Rank__4_page=" + page;
// //重试三次
// for(int retryTimes = 1; retryTimes<=3; retryTimes++) {
// try {
// String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
// if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("pl.content.miniTab.index")) {
// log.info("pageUrl::{}", pageUrl);
// topicList.addAll(parseTopicRankHtml(htmlBody));
// break;
// }else {
// log.error("获取榜单列表页面时数据格式错误,页面为:{}", htmlBody);
// }
// } catch (Exception e) {
// log.error("获取榜单列表页面时出现错误,错误为:{}", e);
// continue;
// }
// }
// }
// return topicList;
// }
//
// /**
// *
// * 解析话题榜单
// * @param htmlBody
// * @return void
// */
// private static List<HotSearchList> parseTopicRankHtml(String htmlBody) {
// try {
// String script = "{\"ns\":\"pl.content.miniTab.index\""+ htmlBody.split("FM.view\\(\\{\"ns\":\"pl.content.miniTab.index\"")[1].split("\\)<\\/script>")[0];
// JSONObject json = JSONObject.parseObject(script);
// String html = json.getString("html");
//
// Elements elements = Jsoup.parse(html).select("div.text_box");
// if(Objects.nonNull(elements) && !elements.isEmpty()) {
// List<HotSearchList> topicList = new ArrayList<>();
// String rankString;
// Integer rank = null;
// String topicName = null;
// String url = null;
// String topicType = null;
// String description = null;
// Integer readNum = null;
// String author = null;
//
// for(Element element : elements) {
// rankString = element.select("div[class=\"title W_autocut\"]").text();
// Matcher matcher = Pattern.compile("\\d+").matcher(rankString);
// while (matcher.find()){
// rank = Integer.valueOf(matcher.group());
// }
// topicName = element.select("div[class=\"title W_autocut\"]").select("a.S_txt1").text();
// url = element.select("div[class=\"title W_autocut\"]").select("a.S_txt1").attr("href");
// topicType = element.select("a[class=\"W_btn_b W_btn_tag\"]").text();
// description = element.select("div.subtitle").text();
// String readNumString = element.select("span.number").text();
// if(readNumString.contains("万")){
// readNumString = readNumString.split("万")[0];
// readNum = Integer.valueOf(readNumString.split("万")[0])*10000;;
// }
// if(readNumString.contains("亿")){
// readNum = Integer.valueOf(readNumString.split("亿")[0])*100000000;
// }
// author = element.select("a[class=\"tlink S_txt1\"]").text();
// HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博话题.name(), author, topicType, description);
// log.info("topic::::" + topic);
// topicList.add(topic);
// }
// return topicList;
// }else{
// log.info("html:{}",html);
// }
// } catch (Exception e) {
// log.error("解析榜单列表页面时出现错误,错误为:{}", e);
// }
// return Collections.emptyList();
// }
/**
* 微博平话题榜采集
*/
public static List<HotSearchList> startCrawlerByPhone(){
List<HotSearchList> topicList = new ArrayList<>();
for(int page=1; page<=7; page++){
String pageUrl = "https://api.weibo.cn/2/page?gsid=_2A25zJX_EDeRxGedH71YS8CzKzzmIHXVuc_QMrDV6PUJbkdANLXPbkWpNUK3OyitGCJsX8exvua-vfubUqCiaA4lb&from=10A1193010&c=iphone&s=2827eebe&count=20&containerid=106003type%253D25%2526t%253D3%2526disable_hot%253D1%2526filter_type%253Dtopicscene&page=" + page;
//重试三次
for(int retryTimes = 1; retryTimes<=5; retryTimes++) {
try {
log.info("pageUrl::{}", pageUrl);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("top_mark_text")) {
topicList.addAll(parseTopicHtml(htmlBody));
break;
}else {
log.info("下载榜单列表页面时数据格式错误,页面为:{}", htmlBody);
}
} catch (Exception e) {
log.error("下载榜单列表页面时出现错误,错误为:{}", e);
continue;
}
}
}
return topicList;
}
private static List<HotSearchList> parseTopicHtml(String htmlBody) {
try {
JSONArray cards = JSONObject.parseObject(htmlBody).getJSONArray("cards");
if(Objects.nonNull(cards) && !cards.isEmpty()) {
List<HotSearchList> topicList = new ArrayList<>();
Integer rank = null;
String topicName = null;
String url = null;
String description = null;
Integer commentNum = null;
Integer readNum = null;
String desc2 = null;
for(int i=0; i<cards.size(); i++) {
JSONObject cardGroup = cards.getJSONObject(i).getJSONArray("card_group").getJSONObject(0);
rank = cardGroup.getInteger("top_mark_text");
topicName = cardGroup.getString("title_sub");
url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode(topicName, "utf-8");
description = cardGroup.getString("desc1");
desc2 = cardGroup.getString("desc2");
String commentNumStr = desc2.replaceAll("讨论.*", "").trim();
String readNumStr = desc2.replaceAll(".*讨论|阅读", "").trim();
try {
if(commentNumStr.contains("万")){
commentNumStr = commentNumStr.replaceAll("万", "");
commentNum = (int)(Double.parseDouble(commentNumStr)*10000);
}else if(commentNumStr.contains("亿")){
commentNumStr = commentNumStr.replaceAll("亿", "");
commentNum = (int)(Double.parseDouble(commentNumStr)*10000000);
}else{
commentNum = Integer.getInteger(commentNumStr);
}
if(readNumStr.contains("万")){
readNumStr = readNumStr.replaceAll("万", "");
readNum = (int)(Double.parseDouble(readNumStr)*10000);
}else if(readNumStr.contains("亿")){
readNumStr = readNumStr.replaceAll("亿", "");
readNum = (int)(Double.parseDouble(readNumStr)*10000000);
}else{
readNum = Integer.getInteger(readNumStr);
}
}catch (Exception e){
e.printStackTrace();
}
HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博热搜.name(), commentNum, description);
log.info("topic::::" + topic);
topicList.add(topic);
}
return topicList;
}else{
log.info("html:{}",htmlBody);
}
} catch (Exception e) {
log.error("解析榜单列表页面时出现错误,错误为:{}", e);
}
return Collections.emptyList();
}
}
package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
/**
* @ClassName: ZhihuHotCrawler
* @Description: TODO(知乎热搜采集程序)
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
public class ZhihuHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(ZhihuHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: getZhihuHotList
* @author hero
* @Description: 知乎热搜采集程序
* @param 设定文件
* @return void 返回类型
*/
public static List<HotSearchList> getZhihuHotList(){
List<HotSearchList> list = null;
String url = "https://www.zhihu.com/api/v4/search/top_search";
String rerferer = "https://www.zhihu.com/search?type=content&q=%E5%BF%AB%E6%89%8B";
Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");
headerMap.put("Host", "www.zhihu.com");
headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
headerMap.put("accept", "application/json, text/plain, */*");
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
headerMap.put("Referer", rerferer);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null && htmlBody.contains("words")){
list = new ArrayList<>();
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray words = topSearch.getJSONObject("top_search").getJSONArray("words");
String link = null;
String displayQuery = null;
String query = null;
for (int i = 0; i < words.size(); i++) {
JSONObject word = words.getJSONObject(i);
query = word.getString("query");
displayQuery = word.getString("display_query");
link = "https://www.zhihu.com/search?q="+URLCodeUtil.getURLEncode(query, "utf-8")+"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content";
HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name());
list.add(zhihu);
}
}
} catch (IOException e) {
logger.debug("获取知乎热搜时出现问题:{}", e);
return list;
}
return list;
}
/**
* @Title: getMobileZhihuHotList
* @author hero
* @Description: 移動端知乎熱搜榜
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public static List<HotSearchList> getMobileZhihuHotList(){
List<HotSearchList> list = new ArrayList<>();;
String url = "https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0";
Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "api.zhihu.com");
headerMap.put("Referer", url);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36");
headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null && htmlBody.contains("author")){
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray words = topSearch.getJSONArray("data");
String link = null;
String displayQuery = null;
for (int i = 0; i < words.size(); i++) {
JSONObject word = words.getJSONObject(i).getJSONObject("target");
displayQuery = word.getString("title");
link = "https://www.zhihu.com/question/"+word.getLongValue("id");
HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name());
list.add(zhihu);
}
}
} catch (IOException e) {
logger.debug("获取知乎热搜时出现问题:{}", e);
return list;
}
return list;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
/**
* @ClassName: ZhihuHotCrawler
* @Description: TODO(知乎热搜采集程序)
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
public class ZhihuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: getZhihuHotList
* @author hero
* @Description: 知乎热搜采集程序
* @return void 返回类型
*/
public static List<HotSearchList> getZhihuHotList(){
List<HotSearchList> list = null;
String url = "https://www.zhihu.com/api/v4/search/top_search";
String rerferer = "https://www.zhihu.com/search?type=content&q=%E5%BF%AB%E6%89%8B";
Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");
headerMap.put("Host", "www.zhihu.com");
headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
headerMap.put("accept", "application/json, text/plain, */*");
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
headerMap.put("Referer", rerferer);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null && htmlBody.contains("words")){
list = new ArrayList<>();
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray words = topSearch.getJSONObject("top_search").getJSONArray("words");
String link = null;
String displayQuery = null;
String query = null;
for (int i = 0; i < words.size(); i++) {
JSONObject word = words.getJSONObject(i);
query = word.getString("query");
displayQuery = word.getString("display_query");
link = "https://www.zhihu.com/search?q="+URLCodeUtil.getURLEncode(query, "utf-8")+"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content";
HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name());
list.add(zhihu);
}
}
} catch (IOException e) {
log.debug("获取知乎热搜时出现问题:{}", e);
return list;
}
return list;
}
/**
* @Title: getMobileZhihuHotList
* @author hero
* @Description: 移動端知乎熱搜榜
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public static List<HotSearchList> getMobileZhihuHotList(){
List<HotSearchList> list = new ArrayList<>();;
String url = "https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0";
Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "api.zhihu.com");
headerMap.put("Referer", url);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36");
headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null && htmlBody.contains("author")){
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray words = topSearch.getJSONArray("data");
String link = null;
String displayQuery = null;
for (int i = 0; i < words.size(); i++) {
JSONObject word = words.getJSONObject(i).getJSONObject("target");
displayQuery = word.getString("title");
link = "https://www.zhihu.com/question/"+word.getLongValue("id");
HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name());
list.add(zhihu);
}
}
} catch (IOException e) {
log.debug("获取知乎热搜时出现问题:{}", e);
return list;
}
return list;
}
}
package com.zhiwei.searchhotcrawler.dao;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.cache.CacheManager;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.tools.timeparse.TimeParse;
public class HotSearchListDAO extends MongoDBTemplate{
private static Logger logger = LoggerFactory.getLogger(HotSearchListDAO.class);
public HotSearchListDAO() {
super();
super.setDbName(Config.dbName);
String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
String year = time.substring(0,4);
String month = time.substring(5,7);
String collName = Config.searchCollName + year + "_" + month;
super.setCollName(collName);
DBObject countIndexDoc = new BasicDBObject();
countIndexDoc.put("count", -1);
DBObject timeIndexDoc = new BasicDBObject();
timeIndexDoc.put("time", -1);
DBObject rankIndexDoc = new BasicDBObject();
rankIndexDoc.put("rank", -1);
DBObject nameIndexDoc = new BasicDBObject();
nameIndexDoc.put("name", -1);
DBObject typeIndexDoc = new BasicDBObject();
typeIndexDoc.put("type", -1);
try {
super.getReadColl().createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
super.getReadColl().createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
super.getReadColl().createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
super.getReadColl().createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
super.getReadColl().createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 添加数据入库
* @param list
*/
public void addHotSearchList(List<DBObject> list){
try {
this.getReadColl().insert(list);
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
}
}
public void addHotSearch(DBObject doc){
try {
this.getReadColl().insert(doc);
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
}
}
/**
* 查询据上次变化量
* @Title: getChangeCount
* @author hero
* @param @param weiboHotSearch
* @param @return 设定文件
* @return int 返回类型
*/
public int getChangeCount(HotSearchList weiboHotSearch){
int result = 0;
DBObject query = new BasicDBObject();
query.put("name", weiboHotSearch.getName());
DBObject sort = new BasicDBObject();
sort.put("time", -1);
try {
DBCursor cur = this.getReadColl().find(query).sort(sort).limit(1);
while(cur.hasNext()){
DBObject doc = cur.next();
if(doc.get("count")!=null) {
result = weiboHotSearch.getCount() - Integer.valueOf(doc.get("count").toString());
break;
}
}
cur.close();
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
return result;
}
return result;
}
/**
* @Title: getWeiboHotOneHour
* @author hero
* @Description: 查询最近1小时内新增的微博热搜
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
public List<DBObject> getHotOneHour(String type){
List<DBObject> list = new ArrayList<>();
Date date = new Date((new Date().getTime()-60*60*1000));
DBObject query = new BasicDBObject();
query.put("time", new BasicDBObject("$gte", date));
query.put("changeCount", 0);
query.put("type", type);
try {
DBCursor cur = this.getReadColl().find(query);
while(cur.hasNext()){
DBObject doc = cur.next();
String name = doc.get("name").toString();
if(CacheManager.getCacheByKey(name)==null){
CacheManager.putCache(name, doc, 48*60*60*1000);
list.add(doc);
}
}
cur.close();
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.dao;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Objects;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.cache.CacheManager;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.tools.timeparse.TimeParse;
@Log4j2
public class HotSearchListDAO extends MongoDBTemplate{
public HotSearchListDAO() {
super();
super.setDbName(Config.dbName);
String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
String year = time.substring(0,4);
String month = time.substring(5,7);
String collName = Config.searchCollName + year + "_" + month;
super.setCollName(collName);
//给数据表创建索引
createIndex();
}
/**
* 初次创建表及创建相应的索引
*/
private void createIndex(){
List<DBObject> indexList = this.getReadColl().getIndexInfo();
if(Objects.isNull(indexList) && indexList.isEmpty()){
DBObject countIndexDoc = new BasicDBObject();
countIndexDoc.put("count", -1);
DBObject timeIndexDoc = new BasicDBObject();
timeIndexDoc.put("time", -1);
DBObject rankIndexDoc = new BasicDBObject();
rankIndexDoc.put("rank", -1);
DBObject nameIndexDoc = new BasicDBObject();
nameIndexDoc.put("name", -1);
DBObject typeIndexDoc = new BasicDBObject();
typeIndexDoc.put("type", -1);
try {
super.getReadColl().createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
super.getReadColl().createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
super.getReadColl().createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
super.getReadColl().createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
super.getReadColl().createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* 添加数据入库
* @param list
*/
public void addHotSearchList(List<DBObject> list){
try {
this.getReadColl().insert(list);
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
}
public void addHotSearch(DBObject doc){
try {
this.getReadColl().insert(doc);
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
}
/**
* 查询据上次变化量
* @Title: getChangeCount
* @author hero
* @param @param weiboHotSearch
* @param @return 设定文件
* @return int 返回类型
*/
public int getChangeCount(HotSearchList weiboHotSearch){
int result = 0;
DBObject query = new BasicDBObject();
query.put("name", weiboHotSearch.getName());
DBObject sort = new BasicDBObject();
sort.put("time", -1);
try {
DBCursor cur = this.getReadColl().find(query).sort(sort).limit(1);
while(cur.hasNext()){
DBObject doc = cur.next();
if(doc.get("count")!=null) {
result = weiboHotSearch.getCount() - Integer.valueOf(doc.get("count").toString());
break;
}
}
cur.close();
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
return result;
}
return result;
}
/**
* @Title: getWeiboHotOneHour
* @author hero
* @Description: 查询最近1小时内新增的微博热搜
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
public List<DBObject> getHotOneHour(String type){
List<DBObject> list = new ArrayList<>();
Date date = new Date((new Date().getTime()-60*60*1000));
DBObject query = new BasicDBObject();
query.put("time", new BasicDBObject("$gte", date));
query.put("changeCount", 0);
query.put("type", type);
try {
DBCursor cur = this.getReadColl().find(query);
while(cur.hasNext()){
DBObject doc = cur.next();
String name = doc.get("name").toString();
if(CacheManager.getCacheByKey(name)==null){
CacheManager.putCache(name, doc, 48*60*60*1000);
list.add(doc);
}
}
cur.close();
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.dao;
import java.util.Collections;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
public class WechatUserDao extends MongoDBTemplate{
private static Logger logger = LoggerFactory.getLogger(BaiDuHotSearchCrawler.class);
public WechatUserDao() {
super();
super.setDbName(Config.dbName);
super.setCollName(Config.collWechatUserName);
}
/**
* 添加分组用户
* @param userlist
* @param groupName
* @param groupId
*/
public void addWechatUser(List<String> userlist, String groupName, Integer groupId){
for(int i=0; i<3; i++){
try {
DBObject doc = new BasicDBObject();
doc.put("_id", groupId+"-"+groupName);
doc.put("groupId", groupId);
doc.put("groupName", groupName);
doc.put("user", userlist);
this.getReadColl().save(doc);
break;
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
}
}
}
/**
* 根据分组名称查询分组用户
* @param group
* @return
*/
@SuppressWarnings("unchecked")
public List<String> getWechatUserByGroup(String group){
try {
DBObject query = new BasicDBObject();
query.put("groupName", group);
DBObject doc = this.getReadColl().findOne(query);
if(doc != null){
return (List<String>)doc.get("user");
}
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
}
return Collections.emptyList();
}
}
package com.zhiwei.searchhotcrawler.dao;
import java.util.Collections;
import java.util.List;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
@Log4j2
public class WechatUserDao extends MongoDBTemplate{
public WechatUserDao() {
super();
super.setDbName(Config.dbName);
super.setCollName(Config.collWechatUserName);
}
/**
* 添加分组用户
* @param userlist
* @param groupName
* @param groupId
*/
public void addWechatUser(List<String> userlist, String groupName, Integer groupId){
for(int i=0; i<3; i++){
try {
DBObject doc = new BasicDBObject();
doc.put("_id", groupId+"-"+groupName);
doc.put("groupId", groupId);
doc.put("groupName", groupName);
doc.put("user", userlist);
this.getReadColl().save(doc);
break;
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
}
}
/**
* 根据分组名称查询分组用户
* @param group
* @return
*/
@SuppressWarnings("unchecked")
public List<String> getWechatUserByGroup(String group){
try {
DBObject query = new BasicDBObject();
query.put("groupName", group);
DBObject doc = this.getReadColl().findOne(query);
if(doc != null){
return (List<String>)doc.get("user");
}
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
return Collections.emptyList();
}
}
package com.zhiwei.searchhotcrawler.dao;
import java.util.Date;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.tools.timeparse.TimeParse;
public class WeiboTopicDAO extends MongoDBTemplate{
private static Logger logger = LoggerFactory.getLogger(WeiboTopicDAO.class);
public WeiboTopicDAO() {
super();
super.setDbName(Config.dbName);
String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
String year = time.substring(0,4);
String month = time.substring(5,7);
String collName = Config.topicCollName + year + "_" + month;
super.setCollName(collName);
DBObject countIndexDoc = new BasicDBObject();
countIndexDoc.put("score_num", -1);
DBObject timeIndexDoc = new BasicDBObject();
timeIndexDoc.put("time", -1);
DBObject rankIndexDoc = new BasicDBObject();
rankIndexDoc.put("rank", -1);
DBObject nameIndexDoc = new BasicDBObject();
nameIndexDoc.put("name", -1);
DBObject typeIndexDoc = new BasicDBObject();
typeIndexDoc.put("type", -1);
try {
super.getReadColl().createIndex(countIndexDoc, new BasicDBObject("name", "score_desc"));
super.getReadColl().createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
super.getReadColl().createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
super.getReadColl().createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
super.getReadColl().createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 添加数据入库
* @param list
*/
public void addTopicList(List<DBObject> list){
try {
this.getReadColl().insert(list);
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
}
}
public void addTopic(DBObject doc){
try {
this.getReadColl().insert(doc);
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
}
}
}
package com.zhiwei.searchhotcrawler.dao;
import java.util.Date;
import java.util.List;
import java.util.Objects;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.tools.timeparse.TimeParse;
@Log4j2
public class WeiboSuperTopicDAO extends MongoDBTemplate{
public WeiboSuperTopicDAO() {
super();
super.setDbName(Config.dbName);
String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
String year = time.substring(0,4);
String month = time.substring(5,7);
String collName = Config.topicCollName + year + "_" + month;
super.setCollName(collName);
createIndex();
}
/**
* 初次创建表及创建相应的索引
*/
private void createIndex(){
List<DBObject> indexList = this.getReadColl().getIndexInfo();
if(Objects.isNull(indexList) && indexList.isEmpty()){
DBObject countIndexDoc = new BasicDBObject();
countIndexDoc.put("score_num", -1);
DBObject timeIndexDoc = new BasicDBObject();
timeIndexDoc.put("time", -1);
DBObject rankIndexDoc = new BasicDBObject();
rankIndexDoc.put("rank", -1);
DBObject nameIndexDoc = new BasicDBObject();
nameIndexDoc.put("name", -1);
DBObject typeIndexDoc = new BasicDBObject();
typeIndexDoc.put("type", -1);
try {
super.getReadColl().createIndex(countIndexDoc, new BasicDBObject("name", "score_desc"));
super.getReadColl().createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
super.getReadColl().createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
super.getReadColl().createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
super.getReadColl().createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* 添加数据入库
* @param list
*/
public void addTopicList(List<DBObject> list){
try {
this.getReadColl().insert(list);
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
}
public void addTopic(DBObject doc){
try {
this.getReadColl().insert(doc);
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
}
}
package com.zhiwei.searchhotcrawler.run;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.cache.CacheListener;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.DouyinHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SendWeiboHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SendZhihuHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SougoHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.UpdateWechatUserRun;
import com.zhiwei.searchhotcrawler.timer.WeiboHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.WeiboTopicRun;
import com.zhiwei.searchhotcrawler.timer.ZhihuHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.*;
import com.zhiwei.tools.tools.ZhiWeiTools;
import java.util.concurrent.Executors;
......@@ -23,8 +15,10 @@ import java.util.concurrent.TimeUnit;
public class HotSearchRun {
public static void main(String[] args) {
ProxyFactory.init(ProxyConfig.registry, ProxyConfig.group, GroupType.PROVIDER, 10000013);
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
new UpdateWechatUserRun().start();
ZhiWeiTools.sleep(10000);
......@@ -51,6 +45,7 @@ public class HotSearchRun {
new SougoHotSearchRun().start();
new DouyinHotSearchRun().start();
new ZhihuHotSearchRun().start();
new WeiboSuperTopicRun().start();
new WeiboTopicRun().start();
//推送程序启动
new SendWeiboHotSearchRun().start();
......
package com.zhiwei.searchhotcrawler.test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Map;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
import com.mongodb.MongoClient;
import com.mongodb.MongoCredential;
import com.mongodb.ServerAddress;
import com.mongodb.WriteResult;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.tools.timeparse.TimeParse;
public class HotSearchListTest{
public static void main(String[] args) {
MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray());
ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
Mongo mongo = new MongoClient(address, Arrays.asList(credential));
DB db = mongo.getDB("hot_search_list");
DBCollection coll = db.getCollection("hot_search_list2019_09");
// MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray());
// ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
// Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew));
// DB dbNew = mongoNew.getDB("hot_search_list");
Map<String,String> timLine = TimeParse.getTimeMap("2019-10-01 00:00:00", "2019-10-09 23:59:59", "dd", 1);
timLine.forEach((start, end) ->{
String year = end.substring(0,4);
String month = end.substring(5,7);
Date startDate = TimeParse.stringFormartDate(start);
Date endDate = TimeParse.stringFormartDate(end);
String collName = "hot_search_list"+year+"_"+month;
System.out.println("collName=========="+collName);
// DBCollection collNew = dbNew.getCollection(collName);
// DBObject countIndexDoc = new BasicDBObject();
// countIndexDoc.put("count", -1);
// DBObject timeIndexDoc = new BasicDBObject();
// timeIndexDoc.put("time", -1);
// DBObject rankIndexDoc = new BasicDBObject();
// rankIndexDoc.put("rank", -1);
// DBObject nameIndexDoc = new BasicDBObject();
// nameIndexDoc.put("name", -1);
// DBObject typeIndexDoc = new BasicDBObject();
// typeIndexDoc.put("type", -1);
// try {
// collNew.createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
// collNew.createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
// collNew.createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
// collNew.createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
// collNew.createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
// } catch (Exception e) {
// e.printStackTrace();
// }
DBObject query = new BasicDBObject(new BasicDBObject("time",
new BasicDBObject("$gte",startDate).append("$lte", endDate)));
System.out.println(query);
WriteResult wr = coll.remove(query);
System.out.println("========"+wr.getN());
// int i = 0;
// DBCursor cur = coll.remove(query);
// System.out.println(query +"======="+ cur.count());
// List<DBObject> dataList = new ArrayList<>();
// while(cur.hasNext()) {
// DBObject doc = cur.next();
// try {
//// collNew.save(doc);
// i++;
// coll.remove(doc);
// } catch (Exception e2) {
// e2.printStackTrace();
// }
// dataList.add(doc);
// }
// System.out.println(collName +"数据量大小" +dataList.size());
// cur.close();
// if(!dataList.isEmpty()) {
// try {
// collNew.insert(dataList);
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
});
mongo.close();
}
}
package com.zhiwei.searchhotcrawler.test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Map;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
import com.mongodb.MongoClient;
import com.mongodb.MongoCredential;
import com.mongodb.ServerAddress;
import com.mongodb.WriteResult;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.tools.timeparse.TimeParse;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HotSearchListTest{
public static void main(String[] args) {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("zzw").build();
ProxyFactory.init(simpleConfig);
String url = "http://app.myzaker.com/news/app.php?f=";
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
try{
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
Elements elements = Jsoup.parse(htmlBody).select("div.titlebar>a");
for(Element element : elements){
String lableUrl = "http://app.myzaker.com/news/app.php" + element.attr("href");
System.out.println("lableUrl========="+lableUrl);
String htmlBodyLable = httpBoot.syncCall(RequestUtils.wrapGet(lableUrl), ProxyHolder.NAT_HEAVY_PROXY).body().string();
Elements elementsLable = Jsoup.parse(htmlBodyLable).select("div#infinite_scroll>a");
for(Element elementLable : elementsLable){
System.out.println(elementLable.attr("href") + "=============" + elementLable.text());
}
}
}catch (Exception e){
e.printStackTrace();
}
// MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray());
// ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
// Mongo mongo = new MongoClient(address, Arrays.asList(credential));
//
// DB db = mongo.getDB("hot_search_list");
// DBCollection coll = db.getCollection("hot_search_list2019_09");
//
//// MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray());
//// ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
//// Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew));
//// DB dbNew = mongoNew.getDB("hot_search_list");
//
// Map<String,String> timLine = TimeParse.getTimeMap("2019-10-01 00:00:00", "2019-10-09 23:59:59", "dd", 1);
//
// timLine.forEach((start, end) ->{
//
// String year = end.substring(0,4);
// String month = end.substring(5,7);
// Date startDate = TimeParse.stringFormartDate(start);
// Date endDate = TimeParse.stringFormartDate(end);
//
// String collName = "hot_search_list"+year+"_"+month;
// System.out.println("collName=========="+collName);
//// DBCollection collNew = dbNew.getCollection(collName);
//// DBObject countIndexDoc = new BasicDBObject();
//// countIndexDoc.put("count", -1);
//// DBObject timeIndexDoc = new BasicDBObject();
//// timeIndexDoc.put("time", -1);
//// DBObject rankIndexDoc = new BasicDBObject();
//// rankIndexDoc.put("rank", -1);
//// DBObject nameIndexDoc = new BasicDBObject();
//// nameIndexDoc.put("name", -1);
//// DBObject typeIndexDoc = new BasicDBObject();
//// typeIndexDoc.put("type", -1);
//// try {
//// collNew.createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
//// collNew.createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
//// collNew.createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
//// collNew.createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
//// collNew.createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//
// DBObject query = new BasicDBObject(new BasicDBObject("time",
// new BasicDBObject("$gte",startDate).append("$lte", endDate)));
// System.out.println(query);
// WriteResult wr = coll.remove(query);
// System.out.println("========"+wr.getN());
//// int i = 0;
//// DBCursor cur = coll.remove(query);
//// System.out.println(query +"======="+ cur.count());
//// List<DBObject> dataList = new ArrayList<>();
//// while(cur.hasNext()) {
//// DBObject doc = cur.next();
//// try {
////// collNew.save(doc);
//// i++;
//// coll.remove(doc);
//// } catch (Exception e2) {
//// e2.printStackTrace();
//// }
//// dataList.add(doc);
//// }
//// System.out.println(collName +"数据量大小" +dataList.size());
//// cur.close();
//// if(!dataList.isEmpty()) {
//// try {
//// collNew.insert(dataList);
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// });
// mongo.close();
}
}
......@@ -6,6 +6,7 @@ import java.util.List;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -16,10 +17,9 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class BaiduHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(BaiduHotSearchRun.class);
@Override
public void run() {
boolean f = true;
......@@ -37,10 +37,10 @@ public class BaiduHotSearchRun extends Thread{
private void getHotList() {
logger.info("百度风云榜采集开始........");
log.info("百度风云榜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
List<HotSearchList> list = BaiDuHotSearchCrawler.baiduHotSearch();
logger.info("{}, 此轮百度风云榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
log.info("{}, 此轮百度风云榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> saveDataList = new ArrayList<>();
if(Objects.nonNull(list) && !list.isEmpty()) {
list.forEach(baiduHotSearch ->{
......@@ -59,7 +59,7 @@ public class BaiduHotSearchRun extends Thread{
});
}
hotSearchDAO.addHotSearchList(saveDataList);
logger.info("百度风云榜采集结束........");
log.info("百度风云榜采集结束........");
}
}
\ No newline at end of file
......@@ -5,6 +5,7 @@ import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.DouyinHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class DouyinHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(DouyinHotSearchRun.class);
@Override
public void run() {
boolean f = true;
......@@ -40,10 +40,10 @@ public class DouyinHotSearchRun extends Thread{
* @return void
*/
private void getHotList() {
logger.info("抖音热搜榜采集开始........");
log.info("抖音热搜榜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
List<HotSearchList> list = DouyinHotSearchCrawler.getMobileDouyinHotList();
logger.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
log.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
for(HotSearchList douyinHotSearch : list){
int changeCount = hotSearchDAO.getChangeCount(douyinHotSearch);
......@@ -60,7 +60,7 @@ public class DouyinHotSearchRun extends Thread{
data.add(douyin);
hotSearchDAO.addHotSearch(douyin);
}
logger.info("抖音热搜榜采集结束........");
log.info("抖音热搜榜采集结束........");
}
}
package com.zhiwei.searchhotcrawler.timer;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.util.Template;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.searchhotcrawler.util.WechatConstant;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class SendWeiboHotSearchRun extends Thread {
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
private static WechatUserDao wechatUserDao = new WechatUserDao();
private static Logger logger = LoggerFactory.getLogger(SendWeiboHotSearchRun.class);
@Override
public void run() {
while (true) {
try {
Calendar calendar = Calendar.getInstance();
int hour = calendar.get(Calendar.HOUR_OF_DAY);
logger.info("微博推送,当前系统时间为:" + hour);
if (hour > 6 && hour < 23) {
List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.微博热搜.name());
if (list != null && !list.isEmpty()) {
for (DBObject weibo : list) {
String title = weibo.get("name").toString();
String time = TimeParse.dateFormartString((Date) weibo.get("time"), "yyyy-MM-dd HH:mm:ss");
String url = weibo.get("url").toString();
sendTemplateByUserIds(title, time, url);
}
} else {
logger.info("微博最近一小时无数据");
sendTemplateByUserIds("最近一小时无数据",
TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null);
}
}
ZhiWeiTools.sleep(1 * 60 * 60 * 1000);
} catch (Exception e) {
logger.debug("微博热搜推送出现问题,问题为:::{}", e.fillInStackTrace());
ZhiWeiTools.sleep(1 * 60 * 60 * 1000);
continue;
}
}
}
/**
* @Title: sendTemplateByUserIds
* @author hero
* @Description: 发送模版消息
* @param @param
* microTouTiao
* @param @param
* userList 设定文件
* @return void 返回类型
*/
public static void sendTemplateByUserIds(String title, String time, String url) {
Map<String, Object> dataMap = new HashMap<String, Object>();
JSONObject first = new JSONObject();
first.put("value", "您好,有一条来自微博热搜榜的预警通知。");
dataMap.put("first", first);
JSONObject keyword1 = new JSONObject();
keyword1.put("value", title);
keyword1.put("color", "#173177");
dataMap.put("keyword1", keyword1);
JSONObject keyword2 = new JSONObject();
keyword2.put("value", "微博热搜榜");
keyword2.put("color", "#173177");
dataMap.put("keyword2", keyword2);
JSONObject keyword3 = new JSONObject();
keyword3.put("value", time);
keyword3.put("color", "#173177");
dataMap.put("keyword3", keyword3);
JSONObject remark = new JSONObject();
remark.put("value", "知微情报监测服务");
dataMap.put("remark", remark);
List<String> userList = getUserList();
if (userList != null && userList.size() > 0) {
for (String openId : userList) {
Template template = new Template();
template.setTouser(openId);
if (url != null) {
template.setUrl(url);
}
template.setTemplate_id(WechatConstant.WECHAT_TEMPLATEID_EARLY_IT);
template.setData(dataMap);
JSONObject templateJson = (JSONObject) JSONObject.toJSON(template);
WechatCodeUtil.sendDataJson(templateJson);
}
} else {
logger.info("拉取微博用户列表失败");
}
}
/**
* @Title: getUserList
* @author hero
* @Description: 用户列表
* @param @param
* projectName
* @param @return
* 设定文件
* @return List<String> 返回类型
*/
public static List<String> getUserList() {
List<String> userList = wechatUserDao.getWechatUserByGroup("weibohot");
if(userList==null){
userList = WechatCodeUtil.getUserListByGroupName("weibohot");
}
return userList;
}
}
package com.zhiwei.searchhotcrawler.timer;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.util.Template;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.searchhotcrawler.util.WechatConstant;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class SendWeiboHotSearchRun extends Thread {
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
private static WechatUserDao wechatUserDao = new WechatUserDao();
@Override
public void run() {
while (true) {
try {
Calendar calendar = Calendar.getInstance();
int hour = calendar.get(Calendar.HOUR_OF_DAY);
log.info("微博推送,当前系统时间为:" + hour);
if (hour > 6 && hour < 23) {
List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.微博热搜.name());
if (list != null && !list.isEmpty()) {
for (DBObject weibo : list) {
String title = weibo.get("name").toString();
String time = TimeParse.dateFormartString((Date) weibo.get("time"), "yyyy-MM-dd HH:mm:ss");
String url = weibo.get("url").toString();
sendTemplateByUserIds(title, time, url);
}
} else {
log.info("微博最近一小时无数据");
sendTemplateByUserIds("最近一小时无数据",
TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null);
}
}
ZhiWeiTools.sleep(1 * 60 * 60 * 1000);
} catch (Exception e) {
log.debug("微博热搜推送出现问题,问题为:::{}", e.fillInStackTrace());
ZhiWeiTools.sleep(1 * 60 * 60 * 1000);
continue;
}
}
}
/**
* @Title: sendTemplateByUserIds
* @author hero
* @Description: 发送模版消息
* @param @param
* microTouTiao
* @param @param
* userList 设定文件
* @return void 返回类型
*/
public static void sendTemplateByUserIds(String title, String time, String url) {
Map<String, Object> dataMap = new HashMap<String, Object>();
JSONObject first = new JSONObject();
first.put("value", "您好,有一条来自微博热搜榜的预警通知。");
dataMap.put("first", first);
JSONObject keyword1 = new JSONObject();
keyword1.put("value", title);
keyword1.put("color", "#173177");
dataMap.put("keyword1", keyword1);
JSONObject keyword2 = new JSONObject();
keyword2.put("value", "微博热搜榜");
keyword2.put("color", "#173177");
dataMap.put("keyword2", keyword2);
JSONObject keyword3 = new JSONObject();
keyword3.put("value", time);
keyword3.put("color", "#173177");
dataMap.put("keyword3", keyword3);
JSONObject remark = new JSONObject();
remark.put("value", "知微情报监测服务");
dataMap.put("remark", remark);
List<String> userList = getUserList();
if (userList != null && userList.size() > 0) {
for (String openId : userList) {
Template template = new Template();
template.setTouser(openId);
if (url != null) {
template.setUrl(url);
}
template.setTemplate_id(WechatConstant.WECHAT_TEMPLATEID_EARLY_IT);
template.setData(dataMap);
JSONObject templateJson = (JSONObject) JSONObject.toJSON(template);
WechatCodeUtil.sendDataJson(templateJson);
}
} else {
log.info("拉取微博用户列表失败");
}
}
/**
* @Title: getUserList
* @author hero
* @Description: 用户列表
* @param @param
* projectName
* @param @return
* 设定文件
* @return List<String> 返回类型
*/
public static List<String> getUserList() {
List<String> userList = wechatUserDao.getWechatUserByGroup("weibohot");
if(userList==null){
userList = WechatCodeUtil.getUserListByGroupName("weibohot");
}
return userList;
}
}
package com.zhiwei.searchhotcrawler.timer;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.searchhotcrawler.util.Template;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.searchhotcrawler.util.WechatConstant;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class SendZhihuHotSearchRun extends Thread{
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
private static WechatUserDao wechatUserDao = new WechatUserDao();
private static Logger logger = LoggerFactory.getLogger(SendZhihuHotSearchRun.class);
@Override
public void run() {
while(true) {
try {
Calendar calendar = Calendar.getInstance();
int hour = calendar.get(Calendar.HOUR_OF_DAY);
logger.info("知乎推送,当前系统时间为:"+hour);
if(hour > 6 && hour <23){
List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.知乎热搜.name());
if(list!=null && !list.isEmpty()){
for(DBObject zhihu : list){
String title = zhihu.get("display_query").toString();
String time = TimeParse.dateFormartString((Date)zhihu.get("time"), "yyyy-MM-dd HH:mm:ss");
String url = zhihu.get("_id").toString();
if(calendar.get(Calendar.HOUR_OF_DAY) > 6 && calendar.get(Calendar.HOUR_OF_DAY) < 23){
sendTemplateByUserIds(title, time, url);
}
}
}else{
logger.info("知乎最近一小时无数据");
sendTemplateByUserIds("最近一小时无数据", TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null);
}
}
ZhiWeiTools.sleep(1*60*60*1000);
} catch (Exception e) {
logger.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
ZhiWeiTools.sleep(1*60*60*1000);
}
}
}
/**
* @Title: sendTemplateByUserIds
* @author hero
* @Description: 发送模版消息
* @param @param microTouTiao
* @param @param userList 设定文件
* @return void 返回类型
*/
public static void sendTemplateByUserIds(String title,String time, String url) {
Map<String, Object> dataMap = new HashMap<>();
JSONObject first = new JSONObject();
first.put("value", "您好,有一条来自知乎热搜榜的预警通知。");
dataMap.put("first", first);
JSONObject keyword1 = new JSONObject();
keyword1.put("value", title);
keyword1.put("color", "#173177");
dataMap.put("keyword1", keyword1);
JSONObject keyword2 = new JSONObject();
keyword2.put("value", "知乎热搜榜");
keyword2.put("color", "#173177");
dataMap.put("keyword2", keyword2);
JSONObject keyword3 = new JSONObject();
keyword3.put("value", time);
keyword3.put("color", "#173177");
dataMap.put("keyword3", keyword3);
JSONObject remark = new JSONObject();
remark.put("value", "知微情报监测服务");
dataMap.put("remark", remark);
List<String> userList = getUserList();
if(userList!=null && !userList.isEmpty()) {
for (String openId : userList) {
Template template = new Template();
template.setTouser(openId);
if(url!=null){
template.setUrl(url);
}
template.setTemplate_id(WechatConstant.WECHAT_TEMPLATEID_EARLY_IT);
template.setData(dataMap);
JSONObject templateJson = (JSONObject)JSONObject.toJSON(template);
WechatCodeUtil.sendDataJson(templateJson);
}
}else {
logger.info("知乎推送拉取用户列表失败");
}
}
/**
* @Title: getUserList
* @author hero
* @Description: 用户列表
* @param @param projectName
* @param @return 设定文件
* @return List<String> 返回类型
*/
private static List<String> getUserList()
{
List<String> userList = wechatUserDao.getWechatUserByGroup("LP组");
if(userList==null){
userList = WechatCodeUtil.getUserListByGroupName("LP组");
}
return userList;
}
}
package com.zhiwei.searchhotcrawler.timer;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.searchhotcrawler.util.Template;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.searchhotcrawler.util.WechatConstant;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class SendZhihuHotSearchRun extends Thread{
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
private static WechatUserDao wechatUserDao = new WechatUserDao();
@Override
public void run() {
while(true) {
try {
Calendar calendar = Calendar.getInstance();
int hour = calendar.get(Calendar.HOUR_OF_DAY);
log.info("知乎推送,当前系统时间为:"+hour);
if(hour > 6 && hour <23){
List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.知乎热搜.name());
if(list!=null && !list.isEmpty()){
for(DBObject zhihu : list){
String title = zhihu.get("display_query").toString();
String time = TimeParse.dateFormartString((Date)zhihu.get("time"), "yyyy-MM-dd HH:mm:ss");
String url = zhihu.get("_id").toString();
if(calendar.get(Calendar.HOUR_OF_DAY) > 6 && calendar.get(Calendar.HOUR_OF_DAY) < 23){
sendTemplateByUserIds(title, time, url);
}
}
}else{
log.info("知乎最近一小时无数据");
sendTemplateByUserIds("最近一小时无数据", TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null);
}
}
ZhiWeiTools.sleep(1*60*60*1000);
} catch (Exception e) {
log.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
ZhiWeiTools.sleep(1*60*60*1000);
}
}
}
/**
* @Title: sendTemplateByUserIds
* @author hero
* @Description: 发送模版消息
* @param @param microTouTiao
* @param @param userList 设定文件
* @return void 返回类型
*/
public static void sendTemplateByUserIds(String title,String time, String url) {
Map<String, Object> dataMap = new HashMap<>();
JSONObject first = new JSONObject();
first.put("value", "您好,有一条来自知乎热搜榜的预警通知。");
dataMap.put("first", first);
JSONObject keyword1 = new JSONObject();
keyword1.put("value", title);
keyword1.put("color", "#173177");
dataMap.put("keyword1", keyword1);
JSONObject keyword2 = new JSONObject();
keyword2.put("value", "知乎热搜榜");
keyword2.put("color", "#173177");
dataMap.put("keyword2", keyword2);
JSONObject keyword3 = new JSONObject();
keyword3.put("value", time);
keyword3.put("color", "#173177");
dataMap.put("keyword3", keyword3);
JSONObject remark = new JSONObject();
remark.put("value", "知微情报监测服务");
dataMap.put("remark", remark);
List<String> userList = getUserList();
if(userList!=null && !userList.isEmpty()) {
for (String openId : userList) {
Template template = new Template();
template.setTouser(openId);
if(url!=null){
template.setUrl(url);
}
template.setTemplate_id(WechatConstant.WECHAT_TEMPLATEID_EARLY_IT);
template.setData(dataMap);
JSONObject templateJson = (JSONObject)JSONObject.toJSON(template);
WechatCodeUtil.sendDataJson(templateJson);
}
}else {
log.info("知乎推送拉取用户列表失败");
}
}
/**
* @Title: getUserList
* @author hero
* @Description: 用户列表
* @param @param projectName
* @param @return 设定文件
* @return List<String> 返回类型
*/
private static List<String> getUserList()
{
List<String> userList = wechatUserDao.getWechatUserByGroup("LP组");
if(userList==null){
userList = WechatCodeUtil.getUserListByGroupName("LP组");
}
return userList;
}
}
......@@ -5,6 +5,7 @@ import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -15,9 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class SougoHotSearchRun extends Thread {
private static Logger logger = LoggerFactory.getLogger(SougoHotSearchRun.class);
@Override
public void run() {
boolean f = true;
......@@ -36,9 +37,9 @@ public class SougoHotSearchRun extends Thread {
private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
logger.info("搜狗微信采集开始........");
log.info("搜狗微信采集开始........");
List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch();
logger.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
for(HotSearchList sougoHotSearch : list){
DBObject doc = new BasicDBObject();
......@@ -52,7 +53,7 @@ public class SougoHotSearchRun extends Thread {
data.add(doc);
}
hotSearchDAO.addHotSearchList(data);
logger.info("搜狗微信采集结束........");
log.info("搜狗微信采集结束........");
}
}
package com.zhiwei.searchhotcrawler.timer;
import java.util.Calendar;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class UpdateWechatUserRun extends Thread{
private WechatUserDao wechatUserDao = new WechatUserDao();
private static Logger logger = LoggerFactory.getLogger(UpdateWechatUserRun.class);
@Override
public void run() {
logger.info("开始更新用户数据");
while(true) {
try {
Calendar calendar = Calendar.getInstance();
int hour = calendar.get(Calendar.HOUR_OF_DAY);
if(hour > 6 ){
Map<String,Integer> groupMap = WechatCodeUtil.getAllGroupIp();
logger.info("此公众号的分组数量为:::{}", groupMap.size());
if(!groupMap.isEmpty() && groupMap!=null){
for(Entry<String,Integer> group : groupMap.entrySet()){
logger.info("此公众号的分组名称及IP为:::{},{}", group.getKey(), group.getValue());
List<String> userList = WechatCodeUtil.getUserListByGroupId(group.getValue());
logger.info("{},此分组下的用户数量为::{}", group.getKey(), userList.size());
if(userList!=null && !userList.isEmpty()){
wechatUserDao.addWechatUser(userList, group.getKey(), group.getValue());
}
}
}
}
ZhiWeiTools.sleep(1*60*60*1000);
} catch (Exception e) {
logger.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
ZhiWeiTools.sleep(1*60*60*1000);
continue;
}
}
}
}
package com.zhiwei.searchhotcrawler.timer;
import java.util.Calendar;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class UpdateWechatUserRun extends Thread{
private WechatUserDao wechatUserDao = new WechatUserDao();
@Override
public void run() {
log.info("开始更新用户数据");
while(true) {
try {
Calendar calendar = Calendar.getInstance();
int hour = calendar.get(Calendar.HOUR_OF_DAY);
if(hour > 6 ){
Map<String,Integer> groupMap = WechatCodeUtil.getAllGroupIp();
log.info("此公众号的分组数量为:::{}", groupMap.size());
if(!groupMap.isEmpty() && groupMap!=null){
for(Entry<String,Integer> group : groupMap.entrySet()){
log.info("此公众号的分组名称及IP为:::{},{}", group.getKey(), group.getValue());
List<String> userList = WechatCodeUtil.getUserListByGroupId(group.getValue());
log.info("{},此分组下的用户数量为::{}", group.getKey(), userList.size());
if(userList!=null && !userList.isEmpty()){
wechatUserDao.addWechatUser(userList, group.getKey(), group.getValue());
}
}
}
}
ZhiWeiTools.sleep(1*60*60*1000);
} catch (Exception e) {
log.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
ZhiWeiTools.sleep(1*60*60*1000);
continue;
}
}
}
}
......@@ -5,6 +5,7 @@ import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class WeiboHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(WeiboHotSearchRun.class);
@Override
public void run() {
boolean f = true;
......@@ -36,11 +36,11 @@ public class WeiboHotSearchRun extends Thread{
private void getHotList() {
logger.info("微博话题采集开始........");
log.info("微博话题采集开始........");
HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch();
List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearchByPhone();
logger.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
log.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
for(HotSearchList weiboHotSearch : list){
int changeCount = weiboHotSearchDAO.getChangeCount(weiboHotSearch);
......@@ -49,7 +49,7 @@ public class WeiboHotSearchRun extends Thread{
doc.put("name", weiboHotSearch.getName());
doc.put("url", weiboHotSearch.getUrl());
doc.put("count", weiboHotSearch.getCount());
doc.put("hot", weiboHotSearch.isHot());
doc.put("hot", weiboHotSearch.getHot());
doc.put("day", weiboHotSearch.getDay());
doc.put("time", weiboHotSearch.getTime());
doc.put("changeCount", changeCount);
......@@ -59,7 +59,7 @@ public class WeiboHotSearchRun extends Thread{
data.add(doc);
}
weiboHotSearchDAO.addHotSearchList(data);
logger.info("微博话题采集结束........");
log.info("微博话题采集结束........");
}
}
package com.zhiwei.searchhotcrawler.timer;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import com.zhiwei.searchhotcrawler.crawler.WeiboSuperTopicCrawler;
import com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class WeiboSuperTopicRun extends Thread{
@Override
public void run() {
boolean f = true;
while(f) {
try {
getTopicList();
TimeUnit.HOURS.sleep(3);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
}
ZhiWeiTools.sleep(50);
}
}
private void getTopicList() {
WeiboSuperTopicDAO weiboTopicDAO = new WeiboSuperTopicDAO();
log.info("微博超话采集开始........");
List<WeiboSuperTopic> list = WeiboSuperTopicCrawler.startCrawler();
log.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
for(WeiboSuperTopic topic : list){
log.info("topic::::{}", topic);
DBObject doc = new BasicDBObject();
doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName());
doc.put("rank", topic.getRank());
doc.put("score_num", topic.getScore());
doc.put("fensi_num", topic.getFensi());
doc.put("post_num", topic.getPostNum());
doc.put("type", topic.getType());
doc.put("day", topic.getDay());
doc.put("time", topic.getTime());
doc.put("url", topic.getUrl());
data.add(doc);
}
weiboTopicDAO.addTopicList(data);
log.info("微博话题采集结束........");
}
}
package com.zhiwei.searchhotcrawler.timer;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.WeiboTopicCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.WeiboTopic;
import com.zhiwei.searchhotcrawler.crawler.WeiboHuatiCrawler;
import com.zhiwei.searchhotcrawler.dao.WeiboTopicDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class WeiboTopicRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(WeiboTopicRun.class);
@Override
public void run() {
boolean f = true;
while(f) {
try {
getTopicList();
TimeUnit.HOURS.sleep(3);
TimeUnit.MINUTES.sleep(3);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
......@@ -36,28 +31,29 @@ public class WeiboTopicRun extends Thread{
private void getTopicList() {
WeiboTopicDAO weiboTopicDAO = new WeiboTopicDAO();
logger.info("微博超话采集开始........");
List<WeiboTopic> list = WeiboHuatiCrawler.startCrawler();
logger.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO();
log.info("微博话题采集开始........");
List<HotSearchList> list = WeiboTopicCrawler.startCrawlerByPhone();
log.info("{}, 微博话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
for(WeiboTopic topic : list){
logger.info("topic::::{}", topic);
for(HotSearchList topic : list){
log.info("topic::::{}", topic);
DBObject doc = new BasicDBObject();
doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName());
doc.put("rank", topic.getRank());
doc.put("score_num", topic.getScore());
doc.put("fensi_num", topic.getFensi());
doc.put("post_num", topic.getPostNum());
doc.put("type", topic.getType());
doc.put("name", topic.getName());
doc.put("url", topic.getUrl());
doc.put("count", topic.getCount());
doc.put("hot", topic.getHot());
doc.put("day", topic.getDay());
doc.put("time", topic.getTime());
doc.put("url", topic.getUrl());
doc.put("rank", topic.getRank());
doc.put("type", topic.getType());
doc.put("topic_lead", topic.getTopicLead());
doc.put("comment_count", topic.getCommentCount());
data.add(doc);
}
weiboTopicDAO.addTopicList(data);
logger.info("微博话题采集结束........");
weiboHotSearchDAO.addHotSearchList(data);
log.info("微博话题采集结束........");
}
}
......@@ -4,6 +4,7 @@ import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -14,10 +15,9 @@ import com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class ZhihuHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(ZhihuHotSearchRun.class);
@Override
public void run() {
boolean f = true;
......@@ -32,24 +32,22 @@ public class ZhihuHotSearchRun extends Thread{
ZhiWeiTools.sleep(50);
}
}
private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
logger.info("知乎话题采集开始...,当前线程名字:{}", Thread.currentThread().getName());
log.info("知乎话题采集开始...,当前线程名字:{}", Thread.currentThread().getName());
List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList();
List<HotSearchList> mobilelist = ZhihuHotSearchCrawler.getMobileZhihuHotList();
list.addAll(mobilelist);
logger.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
for(HotSearchList zhihuHotSearch : list){
DBObject zhihu = new BasicDBObject();
zhihu.put("_id", zhihuHotSearch.getId());
zhihu.put("name", zhihuHotSearch.getName());
zhihu.put("url", zhihuHotSearch.getUrl());
zhihu.put("count", zhihuHotSearch.getCount());
zhihu.put("hot", zhihuHotSearch.isHot());
zhihu.put("hot", zhihuHotSearch.getHot());
zhihu.put("day", zhihuHotSearch.getDay());
zhihu.put("time", zhihuHotSearch.getTime());
zhihu.put("changeCount", 0);
......@@ -57,7 +55,7 @@ public class ZhihuHotSearchRun extends Thread{
zhihu.put("type", zhihuHotSearch.getType());
hotSearchDAO.addHotSearch(zhihu);
}
logger.info("知乎话题采集结束........");
log.info("知乎话题采集结束........");
}
}
#mongoIp=202.107.192.94
mongoIp=192.168.0.101
mongoPort=30000
#mongoIp=192.168.0.81
#mongoPort=27017
db.username=datapush
db.paasword=4d8ce5c42073c
db.certifiedDB=admin
dbName=hot_search_list
searchCollName=hot_search_list
topicCollName=topic_list
#mongoIp=202.107.192.94
mongoIp=192.168.0.101
mongoPort=30000
#mongoIp=192.168.0.81
#mongoPort=27017
db.username=searchhotcrawleruser
db.paasword=searchhotcrawler1q2w3e4r
db.certifiedDB=admin
dbName=hot_search_list
searchCollName=hot_search_list
topicCollName=topic_list
collWechatUserName=wechat_user
\ No newline at end of file
registry=zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
group=hangzhou
########################################################
#registry=zookeeper://192.168.0.36:2181
#registry=zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181
#group=local
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment