Commit cb6bcd76 by zhiwei

添加微博话题采集,并添加lombok

parent a9966f9d
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>searchhotcrawler</artifactId> <artifactId>searchhotcrawler</artifactId>
<version>0.0.6-SNAPSHOT</version> <version>0.0.6-SNAPSHOT</version>
<name>各平台热搜榜单采集程序</name> <name>各平台热搜榜单采集程序</name>
<description>各平台热搜榜单采集程序 <description>各平台热搜榜单采集程序
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序</description> 目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序</description>
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties> </properties>
<developers> <developers>
<developer> <developer>
<id>Bewilder</id> <id>Bewilder</id>
<name>zhiwei zhang</name> <name>zhiwei zhang</name>
<email>zhangzhiwei@zhiweidata.com</email> <email>zhangzhiwei@zhiweidata.com</email>
</developer> </developer>
</developers> </developers>
<dependencies> <dependencies>
<!-- 数据解析jar --> <!-- 数据解析jar -->
<dependency> <dependency>
<groupId>org.mongodb</groupId> <groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId> <artifactId>mongo-java-driver</artifactId>
<version>3.6.3</version> <version>3.6.3</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>sendmail</artifactId> <artifactId>sendmail</artifactId>
<version>0.0.1-SNAPSHOT</version> <version>0.0.1-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.5-SNAPSHOT</version> <version>0.1.6-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.5.5.6-SNAPSHOT</version> <version>0.6.0.4-RELEASE</version>
</dependency> </dependency>
</dependencies> <dependency>
<groupId>org.projectlombok</groupId>
<build> <artifactId>lombok</artifactId>
<plugins> <version>1.18.8</version>
<plugin> </dependency>
<groupId>org.apache.maven.plugins</groupId> </dependencies>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.2</version> <build>
<executions> <plugins>
<execution> <plugin>
<phase>package</phase> <groupId>org.apache.maven.plugins</groupId>
<goals> <artifactId>maven-shade-plugin</artifactId>
<goal>shade</goal> <version>2.4.2</version>
</goals> <executions>
<configuration> <execution>
<filters> <phase>package</phase>
<filter> <goals>
<artifact>*:*</artifact> <goal>shade</goal>
<excludes> </goals>
<exclude>META-INF/*.SF</exclude> <configuration>
<exclude>META-INF/*.DSA</exclude> <filters>
<exclude>META-INF/*.RSA</exclude> <filter>
</excludes> <artifact>*:*</artifact>
</filter> <excludes>
</filters> <exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<transformers> <exclude>META-INF/*.RSA</exclude>
<transformer </excludes>
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> </filter>
<mainClass>com.zhiwei.searchhotcrawler.run.HotSearchRun</mainClass> </filters>
</transformer>
</transformers> <transformers>
</configuration> <transformer
</execution> implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
</executions> <mainClass>com.zhiwei.searchhotcrawler.run.HotSearchRun</mainClass>
</plugin> </transformer>
</transformers>
<plugin> </configuration>
<artifactId>maven-source-plugin</artifactId> </execution>
<version>2.4</version> </executions>
<configuration> </plugin>
<attach>true</attach>
</configuration> <plugin>
<executions> <artifactId>maven-source-plugin</artifactId>
<execution> <version>2.4</version>
<phase>compile</phase> <configuration>
<goals> <attach>true</attach>
<goal>jar</goal> </configuration>
</goals> <executions>
</execution> <execution>
</executions> <phase>compile</phase>
</plugin> <goals>
<goal>jar</goal>
<!-- 解决maven test命令时console出现中文乱码乱码 --> </goals>
<plugin> </execution>
<groupId>org.apache.maven.plugins</groupId> </executions>
<artifactId>maven-surefire-plugin</artifactId> </plugin>
<version>2.19.1</version>
<configuration> <!-- 解决maven test命令时console出现中文乱码乱码 -->
<forkMode>once</forkMode> <plugin>
<argLine>-Dfile.encoding=UTF-8</argLine> <groupId>org.apache.maven.plugins</groupId>
<skipTests>true</skipTests> <artifactId>maven-surefire-plugin</artifactId>
</configuration> <version>2.19.1</version>
</plugin> <configuration>
</plugins> <forkMode>once</forkMode>
</build> <argLine>-Dfile.encoding=UTF-8</argLine>
<skipTests>true</skipTests>
</configuration>
</plugin>
</plugins>
</build>
</project> </project>
\ No newline at end of file
...@@ -10,36 +10,80 @@ import java.io.Serializable; ...@@ -10,36 +10,80 @@ import java.io.Serializable;
import java.util.Date; import java.util.Date;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import lombok.Data;
import lombok.ToString;
@Data
@ToString
public class HotSearchList implements Serializable{ public class HotSearchList implements Serializable{
private static final long serialVersionUID = 2076919584659821600L; private static final long serialVersionUID = 2076919584659821600L;
private String id; //主键
private String url; //消息链接
private String name; //热搜关键词
private Integer count; //时时热搜量
private Boolean hot; //状态(true 为热搜; false为时时上升)
private String day; //天
private Date time; //时间
private Integer changeCount; //据上分钟变化量
private Integer rank; //排名
private String type; //分类
private String icon; //热搜类型 /**
* 主键
*/
private String id;
/**
* 消息链接
*/
private String url;
/**
* 热搜关键词
*/
private String name;
/**
* 热搜或话题导语
*/
private String topicLead;
/**
* 时时热搜量
*/
private Integer count;
/**
* 状态(true 为热搜; false为时时上升)
*/
private Boolean hot;
/**
* 天
*/
private String day;
/**
* 时间
*/
private Date time;
/**
* 据上分钟变化量
*/
private Integer changeCount;
/**
* 排名
*/
private Integer rank;
/**
* 分类
*/
private String type;
/**
* 热搜类型
*/
private String icon;
/**
* 话题讨论量
*/
private Integer commentCount;
public HotSearchList(){} public HotSearchList(){}
public HotSearchList(String url, String name, Integer count,Boolean hot,Integer rank,String type,String icon){ public HotSearchList(String url, String name, Integer count,Boolean hot,Integer rank,String type,String icon){
...@@ -67,122 +111,20 @@ public class HotSearchList implements Serializable{ ...@@ -67,122 +111,20 @@ public class HotSearchList implements Serializable{
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd"); this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.type = type; this.type = type;
} }
@Override
public String toString(){
return "new HotSearchList["
+ "id = " + id
+ ", url = " + url
+ ", name = " + name
+ ", count = " + count
+ ", time = " + time
+ ", hot = " + hot
+ ", rank = " + rank
+ ", day = " + day
+ ", changeCount = " + changeCount
+ ", type = " + type
+ ", icon = " + icon
+ "]";
}
public String getId() {
return id;
}
public void setId(String id) { public HotSearchList(String url, String name, Integer count,Integer rank,String type, Integer commentCount, String topicLead){
this.id = id; this.id = name + "_" + new Date().getTime();
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url; this.url = url;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name; this.name = name;
}
public Integer getCount() {
return count;
}
public void setCount(Integer count) {
this.count = count; this.count = count;
} this.hot = true;
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public Integer getChangeCount() {
return changeCount;
}
public void setChangeCount(Integer changeCount) {
this.changeCount = changeCount;
}
public static long getSerialversionuid() {
return serialVersionUID;
}
public Boolean isHot() {
return hot;
}
public void setHot(Boolean hot) {
this.hot = hot;
}
public Boolean getHot() {
return hot;
}
public String getIcon() {
return icon;
}
public void setIcon(String icon) {
this.icon = icon;
}
public String getDay() {
return day;
}
public void setDay(String day) {
this.day = day;
}
public Integer getRank() {
return rank;
}
public void setRank(Integer rank) {
this.rank = rank; this.rank = rank;
} this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
public String getType() {
return type;
}
public void setType(String type) {
this.type = type; this.type = type;
this.commentCount = commentCount;
this.topicLead = topicLead;
} }
} }
package com.zhiwei.searchhotcrawler.bean; package com.zhiwei.searchhotcrawler.bean;
public enum HotSearchType { public enum HotSearchType {
百度热搜, 百度热搜,
微博热搜, 微博热搜,
知乎热搜, 知乎热搜,
抖音热搜, 抖音热搜,
搜狗微信热搜 搜狗微信热搜,
} 微博话题
}
package com.zhiwei.searchhotcrawler.bean;
import java.util.Date;
import com.zhiwei.tools.timeparse.TimeParse;
import lombok.Data;
import lombok.ToString;
/**
*
* @ClassName: WeiboSuperTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
@Data
@ToString
public class WeiboSuperTopic {
/**
* 主键
*/
private String id;
/**
* 话题链接
*/
public String url;
/**
* 话题名称
*/
public String topicName;
/**
* 话题排名
*/
public Integer rank;
/**
* 话题影响力
*/
public String score;
/**
* 话题粉丝数
*/
public String fensi;
/**
* 话题阅读数
*/
public String readNum;
/**
* 话题帖子数
*/
public String postNum;
/**
* 榜单类型
*/
public String type;
/**
* 天
*/
private String day;
/**
* 时间
*/
private Date time;
public WeiboSuperTopic() {}
public WeiboSuperTopic(String url, String topicName, Integer rank, String score,
String fensi, String type) {
this.url = url;
this.topicName = topicName;
this.rank = rank;
this.score = score;
this.fensi = fensi;
this.type = type;
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.id = topicName + "_" + type + "_" + day;
}
}
package com.zhiwei.searchhotcrawler.bean;
import java.util.Date;
import com.zhiwei.tools.timeparse.TimeParse;
/**
*
* @ClassName: WeiboTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
public class WeiboTopic {
private String id; //主键
public String url; //话题链接
public String topicName; //话题名称
public Integer rank; //话题排名
public String score; //话题影响力
public String fensi; //话题粉丝数
public String readNum; //话题阅读数
public String postNum; //话题帖子数
public String type; //榜单类型
private String day; //天
private Date time; //时间
public WeiboTopic() {}
public WeiboTopic(String url, String topicName, Integer rank, String score,
String fensi, String type) {
this.url = url;
this.topicName = topicName;
this.rank = rank;
this.score = score;
this.fensi = fensi;
this.type = type;
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.id = topicName + "_" + type + "_" + day;
}
@Override
public String toString() {
return "new WeiboTopic["
+ "topicName = " + topicName
+ ", rank = " + rank
+ ", score = " + score
+ ", fensi = " + fensi
+ ", type = " + type
+ ", readNum = " + readNum
+ ", postNum = " + postNum
+ ", url = " + url
+ "]";
}
public String getUrl() {
return url;
}
public String getTopicName() {
return topicName;
}
public Integer getRank() {
return rank;
}
public String getScore() {
return score;
}
public String getFensi() {
return fensi;
}
public String getReadNum() {
return readNum;
}
public String getPostNum() {
return postNum;
}
public String getType() {
return type;
}
public void setUrl(String url) {
this.url = url;
}
public void setTopicName(String topicName) {
this.topicName = topicName;
}
public void setRank(Integer rank) {
this.rank = rank;
}
public void setScore(String score) {
this.score = score;
}
public void setFensi(String fensi) {
this.fensi = fensi;
}
public void setReadNum(String readNum) {
this.readNum = readNum;
}
public void setPostNum(String postNum) {
this.postNum = postNum;
}
public void setType(String type) {
this.type = type;
}
public String getId() {
return id;
}
public String getDay() {
return day;
}
public Date getTime() {
return time;
}
public void setId(String id) {
this.id = id;
}
public void setDay(String day) {
this.day = day;
}
public void setTime(Date time) {
this.time = time;
}
}
package com.zhiwei.searchhotcrawler.cache; package com.zhiwei.searchhotcrawler.cache;
import org.slf4j.Logger; import lombok.extern.log4j.Log4j2;
import org.slf4j.LoggerFactory; import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class CacheListener { @Log4j2
Logger logger = LoggerFactory.getLogger(CacheListener.class); public class CacheListener {
public void startListen() { /**
new Thread(){ * 开启缓存监听
public void run() { */
while (true) { public void startListen() {
if(CacheManager.caches!=null && CacheManager.caches.size()>0){ new Thread(){
for(String key : CacheManager.getAllKeys()) { public void run() {
if (CacheManager.isTimeOut(key)) { while (true) {
CacheManager.clearByKey(key); if(CacheManager.caches!=null && CacheManager.caches.size()>0){
logger.info(key + "缓存被清除"); for(String key : CacheManager.getAllKeys()) {
} if (CacheManager.isTimeOut(key)) {
} CacheManager.clearByKey(key);
} log.info(key + "缓存被清除");
ZhiWeiTools.sleep(500); }
} }
} }
}.start(); ZhiWeiTools.sleep(500);
} }
} }
}.start();
}
}
package com.zhiwei.searchhotcrawler.crawler; package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import org.apache.commons.lang3.StringUtils; import lombok.extern.log4j.Log4j2;
import org.jsoup.Jsoup; import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document; import org.jsoup.Jsoup;
import org.jsoup.select.Elements; import org.jsoup.nodes.Document;
import org.slf4j.Logger; import org.jsoup.select.Elements;
import org.slf4j.LoggerFactory; import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
/**
* @ClassName:BaiDuHotSearch /**
* @Description: TODO(百度风云榜热搜采集) * @ClassName:BaiDuHotSearch
* @author hero * @Description: TODO(百度风云榜热搜采集)
* @date 2019年7月10日 上午10:54:31 * @author hero
*/ * @date 2019年7月10日 上午10:54:31
public class BaiDuHotSearchCrawler { */
@Log4j2
private static Logger logger = LoggerFactory.getLogger(BaiDuHotSearchCrawler.class); public class BaiDuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: BaiDuHotSearchTest /**
* @author hero * @Title: BaiDuHotSearchTest
* @Description: TODO(PC端百度风云榜采集) * @author hero
* @param 设定文件 * @Description: PC端百度风云榜采集
* @return void 返回类型 * @return void 返回类型
*/ */
public static List<HotSearchList> baiduHotSearch() { public static List<HotSearchList> baiduHotSearch() {
String url = "http://top.baidu.com/buzz?b=1&fr=topindex"; String url = "http://top.baidu.com/buzz?b=1&fr=topindex";
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null && htmlBody.contains("mainBody")) { if (htmlBody != null && htmlBody.contains("mainBody")) {
return ansysData(htmlBody); return ansysData(htmlBody);
} else { } else {
logger.info("解析百度风云榜时出现解析错误,页面结构有问题"); log.info("解析百度风云榜时出现解析错误,页面结构有问题");
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("解析百度风云榜时出现解析错误,页面结构有问题", e); log.error("解析百度风云榜时出现解析错误,页面结构有问题", e);
} }
return Collections.emptyList(); return Collections.emptyList();
} }
/** /**
* 解析数据 * 解析数据
* @param htmlBody * @param htmlBody
* @return * @return
*/ */
private static List<HotSearchList> ansysData(String htmlBody){ private static List<HotSearchList> ansysData(String htmlBody){
List<HotSearchList> list = new ArrayList<>(); List<HotSearchList> list = new ArrayList<>();
try { try {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("table.list-table").select("tr"); Elements elements = document.select("table.list-table").select("tr");
if (Objects.nonNull(elements) && !elements.isEmpty()) { if (Objects.nonNull(elements) && !elements.isEmpty()) {
elements.forEach(element -> { elements.forEach(element -> {
try { try {
// 获取排名rank // 获取排名rank
String rankStr = null; String rankStr = null;
// 根据网页标签,给rankStr做判断 // 根据网页标签,给rankStr做判断
if (!element.select("td.first").select("span.num-top").isEmpty()) { if (!element.select("td.first").select("span.num-top").isEmpty()) {
rankStr = element.select("td.first").select("span.num-top").text(); rankStr = element.select("td.first").select("span.num-top").text();
} else if (!element.select("td.first").select("span.num-normal").isEmpty()) { } else if (!element.select("td.first").select("span.num-normal").isEmpty()) {
rankStr = element.select("td.first").select("span.num-normal").text(); rankStr = element.select("td.first").select("span.num-normal").text();
} }
Integer rank = null; Integer rank = null;
// 判断rankStr是否为空 // 判断rankStr是否为空
if (StringUtils.isNoneBlank(rankStr)) { if (StringUtils.isNoneBlank(rankStr)) {
rank = Integer.valueOf(rankStr); rank = Integer.valueOf(rankStr);
} }
// 获取关键词(String) // 获取关键词(String)
String kw = element.select("td.keyword").select("a.list-title").text(); String kw = element.select("td.keyword").select("a.list-title").text();
// logger.info("关键词:{}", kw); // logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String) // 获取关键词相关链接everurl(String)
String everurl = element.select("td.keyword").select("a.list-title").attr("href"); String everurl = element.select("td.keyword").select("a.list-title").attr("href");
// 获取搜索指数count(int) // 获取搜索指数count(int)
String hot = null; String hot = null;
// 判断热度值所在的规则是否为null // 判断热度值所在的规则是否为null
if (!element.select("td.last").select("span.icon-fall").isEmpty()) { if (!element.select("td.last").select("span.icon-fall").isEmpty()) {
hot = element.select("td.last").select("span.icon-fall").text(); hot = element.select("td.last").select("span.icon-fall").text();
} else if (!element.select("td.last").select("span.icon-rise").isEmpty()) { } else if (!element.select("td.last").select("span.icon-rise").isEmpty()) {
hot = element.select("td.last").select("span.icon-rise").text(); hot = element.select("td.last").select("span.icon-rise").text();
} }
int count = 0; int count = 0;
// 判断hot是否为空 // 判断hot是否为空
if (StringUtils.isNotBlank(hot)) { if (StringUtils.isNotBlank(hot)) {
count = Integer.valueOf(hot); count = Integer.valueOf(hot);
} }
if (Objects.nonNull(rank)) { if (Objects.nonNull(rank)) {
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name()); HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name());
list.add(hotSearch); list.add(hotSearch);
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("解析百度风云榜时出现解析错误", e); log.error("解析百度风云榜时出现解析错误", e);
} }
}); });
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("解析百度风云榜时出现解析错误,数据不是json结构", e); log.error("解析百度风云榜时出现解析错误,数据不是json结构", e);
} }
return list; return list;
} }
} }
\ No newline at end of file
package com.zhiwei.searchhotcrawler.crawler; package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.commons.lang3.StringUtils; import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger; import org.apache.commons.lang3.StringUtils;
import org.slf4j.LoggerFactory; import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONArray;
import com.zhiwei.crawler.core.HttpBoot; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
/**
* @className DouyinHotSearchCrawler /**
* @Description:抖音热搜榜采集程序 * @className DouyinHotSearchCrawler
* @author win 10 * @Description:抖音热搜榜采集程序
* @date:2019年07月11日 上午10:26:21 * @author win 10
*/ * @date:2019年07月11日 上午10:26:21
public class DouyinHotSearchCrawler { */
@Log4j2
private static Logger logger = LoggerFactory.getLogger(ZhihuHotSearchCrawler.class); public class DouyinHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: getMobileDouyinHotList /**
* @author hero * @Title: getMobileDouyinHotList
* @Description: 移动端抖音热搜榜 * @author hero
* @param @return 设定文件 * @Description: 移动端抖音热搜榜
* @return List<ZhihuHotSearch> 返回类型 * @param @return 设定文件
*/ * @return List<ZhihuHotSearch> 返回类型
public static List<HotSearchList> getMobileDouyinHotList(){ */
List<HotSearchList> list = null; public static List<HotSearchList> getMobileDouyinHotList(){
String url = "https://api.amemv.com/aweme/v1/hot/search/list/"; List<HotSearchList> list = null;
try { String url = "https://api.amemv.com/aweme/v1/hot/search/list/";
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string(); try {
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("word_list")){ String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
list = new ArrayList<>(); if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("word_list")){
JSONObject data = JSONObject.parseObject(htmlBody); list = new ArrayList<>();
JSONArray wordList = data.getJSONObject("data").getJSONArray("word_list"); JSONObject data = JSONObject.parseObject(htmlBody);
String positionStr = null; JSONArray wordList = data.getJSONObject("data").getJSONArray("word_list");
String word = null; String positionStr = null;
String hotValueStr = null; String word = null;
for (int i = 0; i < wordList.size(); i++) { String hotValueStr = null;
JSONObject wl = wordList.getJSONObject(i); for (int i = 0; i < wordList.size(); i++) {
//获取排名 JSONObject wl = wordList.getJSONObject(i);
positionStr = wl.getString("position"); //获取排名
Integer position = null; positionStr = wl.getString("position");
position = Integer.valueOf(positionStr); Integer position = null;
//获取关键词 position = Integer.valueOf(positionStr);
word = wl.getString("word"); //获取关键词
//获取热度值 word = wl.getString("word");
hotValueStr =wl.getString("hot_value"); //获取热度值
Integer hotValue = null; hotValueStr =wl.getString("hot_value");
hotValue = Integer.valueOf(hotValueStr); Integer hotValue = null;
// logger.info("热度为:::{}", hot_value); hotValue = Integer.valueOf(hotValueStr);
HotSearchList douyin = new HotSearchList(null,word, hotValue, position,HotSearchType.抖音热搜.name()); // logger.info("热度为:::{}", hot_value);
list.add(douyin); HotSearchList douyin = new HotSearchList(null,word, hotValue, position,HotSearchType.抖音热搜.name());
} list.add(douyin);
} }
} catch (IOException e) { }
logger.debug("获取抖音热搜榜时出现问题:{}", e); } catch (IOException e) {
} log.debug("获取抖音热搜榜时出现问题:{}", e);
return list; }
} return list;
}
}
}
package com.zhiwei.searchhotcrawler.crawler; package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import org.apache.commons.lang3.StringUtils; import lombok.extern.log4j.Log4j2;
import org.jsoup.Jsoup; import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document; import org.jsoup.Jsoup;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Document;
import org.jsoup.select.Elements; import org.jsoup.nodes.Element;
import org.slf4j.Logger; import org.jsoup.select.Elements;
import org.slf4j.LoggerFactory; import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool;
/**
* @ClassName:SougoHotSearch /**
* @Description: TODO(搜狗微信关键词采集) * @ClassName:SougoHotSearch
* @author hero * @Description: TODO(搜狗微信关键词采集)
* @date 2019年7月10日 上午10:54:31 * @author hero
*/ * @date 2019年7月10日 上午10:54:31
public class SougoHotSearchCrawler { */
@Log4j2
private static Logger logger = LoggerFactory.getLogger(SougoHotSearchCrawler.class); public class SougoHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: SougoHotSearchTest /**
* @author hero * @Title: SougoHotSearchTest
* @Description: TODO(PC端搜狗微信关键词采集) * @author hero
* @param 设定文件 * @Description: TODO(PC端搜狗微信关键词采集)
* @return void 返回类型 * @return void 返回类型
*/ */
public static List<HotSearchList> sougoHotSearch() { public static List<HotSearchList> sougoHotSearch() {
String url = "https://weixin.sogou.com"; String url = "https://weixin.sogou.com";
List<HotSearchList> list = new ArrayList<>(); List<HotSearchList> list = new ArrayList<>();
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
String htmlBody = null; String htmlBody = null;
try { try {
Map<String,String> headMap = HeaderTool.getCommonHead(); Map<String,String> headMap = HeaderTool.getCommonHead();
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string(); htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null && htmlBody.contains("topwords")) { if (htmlBody != null && htmlBody.contains("topwords")) {
try { try {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("ol#topwords").select("li"); Elements elements = document.select("ol#topwords").select("li");
for (Element element : elements) { for (Element element : elements) {
try { try {
// 获取排名rank // 获取排名rank
String rankStr = null; String rankStr = null;
if (!element.select("li").select("i").isEmpty()) { if (!element.select("li").select("i").isEmpty()) {
rankStr = element.select("li").select("i").text(); rankStr = element.select("li").select("i").text();
} }
Integer rank = null; Integer rank = null;
if (StringUtils.isNoneBlank(rankStr)) { if (StringUtils.isNoneBlank(rankStr)) {
rank = Integer.valueOf(rankStr); rank = Integer.valueOf(rankStr);
} }
// 获取关键词(String) // 获取关键词(String)
String kw = element.select("li").select("a").text(); String kw = element.select("li").select("a").text();
// logger.info("关键词:{}", kw); // logger.info("关键词:{}", kw);
String everurl = element.select("li").select("a").attr("href"); String everurl = element.select("li").select("a").attr("href");
HotSearchList hotSearch = new HotSearchList(everurl, kw, null, rank, HotSearchType.搜狗微信热搜.name()); HotSearchList hotSearch = new HotSearchList(everurl, kw, null, rank, HotSearchType.搜狗微信热搜.name());
if (Objects.nonNull(rank)) { if (Objects.nonNull(rank)) {
list.add(hotSearch); list.add(hotSearch);
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("解析搜狗微信时出现解析错误", e); log.error("解析搜狗微信时出现解析错误", e);
} }
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("解析搜狗微信时出现解析错误,数据不是json结构", e.fillInStackTrace()); log.error("解析搜狗微信时出现解析错误,数据不是json结构", e.fillInStackTrace());
return Collections.emptyList(); return Collections.emptyList();
} }
} else { } else {
logger.info("解析搜狗微信时出现解析错误,页面结构有问题"); log.info("解析搜狗微信时出现解析错误,页面结构有问题");
} }
break; break;
} catch (Exception e) { } catch (Exception e) {
logger.error("解析搜狗微信时出现解析错误,页面结构有问题", e); log.error("解析搜狗微信时出现解析错误,页面结构有问题", e);
} }
} }
return list; return list;
} }
} }
...@@ -7,6 +7,7 @@ import java.util.HashMap; ...@@ -7,6 +7,7 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -27,13 +28,13 @@ import com.zhiwei.tools.tools.URLCodeUtil; ...@@ -27,13 +28,13 @@ import com.zhiwei.tools.tools.URLCodeUtil;
/** /**
* @ClassName: WeiboHotSearch * @ClassName: WeiboHotSearch
* @Description: TODO(微博实时热搜采集) * @Description: 微博实时热搜采集
* @author hero * @author hero
* @date 2017年9月15日 上午10:54:31 * @date 2017年9月15日 上午10:54:31
*/ */
@Log4j2
public class WeiboHotSearchCrawler { public class WeiboHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(WeiboHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/** /**
* @Title: weiboHotSearchTest * @Title: weiboHotSearchTest
...@@ -70,18 +71,18 @@ public class WeiboHotSearchCrawler { ...@@ -70,18 +71,18 @@ public class WeiboHotSearchCrawler {
list.add(hotSearch); list.add(hotSearch);
} catch (Exception e) { } catch (Exception e) {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com"); SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
logger.error("解析微博时时热搜时出现解析错误", e); log.error("解析微博时时热搜时出现解析错误", e);
continue; continue;
} }
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("解析微博时时热搜时出现解析错误,数据不是json结构",e.fillInStackTrace()); log.error("解析微博时时热搜时出现解析错误,数据不是json结构",e.fillInStackTrace());
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com"); SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
return null; return null;
} }
}else{ }else{
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com"); SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
logger.info("解析微博时时热搜时出现解析错误,页面结构有问题"); log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
} }
break; break;
} catch (Exception e) { } catch (Exception e) {
...@@ -138,25 +139,25 @@ public class WeiboHotSearchCrawler { ...@@ -138,25 +139,25 @@ public class WeiboHotSearchCrawler {
} }
String id = "http://s.weibo.com/weibo/"+URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top"; String id = "http://s.weibo.com/weibo/"+URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon); HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon);
logger.info("采集到的数据:::{}", hotSearch); log.info("采集到的数据:::{}", hotSearch);
result.add(hotSearch); result.add(hotSearch);
rank++; rank++;
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("解析微博时时热搜时出现解析错误",e); log.error("解析微博时时热搜时出现解析错误",e);
continue; continue;
} }
} }
return result; return result;
} catch (Exception e) { } catch (Exception e) {
logger.error("解析微博时时热搜时出现解析错误,数据不是json结构",e); log.error("解析微博时时热搜时出现解析错误,数据不是json结构",e);
return Collections.emptyList(); return Collections.emptyList();
} }
}else{ }else{
logger.info("解析微博时时热搜时出现解析错误,页面结构有问题"); log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
} }
} catch (IOException e1) { } catch (IOException e1) {
logger.error("解析微博时时热搜时出现连接失败",e1); log.error("解析微博时时热搜时出现连接失败",e1);
return Collections.emptyList(); return Collections.emptyList();
} }
return Collections.emptyList(); return Collections.emptyList();
......
package com.zhiwei.searchhotcrawler.crawler; package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Objects; import java.util.Objects;
import org.apache.commons.lang3.StringUtils; import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import org.slf4j.Logger; import lombok.extern.log4j.Log4j2;
import org.slf4j.LoggerFactory; import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import com.alibaba.fastjson.JSONArray; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.alibaba.fastjson.JSONArray;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.searchhotcrawler.bean.WeiboTopic; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
/**
* /**
* @ClassName: WeiboHuatiCrawler *
* @Description: 微博话题榜单采集(明星) * @ClassName: WeiboSuperTopicCrawler
* @author Bewilder ZW * @Description: 微博超话榜单采集(明星)
* @date 2019年9月27日 下午3:01:34 * @author Bewilder ZW
*/ * @date 2019年9月27日 下午3:01:34
public class WeiboHuatiCrawler { */
@Log4j2
private static Logger logger = LoggerFactory.getLogger(WeiboHuatiCrawler.class); public class WeiboSuperTopicCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Map<String,String> headMap = new HashMap<>(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Map<String,String> headMap = new HashMap<>();
static {
headMap.put("X-Requested-With", "XMLHttpRequest"); static {
headMap.put("Referer", "https://huati.weibo.cn/discovery/super?extparam=ctg1_2%7Cscorll_1&luicode=10000011&lfid=100803_-_super&sourceType=weixin"); headMap.put("X-Requested-With", "XMLHttpRequest");
headMap.put("Host", "huati.weibo.cn"); headMap.put("Referer", "https://huati.weibo.cn/discovery/super?extparam=ctg1_2%7Cscorll_1&luicode=10000011&lfid=100803_-_super&sourceType=weixin");
headMap.put("Host", "huati.weibo.cn");
}
}
/**
* /**
* 开始采集明星话题 *
* @return void * 开始采集明星话题
*/ * @return void
public static List<WeiboTopic> startCrawler() { */
Map<String,String> urlMap = new HashMap<>(); public static List<WeiboSuperTopic> startCrawler() {
urlMap.put("明星", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=star&from=&wm="); Map<String,String> urlMap = new HashMap<>();
urlMap.put("明星潜力", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=potential&from=&wm="); urlMap.put("明星", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=star&from=&wm=");
urlMap.put("明星上升", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=up&from=&wm="); urlMap.put("明星潜力", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=potential&from=&wm=");
urlMap.put("明星上升", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=up&from=&wm=");
List<WeiboTopic> topicList = new ArrayList<>();
List<WeiboSuperTopic> topicList = new ArrayList<>();
for(Entry<String,String> entry : urlMap.entrySet()) {
String url = entry.getValue(); for(Entry<String,String> entry : urlMap.entrySet()) {
String type = entry.getKey(); String url = entry.getValue();
for(int page= 1; page<=5; page++) { String type = entry.getKey();
String pageUrl = url + "&page=" + page; for(int page= 1; page<=5; page++) {
//重试三次 String pageUrl = url + "&page=" + page;
for(int retryTimes = 1; retryTimes<=3; retryTimes++) { //重试三次
try { for(int retryTimes = 1; retryTimes<=3; retryTimes++) {
System.out.println("pageUrl=========="+pageUrl); try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string(); System.out.println("pageUrl=========="+pageUrl);
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc1")) { String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
topicList.addAll(parseTopicRankHtml(page, htmlBody, type)); if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc1")) {
break; topicList.addAll(parseTopicRankHtml(page, htmlBody, type));
}else { break;
logger.error("获取榜单列表页面时数据格式错误,页面为:{}", htmlBody); }else {
} log.error("获取榜单列表页面时数据格式错误,页面为:{}", htmlBody);
} catch (Exception e) { }
logger.error("获取榜单列表页面时出现错误,错误为:{}", e); } catch (Exception e) {
continue; log.error("获取榜单列表页面时出现错误,错误为:{}", e);
} continue;
} }
}
}
} }
return topicList; }
} return topicList;
}
/**
* /**
* 解析话题榜单 *
* @param htmlBody * 解析话题榜单
* @param type * @param htmlBody
* @return void * @param type
*/ * @return void
private static List<WeiboTopic> parseTopicRankHtml(int page,String htmlBody, String type) { */
try { private static List<WeiboSuperTopic> parseTopicRankHtml(int page,String htmlBody, String type) {
JSONArray list = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("list"); try {
if(Objects.nonNull(list) && !list.isEmpty()) { JSONArray list = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("list");
page = (page-1)*20; if(Objects.nonNull(list) && !list.isEmpty()) {
page = (page-1)*20;
List<WeiboTopic> topicList = new ArrayList<>();
Integer toprank = null; List<WeiboSuperTopic> topicList = new ArrayList<>();
String topicName = null; Integer toprank = null;
String id = null; String topicName = null;
String score = null; String id = null;
String desc1 = null; String score = null;
String fensi = null; String desc1 = null;
String url = null; String fensi = null;
for(int i=0;i<list.size();i++) { String url = null;
JSONObject data = list.getJSONObject(i); for(int i=0;i<list.size();i++) {
toprank = page + data.getInteger("toprank"); JSONObject data = list.getJSONObject(i);
topicName = data.getString("display_name"); toprank = page + data.getInteger("toprank");
id = data.getString("page_id"); topicName = data.getString("display_name");
score = data.getString("score"); id = data.getString("page_id");
desc1 = data.getString("desc1"); score = data.getString("score");
fensi = desc1.replaceAll(".*影响力|粉丝", "").trim(); desc1 = data.getString("desc1");
url = data.getString("link"); fensi = desc1.replaceAll(".*影响力|粉丝", "").trim();
url = data.getString("link");
WeiboTopic topic = new WeiboTopic(url, topicName, toprank, score, fensi, type);
topic = getTopicInfo(id, topic); WeiboSuperTopic topic = new WeiboSuperTopic(url, topicName, toprank, score, fensi, type);
System.out.println("topic====="+topic); topic = getTopicInfo(id, topic);
topicList.add(topic); System.out.println("topic====="+topic);
} topicList.add(topic);
return topicList; }
} return topicList;
} catch (Exception e) { }
logger.error("解析榜单列表页面时出现错误,错误为:{}", e); } catch (Exception e) {
} log.error("解析榜单列表页面时出现错误,错误为:{}", e);
return Collections.emptyList(); }
} return Collections.emptyList();
}
/**
* /**
* 根据单一话题id获取话题阅读数及发帖数 *
* @param id * 根据单一话题id获取话题阅读数及发帖数
* @param topic * @param id
* @return * @param topic
* @return WeiboTopic * @return
*/ * @return WeiboTopic
private static WeiboTopic getTopicInfo(String id, WeiboTopic topic) { */
for(int retryTimes=1; retryTimes<=3; retryTimes++) { private static WeiboSuperTopic getTopicInfo(String id, WeiboSuperTopic topic) {
try { for(int retryTimes=1; retryTimes<=3; retryTimes++) {
String url = "https://m.weibo.cn/api/container/getIndex?containerid="+ id; try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string(); String url = "https://m.weibo.cn/api/container/getIndex?containerid="+ id;
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc_more")) { String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
String descMore = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("pageInfo").getJSONArray("desc_more").getString(0); if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc_more")) {
if(StringUtils.isNotBlank(descMore)) { String descMore = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("pageInfo").getJSONArray("desc_more").getString(0);
String readNum = descMore.replaceAll("阅读|帖子.*", "").trim(); if(StringUtils.isNotBlank(descMore)) {
String postNum = descMore.replaceAll(".*帖子|粉丝.*", "").trim(); String readNum = descMore.replaceAll("阅读|帖子.*", "").trim();
String postNum = descMore.replaceAll(".*帖子|粉丝.*", "").trim();
topic.setPostNum(postNum);
topic.setReadNum(readNum); topic.setPostNum(postNum);
return topic; topic.setReadNum(readNum);
} return topic;
} }
} catch (Exception e) { }
logger.error("解析榜单详情页面时出现错误,错误为:{}", e); } catch (Exception e) {
} log.error("解析榜单详情页面时出现错误,错误为:{}", e);
} }
return topic; }
} return topic;
}
}
}
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import com.zhiwei.tools.tools.URLCodeUtil;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @ClassName: WeiboTopicCrawler
* @Description: 微博话题榜单采集
* @author Bewilder ZW
*/
@Log4j2
public class WeiboTopicCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Map<String,String> headMap = new HashMap<>();
static {
headMap.put("Host", "simg.s.weibo.com");
headMap.put("User-Agent", "Weibo/40651 CFNetwork/978.0.7 Darwin/18.6.0");
}
// /**
// *
// * 开始采集明星话题
// * @return void
// */
// public static List<HotSearchList> startCrawler() {
// List<HotSearchList> topicList = new ArrayList<>();
// for(int page=1; page<=7; page++){
// String pageUrl = "https://d.weibo.com/231650_ctg1_-_all?pids=Pl_Discover_Pt6Rank__4&cfs=920&Pl_Discover_Pt6Rank__4_filter=&Pl_Discover_Pt6Rank__4_page=" + page;
// //重试三次
// for(int retryTimes = 1; retryTimes<=3; retryTimes++) {
// try {
// String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
// if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("pl.content.miniTab.index")) {
// log.info("pageUrl::{}", pageUrl);
// topicList.addAll(parseTopicRankHtml(htmlBody));
// break;
// }else {
// log.error("获取榜单列表页面时数据格式错误,页面为:{}", htmlBody);
// }
// } catch (Exception e) {
// log.error("获取榜单列表页面时出现错误,错误为:{}", e);
// continue;
// }
// }
// }
// return topicList;
// }
//
// /**
// *
// * 解析话题榜单
// * @param htmlBody
// * @return void
// */
// private static List<HotSearchList> parseTopicRankHtml(String htmlBody) {
// try {
// String script = "{\"ns\":\"pl.content.miniTab.index\""+ htmlBody.split("FM.view\\(\\{\"ns\":\"pl.content.miniTab.index\"")[1].split("\\)<\\/script>")[0];
// JSONObject json = JSONObject.parseObject(script);
// String html = json.getString("html");
//
// Elements elements = Jsoup.parse(html).select("div.text_box");
// if(Objects.nonNull(elements) && !elements.isEmpty()) {
// List<HotSearchList> topicList = new ArrayList<>();
// String rankString;
// Integer rank = null;
// String topicName = null;
// String url = null;
// String topicType = null;
// String description = null;
// Integer readNum = null;
// String author = null;
//
// for(Element element : elements) {
// rankString = element.select("div[class=\"title W_autocut\"]").text();
// Matcher matcher = Pattern.compile("\\d+").matcher(rankString);
// while (matcher.find()){
// rank = Integer.valueOf(matcher.group());
// }
// topicName = element.select("div[class=\"title W_autocut\"]").select("a.S_txt1").text();
// url = element.select("div[class=\"title W_autocut\"]").select("a.S_txt1").attr("href");
// topicType = element.select("a[class=\"W_btn_b W_btn_tag\"]").text();
// description = element.select("div.subtitle").text();
// String readNumString = element.select("span.number").text();
// if(readNumString.contains("万")){
// readNumString = readNumString.split("万")[0];
// readNum = Integer.valueOf(readNumString.split("万")[0])*10000;;
// }
// if(readNumString.contains("亿")){
// readNum = Integer.valueOf(readNumString.split("亿")[0])*100000000;
// }
// author = element.select("a[class=\"tlink S_txt1\"]").text();
// HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博话题.name(), author, topicType, description);
// log.info("topic::::" + topic);
// topicList.add(topic);
// }
// return topicList;
// }else{
// log.info("html:{}",html);
// }
// } catch (Exception e) {
// log.error("解析榜单列表页面时出现错误,错误为:{}", e);
// }
// return Collections.emptyList();
// }
/**
* 微博平话题榜采集
*/
public static List<HotSearchList> startCrawlerByPhone(){
List<HotSearchList> topicList = new ArrayList<>();
for(int page=1; page<=7; page++){
String pageUrl = "https://api.weibo.cn/2/page?gsid=_2A25zJX_EDeRxGedH71YS8CzKzzmIHXVuc_QMrDV6PUJbkdANLXPbkWpNUK3OyitGCJsX8exvua-vfubUqCiaA4lb&from=10A1193010&c=iphone&s=2827eebe&count=20&containerid=106003type%253D25%2526t%253D3%2526disable_hot%253D1%2526filter_type%253Dtopicscene&page=" + page;
//重试三次
for(int retryTimes = 1; retryTimes<=5; retryTimes++) {
try {
log.info("pageUrl::{}", pageUrl);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("top_mark_text")) {
topicList.addAll(parseTopicHtml(htmlBody));
break;
}else {
log.info("下载榜单列表页面时数据格式错误,页面为:{}", htmlBody);
}
} catch (Exception e) {
log.error("下载榜单列表页面时出现错误,错误为:{}", e);
continue;
}
}
}
return topicList;
}
private static List<HotSearchList> parseTopicHtml(String htmlBody) {
try {
JSONArray cards = JSONObject.parseObject(htmlBody).getJSONArray("cards");
if(Objects.nonNull(cards) && !cards.isEmpty()) {
List<HotSearchList> topicList = new ArrayList<>();
Integer rank = null;
String topicName = null;
String url = null;
String description = null;
Integer commentNum = null;
Integer readNum = null;
String desc2 = null;
for(int i=0; i<cards.size(); i++) {
JSONObject cardGroup = cards.getJSONObject(i).getJSONArray("card_group").getJSONObject(0);
rank = cardGroup.getInteger("top_mark_text");
topicName = cardGroup.getString("title_sub");
url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode(topicName, "utf-8");
description = cardGroup.getString("desc1");
desc2 = cardGroup.getString("desc2");
String commentNumStr = desc2.replaceAll("讨论.*", "").trim();
String readNumStr = desc2.replaceAll(".*讨论|阅读", "").trim();
try {
if(commentNumStr.contains("万")){
commentNumStr = commentNumStr.replaceAll("万", "");
commentNum = (int)(Double.parseDouble(commentNumStr)*10000);
}else if(commentNumStr.contains("亿")){
commentNumStr = commentNumStr.replaceAll("亿", "");
commentNum = (int)(Double.parseDouble(commentNumStr)*10000000);
}else{
commentNum = Integer.getInteger(commentNumStr);
}
if(readNumStr.contains("万")){
readNumStr = readNumStr.replaceAll("万", "");
readNum = (int)(Double.parseDouble(readNumStr)*10000);
}else if(readNumStr.contains("亿")){
readNumStr = readNumStr.replaceAll("亿", "");
readNum = (int)(Double.parseDouble(readNumStr)*10000000);
}else{
readNum = Integer.getInteger(readNumStr);
}
}catch (Exception e){
e.printStackTrace();
}
HotSearchList topic = new HotSearchList(url, topicName, readNum, rank, HotSearchType.微博热搜.name(), commentNum, description);
log.info("topic::::" + topic);
topicList.add(topic);
}
return topicList;
}else{
log.info("html:{}",htmlBody);
}
} catch (Exception e) {
log.error("解析榜单列表页面时出现错误,错误为:{}", e);
}
return Collections.emptyList();
}
}
package com.zhiwei.searchhotcrawler.crawler; package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.slf4j.Logger; import lombok.extern.log4j.Log4j2;
import org.slf4j.LoggerFactory; import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONArray;
import com.zhiwei.crawler.core.HttpBoot; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
/**
* @ClassName: ZhihuHotCrawler /**
* @Description: TODO(知乎热搜采集程序) * @ClassName: ZhihuHotCrawler
* @author hero * @Description: TODO(知乎热搜采集程序)
* @date 2017年9月15日 上午10:54:31 * @author hero
*/ * @date 2017年9月15日 上午10:54:31
public class ZhihuHotSearchCrawler { */
@Log4j2
private static Logger logger = LoggerFactory.getLogger(ZhihuHotSearchCrawler.class); public class ZhihuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/** private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
* @Title: getZhihuHotList /**
* @author hero * @Title: getZhihuHotList
* @Description: 知乎热搜采集程序 * @author hero
* @param 设定文件 * @Description: 知乎热搜采集程序
* @return void 返回类型 * @return void 返回类型
*/ */
public static List<HotSearchList> getZhihuHotList(){ public static List<HotSearchList> getZhihuHotList(){
List<HotSearchList> list = null; List<HotSearchList> list = null;
String url = "https://www.zhihu.com/api/v4/search/top_search"; String url = "https://www.zhihu.com/api/v4/search/top_search";
String rerferer = "https://www.zhihu.com/search?type=content&q=%E5%BF%AB%E6%89%8B"; String rerferer = "https://www.zhihu.com/search?type=content&q=%E5%BF%AB%E6%89%8B";
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"); headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");
headerMap.put("Host", "www.zhihu.com"); headerMap.put("Host", "www.zhihu.com");
headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="); headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
headerMap.put("accept", "application/json, text/plain, */*"); headerMap.put("accept", "application/json, text/plain, */*");
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20"); headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
headerMap.put("Referer", rerferer); headerMap.put("Referer", rerferer);
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null && htmlBody.contains("words")){ if(htmlBody != null && htmlBody.contains("words")){
list = new ArrayList<>(); list = new ArrayList<>();
JSONObject topSearch = JSONObject.parseObject(htmlBody); JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray words = topSearch.getJSONObject("top_search").getJSONArray("words"); JSONArray words = topSearch.getJSONObject("top_search").getJSONArray("words");
String link = null; String link = null;
String displayQuery = null; String displayQuery = null;
String query = null; String query = null;
for (int i = 0; i < words.size(); i++) { for (int i = 0; i < words.size(); i++) {
JSONObject word = words.getJSONObject(i); JSONObject word = words.getJSONObject(i);
query = word.getString("query"); query = word.getString("query");
displayQuery = word.getString("display_query"); displayQuery = word.getString("display_query");
link = "https://www.zhihu.com/search?q="+URLCodeUtil.getURLEncode(query, "utf-8")+"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content"; link = "https://www.zhihu.com/search?q="+URLCodeUtil.getURLEncode(query, "utf-8")+"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content";
HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name()); HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name());
list.add(zhihu); list.add(zhihu);
} }
} }
} catch (IOException e) { } catch (IOException e) {
logger.debug("获取知乎热搜时出现问题:{}", e); log.debug("获取知乎热搜时出现问题:{}", e);
return list; return list;
} }
return list; return list;
} }
/** /**
* @Title: getMobileZhihuHotList * @Title: getMobileZhihuHotList
* @author hero * @author hero
* @Description: 移動端知乎熱搜榜 * @Description: 移動端知乎熱搜榜
* @param @return 设定文件 * @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型 * @return List<ZhihuHotSearch> 返回类型
*/ */
public static List<HotSearchList> getMobileZhihuHotList(){ public static List<HotSearchList> getMobileZhihuHotList(){
List<HotSearchList> list = new ArrayList<>();; List<HotSearchList> list = new ArrayList<>();;
String url = "https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0"; String url = "https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0";
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "api.zhihu.com"); headerMap.put("Host", "api.zhihu.com");
headerMap.put("Referer", url); headerMap.put("Referer", url);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"); headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36");
headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="); headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20"); headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null && htmlBody.contains("author")){ if(htmlBody != null && htmlBody.contains("author")){
JSONObject topSearch = JSONObject.parseObject(htmlBody); JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray words = topSearch.getJSONArray("data"); JSONArray words = topSearch.getJSONArray("data");
String link = null; String link = null;
String displayQuery = null; String displayQuery = null;
for (int i = 0; i < words.size(); i++) { for (int i = 0; i < words.size(); i++) {
JSONObject word = words.getJSONObject(i).getJSONObject("target"); JSONObject word = words.getJSONObject(i).getJSONObject("target");
displayQuery = word.getString("title"); displayQuery = word.getString("title");
link = "https://www.zhihu.com/question/"+word.getLongValue("id"); link = "https://www.zhihu.com/question/"+word.getLongValue("id");
HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name()); HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name());
list.add(zhihu); list.add(zhihu);
} }
} }
} catch (IOException e) { } catch (IOException e) {
logger.debug("获取知乎热搜时出现问题:{}", e); log.debug("获取知乎热搜时出现问题:{}", e);
return list; return list;
} }
return list; return list;
} }
} }
package com.zhiwei.searchhotcrawler.dao; package com.zhiwei.searchhotcrawler.dao;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import com.mongodb.BasicDBObject; import org.slf4j.LoggerFactory;
import com.mongodb.DBCursor;
import com.mongodb.DBObject; import com.mongodb.BasicDBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.mongodb.DBCursor;
import com.zhiwei.searchhotcrawler.cache.CacheManager; import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.config.Config; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate; import com.zhiwei.searchhotcrawler.cache.CacheManager;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
public class HotSearchListDAO extends MongoDBTemplate{ import com.zhiwei.tools.timeparse.TimeParse;
private static Logger logger = LoggerFactory.getLogger(HotSearchListDAO.class);
@Log4j2
public HotSearchListDAO() { public class HotSearchListDAO extends MongoDBTemplate{
super();
super.setDbName(Config.dbName); public HotSearchListDAO() {
String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd"); super();
String year = time.substring(0,4); super.setDbName(Config.dbName);
String month = time.substring(5,7); String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
String collName = Config.searchCollName + year + "_" + month; String year = time.substring(0,4);
super.setCollName(collName); String month = time.substring(5,7);
DBObject countIndexDoc = new BasicDBObject(); String collName = Config.searchCollName + year + "_" + month;
countIndexDoc.put("count", -1); super.setCollName(collName);
DBObject timeIndexDoc = new BasicDBObject();
timeIndexDoc.put("time", -1); //给数据表创建索引
DBObject rankIndexDoc = new BasicDBObject(); createIndex();
rankIndexDoc.put("rank", -1);
DBObject nameIndexDoc = new BasicDBObject(); }
nameIndexDoc.put("name", -1);
DBObject typeIndexDoc = new BasicDBObject();
typeIndexDoc.put("type", -1); /**
try { * 初次创建表及创建相应的索引
super.getReadColl().createIndex(countIndexDoc, new BasicDBObject("name", "count_desc")); */
super.getReadColl().createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc")); private void createIndex(){
super.getReadColl().createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc")); List<DBObject> indexList = this.getReadColl().getIndexInfo();
super.getReadColl().createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc")); if(Objects.isNull(indexList) && indexList.isEmpty()){
super.getReadColl().createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc")); DBObject countIndexDoc = new BasicDBObject();
} catch (Exception e) { countIndexDoc.put("count", -1);
e.printStackTrace(); DBObject timeIndexDoc = new BasicDBObject();
} timeIndexDoc.put("time", -1);
} DBObject rankIndexDoc = new BasicDBObject();
rankIndexDoc.put("rank", -1);
/** DBObject nameIndexDoc = new BasicDBObject();
* 添加数据入库 nameIndexDoc.put("name", -1);
* @param list DBObject typeIndexDoc = new BasicDBObject();
*/ typeIndexDoc.put("type", -1);
public void addHotSearchList(List<DBObject> list){ try {
try { super.getReadColl().createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
this.getReadColl().insert(list); super.getReadColl().createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
} catch (Exception e) { super.getReadColl().createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
logger.error("存储数据时出错,错误为:{}", e); super.getReadColl().createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
} super.getReadColl().createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
} } catch (Exception e) {
e.printStackTrace();
public void addHotSearch(DBObject doc){ }
try { }
this.getReadColl().insert(doc); }
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e); /**
} * 添加数据入库
} * @param list
*/
/** public void addHotSearchList(List<DBObject> list){
* 查询据上次变化量 try {
* @Title: getChangeCount this.getReadColl().insert(list);
* @author hero } catch (Exception e) {
* @param @param weiboHotSearch log.error("存储数据时出错,错误为:{}", e);
* @param @return 设定文件 }
* @return int 返回类型 }
*/
public int getChangeCount(HotSearchList weiboHotSearch){ public void addHotSearch(DBObject doc){
int result = 0; try {
DBObject query = new BasicDBObject(); this.getReadColl().insert(doc);
query.put("name", weiboHotSearch.getName()); } catch (Exception e) {
DBObject sort = new BasicDBObject(); log.error("存储数据时出错,错误为:{}", e);
sort.put("time", -1); }
try { }
DBCursor cur = this.getReadColl().find(query).sort(sort).limit(1);
while(cur.hasNext()){ /**
DBObject doc = cur.next(); * 查询据上次变化量
if(doc.get("count")!=null) { * @Title: getChangeCount
result = weiboHotSearch.getCount() - Integer.valueOf(doc.get("count").toString()); * @author hero
break; * @param @param weiboHotSearch
} * @param @return 设定文件
} * @return int 返回类型
cur.close(); */
} catch (Exception e) { public int getChangeCount(HotSearchList weiboHotSearch){
logger.error("存储数据时出错,错误为:{}", e); int result = 0;
return result; DBObject query = new BasicDBObject();
} query.put("name", weiboHotSearch.getName());
return result; DBObject sort = new BasicDBObject();
} sort.put("time", -1);
try {
DBCursor cur = this.getReadColl().find(query).sort(sort).limit(1);
/** while(cur.hasNext()){
* @Title: getWeiboHotOneHour DBObject doc = cur.next();
* @author hero if(doc.get("count")!=null) {
* @Description: 查询最近1小时内新增的微博热搜 result = weiboHotSearch.getCount() - Integer.valueOf(doc.get("count").toString());
* @param @return 设定文件 break;
* @return List<DBObject> 返回类型 }
*/ }
public List<DBObject> getHotOneHour(String type){ cur.close();
List<DBObject> list = new ArrayList<>(); } catch (Exception e) {
Date date = new Date((new Date().getTime()-60*60*1000)); log.error("存储数据时出错,错误为:{}", e);
DBObject query = new BasicDBObject(); return result;
query.put("time", new BasicDBObject("$gte", date)); }
query.put("changeCount", 0); return result;
query.put("type", type); }
try {
DBCursor cur = this.getReadColl().find(query); /**
while(cur.hasNext()){ * @Title: getWeiboHotOneHour
DBObject doc = cur.next(); * @author hero
String name = doc.get("name").toString(); * @Description: 查询最近1小时内新增的微博热搜
if(CacheManager.getCacheByKey(name)==null){ * @param @return 设定文件
CacheManager.putCache(name, doc, 48*60*60*1000); * @return List<DBObject> 返回类型
list.add(doc); */
} public List<DBObject> getHotOneHour(String type){
} List<DBObject> list = new ArrayList<>();
cur.close(); Date date = new Date((new Date().getTime()-60*60*1000));
} catch (Exception e) { DBObject query = new BasicDBObject();
logger.error("存储数据时出错,错误为:{}", e); query.put("time", new BasicDBObject("$gte", date));
} query.put("changeCount", 0);
return list; query.put("type", type);
}
try {
DBCursor cur = this.getReadColl().find(query);
} while(cur.hasNext()){
DBObject doc = cur.next();
String name = doc.get("name").toString();
if(CacheManager.getCacheByKey(name)==null){
CacheManager.putCache(name, doc, 48*60*60*1000);
list.add(doc);
}
}
cur.close();
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.dao; package com.zhiwei.searchhotcrawler.dao;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import org.slf4j.Logger; import lombok.extern.log4j.Log4j2;
import org.slf4j.LoggerFactory; import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject; import com.mongodb.BasicDBObject;
import com.zhiwei.searchhotcrawler.config.Config; import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler; import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate; import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
public class WechatUserDao extends MongoDBTemplate{
private static Logger logger = LoggerFactory.getLogger(BaiDuHotSearchCrawler.class); @Log4j2
public class WechatUserDao extends MongoDBTemplate{
public WechatUserDao() {
super(); public WechatUserDao() {
super.setDbName(Config.dbName); super();
super.setCollName(Config.collWechatUserName); super.setDbName(Config.dbName);
} super.setCollName(Config.collWechatUserName);
}
/**
* 添加分组用户 /**
* @param userlist * 添加分组用户
* @param groupName * @param userlist
* @param groupId * @param groupName
*/ * @param groupId
public void addWechatUser(List<String> userlist, String groupName, Integer groupId){ */
for(int i=0; i<3; i++){ public void addWechatUser(List<String> userlist, String groupName, Integer groupId){
try { for(int i=0; i<3; i++){
DBObject doc = new BasicDBObject(); try {
doc.put("_id", groupId+"-"+groupName); DBObject doc = new BasicDBObject();
doc.put("groupId", groupId); doc.put("_id", groupId+"-"+groupName);
doc.put("groupName", groupName); doc.put("groupId", groupId);
doc.put("user", userlist); doc.put("groupName", groupName);
this.getReadColl().save(doc); doc.put("user", userlist);
break; this.getReadColl().save(doc);
} catch (Exception e) { break;
logger.error("存储数据时出错,错误为:{}", e); } catch (Exception e) {
} log.error("存储数据时出错,错误为:{}", e);
} }
} }
}
/**
* 根据分组名称查询分组用户 /**
* @param group * 根据分组名称查询分组用户
* @return * @param group
*/ * @return
@SuppressWarnings("unchecked") */
public List<String> getWechatUserByGroup(String group){ @SuppressWarnings("unchecked")
try { public List<String> getWechatUserByGroup(String group){
DBObject query = new BasicDBObject(); try {
query.put("groupName", group); DBObject query = new BasicDBObject();
DBObject doc = this.getReadColl().findOne(query); query.put("groupName", group);
if(doc != null){ DBObject doc = this.getReadColl().findOne(query);
return (List<String>)doc.get("user"); if(doc != null){
} return (List<String>)doc.get("user");
} catch (Exception e) { }
logger.error("存储数据时出错,错误为:{}", e); } catch (Exception e) {
} log.error("存储数据时出错,错误为:{}", e);
return Collections.emptyList(); }
} return Collections.emptyList();
}
}
}
package com.zhiwei.searchhotcrawler.dao; package com.zhiwei.searchhotcrawler.dao;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import com.mongodb.BasicDBObject; import org.slf4j.LoggerFactory;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.config.Config; import com.mongodb.BasicDBObject;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate; import com.mongodb.DBObject;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
public class WeiboTopicDAO extends MongoDBTemplate{ import com.zhiwei.tools.timeparse.TimeParse;
private static Logger logger = LoggerFactory.getLogger(WeiboTopicDAO.class);
@Log4j2
public WeiboTopicDAO() { public class WeiboSuperTopicDAO extends MongoDBTemplate{
super();
super.setDbName(Config.dbName); public WeiboSuperTopicDAO() {
String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd"); super();
String year = time.substring(0,4); super.setDbName(Config.dbName);
String month = time.substring(5,7); String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
String collName = Config.topicCollName + year + "_" + month; String year = time.substring(0,4);
super.setCollName(collName); String month = time.substring(5,7);
String collName = Config.topicCollName + year + "_" + month;
DBObject countIndexDoc = new BasicDBObject(); super.setCollName(collName);
countIndexDoc.put("score_num", -1);
DBObject timeIndexDoc = new BasicDBObject(); createIndex();
timeIndexDoc.put("time", -1); }
DBObject rankIndexDoc = new BasicDBObject();
rankIndexDoc.put("rank", -1);
DBObject nameIndexDoc = new BasicDBObject(); /**
nameIndexDoc.put("name", -1); * 初次创建表及创建相应的索引
DBObject typeIndexDoc = new BasicDBObject(); */
typeIndexDoc.put("type", -1); private void createIndex(){
try { List<DBObject> indexList = this.getReadColl().getIndexInfo();
super.getReadColl().createIndex(countIndexDoc, new BasicDBObject("name", "score_desc")); if(Objects.isNull(indexList) && indexList.isEmpty()){
super.getReadColl().createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc")); DBObject countIndexDoc = new BasicDBObject();
super.getReadColl().createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc")); countIndexDoc.put("score_num", -1);
super.getReadColl().createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc")); DBObject timeIndexDoc = new BasicDBObject();
super.getReadColl().createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc")); timeIndexDoc.put("time", -1);
} catch (Exception e) { DBObject rankIndexDoc = new BasicDBObject();
e.printStackTrace(); rankIndexDoc.put("rank", -1);
} DBObject nameIndexDoc = new BasicDBObject();
} nameIndexDoc.put("name", -1);
DBObject typeIndexDoc = new BasicDBObject();
/** typeIndexDoc.put("type", -1);
* 添加数据入库 try {
* @param list super.getReadColl().createIndex(countIndexDoc, new BasicDBObject("name", "score_desc"));
*/ super.getReadColl().createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
public void addTopicList(List<DBObject> list){ super.getReadColl().createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
try { super.getReadColl().createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
this.getReadColl().insert(list); super.getReadColl().createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
} catch (Exception e) { } catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e); e.printStackTrace();
} }
} }
}
public void addTopic(DBObject doc){
try {
this.getReadColl().insert(doc); /**
} catch (Exception e) { * 添加数据入库
logger.error("存储数据时出错,错误为:{}", e); * @param list
} */
} public void addTopicList(List<DBObject> list){
try {
this.getReadColl().insert(list);
} catch (Exception e) {
} log.error("存储数据时出错,错误为:{}", e);
}
}
public void addTopic(DBObject doc){
try {
this.getReadColl().insert(doc);
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
}
}
package com.zhiwei.searchhotcrawler.run; package com.zhiwei.searchhotcrawler.run;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.cache.CacheListener; import com.zhiwei.searchhotcrawler.cache.CacheListener;
import com.zhiwei.searchhotcrawler.config.ProxyConfig; import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun; import com.zhiwei.searchhotcrawler.timer.*;
import com.zhiwei.searchhotcrawler.timer.DouyinHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SendWeiboHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SendZhihuHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SougoHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.UpdateWechatUserRun;
import com.zhiwei.searchhotcrawler.timer.WeiboHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.WeiboTopicRun;
import com.zhiwei.searchhotcrawler.timer.ZhihuHotSearchRun;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
...@@ -23,8 +15,10 @@ import java.util.concurrent.TimeUnit; ...@@ -23,8 +15,10 @@ import java.util.concurrent.TimeUnit;
public class HotSearchRun { public class HotSearchRun {
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init(ProxyConfig.registry, ProxyConfig.group, GroupType.PROVIDER, 10000013); SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
new UpdateWechatUserRun().start(); new UpdateWechatUserRun().start();
ZhiWeiTools.sleep(10000); ZhiWeiTools.sleep(10000);
...@@ -51,6 +45,7 @@ public class HotSearchRun { ...@@ -51,6 +45,7 @@ public class HotSearchRun {
new SougoHotSearchRun().start(); new SougoHotSearchRun().start();
new DouyinHotSearchRun().start(); new DouyinHotSearchRun().start();
new ZhihuHotSearchRun().start(); new ZhihuHotSearchRun().start();
new WeiboSuperTopicRun().start();
new WeiboTopicRun().start(); new WeiboTopicRun().start();
//推送程序启动 //推送程序启动
new SendWeiboHotSearchRun().start(); new SendWeiboHotSearchRun().start();
......
package com.zhiwei.searchhotcrawler.test; package com.zhiwei.searchhotcrawler.test;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.mongodb.BasicDBObject; import com.mongodb.BasicDBObject;
import com.mongodb.DB; import com.mongodb.DB;
import com.mongodb.DBCollection; import com.mongodb.DBCollection;
import com.mongodb.DBCursor; import com.mongodb.DBCursor;
import com.mongodb.DBObject; import com.mongodb.DBObject;
import com.mongodb.Mongo; import com.mongodb.Mongo;
import com.mongodb.MongoClient; import com.mongodb.MongoClient;
import com.mongodb.MongoCredential; import com.mongodb.MongoCredential;
import com.mongodb.ServerAddress; import com.mongodb.ServerAddress;
import com.mongodb.WriteResult; import com.mongodb.WriteResult;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.searchhotcrawler.config.Config; import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
public class HotSearchListTest{ import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.config.Config;
public static void main(String[] args) { import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.tools.timeparse.TimeParse;
MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray()); import org.jsoup.Jsoup;
ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort); import org.jsoup.nodes.Element;
Mongo mongo = new MongoClient(address, Arrays.asList(credential)); import org.jsoup.select.Elements;
DB db = mongo.getDB("hot_search_list"); public class HotSearchListTest{
DBCollection coll = db.getCollection("hot_search_list2019_09");
// MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray()); public static void main(String[] args) {
// ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
// Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew)); SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
// DB dbNew = mongoNew.getDB("hot_search_list"); .group(ProxyConfig.group).appId(10000013).appName("zzw").build();
ProxyFactory.init(simpleConfig);
Map<String,String> timLine = TimeParse.getTimeMap("2019-10-01 00:00:00", "2019-10-09 23:59:59", "dd", 1);
String url = "http://app.myzaker.com/news/app.php?f=";
timLine.forEach((start, end) ->{ HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
try{
String year = end.substring(0,4); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
String month = end.substring(5,7); Elements elements = Jsoup.parse(htmlBody).select("div.titlebar>a");
Date startDate = TimeParse.stringFormartDate(start); for(Element element : elements){
Date endDate = TimeParse.stringFormartDate(end); String lableUrl = "http://app.myzaker.com/news/app.php" + element.attr("href");
System.out.println("lableUrl========="+lableUrl);
String collName = "hot_search_list"+year+"_"+month; String htmlBodyLable = httpBoot.syncCall(RequestUtils.wrapGet(lableUrl), ProxyHolder.NAT_HEAVY_PROXY).body().string();
System.out.println("collName=========="+collName); Elements elementsLable = Jsoup.parse(htmlBodyLable).select("div#infinite_scroll>a");
// DBCollection collNew = dbNew.getCollection(collName);
// DBObject countIndexDoc = new BasicDBObject(); for(Element elementLable : elementsLable){
// countIndexDoc.put("count", -1); System.out.println(elementLable.attr("href") + "=============" + elementLable.text());
// DBObject timeIndexDoc = new BasicDBObject(); }
// timeIndexDoc.put("time", -1); }
// DBObject rankIndexDoc = new BasicDBObject();
// rankIndexDoc.put("rank", -1); }catch (Exception e){
// DBObject nameIndexDoc = new BasicDBObject(); e.printStackTrace();
// nameIndexDoc.put("name", -1); }
// DBObject typeIndexDoc = new BasicDBObject();
// typeIndexDoc.put("type", -1);
// try {
// collNew.createIndex(countIndexDoc, new BasicDBObject("name", "count_desc")); // MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray());
// collNew.createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc")); // ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
// collNew.createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc")); // Mongo mongo = new MongoClient(address, Arrays.asList(credential));
// collNew.createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc")); //
// collNew.createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc")); // DB db = mongo.getDB("hot_search_list");
// } catch (Exception e) { // DBCollection coll = db.getCollection("hot_search_list2019_09");
// e.printStackTrace(); //
// } //// MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray());
//// ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
DBObject query = new BasicDBObject(new BasicDBObject("time", //// Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew));
new BasicDBObject("$gte",startDate).append("$lte", endDate))); //// DB dbNew = mongoNew.getDB("hot_search_list");
System.out.println(query); //
WriteResult wr = coll.remove(query); // Map<String,String> timLine = TimeParse.getTimeMap("2019-10-01 00:00:00", "2019-10-09 23:59:59", "dd", 1);
System.out.println("========"+wr.getN()); //
// int i = 0; // timLine.forEach((start, end) ->{
// DBCursor cur = coll.remove(query); //
// System.out.println(query +"======="+ cur.count()); // String year = end.substring(0,4);
// List<DBObject> dataList = new ArrayList<>(); // String month = end.substring(5,7);
// while(cur.hasNext()) { // Date startDate = TimeParse.stringFormartDate(start);
// DBObject doc = cur.next(); // Date endDate = TimeParse.stringFormartDate(end);
// try { //
//// collNew.save(doc); // String collName = "hot_search_list"+year+"_"+month;
// i++; // System.out.println("collName=========="+collName);
// coll.remove(doc); //// DBCollection collNew = dbNew.getCollection(collName);
// } catch (Exception e2) { //// DBObject countIndexDoc = new BasicDBObject();
// e2.printStackTrace(); //// countIndexDoc.put("count", -1);
// } //// DBObject timeIndexDoc = new BasicDBObject();
// dataList.add(doc); //// timeIndexDoc.put("time", -1);
// } //// DBObject rankIndexDoc = new BasicDBObject();
// System.out.println(collName +"数据量大小" +dataList.size()); //// rankIndexDoc.put("rank", -1);
// cur.close(); //// DBObject nameIndexDoc = new BasicDBObject();
// if(!dataList.isEmpty()) { //// nameIndexDoc.put("name", -1);
// try { //// DBObject typeIndexDoc = new BasicDBObject();
// collNew.insert(dataList); //// typeIndexDoc.put("type", -1);
// } catch (Exception e) { //// try {
// e.printStackTrace(); //// collNew.createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
// } //// collNew.createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
// } //// collNew.createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
}); //// collNew.createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
mongo.close(); //// collNew.createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
} //// } catch (Exception e) {
//// e.printStackTrace();
//// }
//
} // DBObject query = new BasicDBObject(new BasicDBObject("time",
// new BasicDBObject("$gte",startDate).append("$lte", endDate)));
// System.out.println(query);
// WriteResult wr = coll.remove(query);
// System.out.println("========"+wr.getN());
//// int i = 0;
//// DBCursor cur = coll.remove(query);
//// System.out.println(query +"======="+ cur.count());
//// List<DBObject> dataList = new ArrayList<>();
//// while(cur.hasNext()) {
//// DBObject doc = cur.next();
//// try {
////// collNew.save(doc);
//// i++;
//// coll.remove(doc);
//// } catch (Exception e2) {
//// e2.printStackTrace();
//// }
//// dataList.add(doc);
//// }
//// System.out.println(collName +"数据量大小" +dataList.size());
//// cur.close();
//// if(!dataList.isEmpty()) {
//// try {
//// collNew.insert(dataList);
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// });
// mongo.close();
}
}
...@@ -6,6 +6,7 @@ import java.util.List; ...@@ -6,6 +6,7 @@ import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -16,10 +17,9 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler; ...@@ -16,10 +17,9 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class BaiduHotSearchRun extends Thread{ public class BaiduHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(BaiduHotSearchRun.class);
@Override @Override
public void run() { public void run() {
boolean f = true; boolean f = true;
...@@ -37,10 +37,10 @@ public class BaiduHotSearchRun extends Thread{ ...@@ -37,10 +37,10 @@ public class BaiduHotSearchRun extends Thread{
private void getHotList() { private void getHotList() {
logger.info("百度风云榜采集开始........"); log.info("百度风云榜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO(); HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
List<HotSearchList> list = BaiDuHotSearchCrawler.baiduHotSearch(); List<HotSearchList> list = BaiDuHotSearchCrawler.baiduHotSearch();
logger.info("{}, 此轮百度风云榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 此轮百度风云榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> saveDataList = new ArrayList<>(); List<DBObject> saveDataList = new ArrayList<>();
if(Objects.nonNull(list) && !list.isEmpty()) { if(Objects.nonNull(list) && !list.isEmpty()) {
list.forEach(baiduHotSearch ->{ list.forEach(baiduHotSearch ->{
...@@ -59,7 +59,7 @@ public class BaiduHotSearchRun extends Thread{ ...@@ -59,7 +59,7 @@ public class BaiduHotSearchRun extends Thread{
}); });
} }
hotSearchDAO.addHotSearchList(saveDataList); hotSearchDAO.addHotSearchList(saveDataList);
logger.info("百度风云榜采集结束........"); log.info("百度风云榜采集结束........");
} }
} }
\ No newline at end of file
...@@ -5,6 +5,7 @@ import java.util.Date; ...@@ -5,6 +5,7 @@ import java.util.Date;
import java.util.List; import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.DouyinHotSearchCrawler; ...@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.DouyinHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class DouyinHotSearchRun extends Thread{ public class DouyinHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(DouyinHotSearchRun.class);
@Override @Override
public void run() { public void run() {
boolean f = true; boolean f = true;
...@@ -40,10 +40,10 @@ public class DouyinHotSearchRun extends Thread{ ...@@ -40,10 +40,10 @@ public class DouyinHotSearchRun extends Thread{
* @return void * @return void
*/ */
private void getHotList() { private void getHotList() {
logger.info("抖音热搜榜采集开始........"); log.info("抖音热搜榜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO(); HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
List<HotSearchList> list = DouyinHotSearchCrawler.getMobileDouyinHotList(); List<HotSearchList> list = DouyinHotSearchCrawler.getMobileDouyinHotList();
logger.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>(); List<DBObject> data = new ArrayList<>();
for(HotSearchList douyinHotSearch : list){ for(HotSearchList douyinHotSearch : list){
int changeCount = hotSearchDAO.getChangeCount(douyinHotSearch); int changeCount = hotSearchDAO.getChangeCount(douyinHotSearch);
...@@ -60,7 +60,7 @@ public class DouyinHotSearchRun extends Thread{ ...@@ -60,7 +60,7 @@ public class DouyinHotSearchRun extends Thread{
data.add(douyin); data.add(douyin);
hotSearchDAO.addHotSearch(douyin); hotSearchDAO.addHotSearch(douyin);
} }
logger.info("抖音热搜榜采集结束........"); log.info("抖音热搜榜采集结束........");
} }
} }
package com.zhiwei.searchhotcrawler.timer; package com.zhiwei.searchhotcrawler.timer;
import java.util.Calendar; import java.util.Calendar;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.slf4j.Logger; import lombok.extern.log4j.Log4j2;
import org.slf4j.LoggerFactory; import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.DBObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.searchhotcrawler.dao.WechatUserDao; import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.Template; import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil; import com.zhiwei.searchhotcrawler.util.Template;
import com.zhiwei.searchhotcrawler.util.WechatConstant; import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.searchhotcrawler.util.WechatConstant;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class SendWeiboHotSearchRun extends Thread {
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO(); @Log4j2
private static WechatUserDao wechatUserDao = new WechatUserDao(); public class SendWeiboHotSearchRun extends Thread {
private static Logger logger = LoggerFactory.getLogger(SendWeiboHotSearchRun.class); private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
@Override private static WechatUserDao wechatUserDao = new WechatUserDao();
public void run() { @Override
while (true) { public void run() {
try { while (true) {
Calendar calendar = Calendar.getInstance(); try {
int hour = calendar.get(Calendar.HOUR_OF_DAY); Calendar calendar = Calendar.getInstance();
logger.info("微博推送,当前系统时间为:" + hour); int hour = calendar.get(Calendar.HOUR_OF_DAY);
if (hour > 6 && hour < 23) { log.info("微博推送,当前系统时间为:" + hour);
List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.微博热搜.name()); if (hour > 6 && hour < 23) {
if (list != null && !list.isEmpty()) { List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.微博热搜.name());
for (DBObject weibo : list) { if (list != null && !list.isEmpty()) {
String title = weibo.get("name").toString(); for (DBObject weibo : list) {
String time = TimeParse.dateFormartString((Date) weibo.get("time"), "yyyy-MM-dd HH:mm:ss"); String title = weibo.get("name").toString();
String url = weibo.get("url").toString(); String time = TimeParse.dateFormartString((Date) weibo.get("time"), "yyyy-MM-dd HH:mm:ss");
sendTemplateByUserIds(title, time, url); String url = weibo.get("url").toString();
} sendTemplateByUserIds(title, time, url);
} else { }
logger.info("微博最近一小时无数据"); } else {
sendTemplateByUserIds("最近一小时无数据", log.info("微博最近一小时无数据");
TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null); sendTemplateByUserIds("最近一小时无数据",
} TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null);
} }
ZhiWeiTools.sleep(1 * 60 * 60 * 1000); }
} catch (Exception e) { ZhiWeiTools.sleep(1 * 60 * 60 * 1000);
logger.debug("微博热搜推送出现问题,问题为:::{}", e.fillInStackTrace()); } catch (Exception e) {
ZhiWeiTools.sleep(1 * 60 * 60 * 1000); log.debug("微博热搜推送出现问题,问题为:::{}", e.fillInStackTrace());
continue; ZhiWeiTools.sleep(1 * 60 * 60 * 1000);
} continue;
} }
} }
}
/**
* @Title: sendTemplateByUserIds /**
* @author hero * @Title: sendTemplateByUserIds
* @Description: 发送模版消息 * @author hero
* @param @param * @Description: 发送模版消息
* microTouTiao * @param @param
* @param @param * microTouTiao
* userList 设定文件 * @param @param
* @return void 返回类型 * userList 设定文件
*/ * @return void 返回类型
public static void sendTemplateByUserIds(String title, String time, String url) { */
Map<String, Object> dataMap = new HashMap<String, Object>(); public static void sendTemplateByUserIds(String title, String time, String url) {
JSONObject first = new JSONObject(); Map<String, Object> dataMap = new HashMap<String, Object>();
first.put("value", "您好,有一条来自微博热搜榜的预警通知。"); JSONObject first = new JSONObject();
dataMap.put("first", first); first.put("value", "您好,有一条来自微博热搜榜的预警通知。");
JSONObject keyword1 = new JSONObject(); dataMap.put("first", first);
keyword1.put("value", title); JSONObject keyword1 = new JSONObject();
keyword1.put("color", "#173177"); keyword1.put("value", title);
dataMap.put("keyword1", keyword1); keyword1.put("color", "#173177");
JSONObject keyword2 = new JSONObject(); dataMap.put("keyword1", keyword1);
keyword2.put("value", "微博热搜榜"); JSONObject keyword2 = new JSONObject();
keyword2.put("color", "#173177"); keyword2.put("value", "微博热搜榜");
dataMap.put("keyword2", keyword2); keyword2.put("color", "#173177");
JSONObject keyword3 = new JSONObject(); dataMap.put("keyword2", keyword2);
keyword3.put("value", time); JSONObject keyword3 = new JSONObject();
keyword3.put("color", "#173177"); keyword3.put("value", time);
dataMap.put("keyword3", keyword3); keyword3.put("color", "#173177");
JSONObject remark = new JSONObject(); dataMap.put("keyword3", keyword3);
remark.put("value", "知微情报监测服务"); JSONObject remark = new JSONObject();
dataMap.put("remark", remark); remark.put("value", "知微情报监测服务");
List<String> userList = getUserList(); dataMap.put("remark", remark);
if (userList != null && userList.size() > 0) { List<String> userList = getUserList();
for (String openId : userList) { if (userList != null && userList.size() > 0) {
Template template = new Template(); for (String openId : userList) {
template.setTouser(openId); Template template = new Template();
if (url != null) { template.setTouser(openId);
template.setUrl(url); if (url != null) {
} template.setUrl(url);
template.setTemplate_id(WechatConstant.WECHAT_TEMPLATEID_EARLY_IT); }
template.setData(dataMap); template.setTemplate_id(WechatConstant.WECHAT_TEMPLATEID_EARLY_IT);
template.setData(dataMap);
JSONObject templateJson = (JSONObject) JSONObject.toJSON(template);
WechatCodeUtil.sendDataJson(templateJson); JSONObject templateJson = (JSONObject) JSONObject.toJSON(template);
} WechatCodeUtil.sendDataJson(templateJson);
} else { }
logger.info("拉取微博用户列表失败"); } else {
} log.info("拉取微博用户列表失败");
} }
}
/**
* @Title: getUserList /**
* @author hero * @Title: getUserList
* @Description: 用户列表 * @author hero
* @param @param * @Description: 用户列表
* projectName * @param @param
* @param @return * projectName
* 设定文件 * @param @return
* @return List<String> 返回类型 * 设定文件
*/ * @return List<String> 返回类型
public static List<String> getUserList() { */
List<String> userList = wechatUserDao.getWechatUserByGroup("weibohot"); public static List<String> getUserList() {
if(userList==null){ List<String> userList = wechatUserDao.getWechatUserByGroup("weibohot");
userList = WechatCodeUtil.getUserListByGroupName("weibohot"); if(userList==null){
} userList = WechatCodeUtil.getUserListByGroupName("weibohot");
return userList; }
} return userList;
} }
}
package com.zhiwei.searchhotcrawler.timer; package com.zhiwei.searchhotcrawler.timer;
import java.util.Calendar; import java.util.Calendar;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.slf4j.Logger; import lombok.extern.log4j.Log4j2;
import org.slf4j.LoggerFactory; import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.DBObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.dao.WechatUserDao; import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.searchhotcrawler.util.Template; import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil; import com.zhiwei.searchhotcrawler.util.Template;
import com.zhiwei.searchhotcrawler.util.WechatConstant; import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.searchhotcrawler.util.WechatConstant;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class SendZhihuHotSearchRun extends Thread{
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO(); @Log4j2
private static WechatUserDao wechatUserDao = new WechatUserDao(); public class SendZhihuHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(SendZhihuHotSearchRun.class); private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
@Override private static WechatUserDao wechatUserDao = new WechatUserDao();
public void run() { @Override
public void run() {
while(true) {
try { while(true) {
Calendar calendar = Calendar.getInstance(); try {
int hour = calendar.get(Calendar.HOUR_OF_DAY); Calendar calendar = Calendar.getInstance();
logger.info("知乎推送,当前系统时间为:"+hour); int hour = calendar.get(Calendar.HOUR_OF_DAY);
if(hour > 6 && hour <23){ log.info("知乎推送,当前系统时间为:"+hour);
List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.知乎热搜.name()); if(hour > 6 && hour <23){
if(list!=null && !list.isEmpty()){ List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.知乎热搜.name());
for(DBObject zhihu : list){ if(list!=null && !list.isEmpty()){
String title = zhihu.get("display_query").toString(); for(DBObject zhihu : list){
String time = TimeParse.dateFormartString((Date)zhihu.get("time"), "yyyy-MM-dd HH:mm:ss"); String title = zhihu.get("display_query").toString();
String url = zhihu.get("_id").toString(); String time = TimeParse.dateFormartString((Date)zhihu.get("time"), "yyyy-MM-dd HH:mm:ss");
if(calendar.get(Calendar.HOUR_OF_DAY) > 6 && calendar.get(Calendar.HOUR_OF_DAY) < 23){ String url = zhihu.get("_id").toString();
sendTemplateByUserIds(title, time, url); if(calendar.get(Calendar.HOUR_OF_DAY) > 6 && calendar.get(Calendar.HOUR_OF_DAY) < 23){
} sendTemplateByUserIds(title, time, url);
} }
}else{ }
logger.info("知乎最近一小时无数据"); }else{
sendTemplateByUserIds("最近一小时无数据", TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null); log.info("知乎最近一小时无数据");
} sendTemplateByUserIds("最近一小时无数据", TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null);
} }
ZhiWeiTools.sleep(1*60*60*1000); }
} catch (Exception e) { ZhiWeiTools.sleep(1*60*60*1000);
logger.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace()); } catch (Exception e) {
ZhiWeiTools.sleep(1*60*60*1000); log.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
} ZhiWeiTools.sleep(1*60*60*1000);
} }
} }
}
/**
* @Title: sendTemplateByUserIds /**
* @author hero * @Title: sendTemplateByUserIds
* @Description: 发送模版消息 * @author hero
* @param @param microTouTiao * @Description: 发送模版消息
* @param @param userList 设定文件 * @param @param microTouTiao
* @return void 返回类型 * @param @param userList 设定文件
*/ * @return void 返回类型
public static void sendTemplateByUserIds(String title,String time, String url) { */
public static void sendTemplateByUserIds(String title,String time, String url) {
Map<String, Object> dataMap = new HashMap<>();
JSONObject first = new JSONObject(); Map<String, Object> dataMap = new HashMap<>();
first.put("value", "您好,有一条来自知乎热搜榜的预警通知。"); JSONObject first = new JSONObject();
dataMap.put("first", first); first.put("value", "您好,有一条来自知乎热搜榜的预警通知。");
JSONObject keyword1 = new JSONObject(); dataMap.put("first", first);
keyword1.put("value", title); JSONObject keyword1 = new JSONObject();
keyword1.put("color", "#173177"); keyword1.put("value", title);
dataMap.put("keyword1", keyword1); keyword1.put("color", "#173177");
JSONObject keyword2 = new JSONObject(); dataMap.put("keyword1", keyword1);
keyword2.put("value", "知乎热搜榜"); JSONObject keyword2 = new JSONObject();
keyword2.put("color", "#173177"); keyword2.put("value", "知乎热搜榜");
dataMap.put("keyword2", keyword2); keyword2.put("color", "#173177");
JSONObject keyword3 = new JSONObject(); dataMap.put("keyword2", keyword2);
keyword3.put("value", time); JSONObject keyword3 = new JSONObject();
keyword3.put("color", "#173177"); keyword3.put("value", time);
dataMap.put("keyword3", keyword3); keyword3.put("color", "#173177");
JSONObject remark = new JSONObject(); dataMap.put("keyword3", keyword3);
remark.put("value", "知微情报监测服务"); JSONObject remark = new JSONObject();
dataMap.put("remark", remark); remark.put("value", "知微情报监测服务");
dataMap.put("remark", remark);
List<String> userList = getUserList();
if(userList!=null && !userList.isEmpty()) { List<String> userList = getUserList();
for (String openId : userList) { if(userList!=null && !userList.isEmpty()) {
Template template = new Template(); for (String openId : userList) {
template.setTouser(openId); Template template = new Template();
if(url!=null){ template.setTouser(openId);
template.setUrl(url); if(url!=null){
} template.setUrl(url);
template.setTemplate_id(WechatConstant.WECHAT_TEMPLATEID_EARLY_IT); }
template.setData(dataMap); template.setTemplate_id(WechatConstant.WECHAT_TEMPLATEID_EARLY_IT);
template.setData(dataMap);
JSONObject templateJson = (JSONObject)JSONObject.toJSON(template);
WechatCodeUtil.sendDataJson(templateJson); JSONObject templateJson = (JSONObject)JSONObject.toJSON(template);
} WechatCodeUtil.sendDataJson(templateJson);
}else { }
logger.info("知乎推送拉取用户列表失败"); }else {
} log.info("知乎推送拉取用户列表失败");
}
}
}
/**
* @Title: getUserList /**
* @author hero * @Title: getUserList
* @Description: 用户列表 * @author hero
* @param @param projectName * @Description: 用户列表
* @param @return 设定文件 * @param @param projectName
* @return List<String> 返回类型 * @param @return 设定文件
*/ * @return List<String> 返回类型
private static List<String> getUserList() */
{ private static List<String> getUserList()
List<String> userList = wechatUserDao.getWechatUserByGroup("LP组"); {
if(userList==null){ List<String> userList = wechatUserDao.getWechatUserByGroup("LP组");
userList = WechatCodeUtil.getUserListByGroupName("LP组"); if(userList==null){
} userList = WechatCodeUtil.getUserListByGroupName("LP组");
return userList; }
} return userList;
}
}
}
...@@ -5,6 +5,7 @@ import java.util.Date; ...@@ -5,6 +5,7 @@ import java.util.Date;
import java.util.List; import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -15,9 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler; ...@@ -15,9 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class SougoHotSearchRun extends Thread { public class SougoHotSearchRun extends Thread {
private static Logger logger = LoggerFactory.getLogger(SougoHotSearchRun.class);
@Override @Override
public void run() { public void run() {
boolean f = true; boolean f = true;
...@@ -36,9 +37,9 @@ public class SougoHotSearchRun extends Thread { ...@@ -36,9 +37,9 @@ public class SougoHotSearchRun extends Thread {
private void getHotList() { private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO(); HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
logger.info("搜狗微信采集开始........"); log.info("搜狗微信采集开始........");
List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch(); List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch();
logger.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>(); List<DBObject> data = new ArrayList<>();
for(HotSearchList sougoHotSearch : list){ for(HotSearchList sougoHotSearch : list){
DBObject doc = new BasicDBObject(); DBObject doc = new BasicDBObject();
...@@ -52,7 +53,7 @@ public class SougoHotSearchRun extends Thread { ...@@ -52,7 +53,7 @@ public class SougoHotSearchRun extends Thread {
data.add(doc); data.add(doc);
} }
hotSearchDAO.addHotSearchList(data); hotSearchDAO.addHotSearchList(data);
logger.info("搜狗微信采集结束........"); log.info("搜狗微信采集结束........");
} }
} }
package com.zhiwei.searchhotcrawler.timer; package com.zhiwei.searchhotcrawler.timer;
import java.util.Calendar; import java.util.Calendar;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import org.slf4j.Logger; import lombok.extern.log4j.Log4j2;
import org.slf4j.LoggerFactory; import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil; import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class UpdateWechatUserRun extends Thread{
private WechatUserDao wechatUserDao = new WechatUserDao(); @Log4j2
private static Logger logger = LoggerFactory.getLogger(UpdateWechatUserRun.class); public class UpdateWechatUserRun extends Thread{
@Override private WechatUserDao wechatUserDao = new WechatUserDao();
public void run() { @Override
logger.info("开始更新用户数据"); public void run() {
while(true) { log.info("开始更新用户数据");
try { while(true) {
Calendar calendar = Calendar.getInstance(); try {
int hour = calendar.get(Calendar.HOUR_OF_DAY); Calendar calendar = Calendar.getInstance();
if(hour > 6 ){ int hour = calendar.get(Calendar.HOUR_OF_DAY);
Map<String,Integer> groupMap = WechatCodeUtil.getAllGroupIp(); if(hour > 6 ){
logger.info("此公众号的分组数量为:::{}", groupMap.size()); Map<String,Integer> groupMap = WechatCodeUtil.getAllGroupIp();
if(!groupMap.isEmpty() && groupMap!=null){ log.info("此公众号的分组数量为:::{}", groupMap.size());
for(Entry<String,Integer> group : groupMap.entrySet()){ if(!groupMap.isEmpty() && groupMap!=null){
logger.info("此公众号的分组名称及IP为:::{},{}", group.getKey(), group.getValue()); for(Entry<String,Integer> group : groupMap.entrySet()){
List<String> userList = WechatCodeUtil.getUserListByGroupId(group.getValue()); log.info("此公众号的分组名称及IP为:::{},{}", group.getKey(), group.getValue());
logger.info("{},此分组下的用户数量为::{}", group.getKey(), userList.size()); List<String> userList = WechatCodeUtil.getUserListByGroupId(group.getValue());
if(userList!=null && !userList.isEmpty()){ log.info("{},此分组下的用户数量为::{}", group.getKey(), userList.size());
wechatUserDao.addWechatUser(userList, group.getKey(), group.getValue()); if(userList!=null && !userList.isEmpty()){
} wechatUserDao.addWechatUser(userList, group.getKey(), group.getValue());
} }
} }
} }
ZhiWeiTools.sleep(1*60*60*1000); }
} catch (Exception e) { ZhiWeiTools.sleep(1*60*60*1000);
logger.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace()); } catch (Exception e) {
ZhiWeiTools.sleep(1*60*60*1000); log.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
continue; ZhiWeiTools.sleep(1*60*60*1000);
} continue;
} }
} }
}
}
}
...@@ -5,6 +5,7 @@ import java.util.Date; ...@@ -5,6 +5,7 @@ import java.util.Date;
import java.util.List; import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler; ...@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class WeiboHotSearchRun extends Thread{ public class WeiboHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(WeiboHotSearchRun.class);
@Override @Override
public void run() { public void run() {
boolean f = true; boolean f = true;
...@@ -36,11 +36,11 @@ public class WeiboHotSearchRun extends Thread{ ...@@ -36,11 +36,11 @@ public class WeiboHotSearchRun extends Thread{
private void getHotList() { private void getHotList() {
logger.info("微博话题采集开始........"); log.info("微博话题采集开始........");
HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO(); HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch(); // List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch();
List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearchByPhone(); List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearchByPhone();
logger.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>(); List<DBObject> data = new ArrayList<>();
for(HotSearchList weiboHotSearch : list){ for(HotSearchList weiboHotSearch : list){
int changeCount = weiboHotSearchDAO.getChangeCount(weiboHotSearch); int changeCount = weiboHotSearchDAO.getChangeCount(weiboHotSearch);
...@@ -49,7 +49,7 @@ public class WeiboHotSearchRun extends Thread{ ...@@ -49,7 +49,7 @@ public class WeiboHotSearchRun extends Thread{
doc.put("name", weiboHotSearch.getName()); doc.put("name", weiboHotSearch.getName());
doc.put("url", weiboHotSearch.getUrl()); doc.put("url", weiboHotSearch.getUrl());
doc.put("count", weiboHotSearch.getCount()); doc.put("count", weiboHotSearch.getCount());
doc.put("hot", weiboHotSearch.isHot()); doc.put("hot", weiboHotSearch.getHot());
doc.put("day", weiboHotSearch.getDay()); doc.put("day", weiboHotSearch.getDay());
doc.put("time", weiboHotSearch.getTime()); doc.put("time", weiboHotSearch.getTime());
doc.put("changeCount", changeCount); doc.put("changeCount", changeCount);
...@@ -59,7 +59,7 @@ public class WeiboHotSearchRun extends Thread{ ...@@ -59,7 +59,7 @@ public class WeiboHotSearchRun extends Thread{
data.add(doc); data.add(doc);
} }
weiboHotSearchDAO.addHotSearchList(data); weiboHotSearchDAO.addHotSearchList(data);
logger.info("微博话题采集结束........"); log.info("微博话题采集结束........");
} }
} }
package com.zhiwei.searchhotcrawler.timer;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import com.zhiwei.searchhotcrawler.crawler.WeiboSuperTopicCrawler;
import com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class WeiboSuperTopicRun extends Thread{
@Override
public void run() {
boolean f = true;
while(f) {
try {
getTopicList();
TimeUnit.HOURS.sleep(3);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
}
ZhiWeiTools.sleep(50);
}
}
private void getTopicList() {
WeiboSuperTopicDAO weiboTopicDAO = new WeiboSuperTopicDAO();
log.info("微博超话采集开始........");
List<WeiboSuperTopic> list = WeiboSuperTopicCrawler.startCrawler();
log.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
for(WeiboSuperTopic topic : list){
log.info("topic::::{}", topic);
DBObject doc = new BasicDBObject();
doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName());
doc.put("rank", topic.getRank());
doc.put("score_num", topic.getScore());
doc.put("fensi_num", topic.getFensi());
doc.put("post_num", topic.getPostNum());
doc.put("type", topic.getType());
doc.put("day", topic.getDay());
doc.put("time", topic.getTime());
doc.put("url", topic.getUrl());
data.add(doc);
}
weiboTopicDAO.addTopicList(data);
log.info("微博话题采集结束........");
}
}
package com.zhiwei.searchhotcrawler.timer; package com.zhiwei.searchhotcrawler.timer;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.WeiboTopicCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import org.slf4j.Logger; @Log4j2
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.WeiboTopic;
import com.zhiwei.searchhotcrawler.crawler.WeiboHuatiCrawler;
import com.zhiwei.searchhotcrawler.dao.WeiboTopicDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class WeiboTopicRun extends Thread{ public class WeiboTopicRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(WeiboTopicRun.class);
@Override @Override
public void run() { public void run() {
boolean f = true; boolean f = true;
while(f) { while(f) {
try { try {
getTopicList(); getTopicList();
TimeUnit.HOURS.sleep(3); TimeUnit.MINUTES.sleep(3);
} catch (Exception e) { } catch (Exception e) {
e.fillInStackTrace(); e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000); ZhiWeiTools.sleep(60*60*1000);
...@@ -36,28 +31,29 @@ public class WeiboTopicRun extends Thread{ ...@@ -36,28 +31,29 @@ public class WeiboTopicRun extends Thread{
private void getTopicList() { private void getTopicList() {
WeiboTopicDAO weiboTopicDAO = new WeiboTopicDAO(); HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO();
logger.info("微博超话采集开始........"); log.info("微博话题采集开始........");
List<WeiboTopic> list = WeiboHuatiCrawler.startCrawler(); List<HotSearchList> list = WeiboTopicCrawler.startCrawlerByPhone();
logger.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 微博话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>(); List<DBObject> data = new ArrayList<>();
for(WeiboTopic topic : list){ for(HotSearchList topic : list){
logger.info("topic::::{}", topic); log.info("topic::::{}", topic);
DBObject doc = new BasicDBObject(); DBObject doc = new BasicDBObject();
doc.put("_id", topic.getId()); doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName()); doc.put("name", topic.getName());
doc.put("rank", topic.getRank()); doc.put("url", topic.getUrl());
doc.put("score_num", topic.getScore()); doc.put("count", topic.getCount());
doc.put("fensi_num", topic.getFensi()); doc.put("hot", topic.getHot());
doc.put("post_num", topic.getPostNum());
doc.put("type", topic.getType());
doc.put("day", topic.getDay()); doc.put("day", topic.getDay());
doc.put("time", topic.getTime()); doc.put("time", topic.getTime());
doc.put("url", topic.getUrl()); doc.put("rank", topic.getRank());
doc.put("type", topic.getType());
doc.put("topic_lead", topic.getTopicLead());
doc.put("comment_count", topic.getCommentCount());
data.add(doc); data.add(doc);
} }
weiboTopicDAO.addTopicList(data); weiboHotSearchDAO.addHotSearchList(data);
logger.info("微博话题采集结束........"); log.info("微博话题采集结束........");
} }
} }
...@@ -4,6 +4,7 @@ import java.util.Date; ...@@ -4,6 +4,7 @@ import java.util.Date;
import java.util.List; import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -14,10 +15,9 @@ import com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler; ...@@ -14,10 +15,9 @@ import com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class ZhihuHotSearchRun extends Thread{ public class ZhihuHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(ZhihuHotSearchRun.class);
@Override @Override
public void run() { public void run() {
boolean f = true; boolean f = true;
...@@ -32,24 +32,22 @@ public class ZhihuHotSearchRun extends Thread{ ...@@ -32,24 +32,22 @@ public class ZhihuHotSearchRun extends Thread{
ZhiWeiTools.sleep(50); ZhiWeiTools.sleep(50);
} }
} }
private void getHotList() { private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO(); HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
logger.info("知乎话题采集开始...,当前线程名字:{}", Thread.currentThread().getName()); log.info("知乎话题采集开始...,当前线程名字:{}", Thread.currentThread().getName());
List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList(); List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList();
List<HotSearchList> mobilelist = ZhihuHotSearchCrawler.getMobileZhihuHotList(); List<HotSearchList> mobilelist = ZhihuHotSearchCrawler.getMobileZhihuHotList();
list.addAll(mobilelist); list.addAll(mobilelist);
logger.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
for(HotSearchList zhihuHotSearch : list){ for(HotSearchList zhihuHotSearch : list){
DBObject zhihu = new BasicDBObject(); DBObject zhihu = new BasicDBObject();
zhihu.put("_id", zhihuHotSearch.getId()); zhihu.put("_id", zhihuHotSearch.getId());
zhihu.put("name", zhihuHotSearch.getName()); zhihu.put("name", zhihuHotSearch.getName());
zhihu.put("url", zhihuHotSearch.getUrl()); zhihu.put("url", zhihuHotSearch.getUrl());
zhihu.put("count", zhihuHotSearch.getCount()); zhihu.put("count", zhihuHotSearch.getCount());
zhihu.put("hot", zhihuHotSearch.isHot()); zhihu.put("hot", zhihuHotSearch.getHot());
zhihu.put("day", zhihuHotSearch.getDay()); zhihu.put("day", zhihuHotSearch.getDay());
zhihu.put("time", zhihuHotSearch.getTime()); zhihu.put("time", zhihuHotSearch.getTime());
zhihu.put("changeCount", 0); zhihu.put("changeCount", 0);
...@@ -57,7 +55,7 @@ public class ZhihuHotSearchRun extends Thread{ ...@@ -57,7 +55,7 @@ public class ZhihuHotSearchRun extends Thread{
zhihu.put("type", zhihuHotSearch.getType()); zhihu.put("type", zhihuHotSearch.getType());
hotSearchDAO.addHotSearch(zhihu); hotSearchDAO.addHotSearch(zhihu);
} }
logger.info("知乎话题采集结束........"); log.info("知乎话题采集结束........");
} }
} }
#mongoIp=202.107.192.94 #mongoIp=202.107.192.94
mongoIp=192.168.0.101 mongoIp=192.168.0.101
mongoPort=30000 mongoPort=30000
#mongoIp=192.168.0.81 #mongoIp=192.168.0.81
#mongoPort=27017 #mongoPort=27017
db.username=datapush db.username=searchhotcrawleruser
db.paasword=4d8ce5c42073c db.paasword=searchhotcrawler1q2w3e4r
db.certifiedDB=admin db.certifiedDB=admin
dbName=hot_search_list dbName=hot_search_list
searchCollName=hot_search_list searchCollName=hot_search_list
topicCollName=topic_list topicCollName=topic_list
collWechatUserName=wechat_user collWechatUserName=wechat_user
\ No newline at end of file
registry=zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182 registry=zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
group=hangzhou group=hangzhou
######################################################## ########################################################
#registry=zookeeper://192.168.0.36:2181 #registry=zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181
#group=local #group=local
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment