Commit cb6bcd76 by zhiwei

添加微博话题采集,并添加lombok

parent a9966f9d
......@@ -38,12 +38,17 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.5-SNAPSHOT</version>
<version>0.1.6-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.5.6-SNAPSHOT</version>
<version>0.6.0.4-RELEASE</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.8</version>
</dependency>
</dependencies>
......
......@@ -10,35 +10,79 @@ import java.io.Serializable;
import java.util.Date;
import com.zhiwei.tools.timeparse.TimeParse;
import lombok.Data;
import lombok.ToString;
@Data
@ToString
public class HotSearchList implements Serializable{
private static final long serialVersionUID = 2076919584659821600L;
private String id; //主键
/**
* 主键
*/
private String id;
private String url; //消息链接
/**
* 消息链接
*/
private String url;
private String name; //热搜关键词
/**
* 热搜关键词
*/
private String name;
private Integer count; //时时热搜量
/**
* 热搜或话题导语
*/
private String topicLead;
private Boolean hot; //状态(true 为热搜; false为时时上升)
/**
* 时时热搜量
*/
private Integer count;
private String day; //天
/**
* 状态(true 为热搜; false为时时上升)
*/
private Boolean hot;
private Date time; //时间
/**
* 天
*/
private String day;
private Integer changeCount; //据上分钟变化量
/**
* 时间
*/
private Date time;
private Integer rank; //排名
/**
* 据上分钟变化量
*/
private Integer changeCount;
private String type; //分类
/**
* 排名
*/
private Integer rank;
private String icon; //热搜类型
/**
* 分类
*/
private String type;
/**
* 热搜类型
*/
private String icon;
/**
* 话题讨论量
*/
private Integer commentCount;
public HotSearchList(){}
......@@ -69,120 +113,18 @@ public class HotSearchList implements Serializable{
}
@Override
public String toString(){
return "new HotSearchList["
+ "id = " + id
+ ", url = " + url
+ ", name = " + name
+ ", count = " + count
+ ", time = " + time
+ ", hot = " + hot
+ ", rank = " + rank
+ ", day = " + day
+ ", changeCount = " + changeCount
+ ", type = " + type
+ ", icon = " + icon
+ "]";
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
public HotSearchList(String url, String name, Integer count,Integer rank,String type, Integer commentCount, String topicLead){
this.id = name + "_" + new Date().getTime();
this.url = url;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Integer getCount() {
return count;
}
public void setCount(Integer count) {
this.count = count;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public Integer getChangeCount() {
return changeCount;
}
public void setChangeCount(Integer changeCount) {
this.changeCount = changeCount;
}
public static long getSerialversionuid() {
return serialVersionUID;
}
public Boolean isHot() {
return hot;
}
public void setHot(Boolean hot) {
this.hot = hot;
}
public Boolean getHot() {
return hot;
}
public String getIcon() {
return icon;
}
public void setIcon(String icon) {
this.icon = icon;
}
public String getDay() {
return day;
}
public void setDay(String day) {
this.day = day;
}
public Integer getRank() {
return rank;
}
public void setRank(Integer rank) {
this.hot = true;
this.rank = rank;
}
public String getType() {
return type;
}
public void setType(String type) {
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.type = type;
this.commentCount = commentCount;
this.topicLead = topicLead;
}
}
......@@ -5,5 +5,6 @@ public enum HotSearchType {
微博热搜,
知乎热搜,
抖音热搜,
搜狗微信热搜
搜狗微信热搜,
微博话题
}
package com.zhiwei.searchhotcrawler.bean;
import java.util.Date;
import com.zhiwei.tools.timeparse.TimeParse;
import lombok.Data;
import lombok.ToString;
/**
*
* @ClassName: WeiboSuperTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
@Data
@ToString
public class WeiboSuperTopic {
/**
* 主键
*/
private String id;
/**
* 话题链接
*/
public String url;
/**
* 话题名称
*/
public String topicName;
/**
* 话题排名
*/
public Integer rank;
/**
* 话题影响力
*/
public String score;
/**
* 话题粉丝数
*/
public String fensi;
/**
* 话题阅读数
*/
public String readNum;
/**
* 话题帖子数
*/
public String postNum;
/**
* 榜单类型
*/
public String type;
/**
* 天
*/
private String day;
/**
* 时间
*/
private Date time;
public WeiboSuperTopic() {}
public WeiboSuperTopic(String url, String topicName, Integer rank, String score,
String fensi, String type) {
this.url = url;
this.topicName = topicName;
this.rank = rank;
this.score = score;
this.fensi = fensi;
this.type = type;
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.id = topicName + "_" + type + "_" + day;
}
}
package com.zhiwei.searchhotcrawler.bean;
import java.util.Date;
import com.zhiwei.tools.timeparse.TimeParse;
/**
*
* @ClassName: WeiboTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
public class WeiboTopic {
private String id; //主键
public String url; //话题链接
public String topicName; //话题名称
public Integer rank; //话题排名
public String score; //话题影响力
public String fensi; //话题粉丝数
public String readNum; //话题阅读数
public String postNum; //话题帖子数
public String type; //榜单类型
private String day; //天
private Date time; //时间
public WeiboTopic() {}
public WeiboTopic(String url, String topicName, Integer rank, String score,
String fensi, String type) {
this.url = url;
this.topicName = topicName;
this.rank = rank;
this.score = score;
this.fensi = fensi;
this.type = type;
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.id = topicName + "_" + type + "_" + day;
}
@Override
public String toString() {
return "new WeiboTopic["
+ "topicName = " + topicName
+ ", rank = " + rank
+ ", score = " + score
+ ", fensi = " + fensi
+ ", type = " + type
+ ", readNum = " + readNum
+ ", postNum = " + postNum
+ ", url = " + url
+ "]";
}
public String getUrl() {
return url;
}
public String getTopicName() {
return topicName;
}
public Integer getRank() {
return rank;
}
public String getScore() {
return score;
}
public String getFensi() {
return fensi;
}
public String getReadNum() {
return readNum;
}
public String getPostNum() {
return postNum;
}
public String getType() {
return type;
}
public void setUrl(String url) {
this.url = url;
}
public void setTopicName(String topicName) {
this.topicName = topicName;
}
public void setRank(Integer rank) {
this.rank = rank;
}
public void setScore(String score) {
this.score = score;
}
public void setFensi(String fensi) {
this.fensi = fensi;
}
public void setReadNum(String readNum) {
this.readNum = readNum;
}
public void setPostNum(String postNum) {
this.postNum = postNum;
}
public void setType(String type) {
this.type = type;
}
public String getId() {
return id;
}
public String getDay() {
return day;
}
public Date getTime() {
return time;
}
public void setId(String id) {
this.id = id;
}
public void setDay(String day) {
this.day = day;
}
public void setTime(Date time) {
this.time = time;
}
}
package com.zhiwei.searchhotcrawler.cache;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class CacheListener {
Logger logger = LoggerFactory.getLogger(CacheListener.class);
/**
* 开启缓存监听
*/
public void startListen() {
new Thread(){
public void run() {
......@@ -17,7 +20,7 @@ public class CacheListener {
for(String key : CacheManager.getAllKeys()) {
if (CacheManager.isTimeOut(key)) {
CacheManager.clearByKey(key);
logger.info(key + "缓存被清除");
log.info(key + "缓存被清除");
}
}
}
......
......@@ -5,6 +5,7 @@ import java.util.Collections;
import java.util.List;
import java.util.Objects;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -24,16 +25,15 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
@Log4j2
public class BaiDuHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(BaiDuHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: BaiDuHotSearchTest
* @author hero
* @Description: TODO(PC端百度风云榜采集)
* @param 设定文件
* @Description: PC端百度风云榜采集
* @return void 返回类型
*/
public static List<HotSearchList> baiduHotSearch() {
......@@ -43,10 +43,10 @@ public class BaiDuHotSearchCrawler {
if (htmlBody != null && htmlBody.contains("mainBody")) {
return ansysData(htmlBody);
} else {
logger.info("解析百度风云榜时出现解析错误,页面结构有问题");
log.info("解析百度风云榜时出现解析错误,页面结构有问题");
}
} catch (Exception e) {
logger.error("解析百度风云榜时出现解析错误,页面结构有问题", e);
log.error("解析百度风云榜时出现解析错误,页面结构有问题", e);
}
return Collections.emptyList();
}
......@@ -101,12 +101,12 @@ public class BaiDuHotSearchCrawler {
list.add(hotSearch);
}
} catch (Exception e) {
logger.error("解析百度风云榜时出现解析错误", e);
log.error("解析百度风云榜时出现解析错误", e);
}
});
}
} catch (Exception e) {
logger.error("解析百度风云榜时出现解析错误,数据不是json结构", e);
log.error("解析百度风云榜时出现解析错误,数据不是json结构", e);
}
return list;
}
......
......@@ -4,6 +4,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -24,9 +25,9 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
* @author win 10
* @date:2019年07月11日 上午10:26:21
*/
@Log4j2
public class DouyinHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(ZhihuHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
......@@ -66,7 +67,7 @@ public class DouyinHotSearchCrawler {
}
}
} catch (IOException e) {
logger.debug("获取抖音热搜榜时出现问题:{}", e);
log.debug("获取抖音热搜榜时出现问题:{}", e);
}
return list;
}
......
......@@ -6,6 +6,7 @@ import java.util.List;
import java.util.Map;
import java.util.Objects;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -27,16 +28,15 @@ import com.zhiwei.tools.httpclient.HeaderTool;
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
@Log4j2
public class SougoHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(SougoHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: SougoHotSearchTest
* @author hero
* @Description: TODO(PC端搜狗微信关键词采集)
* @param 设定文件
* @return void 返回类型
*/
public static List<HotSearchList> sougoHotSearch() {
......@@ -76,19 +76,19 @@ public class SougoHotSearchCrawler {
list.add(hotSearch);
}
} catch (Exception e) {
logger.error("解析搜狗微信时出现解析错误", e);
log.error("解析搜狗微信时出现解析错误", e);
}
}
} catch (Exception e) {
logger.error("解析搜狗微信时出现解析错误,数据不是json结构", e.fillInStackTrace());
log.error("解析搜狗微信时出现解析错误,数据不是json结构", e.fillInStackTrace());
return Collections.emptyList();
}
} else {
logger.info("解析搜狗微信时出现解析错误,页面结构有问题");
log.info("解析搜狗微信时出现解析错误,页面结构有问题");
}
break;
} catch (Exception e) {
logger.error("解析搜狗微信时出现解析错误,页面结构有问题", e);
log.error("解析搜狗微信时出现解析错误,页面结构有问题", e);
}
}
return list;
......
......@@ -7,6 +7,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -27,13 +28,13 @@ import com.zhiwei.tools.tools.URLCodeUtil;
/**
* @ClassName: WeiboHotSearch
* @Description: TODO(微博实时热搜采集)
* @Description: 微博实时热搜采集
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
public class WeiboHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(WeiboHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: weiboHotSearchTest
......@@ -70,18 +71,18 @@ public class WeiboHotSearchCrawler {
list.add(hotSearch);
} catch (Exception e) {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
logger.error("解析微博时时热搜时出现解析错误", e);
log.error("解析微博时时热搜时出现解析错误", e);
continue;
}
}
} catch (Exception e) {
logger.error("解析微博时时热搜时出现解析错误,数据不是json结构",e.fillInStackTrace());
log.error("解析微博时时热搜时出现解析错误,数据不是json结构",e.fillInStackTrace());
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
return null;
}
}else{
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
logger.info("解析微博时时热搜时出现解析错误,页面结构有问题");
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
}
break;
} catch (Exception e) {
......@@ -138,25 +139,25 @@ public class WeiboHotSearchCrawler {
}
String id = "http://s.weibo.com/weibo/"+URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon);
logger.info("采集到的数据:::{}", hotSearch);
log.info("采集到的数据:::{}", hotSearch);
result.add(hotSearch);
rank++;
}
} catch (Exception e) {
logger.error("解析微博时时热搜时出现解析错误",e);
log.error("解析微博时时热搜时出现解析错误",e);
continue;
}
}
return result;
} catch (Exception e) {
logger.error("解析微博时时热搜时出现解析错误,数据不是json结构",e);
log.error("解析微博时时热搜时出现解析错误,数据不是json结构",e);
return Collections.emptyList();
}
}else{
logger.info("解析微博时时热搜时出现解析错误,页面结构有问题");
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
}
} catch (IOException e1) {
logger.error("解析微博时时热搜时出现连接失败",e1);
log.error("解析微博时时热搜时出现连接失败",e1);
return Collections.emptyList();
}
return Collections.emptyList();
......
......@@ -8,6 +8,8 @@ import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -17,18 +19,17 @@ import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.WeiboTopic;
/**
*
* @ClassName: WeiboHuatiCrawler
* @Description: 微博话题榜单采集(明星)
* @ClassName: WeiboSuperTopicCrawler
* @Description: 微博超话榜单采集(明星)
* @author Bewilder ZW
* @date 2019年9月27日 下午3:01:34
*/
public class WeiboHuatiCrawler {
@Log4j2
public class WeiboSuperTopicCrawler {
private static Logger logger = LoggerFactory.getLogger(WeiboHuatiCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Map<String,String> headMap = new HashMap<>();
......@@ -44,13 +45,13 @@ public class WeiboHuatiCrawler {
* 开始采集明星话题
* @return void
*/
public static List<WeiboTopic> startCrawler() {
public static List<WeiboSuperTopic> startCrawler() {
Map<String,String> urlMap = new HashMap<>();
urlMap.put("明星", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=star&from=&wm=");
urlMap.put("明星潜力", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=potential&from=&wm=");
urlMap.put("明星上升", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=up&from=&wm=");
List<WeiboTopic> topicList = new ArrayList<>();
List<WeiboSuperTopic> topicList = new ArrayList<>();
for(Entry<String,String> entry : urlMap.entrySet()) {
String url = entry.getValue();
......@@ -66,10 +67,10 @@ public class WeiboHuatiCrawler {
topicList.addAll(parseTopicRankHtml(page, htmlBody, type));
break;
}else {
logger.error("获取榜单列表页面时数据格式错误,页面为:{}", htmlBody);
log.error("获取榜单列表页面时数据格式错误,页面为:{}", htmlBody);
}
} catch (Exception e) {
logger.error("获取榜单列表页面时出现错误,错误为:{}", e);
log.error("获取榜单列表页面时出现错误,错误为:{}", e);
continue;
}
}
......@@ -87,13 +88,13 @@ public class WeiboHuatiCrawler {
* @param type
* @return void
*/
private static List<WeiboTopic> parseTopicRankHtml(int page,String htmlBody, String type) {
private static List<WeiboSuperTopic> parseTopicRankHtml(int page,String htmlBody, String type) {
try {
JSONArray list = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("list");
if(Objects.nonNull(list) && !list.isEmpty()) {
page = (page-1)*20;
List<WeiboTopic> topicList = new ArrayList<>();
List<WeiboSuperTopic> topicList = new ArrayList<>();
Integer toprank = null;
String topicName = null;
String id = null;
......@@ -111,7 +112,7 @@ public class WeiboHuatiCrawler {
fensi = desc1.replaceAll(".*影响力|粉丝", "").trim();
url = data.getString("link");
WeiboTopic topic = new WeiboTopic(url, topicName, toprank, score, fensi, type);
WeiboSuperTopic topic = new WeiboSuperTopic(url, topicName, toprank, score, fensi, type);
topic = getTopicInfo(id, topic);
System.out.println("topic====="+topic);
topicList.add(topic);
......@@ -119,7 +120,7 @@ public class WeiboHuatiCrawler {
return topicList;
}
} catch (Exception e) {
logger.error("解析榜单列表页面时出现错误,错误为:{}", e);
log.error("解析榜单列表页面时出现错误,错误为:{}", e);
}
return Collections.emptyList();
}
......@@ -134,7 +135,7 @@ public class WeiboHuatiCrawler {
* @return
* @return WeiboTopic
*/
private static WeiboTopic getTopicInfo(String id, WeiboTopic topic) {
private static WeiboSuperTopic getTopicInfo(String id, WeiboSuperTopic topic) {
for(int retryTimes=1; retryTimes<=3; retryTimes++) {
try {
String url = "https://m.weibo.cn/api/container/getIndex?containerid="+ id;
......@@ -151,7 +152,7 @@ public class WeiboHuatiCrawler {
}
}
} catch (Exception e) {
logger.error("解析榜单详情页面时出现错误,错误为:{}", e);
log.error("解析榜单详情页面时出现错误,错误为:{}", e);
}
}
return topic;
......
......@@ -5,6 +5,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -24,15 +25,14 @@ import com.zhiwei.tools.tools.URLCodeUtil;
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
public class ZhihuHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(ZhihuHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: getZhihuHotList
* @author hero
* @Description: 知乎热搜采集程序
* @param 设定文件
* @return void 返回类型
*/
public static List<HotSearchList> getZhihuHotList(){
......@@ -65,7 +65,7 @@ public class ZhihuHotSearchCrawler {
}
}
} catch (IOException e) {
logger.debug("获取知乎热搜时出现问题:{}", e);
log.debug("获取知乎热搜时出现问题:{}", e);
return list;
}
return list;
......@@ -106,7 +106,7 @@ public class ZhihuHotSearchCrawler {
}
}
} catch (IOException e) {
logger.debug("获取知乎热搜时出现问题:{}", e);
log.debug("获取知乎热搜时出现问题:{}", e);
return list;
}
return list;
......
......@@ -4,7 +4,9 @@ package com.zhiwei.searchhotcrawler.dao;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Objects;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -17,8 +19,8 @@ import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.tools.timeparse.TimeParse;
@Log4j2
public class HotSearchListDAO extends MongoDBTemplate{
private static Logger logger = LoggerFactory.getLogger(HotSearchListDAO.class);
public HotSearchListDAO() {
super();
......@@ -28,6 +30,19 @@ public class HotSearchListDAO extends MongoDBTemplate{
String month = time.substring(5,7);
String collName = Config.searchCollName + year + "_" + month;
super.setCollName(collName);
//给数据表创建索引
createIndex();
}
/**
* 初次创建表及创建相应的索引
*/
private void createIndex(){
List<DBObject> indexList = this.getReadColl().getIndexInfo();
if(Objects.isNull(indexList) && indexList.isEmpty()){
DBObject countIndexDoc = new BasicDBObject();
countIndexDoc.put("count", -1);
DBObject timeIndexDoc = new BasicDBObject();
......@@ -48,6 +63,7 @@ public class HotSearchListDAO extends MongoDBTemplate{
e.printStackTrace();
}
}
}
/**
* 添加数据入库
......@@ -57,7 +73,7 @@ public class HotSearchListDAO extends MongoDBTemplate{
try {
this.getReadColl().insert(list);
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
log.error("存储数据时出错,错误为:{}", e);
}
}
......@@ -65,7 +81,7 @@ public class HotSearchListDAO extends MongoDBTemplate{
try {
this.getReadColl().insert(doc);
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
log.error("存储数据时出错,错误为:{}", e);
}
}
......@@ -94,7 +110,7 @@ public class HotSearchListDAO extends MongoDBTemplate{
}
cur.close();
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
log.error("存储数据时出错,错误为:{}", e);
return result;
}
return result;
......@@ -128,7 +144,7 @@ public class HotSearchListDAO extends MongoDBTemplate{
}
cur.close();
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
log.error("存储数据时出错,错误为:{}", e);
}
return list;
}
......
......@@ -3,6 +3,7 @@ package com.zhiwei.searchhotcrawler.dao;
import java.util.Collections;
import java.util.List;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -12,8 +13,8 @@ import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
@Log4j2
public class WechatUserDao extends MongoDBTemplate{
private static Logger logger = LoggerFactory.getLogger(BaiDuHotSearchCrawler.class);
public WechatUserDao() {
super();
......@@ -39,7 +40,7 @@ public class WechatUserDao extends MongoDBTemplate{
this.getReadColl().save(doc);
break;
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
log.error("存储数据时出错,错误为:{}", e);
}
}
}
......@@ -61,7 +62,7 @@ public class WechatUserDao extends MongoDBTemplate{
return (List<String>)doc.get("user");
}
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
log.error("存储数据时出错,错误为:{}", e);
}
return Collections.emptyList();
}
......
......@@ -3,7 +3,9 @@ package com.zhiwei.searchhotcrawler.dao;
import java.util.Date;
import java.util.List;
import java.util.Objects;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -13,10 +15,10 @@ import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.tools.timeparse.TimeParse;
public class WeiboTopicDAO extends MongoDBTemplate{
private static Logger logger = LoggerFactory.getLogger(WeiboTopicDAO.class);
@Log4j2
public class WeiboSuperTopicDAO extends MongoDBTemplate{
public WeiboTopicDAO() {
public WeiboSuperTopicDAO() {
super();
super.setDbName(Config.dbName);
String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
......@@ -25,6 +27,16 @@ public class WeiboTopicDAO extends MongoDBTemplate{
String collName = Config.topicCollName + year + "_" + month;
super.setCollName(collName);
createIndex();
}
/**
* 初次创建表及创建相应的索引
*/
private void createIndex(){
List<DBObject> indexList = this.getReadColl().getIndexInfo();
if(Objects.isNull(indexList) && indexList.isEmpty()){
DBObject countIndexDoc = new BasicDBObject();
countIndexDoc.put("score_num", -1);
DBObject timeIndexDoc = new BasicDBObject();
......@@ -45,6 +57,8 @@ public class WeiboTopicDAO extends MongoDBTemplate{
e.printStackTrace();
}
}
}
/**
* 添加数据入库
......@@ -54,7 +68,7 @@ public class WeiboTopicDAO extends MongoDBTemplate{
try {
this.getReadColl().insert(list);
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
log.error("存储数据时出错,错误为:{}", e);
}
}
......@@ -62,7 +76,7 @@ public class WeiboTopicDAO extends MongoDBTemplate{
try {
this.getReadColl().insert(doc);
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
log.error("存储数据时出错,错误为:{}", e);
}
}
......
package com.zhiwei.searchhotcrawler.run;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.cache.CacheListener;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.DouyinHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SendWeiboHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SendZhihuHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SougoHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.UpdateWechatUserRun;
import com.zhiwei.searchhotcrawler.timer.WeiboHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.WeiboTopicRun;
import com.zhiwei.searchhotcrawler.timer.ZhihuHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.*;
import com.zhiwei.tools.tools.ZhiWeiTools;
import java.util.concurrent.Executors;
......@@ -24,7 +16,9 @@ public class HotSearchRun {
public static void main(String[] args) {
ProxyFactory.init(ProxyConfig.registry, ProxyConfig.group, GroupType.PROVIDER, 10000013);
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
new UpdateWechatUserRun().start();
ZhiWeiTools.sleep(10000);
......@@ -51,6 +45,7 @@ public class HotSearchRun {
new SougoHotSearchRun().start();
new DouyinHotSearchRun().start();
new ZhihuHotSearchRun().start();
new WeiboSuperTopicRun().start();
new WeiboTopicRun().start();
//推送程序启动
new SendWeiboHotSearchRun().start();
......
......@@ -16,90 +16,125 @@ import com.mongodb.MongoClient;
import com.mongodb.MongoCredential;
import com.mongodb.ServerAddress;
import com.mongodb.WriteResult;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.tools.timeparse.TimeParse;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HotSearchListTest{
public static void main(String[] args) {
MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray());
ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
Mongo mongo = new MongoClient(address, Arrays.asList(credential));
DB db = mongo.getDB("hot_search_list");
DBCollection coll = db.getCollection("hot_search_list2019_09");
// MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray());
// ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
// Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew));
// DB dbNew = mongoNew.getDB("hot_search_list");
Map<String,String> timLine = TimeParse.getTimeMap("2019-10-01 00:00:00", "2019-10-09 23:59:59", "dd", 1);
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("zzw").build();
ProxyFactory.init(simpleConfig);
String url = "http://app.myzaker.com/news/app.php?f=";
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
try{
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
Elements elements = Jsoup.parse(htmlBody).select("div.titlebar>a");
for(Element element : elements){
String lableUrl = "http://app.myzaker.com/news/app.php" + element.attr("href");
System.out.println("lableUrl========="+lableUrl);
String htmlBodyLable = httpBoot.syncCall(RequestUtils.wrapGet(lableUrl), ProxyHolder.NAT_HEAVY_PROXY).body().string();
Elements elementsLable = Jsoup.parse(htmlBodyLable).select("div#infinite_scroll>a");
for(Element elementLable : elementsLable){
System.out.println(elementLable.attr("href") + "=============" + elementLable.text());
}
}
timLine.forEach((start, end) ->{
}catch (Exception e){
e.printStackTrace();
}
String year = end.substring(0,4);
String month = end.substring(5,7);
Date startDate = TimeParse.stringFormartDate(start);
Date endDate = TimeParse.stringFormartDate(end);
String collName = "hot_search_list"+year+"_"+month;
System.out.println("collName=========="+collName);
// DBCollection collNew = dbNew.getCollection(collName);
// DBObject countIndexDoc = new BasicDBObject();
// countIndexDoc.put("count", -1);
// DBObject timeIndexDoc = new BasicDBObject();
// timeIndexDoc.put("time", -1);
// DBObject rankIndexDoc = new BasicDBObject();
// rankIndexDoc.put("rank", -1);
// DBObject nameIndexDoc = new BasicDBObject();
// nameIndexDoc.put("name", -1);
// DBObject typeIndexDoc = new BasicDBObject();
// typeIndexDoc.put("type", -1);
// try {
// collNew.createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
// collNew.createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
// collNew.createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
// collNew.createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
// collNew.createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
// } catch (Exception e) {
// e.printStackTrace();
// }
DBObject query = new BasicDBObject(new BasicDBObject("time",
new BasicDBObject("$gte",startDate).append("$lte", endDate)));
System.out.println(query);
WriteResult wr = coll.remove(query);
System.out.println("========"+wr.getN());
// int i = 0;
// DBCursor cur = coll.remove(query);
// System.out.println(query +"======="+ cur.count());
// List<DBObject> dataList = new ArrayList<>();
// while(cur.hasNext()) {
// DBObject doc = cur.next();
// try {
//// collNew.save(doc);
// i++;
// coll.remove(doc);
// } catch (Exception e2) {
// e2.printStackTrace();
// }
// dataList.add(doc);
// }
// System.out.println(collName +"数据量大小" +dataList.size());
// cur.close();
// if(!dataList.isEmpty()) {
// try {
// collNew.insert(dataList);
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
});
mongo.close();
// MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray());
// ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
// Mongo mongo = new MongoClient(address, Arrays.asList(credential));
//
// DB db = mongo.getDB("hot_search_list");
// DBCollection coll = db.getCollection("hot_search_list2019_09");
//
//// MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray());
//// ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
//// Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew));
//// DB dbNew = mongoNew.getDB("hot_search_list");
//
// Map<String,String> timLine = TimeParse.getTimeMap("2019-10-01 00:00:00", "2019-10-09 23:59:59", "dd", 1);
//
// timLine.forEach((start, end) ->{
//
// String year = end.substring(0,4);
// String month = end.substring(5,7);
// Date startDate = TimeParse.stringFormartDate(start);
// Date endDate = TimeParse.stringFormartDate(end);
//
// String collName = "hot_search_list"+year+"_"+month;
// System.out.println("collName=========="+collName);
//// DBCollection collNew = dbNew.getCollection(collName);
//// DBObject countIndexDoc = new BasicDBObject();
//// countIndexDoc.put("count", -1);
//// DBObject timeIndexDoc = new BasicDBObject();
//// timeIndexDoc.put("time", -1);
//// DBObject rankIndexDoc = new BasicDBObject();
//// rankIndexDoc.put("rank", -1);
//// DBObject nameIndexDoc = new BasicDBObject();
//// nameIndexDoc.put("name", -1);
//// DBObject typeIndexDoc = new BasicDBObject();
//// typeIndexDoc.put("type", -1);
//// try {
//// collNew.createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
//// collNew.createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
//// collNew.createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
//// collNew.createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
//// collNew.createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//
// DBObject query = new BasicDBObject(new BasicDBObject("time",
// new BasicDBObject("$gte",startDate).append("$lte", endDate)));
// System.out.println(query);
// WriteResult wr = coll.remove(query);
// System.out.println("========"+wr.getN());
//// int i = 0;
//// DBCursor cur = coll.remove(query);
//// System.out.println(query +"======="+ cur.count());
//// List<DBObject> dataList = new ArrayList<>();
//// while(cur.hasNext()) {
//// DBObject doc = cur.next();
//// try {
////// collNew.save(doc);
//// i++;
//// coll.remove(doc);
//// } catch (Exception e2) {
//// e2.printStackTrace();
//// }
//// dataList.add(doc);
//// }
//// System.out.println(collName +"数据量大小" +dataList.size());
//// cur.close();
//// if(!dataList.isEmpty()) {
//// try {
//// collNew.insert(dataList);
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// });
// mongo.close();
}
......
......@@ -6,6 +6,7 @@ import java.util.List;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -16,10 +17,9 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class BaiduHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(BaiduHotSearchRun.class);
@Override
public void run() {
boolean f = true;
......@@ -37,10 +37,10 @@ public class BaiduHotSearchRun extends Thread{
private void getHotList() {
logger.info("百度风云榜采集开始........");
log.info("百度风云榜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
List<HotSearchList> list = BaiDuHotSearchCrawler.baiduHotSearch();
logger.info("{}, 此轮百度风云榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
log.info("{}, 此轮百度风云榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> saveDataList = new ArrayList<>();
if(Objects.nonNull(list) && !list.isEmpty()) {
list.forEach(baiduHotSearch ->{
......@@ -59,7 +59,7 @@ public class BaiduHotSearchRun extends Thread{
});
}
hotSearchDAO.addHotSearchList(saveDataList);
logger.info("百度风云榜采集结束........");
log.info("百度风云榜采集结束........");
}
}
\ No newline at end of file
......@@ -5,6 +5,7 @@ import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.DouyinHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class DouyinHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(DouyinHotSearchRun.class);
@Override
public void run() {
boolean f = true;
......@@ -40,10 +40,10 @@ public class DouyinHotSearchRun extends Thread{
* @return void
*/
private void getHotList() {
logger.info("抖音热搜榜采集开始........");
log.info("抖音热搜榜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
List<HotSearchList> list = DouyinHotSearchCrawler.getMobileDouyinHotList();
logger.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
log.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
for(HotSearchList douyinHotSearch : list){
int changeCount = hotSearchDAO.getChangeCount(douyinHotSearch);
......@@ -60,7 +60,7 @@ public class DouyinHotSearchRun extends Thread{
data.add(douyin);
hotSearchDAO.addHotSearch(douyin);
}
logger.info("抖音热搜榜采集结束........");
log.info("抖音热搜榜采集结束........");
}
}
......@@ -6,6 +6,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -20,17 +21,17 @@ import com.zhiwei.searchhotcrawler.util.WechatConstant;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class SendWeiboHotSearchRun extends Thread {
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
private static WechatUserDao wechatUserDao = new WechatUserDao();
private static Logger logger = LoggerFactory.getLogger(SendWeiboHotSearchRun.class);
@Override
public void run() {
while (true) {
try {
Calendar calendar = Calendar.getInstance();
int hour = calendar.get(Calendar.HOUR_OF_DAY);
logger.info("微博推送,当前系统时间为:" + hour);
log.info("微博推送,当前系统时间为:" + hour);
if (hour > 6 && hour < 23) {
List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.微博热搜.name());
if (list != null && !list.isEmpty()) {
......@@ -41,14 +42,14 @@ public class SendWeiboHotSearchRun extends Thread {
sendTemplateByUserIds(title, time, url);
}
} else {
logger.info("微博最近一小时无数据");
log.info("微博最近一小时无数据");
sendTemplateByUserIds("最近一小时无数据",
TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null);
}
}
ZhiWeiTools.sleep(1 * 60 * 60 * 1000);
} catch (Exception e) {
logger.debug("微博热搜推送出现问题,问题为:::{}", e.fillInStackTrace());
log.debug("微博热搜推送出现问题,问题为:::{}", e.fillInStackTrace());
ZhiWeiTools.sleep(1 * 60 * 60 * 1000);
continue;
}
......@@ -100,7 +101,7 @@ public class SendWeiboHotSearchRun extends Thread {
WechatCodeUtil.sendDataJson(templateJson);
}
} else {
logger.info("拉取微博用户列表失败");
log.info("拉取微博用户列表失败");
}
}
......
......@@ -6,6 +6,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -20,10 +21,10 @@ import com.zhiwei.searchhotcrawler.util.WechatConstant;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class SendZhihuHotSearchRun extends Thread{
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
private static WechatUserDao wechatUserDao = new WechatUserDao();
private static Logger logger = LoggerFactory.getLogger(SendZhihuHotSearchRun.class);
@Override
public void run() {
......@@ -31,7 +32,7 @@ public class SendZhihuHotSearchRun extends Thread{
try {
Calendar calendar = Calendar.getInstance();
int hour = calendar.get(Calendar.HOUR_OF_DAY);
logger.info("知乎推送,当前系统时间为:"+hour);
log.info("知乎推送,当前系统时间为:"+hour);
if(hour > 6 && hour <23){
List<DBObject> list = hotSearchDAO.getHotOneHour(HotSearchType.知乎热搜.name());
if(list!=null && !list.isEmpty()){
......@@ -44,13 +45,13 @@ public class SendZhihuHotSearchRun extends Thread{
}
}
}else{
logger.info("知乎最近一小时无数据");
log.info("知乎最近一小时无数据");
sendTemplateByUserIds("最近一小时无数据", TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss"), null);
}
}
ZhiWeiTools.sleep(1*60*60*1000);
} catch (Exception e) {
logger.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
log.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
ZhiWeiTools.sleep(1*60*60*1000);
}
}
......@@ -101,7 +102,7 @@ public class SendZhihuHotSearchRun extends Thread{
WechatCodeUtil.sendDataJson(templateJson);
}
}else {
logger.info("知乎推送拉取用户列表失败");
log.info("知乎推送拉取用户列表失败");
}
}
......
......@@ -5,6 +5,7 @@ import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -15,8 +16,8 @@ import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class SougoHotSearchRun extends Thread {
private static Logger logger = LoggerFactory.getLogger(SougoHotSearchRun.class);
@Override
public void run() {
......@@ -36,9 +37,9 @@ public class SougoHotSearchRun extends Thread {
private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
logger.info("搜狗微信采集开始........");
log.info("搜狗微信采集开始........");
List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch();
logger.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
for(HotSearchList sougoHotSearch : list){
DBObject doc = new BasicDBObject();
......@@ -52,7 +53,7 @@ public class SougoHotSearchRun extends Thread {
data.add(doc);
}
hotSearchDAO.addHotSearchList(data);
logger.info("搜狗微信采集结束........");
log.info("搜狗微信采集结束........");
}
}
......@@ -5,6 +5,7 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -12,24 +13,24 @@ import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class UpdateWechatUserRun extends Thread{
private WechatUserDao wechatUserDao = new WechatUserDao();
private static Logger logger = LoggerFactory.getLogger(UpdateWechatUserRun.class);
@Override
public void run() {
logger.info("开始更新用户数据");
log.info("开始更新用户数据");
while(true) {
try {
Calendar calendar = Calendar.getInstance();
int hour = calendar.get(Calendar.HOUR_OF_DAY);
if(hour > 6 ){
Map<String,Integer> groupMap = WechatCodeUtil.getAllGroupIp();
logger.info("此公众号的分组数量为:::{}", groupMap.size());
log.info("此公众号的分组数量为:::{}", groupMap.size());
if(!groupMap.isEmpty() && groupMap!=null){
for(Entry<String,Integer> group : groupMap.entrySet()){
logger.info("此公众号的分组名称及IP为:::{},{}", group.getKey(), group.getValue());
log.info("此公众号的分组名称及IP为:::{},{}", group.getKey(), group.getValue());
List<String> userList = WechatCodeUtil.getUserListByGroupId(group.getValue());
logger.info("{},此分组下的用户数量为::{}", group.getKey(), userList.size());
log.info("{},此分组下的用户数量为::{}", group.getKey(), userList.size());
if(userList!=null && !userList.isEmpty()){
wechatUserDao.addWechatUser(userList, group.getKey(), group.getValue());
}
......@@ -38,7 +39,7 @@ public class UpdateWechatUserRun extends Thread{
}
ZhiWeiTools.sleep(1*60*60*1000);
} catch (Exception e) {
logger.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
log.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
ZhiWeiTools.sleep(1*60*60*1000);
continue;
}
......
......@@ -5,6 +5,7 @@ import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -15,10 +16,9 @@ import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class WeiboHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(WeiboHotSearchRun.class);
@Override
public void run() {
boolean f = true;
......@@ -36,11 +36,11 @@ public class WeiboHotSearchRun extends Thread{
private void getHotList() {
logger.info("微博话题采集开始........");
log.info("微博话题采集开始........");
HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch();
List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearchByPhone();
logger.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
log.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
for(HotSearchList weiboHotSearch : list){
int changeCount = weiboHotSearchDAO.getChangeCount(weiboHotSearch);
......@@ -49,7 +49,7 @@ public class WeiboHotSearchRun extends Thread{
doc.put("name", weiboHotSearch.getName());
doc.put("url", weiboHotSearch.getUrl());
doc.put("count", weiboHotSearch.getCount());
doc.put("hot", weiboHotSearch.isHot());
doc.put("hot", weiboHotSearch.getHot());
doc.put("day", weiboHotSearch.getDay());
doc.put("time", weiboHotSearch.getTime());
doc.put("changeCount", changeCount);
......@@ -59,7 +59,7 @@ public class WeiboHotSearchRun extends Thread{
data.add(doc);
}
weiboHotSearchDAO.addHotSearchList(data);
logger.info("微博话题采集结束........");
log.info("微博话题采集结束........");
}
}
package com.zhiwei.searchhotcrawler.timer;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import com.zhiwei.searchhotcrawler.crawler.WeiboSuperTopicCrawler;
import com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class WeiboSuperTopicRun extends Thread{
@Override
public void run() {
boolean f = true;
while(f) {
try {
getTopicList();
TimeUnit.HOURS.sleep(3);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
}
ZhiWeiTools.sleep(50);
}
}
private void getTopicList() {
WeiboSuperTopicDAO weiboTopicDAO = new WeiboSuperTopicDAO();
log.info("微博超话采集开始........");
List<WeiboSuperTopic> list = WeiboSuperTopicCrawler.startCrawler();
log.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
for(WeiboSuperTopic topic : list){
log.info("topic::::{}", topic);
DBObject doc = new BasicDBObject();
doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName());
doc.put("rank", topic.getRank());
doc.put("score_num", topic.getScore());
doc.put("fensi_num", topic.getFensi());
doc.put("post_num", topic.getPostNum());
doc.put("type", topic.getType());
doc.put("day", topic.getDay());
doc.put("time", topic.getTime());
doc.put("url", topic.getUrl());
data.add(doc);
}
weiboTopicDAO.addTopicList(data);
log.info("微博话题采集结束........");
}
}
package com.zhiwei.searchhotcrawler.timer;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.WeiboTopicCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.WeiboTopic;
import com.zhiwei.searchhotcrawler.crawler.WeiboHuatiCrawler;
import com.zhiwei.searchhotcrawler.dao.WeiboTopicDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class WeiboTopicRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(WeiboTopicRun.class);
@Override
public void run() {
boolean f = true;
while(f) {
try {
getTopicList();
TimeUnit.HOURS.sleep(3);
TimeUnit.MINUTES.sleep(3);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
......@@ -36,28 +31,29 @@ public class WeiboTopicRun extends Thread{
private void getTopicList() {
WeiboTopicDAO weiboTopicDAO = new WeiboTopicDAO();
logger.info("微博超话采集开始........");
List<WeiboTopic> list = WeiboHuatiCrawler.startCrawler();
logger.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO();
log.info("微博话题采集开始........");
List<HotSearchList> list = WeiboTopicCrawler.startCrawlerByPhone();
log.info("{}, 微博话题此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
for(WeiboTopic topic : list){
logger.info("topic::::{}", topic);
for(HotSearchList topic : list){
log.info("topic::::{}", topic);
DBObject doc = new BasicDBObject();
doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName());
doc.put("rank", topic.getRank());
doc.put("score_num", topic.getScore());
doc.put("fensi_num", topic.getFensi());
doc.put("post_num", topic.getPostNum());
doc.put("type", topic.getType());
doc.put("name", topic.getName());
doc.put("url", topic.getUrl());
doc.put("count", topic.getCount());
doc.put("hot", topic.getHot());
doc.put("day", topic.getDay());
doc.put("time", topic.getTime());
doc.put("url", topic.getUrl());
doc.put("rank", topic.getRank());
doc.put("type", topic.getType());
doc.put("topic_lead", topic.getTopicLead());
doc.put("comment_count", topic.getCommentCount());
data.add(doc);
}
weiboTopicDAO.addTopicList(data);
logger.info("微博话题采集结束........");
weiboHotSearchDAO.addHotSearchList(data);
log.info("微博话题采集结束........");
}
}
......@@ -4,6 +4,7 @@ import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import lombok.extern.log4j.Log4j2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -14,10 +15,9 @@ import com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class ZhihuHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(ZhihuHotSearchRun.class);
@Override
public void run() {
boolean f = true;
......@@ -34,22 +34,20 @@ public class ZhihuHotSearchRun extends Thread{
}
private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
logger.info("知乎话题采集开始...,当前线程名字:{}", Thread.currentThread().getName());
log.info("知乎话题采集开始...,当前线程名字:{}", Thread.currentThread().getName());
List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList();
List<HotSearchList> mobilelist = ZhihuHotSearchCrawler.getMobileZhihuHotList();
list.addAll(mobilelist);
logger.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
for(HotSearchList zhihuHotSearch : list){
DBObject zhihu = new BasicDBObject();
zhihu.put("_id", zhihuHotSearch.getId());
zhihu.put("name", zhihuHotSearch.getName());
zhihu.put("url", zhihuHotSearch.getUrl());
zhihu.put("count", zhihuHotSearch.getCount());
zhihu.put("hot", zhihuHotSearch.isHot());
zhihu.put("hot", zhihuHotSearch.getHot());
zhihu.put("day", zhihuHotSearch.getDay());
zhihu.put("time", zhihuHotSearch.getTime());
zhihu.put("changeCount", 0);
......@@ -57,7 +55,7 @@ public class ZhihuHotSearchRun extends Thread{
zhihu.put("type", zhihuHotSearch.getType());
hotSearchDAO.addHotSearch(zhihu);
}
logger.info("知乎话题采集结束........");
log.info("知乎话题采集结束........");
}
}
......@@ -3,8 +3,8 @@ mongoIp=192.168.0.101
mongoPort=30000
#mongoIp=192.168.0.81
#mongoPort=27017
db.username=datapush
db.paasword=4d8ce5c42073c
db.username=searchhotcrawleruser
db.paasword=searchhotcrawler1q2w3e4r
db.certifiedDB=admin
dbName=hot_search_list
searchCollName=hot_search_list
......
registry=zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
group=hangzhou
########################################################
#registry=zookeeper://192.168.0.36:2181
#registry=zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181
#group=local
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment