Commit 4fce8f43 by zhiwei

添加明星超话榜单采集

parent 21d9c6dc
package com.zhiwei.searchhotcrawler.bean;
import java.util.Date;
import com.zhiwei.tools.timeparse.TimeParse;
/**
*
* @ClassName: WeiboTopic
* @Description: 微博话题
* @author Bewilder ZW
* @date 2019年9月27日 下午3:33:08
*/
public class WeiboTopic {
private String id; //主键
public String url; //话题链接
public String topicName; //话题名称
public Integer rank; //话题排名
public String score; //话题影响力
public String fensi; //话题粉丝数
public String readNum; //话题阅读数
public String postNum; //话题帖子数
public String type; //榜单类型
private String day; //天
private Date time; //时间
public WeiboTopic() {}
public WeiboTopic(String url, String topicName, Integer rank, String score,
String fensi, String type) {
this.url = url;
this.topicName = topicName;
this.rank = rank;
this.score = score;
this.fensi = fensi;
this.type = type;
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.id = topicName + "_" + day;
}
@Override
public String toString() {
return "new WeiboTopic["
+ "topicName = " + topicName
+ ", rank = " + rank
+ ", score = " + score
+ ", fensi = " + fensi
+ ", type = " + type
+ ", readNum = " + readNum
+ ", postNum = " + postNum
+ ", url = " + url
+ "]";
}
public String getUrl() {
return url;
}
public String getTopicName() {
return topicName;
}
public Integer getRank() {
return rank;
}
public String getScore() {
return score;
}
public String getFensi() {
return fensi;
}
public String getReadNum() {
return readNum;
}
public String getPostNum() {
return postNum;
}
public String getType() {
return type;
}
public void setUrl(String url) {
this.url = url;
}
public void setTopicName(String topicName) {
this.topicName = topicName;
}
public void setRank(Integer rank) {
this.rank = rank;
}
public void setScore(String score) {
this.score = score;
}
public void setFensi(String fensi) {
this.fensi = fensi;
}
public void setReadNum(String readNum) {
this.readNum = readNum;
}
public void setPostNum(String postNum) {
this.postNum = postNum;
}
public void setType(String type) {
this.type = type;
}
public String getId() {
return id;
}
public String getDay() {
return day;
}
public Date getTime() {
return time;
}
public void setId(String id) {
this.id = id;
}
public void setDay(String day) {
this.day = day;
}
public void setTime(Date time) {
this.time = time;
}
}
...@@ -18,7 +18,8 @@ public class Config { ...@@ -18,7 +18,8 @@ public class Config {
userPwd = conf.getProperty("db.paasword"); userPwd = conf.getProperty("db.paasword");
authDB = conf.getProperty("db.certifiedDB"); authDB = conf.getProperty("db.certifiedDB");
dbName = conf.getProperty("dbName"); dbName = conf.getProperty("dbName");
collName = conf.getProperty("collName"); searchCollName = conf.getProperty("searchCollName");
topicCollName = conf.getProperty("topicCollName");
collWechatUserName = conf.getProperty("collWechatUserName"); collWechatUserName = conf.getProperty("collWechatUserName");
} catch (Exception e) { } catch (Exception e) {
...@@ -33,6 +34,7 @@ public class Config { ...@@ -33,6 +34,7 @@ public class Config {
public static String userPwd; public static String userPwd;
public static String authDB; public static String authDB;
public static String dbName; public static String dbName;
public static String collName; public static String searchCollName;
public static String topicCollName;
public static String collWechatUserName; public static String collWechatUserName;
} }
package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.WeiboTopic;
/**
*
* @ClassName: WeiboHuatiCrawler
* @Description: 微博话题榜单采集(明星)
* @author Bewilder ZW
* @date 2019年9月27日 下午3:01:34
*/
public class WeiboHuatiCrawler {
private static Logger logger = LoggerFactory.getLogger(WeiboHuatiCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Map<String,String> headMap = new HashMap<>();
static {
headMap.put("X-Requested-With", "XMLHttpRequest");
headMap.put("Referer", "https://huati.weibo.cn/discovery/super?extparam=ctg1_2%7Cscorll_1&luicode=10000011&lfid=100803_-_super&sourceType=weixin");
headMap.put("Host", "huati.weibo.cn");
}
/**
*
* 开始采集明星话题
* @return void
*/
public static List<WeiboTopic> startCrawler() {
Map<String,String> urlMap = new HashMap<>();
urlMap.put("明星", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=star&from=&wm=");
urlMap.put("明星潜力", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=potential&from=&wm=");
urlMap.put("明星上升", "https://huati.weibo.cn/aj/discovery/rank?cate_id=2&topic_to_page=&block_time=0&star_type=up&from=&wm=");
List<WeiboTopic> topicList = new ArrayList<>();
for(Entry<String,String> entry : urlMap.entrySet()) {
String url = entry.getValue();
String type = entry.getKey();
for(int page= 1; page<=5; page++) {
String pageUrl = url + "&page=" + page;
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc1")) {
topicList.addAll(parseTopicRankHtml(htmlBody, type));
}
} catch (Exception e) {
logger.error("获取榜单列表页面时出现错误,错误为:{}", e);
}
}
}
return topicList;
}
/**
*
* 解析话题榜单
* @param htmlBody
* @param type
* @return void
*/
private static List<WeiboTopic> parseTopicRankHtml(String htmlBody, String type) {
try {
JSONArray list = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("list");
if(Objects.nonNull(list) && !list.isEmpty()) {
List<WeiboTopic> topicList = new ArrayList<>();
Integer toprank = null;
String topicName = null;
String id = null;
String score = null;
String desc1 = null;
String fensi = null;
String url = null;
for(int i=0;i<list.size();i++) {
JSONObject data = list.getJSONObject(i);
toprank = data.getInteger("toprank");
topicName = data.getString("display_name");
id = data.getString("page_id");
score = data.getString("score");
desc1 = data.getString("desc1");
fensi = desc1.replaceAll("影响力.*", "");
url = data.getString("link");
WeiboTopic topic = new WeiboTopic(url, topicName, toprank, score, fensi, type);
topic = getTopicInfo(id, topic);
topicList.add(topic);
}
return topicList;
}
} catch (Exception e) {
logger.error("解析榜单列表页面时出现错误,错误为:{}", e);
}
return Collections.emptyList();
}
/**
*
* 根据单一话题id获取话题阅读数及发帖数
* @param id
* @param topic
* @return
* @return WeiboTopic
*/
private static WeiboTopic getTopicInfo(String id, WeiboTopic topic) {
for(int i=1;i<=3;i++) {
try {
String url = "https://m.weibo.cn/api/container/getIndex?containerid="+ id;
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc_more")) {
String descMore = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("pageInfo").getJSONArray("desc_more").getString(0);
if(StringUtils.isNotBlank(descMore)) {
String readNum = descMore.replaceAll("阅读|帖子.*", "").trim();
String postNum = descMore.replaceAll(".*帖子|粉丝.*", "").trim();
topic.setPostNum(postNum);
topic.setReadNum(readNum);
return topic;
}
}
} catch (Exception e) {
logger.error("解析榜单详情页面时出现错误,错误为:{}", e);
}
}
return topic;
}
}
...@@ -14,22 +14,19 @@ import com.mongodb.DBObject; ...@@ -14,22 +14,19 @@ import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.cache.CacheManager; import com.zhiwei.searchhotcrawler.cache.CacheManager;
import com.zhiwei.searchhotcrawler.config.Config; import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate; import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class HotSearchListDAO extends MongoDBTemplate{ public class HotSearchListDAO extends MongoDBTemplate{
private static Logger logger = LoggerFactory.getLogger(BaiDuHotSearchCrawler.class); private static Logger logger = LoggerFactory.getLogger(HotSearchListDAO.class);
@SuppressWarnings("unused")
public HotSearchListDAO() { public HotSearchListDAO() {
super(); super();
super.setDbName(Config.dbName); super.setDbName(Config.dbName);
String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd"); String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
String year = time.substring(0,4); String year = time.substring(0,4);
String month = time.substring(5,7); String month = time.substring(5,7);
String collName = Config.collName + year + "_" + month; String collName = Config.searchCollName + year + "_" + month;
super.setCollName(collName); super.setCollName(collName);
} }
......
package com.zhiwei.searchhotcrawler.dao;
import java.util.Date;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.tools.timeparse.TimeParse;
public class WeiboTopicDAO extends MongoDBTemplate{
private static Logger logger = LoggerFactory.getLogger(WeiboTopicDAO.class);
public WeiboTopicDAO() {
super();
super.setDbName(Config.dbName);
String time = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
String year = time.substring(0,4);
String month = time.substring(5,7);
String collName = Config.topicCollName + year + "_" + month;
super.setCollName(collName);
}
/**
* 添加数据入库
* @param list
*/
public void addTopicList(List<DBObject> list){
try {
this.getReadColl().insert(list);
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
}
}
public void addTopic(DBObject doc){
try {
this.getReadColl().insert(doc);
} catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e);
}
}
}
...@@ -11,25 +11,13 @@ import com.zhiwei.searchhotcrawler.timer.SendZhihuHotSearchRun; ...@@ -11,25 +11,13 @@ import com.zhiwei.searchhotcrawler.timer.SendZhihuHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SougoHotSearchRun; import com.zhiwei.searchhotcrawler.timer.SougoHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.UpdateWechatUserRun; import com.zhiwei.searchhotcrawler.timer.UpdateWechatUserRun;
import com.zhiwei.searchhotcrawler.timer.WeiboHotSearchRun; import com.zhiwei.searchhotcrawler.timer.WeiboHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.WeiboTopicRun;
import com.zhiwei.searchhotcrawler.timer.ZhihuHotSearchRun; import com.zhiwei.searchhotcrawler.timer.ZhihuHotSearchRun;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
public class HotSearchRun { public class HotSearchRun {
// private ScheduledExecutorService scheduExec;
//
// public HotSearchRun() {
// this.scheduExec = Executors.newScheduledThreadPool(5);
// }
// public void showTimer() {
// scheduExec.scheduleAtFixedRate(new WeiboHotSearchRun(), 0, 1, TimeUnit.MINUTES);
// scheduExec.scheduleAtFixedRate(new ZhihuHotSearchRun(), 0, 10 , TimeUnit.MINUTES);
// scheduExec.scheduleAtFixedRate(new BaiduHotSearchRun(), 0, 5 , TimeUnit.MINUTES);
// scheduExec.scheduleAtFixedRate(new SougoHotSearchRun(), 0, 5 , TimeUnit.MINUTES);
// scheduExec.scheduleAtFixedRate(new DouyinHotSearchRun(), 0, 10 , TimeUnit.MINUTES);
// }
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init(ProxyConfig.registry, ProxyConfig.group, GroupType.PROVIDER); ProxyFactory.init(ProxyConfig.registry, ProxyConfig.group, GroupType.PROVIDER);
...@@ -43,6 +31,7 @@ public class HotSearchRun { ...@@ -43,6 +31,7 @@ public class HotSearchRun {
new SougoHotSearchRun().start(); new SougoHotSearchRun().start();
new DouyinHotSearchRun().start(); new DouyinHotSearchRun().start();
new ZhihuHotSearchRun().start(); new ZhihuHotSearchRun().start();
new WeiboTopicRun().start();
//推送程序启动 //推送程序启动
new SendWeiboHotSearchRun().start(); new SendWeiboHotSearchRun().start();
new SendZhihuHotSearchRun().start(); new SendZhihuHotSearchRun().start();
......
...@@ -12,7 +12,6 @@ import org.slf4j.LoggerFactory; ...@@ -12,7 +12,6 @@ import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject; import com.mongodb.BasicDBObject;
import com.mongodb.DBObject; import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler; import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
......
package com.zhiwei.searchhotcrawler.timer;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.WeiboTopic;
import com.zhiwei.searchhotcrawler.crawler.WeiboHuatiCrawler;
import com.zhiwei.searchhotcrawler.dao.WeiboTopicDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class WeiboTopicRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(WeiboTopicRun.class);
private WeiboTopicDAO weiboTopicDAO = new WeiboTopicDAO();
@Override
public void run() {
boolean f = true;
while(f) {
try {
getTopicList();
TimeUnit.DAYS.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
}
ZhiWeiTools.sleep(50);
}
}
private void getTopicList() {
logger.info("微博超话采集开始........");
List<WeiboTopic> list = WeiboHuatiCrawler.startCrawler();
logger.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
for(WeiboTopic topic : list){
DBObject doc = new BasicDBObject();
doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName());
doc.put("rank", topic.getRank());
doc.put("score_num", topic.getScore());
doc.put("fensi_num", topic.getFensi());
doc.put("post_num", topic.getPostNum());
doc.put("type", topic.getType());
doc.put("day", topic.getDay());
doc.put("time", topic.getTime());
doc.put("url", topic.getUrl());
data.add(doc);
}
weiboTopicDAO.addTopicList(data);
logger.info("微博话题采集结束........");
}
}
...@@ -10,7 +10,6 @@ import org.slf4j.LoggerFactory; ...@@ -10,7 +10,6 @@ import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject; import com.mongodb.BasicDBObject;
import com.mongodb.DBObject; import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler; import com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
......
...@@ -7,5 +7,6 @@ db.username=datapush ...@@ -7,5 +7,6 @@ db.username=datapush
db.paasword=4d8ce5c42073c db.paasword=4d8ce5c42073c
db.certifiedDB=admin db.certifiedDB=admin
dbName=hot_search_list dbName=hot_search_list
collName=hot_search_list searchCollName=hot_search_list
topicCollName=topic_list
collWechatUserName=wechat_user collWechatUserName=wechat_user
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment