Commit c21d66b0 by win 10

提交百度 搜狗 抖音 新增热搜采集

parent a65b651d
package com.zhiwei.searchhotcrawler.bean;
import java.io.Serializable;
import java.util.Date;
import com.zhiwei.tools.timeparse.TimeParse;
public class BaiDuHotSearch implements Serializable {
private static final long serialVersionUID = 2076919584659821600L;
private String id; //主键(kw+时间)
private String url; //主链接
private String everurl; //相关链接
private String kw; //关键词
private int count; //搜索指数
private String day; //天
private Date time; //时间
private int changeCount; //据上分钟变化量
private Integer rank; //排名
public BaiDuHotSearch(){}
public BaiDuHotSearch(Integer rank, String kw, String everurl,int count){
this.id = kw + "_" + new Date().getTime();
this.rank = rank;
this.kw = kw;
this.count = count;
this.everurl = everurl;
this.rank = rank;
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
}
@Override
public String toString(){
return "new BaiDuHotSearch["
+ "id = " + id
+ ", url = " + url
+ ", everurl = " + everurl
+ ", kw = " + kw
+ ", count = " + count
+ ", day = " + day
+ ", time = " + time
+ ", rank = " + rank
+ ", changeCount = " + changeCount
+ "]";
}
public String getDay() {
return day;
}
public void setDay(String day) {
this.day = day;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getEverurl() {
return everurl;
}
public void setEverurl(String everurl) {
this.everurl = everurl;
}
public String getKw() {
return kw;
}
public void setKw(String kw) {
this.kw = kw;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public int getChangeCount() {
return changeCount;
}
public void setChangeCount(int changeCount) {
this.changeCount = changeCount;
}
public int getRank() {
return rank;
}
public void setRank(Integer rank) {
this.rank = rank;
}
}
package com.zhiwei.searchhotcrawler.bean;
import java.io.Serializable;
import java.util.Date;
public class DouyinHotSearch implements Serializable {
private static final long serialVersionUID = -7707110236217797510L;
private String id; //主键(word+时间)
// private String url; //消息链接
private Integer position; //排名
private String word; //热搜关键词
private int hot_value; //热度值
private Date time; //时间
private int changeCount; //据上分钟变化量
public DouyinHotSearch(){}
public DouyinHotSearch(Integer position, String word, Integer hot_value) {
this.id = word + "_" + new Date().getTime();
this.position = position;
this.word = word;
this.hot_value = hot_value;
this.time = new Date();
}
@Override
public String toString(){
return "new DouyinHotSearch["
+ "id = " + id
+ ", position = " + position
+ ", word = " + word
+ ", hot_value = " + hot_value
+ ", time = " + time
+ ", changeCount = " + changeCount
+ "]";
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Integer getPosition() {
return position;
}
public void setPosition(Integer position) {
this.position = position;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public int getHot_value() {
return hot_value;
}
public void setHot_value(int hot_value) {
this.hot_value = hot_value;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public int getChangeCount() {
return changeCount;
}
public void setChangeCount(int changeCount) {
this.changeCount = changeCount;
}
}
......@@ -20,6 +20,9 @@ public class Config {
dbName = conf.getProperty("dbName");
collWeiboName = conf.getProperty("collWeiboName");
collZhihuName = conf.getProperty("collZhihuName");
collBaiduName = conf.getProperty("collBaiduName");
collSougoName = conf.getProperty("collSougoName");
collDouyinName = conf.getProperty("collDouyinName");
collWechatUserName = conf.getProperty("collWechatUserName");
} catch (Exception e) {
......@@ -35,7 +38,9 @@ public class Config {
public static String authDB;
public static String dbName;
public static String collWeiboName;
public static String collBaiduName;
public static String collZhihuName;
public static String collWechatUserName;
public static String collSougoName;
public static String collDouyinName;
}
package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.DouyinHotSearch;
/**
* @className DouyinHotSearchCrawler
* @Description:抖音热搜榜采集程序
* @author win 10
* @date:2019年07月11日 上午10:26:21
*/
public class DouyinHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(ZhihuHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
* @Title: getMobileDouyinHotList
* @author hero
* @Description: 移动端抖音热搜榜
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public static List<DouyinHotSearch> getMobileDouyinHotList(){
List<DouyinHotSearch> list = null;
String url = "https://api.amemv.com/aweme/v1/hot/search/list/";
// Map<String,String> headerMap = HeaderTool.getCommonHead();
// headerMap.put("Host", "api.zhihu.com");
// headerMap.put("Referer", url);
// headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36");
// headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
// headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
for(int j=0;j<3;j++){
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
if(htmlBody != null){
if(htmlBody.contains("word_list")){
list = new ArrayList<DouyinHotSearch>();
JSONObject data = JSONObject.parseObject(htmlBody);
JSONArray word_list = data.getJSONObject("data").getJSONArray("word_list");
String positionStr = null;
String word = null;
String hot_value_str = null;
for (int i = 0; i < word_list.size(); i++) {
JSONObject wl = word_list.getJSONObject(i);
//获取排名
positionStr = wl.getString("position");
Integer position = null;
position = Integer.valueOf(positionStr);
//获取关键词
word = wl.getString("word");
//获取热度值
hot_value_str =wl.getString("hot_value");
Integer hot_value = null;
hot_value = Integer.valueOf(hot_value_str);
logger.info("热度为:::{}", hot_value);
DouyinHotSearch douyin = new DouyinHotSearch(position, word, hot_value);
list.add(douyin);
}
break;
}else{
System.out.println("---------------");
}
}
} catch (IOException e) {
logger.debug("获取抖音热搜榜时出现问题:{}", e.fillInStackTrace());
continue;
}
}
return list;
}
}
package com.zhiwei.searchhotcrawler.dao;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.BaiDuHotSearch;
import com.zhiwei.searchhotcrawler.cache.CacheManager;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class BaiduHotSearchDAO extends MongoDBTemplate{
public BaiduHotSearchDAO() {
super();
super.setDbName(Config.dbName);
// Date date = new Date();
// String time = TimeParse.dateFormartString(date, "yyyy");
// if(Calendar.MONTH<6){
// collWeiboName = Config.collWeiboName + time+"_01";
// }else{
// collWeiboName = Config.collWeiboName + time+"_06";
// }
// System.out.println("collWeiboName========="+collWeiboName);
super.setCollName(Config.collBaiduName);
}
/**
* @Title: addBaiduSearch
* @author hero
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param @param doc 设定文件
* @return void 返回类型
*/
public void addBaiduSearch(List<DBObject> list){
for(int i=0; i<3; i++){
try {
this.getReadColl().insert(list);
ZhiWeiTools.sleep(200);
break;
} catch (Exception e) {
e.printStackTrace();
continue;
}
}
}
/**
* @Title: getChangeCount
* @author hero
* @Description: TODO(查询据上次变化量)
* @param @param baiduHotSearch
* @param @return 设定文件
* @return int 返回类型
*/
public int getChangeCount(BaiDuHotSearch baiduHotSearch){
int result = 0;
DBObject query = new BasicDBObject();
query.put("kw", baiduHotSearch.getKw());
DBObject sort = new BasicDBObject();
sort.put("time", -1);
try {
DBCursor cur = this.getReadColl().find(query).sort(sort).limit(1);
while(cur.hasNext()){
DBObject doc = cur.next();
result = baiduHotSearch.getCount() - Integer.valueOf(doc.get("count").toString());
break;
}
cur.close();
} catch (Exception e) {
e.printStackTrace();
return result;
}
return result;
}
/**
* @Title: getWeiboHotOneHour
* @author hero
* @Description: 查询最近1小时内新增的微博热搜
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
// public List<DBObject> getWeiboHotOneHour(){
// List<DBObject> list = new ArrayList<DBObject>();
// Date date = new Date((new Date().getTime()-60*60*1000));
// DBObject query = new BasicDBObject();
// query.put("time", new BasicDBObject("$gte", date));
// query.put("changeCount", 0);
//
// try {
// DBCursor cur = this.getReadColl().find(query);
// while(cur.hasNext()){
// DBObject doc = cur.next();
// String name = doc.get("name").toString();
// if(CacheManager.getCacheByKey(name)==null){
// CacheManager.putCache(name, doc, 48*60*60*1000);
// list.add(doc);
// }
// }
// cur.close();
// } catch (Exception e) {
// return null;
// }
// return list;
// }
}
......@@ -29,11 +29,13 @@ public class MongoDBTemplate {
ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
if(reader==null)
{
reader = new MongoClient(address, Arrays.asList(credential));
// reader = new MongoClient(address, Arrays.asList(credential));
reader = new MongoClient(address);
}
if(writer==null)
{
writer = new MongoClient(address, Arrays.asList(credential));
// writer = new MongoClient(address, Arrays.asList(credential));
writer = new MongoClient(address);
}
} catch (MongoException e) {
e.printStackTrace();
......
......@@ -5,8 +5,11 @@ import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.cache.CacheListener;
import com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.DouyinHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SendWeiboHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SendZhihuHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SougoHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.UpdateWechatUserRun;
import com.zhiwei.searchhotcrawler.timer.WeiboHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.ZhihuHotSearchRun;
......@@ -24,6 +27,9 @@ public class HotSearchRun {
public void showTimer() {
scheduExec.scheduleAtFixedRate(new WeiboHotSearchRun(), 0, 1, TimeUnit.MINUTES);
scheduExec.scheduleAtFixedRate(new ZhihuHotSearchRun(), 0, 1 , TimeUnit.MINUTES);
scheduExec.scheduleAtFixedRate(new BaiduHotSearchRun(), 0, 5 , TimeUnit.MINUTES);
scheduExec.scheduleAtFixedRate(new SougoHotSearchRun(), 0, 1 , TimeUnit.MINUTES);
scheduExec.scheduleAtFixedRate(new DouyinHotSearchRun(), 0, 10 , TimeUnit.MINUTES);
}
public static void main(String[] args) {
......
package com.zhiwei.searchhotcrawler.timer;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.BaiDuHotSearch;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.BaiduHotSearchDAO;
public class BaiduHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(BaiduHotSearchRun.class);
private BaiduHotSearchDAO baiduHotSearchDAO = new BaiduHotSearchDAO();
@Override
public void run() {
logger.info("百度风云榜采集开始........");
List<BaiDuHotSearch> list = BaiDuHotSearchCrawler.baiduHotSearch();
logger.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<DBObject>();
for(BaiDuHotSearch baiduHotSearch : list){
int changeCount = baiduHotSearchDAO.getChangeCount(baiduHotSearch);
DBObject doc = new BasicDBObject();
doc.put("_id", baiduHotSearch.getId());
doc.put("kw", baiduHotSearch.getKw());
doc.put("everurl", baiduHotSearch.getEverurl());
doc.put("count", baiduHotSearch.getCount());
doc.put("day", baiduHotSearch.getDay());
doc.put("time", baiduHotSearch.getTime());
doc.put("changeCount", changeCount);
doc.put("rank", baiduHotSearch.getRank());
data.add(doc);
}
baiduHotSearchDAO.addBaiduSearch(data);
logger.info("百度风云榜采集结束........");
}
}
\ No newline at end of file
......@@ -66,7 +66,7 @@ public class SendWeiboHotSearchRun extends Thread {
* @return void 返回类型
*/
public static void sendTemplateByUserIds(String title, String time, String url) {
Map<String, Object> dataMap = new HashMap<>();
Map<String, Object> dataMap = new HashMap<String, Object>();
JSONObject first = new JSONObject();
first.put("value", "您好,有一条来自微博热搜榜的预警通知。");
dataMap.put("first", first);
......
......@@ -66,7 +66,7 @@ public class SendZhihuHotSearchRun extends Thread{
*/
public static void sendTemplateByUserIds(String title,String time, String url) {
Map<String, Object> dataMap = new HashMap<>();
Map<String, Object> dataMap = new HashMap<String, Object>();
JSONObject first = new JSONObject();
first.put("value", "您好,有一条来自知乎热搜榜的预警通知。");
dataMap.put("first", first);
......
#mongoIp=202.107.192.94
mongoIp=192.168.0.101
mongoPort=30000
mongoIp=192.168.0.247
mongoPort=27017
db.username=zzwno
db.paasword=zzwno1q2w3e4r
db.certifiedDB=admin
db.certifiedDB=oneDB
dbName=NetWork
collWeiboName=weibo_hotsearch2018_10
collZhihuName=zhihu_hotsearch2018_10
collWechatUserName=wechat_user
collBaiduName=baidu_hotsearch2019_07
collSougoName=sougo_hotsearch2019_07
collDouyinName=douyin_hotsearch2019_07
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment