Commit 7d05be2a by leiliangliang

更新微博搜索框采集程序类名

parent 007cfcb4
package com.zhiwei.searchhotcrawler.bean;
/**
* @ClassName: WeiBoUser
* @Description: 微博用户
* @ClassName: WeiBoSearchBoxHotWords
* @Description: 微博搜索框关键词实体类
* @author ll
* @date 2021年5月27日 下午3:26:11
* @date 2021年11月12日 上午11:35:31
*/
import lombok.Data;
import lombok.ToString;
import java.io.Serializable;
import java.util.Date;
@Data
@ToString
public class WeiBoSearch {
public class WeiBoSearchBoxHotWords {
/**
......@@ -40,10 +39,10 @@ public class WeiBoSearch {
private Date time;
public WeiBoSearch() {
public WeiBoSearchBoxHotWords() {
}
public WeiBoSearch(String name, String ext, String word,String type,Date time) {
public WeiBoSearchBoxHotWords(String name, String ext, String word,String type,Date time) {
this.id = name+"_大家正在搜";
this.name = name;
......
......@@ -21,7 +21,7 @@ public class DBConfig {
collWechatUserName = conf.getProperty("collWechatUserName");
weiBoMassageCollName = conf.getProperty("weiBoMassageCollName");
weiBoUserCollName = conf.getProperty("weiBoUserCollName");
weiBoSearchCollName = conf.getProperty("weiBoSearchCollName");
weiBoSearchBoxHotWordsCollName = conf.getProperty("weiBoSearchBoxHotWordsCollName");
} catch (Exception e) {
e.printStackTrace();
......@@ -38,5 +38,5 @@ public class DBConfig {
public static String collWechatUserName;
public static String weiBoMassageCollName;
public static String weiBoUserCollName;
public static String weiBoSearchCollName;
public static String weiBoSearchBoxHotWordsCollName;
}
......@@ -5,8 +5,8 @@ import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.WeiBoSearch;
import com.zhiwei.searchhotcrawler.dao.WeiBoSearchDao;
import com.zhiwei.searchhotcrawler.bean.WeiBoSearchBoxHotWords;
import com.zhiwei.searchhotcrawler.dao.WeiBoSearchBoxHotWordsDao;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
......@@ -19,17 +19,17 @@ import java.util.Objects;
/**
* @author: ll
* @ClassName: weiBoSearchCrawlerTest
* @Description: 移动端微博搜索框数据采集
* @ClassName: WeiBoSearchBoxHotWordsCrawler
* @Description: 移动端微博搜索框热词采集
* @date: 2021年11月12日 上午11:35:31
* @Title: weiBoSearchCrawler
* @Title: WeiBoSearchBoxHotWordsCrawler
*/
@Log4j2
public class weiBoSearchCrawler {
public class WeiBoSearchBoxHotWordsCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
static WeiBoSearchDao weiBoSearchDao = new WeiBoSearchDao();
static WeiBoSearchBoxHotWordsDao weiBoSearchDao = new WeiBoSearchBoxHotWordsDao();
public static void weiBoSearch(Date date){
public static void weiBoSearchBoxHotWords(Date date){
String url = "https://api.weibo.cn/2/guest/cardlist?networktype=wifi&image_type=heif&launchid=10000365--x&uicode=10000512&ul_hid=dfa73128-2705-4483-bda9-063cd789e44e&ul_sid=cef2538c-9b16-486e-b49f-db9c387b8384&moduleID=708&checktoken=ea8044f2cc7f0a44a9ad159526fd7186&wb_version=5293&refresh_type=0&c=android&s=0b69e4f6&ft=0&ua=Xiaomi-Redmi%208__weibo__11.11.1__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&did=0f607264fc6318a92b9e13c65db7cd3cbce74dcd&fid=231278_plaza&uid=2004639399897&v_f=2&v_p=89&from=10BB195010&gsid=_2AkMW0UMLf8NhqwFRmPwTz2LhZYR_ww_EieKgjbLQJRM3HRl-wT_nqksFtRV6PfAyN6rPTMzBcJo_-h6X0zli7DSuUqw-&imsi=&lang=zh_CN&page=1&skin=default&count=20&oldwm=20005_0002&sflag=1&containerid=231289type%3D1&ignore_inturrpted_error=true&no_location_permission=1&android_id=0febc80e083662a7&client_key=c2f5393732c75e52b85b1da27a8e20ae&need_new_pop=1&ul_ctime=1636683060289&need_head_cards=0&cum=53EC532B";
String htmlBody = null;
......@@ -39,7 +39,7 @@ public class weiBoSearchCrawler {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析微博搜索时出现解析错误,页面结构有问题", e);
log.error("解析微博搜索框热词时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("hotwords")) {
int num = ansysData(htmlBody, date);
......@@ -47,8 +47,7 @@ public class weiBoSearchCrawler {
break;
}
} else {
log.info("解析微博" +
"搜索时出现解析错误,页面结构有问题");
log.info("解析微博搜索框热词时出现解析错误,页面结构有问题");
continue;
}
}
......@@ -58,9 +57,9 @@ public class weiBoSearchCrawler {
private static int ansysData(String htmlBody, Date date) {
//使用静态WeiBoSearchDao,防止频繁连数据库
if (Objects.isNull(weiBoSearchDao)) {
weiBoSearchDao = new WeiBoSearchDao();
weiBoSearchDao = new WeiBoSearchBoxHotWordsDao();
}
List<WeiBoSearch> list = new ArrayList<>();
List<WeiBoSearchBoxHotWords> list = new ArrayList<>();
try {
//解析htmlBody
JSONObject object = JSONObject.parseObject(htmlBody);
......@@ -75,16 +74,16 @@ public class weiBoSearchCrawler {
String word = card.getString("word");
//获取标题
String name = card.getString("note");
WeiBoSearch weiBoSearch = new WeiBoSearch(name, ext, word, type, date);
WeiBoSearchBoxHotWords weiBoSearch = new WeiBoSearchBoxHotWords(name, ext, word, type, date);
list.add(weiBoSearch);
}
} catch (Exception e) {
log.error("解析微博搜索时出现解析错误,数据不是json结构",e);
log.error("解析微博搜索框热词时出现解析错误,数据不是json结构",e);
}
log.info("{}, 此轮微博搜索采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
log.info("{}, 此轮微博搜索框热词采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
//数据传给dao
weiBoSearchDao.addWeiBoUser(list);
weiBoSearchDao.addWeiBoSearchBoxHotWords(list);
return list.size();
}
......
......@@ -4,7 +4,7 @@ package com.zhiwei.searchhotcrawler.dao;
import com.mongodb.MongoWriteException;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.zhiwei.searchhotcrawler.bean.WeiBoSearch;
import com.zhiwei.searchhotcrawler.bean.WeiBoSearchBoxHotWords;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import lombok.extern.log4j.Log4j2;
......@@ -16,13 +16,13 @@ import java.util.List;
import static java.util.Objects.nonNull;
@Log4j2
public class WeiBoSearchDao {
public class WeiBoSearchBoxHotWordsDao {
public static MongoDatabase mongoDatabase = MongoDBTemplate.getDB(DBConfig.dbName);
public static MongoCollection mongoCollection;
public WeiBoSearchDao() {
String collName = DBConfig.weiBoSearchCollName;
public WeiBoSearchBoxHotWordsDao() {
String collName = DBConfig.weiBoSearchBoxHotWordsCollName;
mongoCollection = mongoDatabase.getCollection(collName);
//给数据表创建索引
MongoDBTemplate.createIndex(DBConfig.dbName, collName);
......@@ -32,9 +32,9 @@ public class WeiBoSearchDao {
* 添加数据入库
* @param weiBoSearch
*/
public void addWeiBoUser(List<WeiBoSearch> weiBoSearch){
public void addWeiBoSearchBoxHotWords(List<WeiBoSearchBoxHotWords> weiBoSearch){
for (WeiBoSearch search : weiBoSearch) {
for (WeiBoSearchBoxHotWords search : weiBoSearch) {
try {
//获取时间
Date time = search.getTime();
......
......@@ -35,19 +35,19 @@ public class ToutiaoHotSearchRun extends Thread{
private void getHotList() {
// log.info("今日头条热搜采集开始........");
// HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<HotSearchList> list = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone();
// log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// if(list == null || list.size() == 0){
// TipsUtils.sendTips("今日头条热搜",new Date());
// }else {
// List<Document> data = hotSearchCacheDAO.addData(list);
// hotSearchDAO.addHotSearchList(data);
// TipsUtils.recoveryTips("今日头条热搜",new Date());
// }
// log.info("今日头条热搜采集结束........");
log.info("今日头条热搜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> list = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone(new Date());
log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
if(list == null || list.size() == 0){
TipsUtils.sendTips("今日头条热搜",new Date());
}else {
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
TipsUtils.recoveryTips("今日头条热搜",new Date());
}
log.info("今日头条热搜采集结束........");
}
}
......@@ -20,5 +20,5 @@ topicCollName=topic_list
collWechatUserName=wechat_user
weiBoMassageCollName=weibo_massage
weiBoUserCollName=weibo_user
weiBoSearchCollName=weibo_search
weiBoSearchBoxHotWordsCollName=weiBoSearchBoxHotWord
#
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment