Commit 7d05be2a by leiliangliang

更新微博搜索框采集程序类名

parent 007cfcb4
package com.zhiwei.searchhotcrawler.bean; package com.zhiwei.searchhotcrawler.bean;
/** /**
* @ClassName: WeiBoUser * @ClassName: WeiBoSearchBoxHotWords
* @Description: 微博用户 * @Description: 微博搜索框关键词实体类
* @author ll * @author ll
* @date 2021年5月27日 下午3:26:11 * @date 2021年11月12日 上午11:35:31
*/ */
import lombok.Data; import lombok.Data;
import lombok.ToString; import lombok.ToString;
import java.io.Serializable;
import java.util.Date; import java.util.Date;
@Data @Data
@ToString @ToString
public class WeiBoSearch { public class WeiBoSearchBoxHotWords {
/** /**
...@@ -40,10 +39,10 @@ public class WeiBoSearch { ...@@ -40,10 +39,10 @@ public class WeiBoSearch {
private Date time; private Date time;
public WeiBoSearch() { public WeiBoSearchBoxHotWords() {
} }
public WeiBoSearch(String name, String ext, String word,String type,Date time) { public WeiBoSearchBoxHotWords(String name, String ext, String word,String type,Date time) {
this.id = name+"_大家正在搜"; this.id = name+"_大家正在搜";
this.name = name; this.name = name;
......
...@@ -21,7 +21,7 @@ public class DBConfig { ...@@ -21,7 +21,7 @@ public class DBConfig {
collWechatUserName = conf.getProperty("collWechatUserName"); collWechatUserName = conf.getProperty("collWechatUserName");
weiBoMassageCollName = conf.getProperty("weiBoMassageCollName"); weiBoMassageCollName = conf.getProperty("weiBoMassageCollName");
weiBoUserCollName = conf.getProperty("weiBoUserCollName"); weiBoUserCollName = conf.getProperty("weiBoUserCollName");
weiBoSearchCollName = conf.getProperty("weiBoSearchCollName"); weiBoSearchBoxHotWordsCollName = conf.getProperty("weiBoSearchBoxHotWordsCollName");
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
...@@ -38,5 +38,5 @@ public class DBConfig { ...@@ -38,5 +38,5 @@ public class DBConfig {
public static String collWechatUserName; public static String collWechatUserName;
public static String weiBoMassageCollName; public static String weiBoMassageCollName;
public static String weiBoUserCollName; public static String weiBoUserCollName;
public static String weiBoSearchCollName; public static String weiBoSearchBoxHotWordsCollName;
} }
...@@ -5,8 +5,8 @@ import com.alibaba.fastjson.JSONObject; ...@@ -5,8 +5,8 @@ import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.WeiBoSearch; import com.zhiwei.searchhotcrawler.bean.WeiBoSearchBoxHotWords;
import com.zhiwei.searchhotcrawler.dao.WeiBoSearchDao; import com.zhiwei.searchhotcrawler.dao.WeiBoSearchBoxHotWordsDao;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
...@@ -19,17 +19,17 @@ import java.util.Objects; ...@@ -19,17 +19,17 @@ import java.util.Objects;
/** /**
* @author: ll * @author: ll
* @ClassName: weiBoSearchCrawlerTest * @ClassName: WeiBoSearchBoxHotWordsCrawler
* @Description: 移动端微博搜索框数据采集 * @Description: 移动端微博搜索框热词采集
* @date: 2021年11月12日 上午11:35:31 * @date: 2021年11月12日 上午11:35:31
* @Title: weiBoSearchCrawler * @Title: WeiBoSearchBoxHotWordsCrawler
*/ */
@Log4j2 @Log4j2
public class weiBoSearchCrawler { public class WeiBoSearchBoxHotWordsCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
static WeiBoSearchDao weiBoSearchDao = new WeiBoSearchDao(); static WeiBoSearchBoxHotWordsDao weiBoSearchDao = new WeiBoSearchBoxHotWordsDao();
public static void weiBoSearch(Date date){ public static void weiBoSearchBoxHotWords(Date date){
String url = "https://api.weibo.cn/2/guest/cardlist?networktype=wifi&image_type=heif&launchid=10000365--x&uicode=10000512&ul_hid=dfa73128-2705-4483-bda9-063cd789e44e&ul_sid=cef2538c-9b16-486e-b49f-db9c387b8384&moduleID=708&checktoken=ea8044f2cc7f0a44a9ad159526fd7186&wb_version=5293&refresh_type=0&c=android&s=0b69e4f6&ft=0&ua=Xiaomi-Redmi%208__weibo__11.11.1__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&did=0f607264fc6318a92b9e13c65db7cd3cbce74dcd&fid=231278_plaza&uid=2004639399897&v_f=2&v_p=89&from=10BB195010&gsid=_2AkMW0UMLf8NhqwFRmPwTz2LhZYR_ww_EieKgjbLQJRM3HRl-wT_nqksFtRV6PfAyN6rPTMzBcJo_-h6X0zli7DSuUqw-&imsi=&lang=zh_CN&page=1&skin=default&count=20&oldwm=20005_0002&sflag=1&containerid=231289type%3D1&ignore_inturrpted_error=true&no_location_permission=1&android_id=0febc80e083662a7&client_key=c2f5393732c75e52b85b1da27a8e20ae&need_new_pop=1&ul_ctime=1636683060289&need_head_cards=0&cum=53EC532B"; String url = "https://api.weibo.cn/2/guest/cardlist?networktype=wifi&image_type=heif&launchid=10000365--x&uicode=10000512&ul_hid=dfa73128-2705-4483-bda9-063cd789e44e&ul_sid=cef2538c-9b16-486e-b49f-db9c387b8384&moduleID=708&checktoken=ea8044f2cc7f0a44a9ad159526fd7186&wb_version=5293&refresh_type=0&c=android&s=0b69e4f6&ft=0&ua=Xiaomi-Redmi%208__weibo__11.11.1__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&did=0f607264fc6318a92b9e13c65db7cd3cbce74dcd&fid=231278_plaza&uid=2004639399897&v_f=2&v_p=89&from=10BB195010&gsid=_2AkMW0UMLf8NhqwFRmPwTz2LhZYR_ww_EieKgjbLQJRM3HRl-wT_nqksFtRV6PfAyN6rPTMzBcJo_-h6X0zli7DSuUqw-&imsi=&lang=zh_CN&page=1&skin=default&count=20&oldwm=20005_0002&sflag=1&containerid=231289type%3D1&ignore_inturrpted_error=true&no_location_permission=1&android_id=0febc80e083662a7&client_key=c2f5393732c75e52b85b1da27a8e20ae&need_new_pop=1&ul_ctime=1636683060289&need_head_cards=0&cum=53EC532B";
String htmlBody = null; String htmlBody = null;
...@@ -39,7 +39,7 @@ public class weiBoSearchCrawler { ...@@ -39,7 +39,7 @@ public class weiBoSearchCrawler {
htmlBody = response.body().string(); htmlBody = response.body().string();
} catch (Exception e) { } catch (Exception e) {
log.error("解析微博搜索时出现解析错误,页面结构有问题", e); log.error("解析微博搜索框热词时出现解析错误,页面结构有问题", e);
} }
if (htmlBody != null && htmlBody.contains("hotwords")) { if (htmlBody != null && htmlBody.contains("hotwords")) {
int num = ansysData(htmlBody, date); int num = ansysData(htmlBody, date);
...@@ -47,8 +47,7 @@ public class weiBoSearchCrawler { ...@@ -47,8 +47,7 @@ public class weiBoSearchCrawler {
break; break;
} }
} else { } else {
log.info("解析微博" + log.info("解析微博搜索框热词时出现解析错误,页面结构有问题");
"搜索时出现解析错误,页面结构有问题");
continue; continue;
} }
} }
...@@ -58,9 +57,9 @@ public class weiBoSearchCrawler { ...@@ -58,9 +57,9 @@ public class weiBoSearchCrawler {
private static int ansysData(String htmlBody, Date date) { private static int ansysData(String htmlBody, Date date) {
//使用静态WeiBoSearchDao,防止频繁连数据库 //使用静态WeiBoSearchDao,防止频繁连数据库
if (Objects.isNull(weiBoSearchDao)) { if (Objects.isNull(weiBoSearchDao)) {
weiBoSearchDao = new WeiBoSearchDao(); weiBoSearchDao = new WeiBoSearchBoxHotWordsDao();
} }
List<WeiBoSearch> list = new ArrayList<>(); List<WeiBoSearchBoxHotWords> list = new ArrayList<>();
try { try {
//解析htmlBody //解析htmlBody
JSONObject object = JSONObject.parseObject(htmlBody); JSONObject object = JSONObject.parseObject(htmlBody);
...@@ -75,16 +74,16 @@ public class weiBoSearchCrawler { ...@@ -75,16 +74,16 @@ public class weiBoSearchCrawler {
String word = card.getString("word"); String word = card.getString("word");
//获取标题 //获取标题
String name = card.getString("note"); String name = card.getString("note");
WeiBoSearch weiBoSearch = new WeiBoSearch(name, ext, word, type, date); WeiBoSearchBoxHotWords weiBoSearch = new WeiBoSearchBoxHotWords(name, ext, word, type, date);
list.add(weiBoSearch); list.add(weiBoSearch);
} }
} catch (Exception e) { } catch (Exception e) {
log.error("解析微博搜索时出现解析错误,数据不是json结构",e); log.error("解析微博搜索框热词时出现解析错误,数据不是json结构",e);
} }
log.info("{}, 此轮微博搜索采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 此轮微博搜索框热词采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
//数据传给dao //数据传给dao
weiBoSearchDao.addWeiBoUser(list); weiBoSearchDao.addWeiBoSearchBoxHotWords(list);
return list.size(); return list.size();
} }
......
...@@ -4,7 +4,7 @@ package com.zhiwei.searchhotcrawler.dao; ...@@ -4,7 +4,7 @@ package com.zhiwei.searchhotcrawler.dao;
import com.mongodb.MongoWriteException; import com.mongodb.MongoWriteException;
import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase; import com.mongodb.client.MongoDatabase;
import com.zhiwei.searchhotcrawler.bean.WeiBoSearch; import com.zhiwei.searchhotcrawler.bean.WeiBoSearchBoxHotWords;
import com.zhiwei.searchhotcrawler.config.DBConfig; import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate; import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
...@@ -16,13 +16,13 @@ import java.util.List; ...@@ -16,13 +16,13 @@ import java.util.List;
import static java.util.Objects.nonNull; import static java.util.Objects.nonNull;
@Log4j2 @Log4j2
public class WeiBoSearchDao { public class WeiBoSearchBoxHotWordsDao {
public static MongoDatabase mongoDatabase = MongoDBTemplate.getDB(DBConfig.dbName); public static MongoDatabase mongoDatabase = MongoDBTemplate.getDB(DBConfig.dbName);
public static MongoCollection mongoCollection; public static MongoCollection mongoCollection;
public WeiBoSearchDao() { public WeiBoSearchBoxHotWordsDao() {
String collName = DBConfig.weiBoSearchCollName; String collName = DBConfig.weiBoSearchBoxHotWordsCollName;
mongoCollection = mongoDatabase.getCollection(collName); mongoCollection = mongoDatabase.getCollection(collName);
//给数据表创建索引 //给数据表创建索引
MongoDBTemplate.createIndex(DBConfig.dbName, collName); MongoDBTemplate.createIndex(DBConfig.dbName, collName);
...@@ -32,9 +32,9 @@ public class WeiBoSearchDao { ...@@ -32,9 +32,9 @@ public class WeiBoSearchDao {
* 添加数据入库 * 添加数据入库
* @param weiBoSearch * @param weiBoSearch
*/ */
public void addWeiBoUser(List<WeiBoSearch> weiBoSearch){ public void addWeiBoSearchBoxHotWords(List<WeiBoSearchBoxHotWords> weiBoSearch){
for (WeiBoSearch search : weiBoSearch) { for (WeiBoSearchBoxHotWords search : weiBoSearch) {
try { try {
//获取时间 //获取时间
Date time = search.getTime(); Date time = search.getTime();
......
...@@ -35,19 +35,19 @@ public class ToutiaoHotSearchRun extends Thread{ ...@@ -35,19 +35,19 @@ public class ToutiaoHotSearchRun extends Thread{
private void getHotList() { private void getHotList() {
// log.info("今日头条热搜采集开始........"); log.info("今日头条热搜采集开始........");
// HotSearchListDAO hotSearchDAO = new HotSearchListDAO(); HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO(); HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<HotSearchList> list = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone(); List<HotSearchList> list = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone(new Date());
// log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
// if(list == null || list.size() == 0){ if(list == null || list.size() == 0){
// TipsUtils.sendTips("今日头条热搜",new Date()); TipsUtils.sendTips("今日头条热搜",new Date());
// }else { }else {
// List<Document> data = hotSearchCacheDAO.addData(list); List<Document> data = hotSearchCacheDAO.addData(list);
// hotSearchDAO.addHotSearchList(data); hotSearchDAO.addHotSearchList(data);
// TipsUtils.recoveryTips("今日头条热搜",new Date()); TipsUtils.recoveryTips("今日头条热搜",new Date());
// } }
// log.info("今日头条热搜采集结束........"); log.info("今日头条热搜采集结束........");
} }
} }
...@@ -20,5 +20,5 @@ topicCollName=topic_list ...@@ -20,5 +20,5 @@ topicCollName=topic_list
collWechatUserName=wechat_user collWechatUserName=wechat_user
weiBoMassageCollName=weibo_massage weiBoMassageCollName=weibo_massage
weiBoUserCollName=weibo_user weiBoUserCollName=weibo_user
weiBoSearchCollName=weibo_search weiBoSearchBoxHotWordsCollName=weiBoSearchBoxHotWord
# #
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment