Commit f5589b9f by zhiwei

添加今日头条热搜榜采集

parent b2d4bb96
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
<dependency> <dependency>
<groupId>org.mongodb</groupId> <groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId> <artifactId>mongo-java-driver</artifactId>
<version>3.6.3</version> <version>3.12.2</version>
</dependency> </dependency>
<dependency> <dependency>
......
...@@ -6,5 +6,6 @@ public enum HotSearchType { ...@@ -6,5 +6,6 @@ public enum HotSearchType {
知乎热搜, 知乎热搜,
抖音热搜, 抖音热搜,
搜狗微信热搜, 搜狗微信热搜,
微博话题 微博话题,
今日头条热搜
} }
...@@ -13,6 +13,7 @@ public class DBConfig { ...@@ -13,6 +13,7 @@ public class DBConfig {
conf.load(is); conf.load(is);
is.close(); is.close();
mongoUri = conf.getProperty("mongoUri"); mongoUri = conf.getProperty("mongoUri");
mongoLocalUri = conf.getProperty("mongoLocalUri");
dbName = conf.getProperty("dbName"); dbName = conf.getProperty("dbName");
searchCollName = conf.getProperty("searchCollName"); searchCollName = conf.getProperty("searchCollName");
searchCacheCollName = conf.getProperty("searchCacheCollName"); searchCacheCollName = conf.getProperty("searchCacheCollName");
...@@ -25,6 +26,7 @@ public class DBConfig { ...@@ -25,6 +26,7 @@ public class DBConfig {
public static String mongoUri; public static String mongoUri;
public static String mongoLocalUri;
public static String dbName; public static String dbName;
public static String searchCollName; public static String searchCollName;
public static String searchCacheCollName; public static String searchCacheCollName;
......
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.tools.URLCodeUtil;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.util.*;
/**
* @ProjectName: searchhotcrawler
* @ClassName: ToutiaoHotSearchCrawler
* @Author: hero
* @Description: 今日头条实时热搜榜单
* @Date: 2020/4/8 16:21
* @Version: 1.0
*/
@Log4j2
public class ToutiaoHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
* @return void 返回类型
*/
public static List<HotSearchList> toutiaoHotSearchByPhone(){
for(int count =0; count<=5; count++){
String url = "https://ib.snssdk.com/api/suggest_words/?business_id=10017";
Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1");
headerMap.put("referer","https://ib.snssdk.com/rogue/aladdin_landingpage/template/aladdin_landingpage/hot_words.html?isBrowser=true&traffic_source=");
String htmlBody;
try {
List<HotSearchList> result = new ArrayList<HotSearchList>();
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("words")){
try {
JSONArray words = JSONObject.parseObject(htmlBody).getJSONArray("data").getJSONObject(0).getJSONArray("words");
int rank = 1;
for(int i=0;i<words.size();i++){
try {
JSONObject word = words.getJSONObject(i);
String name = word.getString("word");
String link = "https://ib.snssdk.com/search/?keyword="+ URLCodeUtil.getURLEncode(name, "utf-8") +"&pd=synthesis&source=trending_list&traffic_source=";
Integer hotCount = word.getJSONObject("params").getInteger("fake_click_cnt");
Integer wordsType = word.getInteger("words_type");
String icon = getIcon(wordsType);
HotSearchList hotSearch = new HotSearchList(link, name, hotCount, true, rank, HotSearchType.今日头条热搜.name(), icon);
result.add(hotSearch);
rank++;
} catch (Exception e) {
log.error("解析今日头条实时热搜时出现解析错误",e);
continue;
}
}
return result;
} catch (Exception e) {
log.error("解析今日头条实时热搜时出现解析错误,数据不是json结构",e);
}
}else{
log.info("解析今日头条实时热搜时出现解析错误,页面结构有问题");
}
} catch (IOException e1) {
log.error("解析今日头条实时热搜时出现连接失败",e1);
}
}
return Collections.emptyList();
}
/**
* 热搜类型
* @param wordsType
* @return
*/
private static String getIcon(Integer wordsType){
String icon = "无";
if(Objects.nonNull(wordsType)){
switch (wordsType){
case 1:
icon = "新";
break;
case 2:
icon = "热";
break;
case 3:
icon = "爆";
break;
}
}
return icon;
}
}
package com.zhiwei.searchhotcrawler.dao; package com.zhiwei.searchhotcrawler.dao;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCollection;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.config.DBConfig; import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate; import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import org.bson.Document; import org.bson.Document;
import javax.print.Doc;
import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
...@@ -21,6 +21,37 @@ public class HotSearchCacheDAO { ...@@ -21,6 +21,37 @@ public class HotSearchCacheDAO {
private static MongoCollection collection = MongoDBTemplate.getCollection(DBConfig.dbName, DBConfig.searchCacheCollName); private static MongoCollection collection = MongoDBTemplate.getCollection(DBConfig.dbName, DBConfig.searchCacheCollName);
/**
* 存储数据
* @param dataList
* @return
*/
public List<Document> addData(List<HotSearchList> dataList){
List<Document> dataes = new ArrayList<>();
dataList.forEach(hotSearch ->{
Document document = new Document();
document.put("_id", hotSearch.getId());
document.put("name", hotSearch.getName());
document.put("url", hotSearch.getUrl());
document.put("count", hotSearch.getCount());
document.put("hot", hotSearch.getHot());
document.put("day", hotSearch.getDay());
document.put("time", hotSearch.getTime());
document.put("rank", hotSearch.getRank());
document.put("type", hotSearch.getType());
document.put("icon", hotSearch.getIcon());
addAndUpdateData(document);
dataes.add(document);
});
return dataes;
}
/** /**
* 添加及更新相应数据表中的数据 * 添加及更新相应数据表中的数据
* @param document * @param document
...@@ -114,6 +145,9 @@ public class HotSearchCacheDAO { ...@@ -114,6 +145,9 @@ public class HotSearchCacheDAO {
case "微博话题" : case "微博话题" :
duration = duration + 3; duration = duration + 3;
break; break;
case "今日头条热搜" :
duration = duration + 1;
break;
default : default :
duration = duration + 1; duration = duration + 1;
} }
......
...@@ -50,6 +50,7 @@ public class HotSearchRun { ...@@ -50,6 +50,7 @@ public class HotSearchRun {
new ZhihuHotSearchRun().start(); new ZhihuHotSearchRun().start();
new WeiboSuperTopicRun().start(); new WeiboSuperTopicRun().start();
new WeiboTopicRun().start(); new WeiboTopicRun().start();
new ToutiaoHotSearchRun().start();
} }
} }
//package com.zhiwei.searchhotcrawler.test;
//
//import com.mongodb.client.MongoCollection;
//import com.mongodb.client.MongoCursor;
//import com.mongodb.client.MongoDatabase;
//import com.zhiwei.searchhotcrawler.config.DBConfig;
//import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
//import com.zhiwei.tools.timeparse.TimeParse;
//import lombok.extern.log4j.Log4j2;
//import org.bson.Document;
//
//import java.util.*;
//
//@Log4j2
//public class TopicTest {
//
// private static MongoDatabase mongoDB = MongoDBTemplate.getDB(DBConfig.dbName);
//
// public static void main(String[] args) {
//// repairTopic();
//
// updateHotSearchCache();
// }
//
// /**
// * 修复热搜话题类型错误问题
// */
// public static void repairTopic(){
// MongoCollection mongoCollection = mongoDB.getCollection("hot_search_list2020_04");
// Document query = new Document("comment_count", new Document("$ne", null));
// query.put("type", "微博热搜");
// Date time = TimeParse.stringFormartDate("2020-03-12 18:00:00");
//
// long count = mongoCollection.countDocuments(query);
// log.info("count is {}", count);
// for(int i=0;i<55;i++){
// MongoCursor<Document> cursor = mongoCollection.find(query).limit(1000).iterator();
// while(cursor.hasNext()){
// Document update = cursor.next();
// update.put("type", "微博话题");
// Document query2 = new Document();
// query2.put("_id", update.getString("_id"));
// mongoCollection.findOneAndReplace(query2, update);
// time = update.getDate("time");
// }
// log.info("i========{}", i);
// }
// }
//
//
//
//
// public static void updateHotSearchCache(){
// for(int month = 3; month<=3; month++){
// String collectionName = "hot_search_list2020_0" + month;
// if(month>=10){
// collectionName = "hot_search_list2020_" + month;
// }
// log.info("collectionName is {}", collectionName);
// MongoCollection mongoCollection = mongoDB.getCollection(collectionName);
// MongoCollection mongoCollectionLocal = mongoDBLocal.getCollection("hot_search_cache");
//
// long count = mongoCollection.countDocuments();
// int pageCount = 10000;
// int pages = (int)Math.ceil((double)count/(double)pageCount);
// log.info("count====={},pages====={}",count, pages);
// Date date = TimeParse.stringFormartDate("2020-03-12 18:00:00");
//
// Map<String,Document> resultMap = new HashMap<>();
//
// for(int page = 1; page<pages; page++){
// Document query = new Document();
// if(page>1) {
// query.put("time", new Document("$gt", date));
// }
// log.info("page is {} ,query is {},coutn is {}", page ,query ,mongoCollection.countDocuments(query));
// MongoCursor<Document> cursor = mongoCollection.find(query).limit(pageCount).sort(new Document("time",1)).iterator();
// while(cursor.hasNext()){
// Document document = cursor.next();
// String name = document.getString("name");
// String type = document.getString("type");
// int lastRank = document.getInteger("rank")!=null?document.getInteger("rank"): -1;
// int lastCount = document.getInteger("count")!=null?document.getInteger("count"): -1;
// Date startTime = document.getDate("time");
// Date endTime = new Date(startTime.getTime() + (60 * 1000));
// String topicLead = document.getString("topic_lead")!=null?document.getString("topic_lead"):null;
// boolean hot = document.getBoolean("hot")!=null?document.getBoolean("hot"):true;
// String url = document.getString("url")!=null?document.getString("url"):null;
// String id = name + "_" + type;
//
// Document nowDoc = resultMap.get(id);
// if (Objects.nonNull(nowDoc)) {
// int highestRank = nowDoc.getInteger("highestRank");
// int highestCount = nowDoc.getInteger("highestCount");
// //判断最大热度值
// if (lastCount>0 && lastCount > highestCount) {
// highestCount = lastCount;
// }
// //判断最高排名
// if (lastRank>0 && lastRank < highestRank) {
// highestRank = lastRank;
// }
// //计算热搜时长
// int duration = nowDoc.getInteger("duration");
// int durationNow = getDuration(type, duration);
//
// //更新相应信息
// nowDoc.put("endTime", endTime);
// nowDoc.put("lastRank", lastRank);
// nowDoc.put("lastCount", lastCount);
// nowDoc.put("highestRank", highestRank);
// nowDoc.put("highestCount", highestCount);
// nowDoc.put("duration", durationNow);
// } else {
// nowDoc = new Document();
// int durationNow = getDuration(type, 0);
// nowDoc.put("_id", id);
// nowDoc.put("url", url);
// nowDoc.put("name", name);
// nowDoc.put("hot", hot);
// nowDoc.put("topicLead", topicLead);
// nowDoc.put("type", type);
// nowDoc.put("lastRank", lastRank);
// nowDoc.put("highestRank", lastRank);
// nowDoc.put("lastCount", lastCount);
// nowDoc.put("highestCount", lastCount);
// nowDoc.put("startTime", startTime);
// nowDoc.put("endTime", endTime);
// nowDoc.put("duration", durationNow);
// }
// resultMap.put(id, nowDoc);
// date = startTime;
// }
// cursor.close();
// }
//
// log.info("list size is {}", resultMap.size());
// for (Map.Entry<String,Document> entry: resultMap.entrySet()){
// String id = entry.getKey();
// Document document = entry.getValue();
// String name = document.getString("name");
// String type = document.getString("type");
// int lastRank = document.getInteger("lastRank");
// int lastCount = document.getInteger("lastCount");
// int highestRank = document.getInteger("highestRank");
// int highestCount = document.getInteger("highestCount");
// int duration = document.getInteger("duration");
//
// Document query = new Document("_id", id);
// Document resultDoc = (Document) mongoCollectionLocal.find(query).first();
// if(Objects.isNull(resultDoc)){
// mongoCollectionLocal.insertOne(document);
// }else{
//
// int highestRankResult = resultDoc.getInteger("highestRank");
// int highestCountResult = resultDoc.getInteger("highestCount");
// int durationResult = document.getInteger("duration");
// //判断最大热度值
// if (highestCountResult > highestCount) {
// highestCount = highestCountResult;
// }
// //判断最高排名
// if (highestRankResult < highestRank) {
// highestRank = highestRankResult;
// }
// //计算热搜时长
// int durationNow = duration + durationResult;
// Date endTime = document.getDate("endTime");
// //更新相应信息
// resultDoc.put("endTime", endTime);
// resultDoc.put("lastRank", lastRank);
// resultDoc.put("lastCount", lastCount);
// resultDoc.put("highestRank", highestRank);
// resultDoc.put("highestCount", highestCount);
// resultDoc.put("duration", durationNow);
// mongoCollectionLocal.findOneAndReplace(query, resultDoc);
// }
// }
// }
// }
//
//
// /**
// * 计算热搜时长
// * @param type
// * @param duration
// * @return
// */
// private static int getDuration(String type, int duration){
// switch (type){
// case "微博热搜" :
// duration = duration + 1;
// break;
// case "百度热搜" :
// duration = duration + 5;
// break;
// case "知乎热搜" :
// duration = duration + 10;
// break;
// case "抖音热搜" :
// duration = duration + 10;
// break;
// case "搜狗微信热搜" :
// duration = duration + 5;
// break;
// case "微博话题" :
// duration = duration + 3;
// break;
// default :
// duration = duration + 1;
// }
// return duration;
// }
//
//}
...@@ -44,24 +44,10 @@ public class BaiduHotSearchRun extends Thread{ ...@@ -44,24 +44,10 @@ public class BaiduHotSearchRun extends Thread{
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO(); HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> list = BaiDuHotSearchCrawler.baiduHotSearch(); List<HotSearchList> list = BaiDuHotSearchCrawler.baiduHotSearch();
log.info("{}, 此轮百度风云榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 此轮百度风云榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<Document> saveDataList = new ArrayList<>();
if(Objects.nonNull(list) && !list.isEmpty()) { if(Objects.nonNull(list) && !list.isEmpty()) {
list.forEach(baiduHotSearch ->{ List<Document> data = hotSearchCacheDAO.addData(list);
Document doc = new Document(); hotSearchDAO.addHotSearchList(data);
doc.put("_id", baiduHotSearch.getId());
doc.put("name", baiduHotSearch.getName());
doc.put("url", baiduHotSearch.getUrl());
doc.put("count", baiduHotSearch.getCount());
doc.put("day", baiduHotSearch.getDay());
doc.put("time", baiduHotSearch.getTime());
doc.put("rank", baiduHotSearch.getRank());
doc.put("type", baiduHotSearch.getType());
saveDataList.add(doc);
hotSearchCacheDAO.addAndUpdateData(doc);
});
} }
hotSearchDAO.addHotSearchList(saveDataList);
log.info("百度风云榜采集结束........"); log.info("百度风云榜采集结束........");
} }
......
...@@ -47,22 +47,8 @@ public class DouyinHotSearchRun extends Thread{ ...@@ -47,22 +47,8 @@ public class DouyinHotSearchRun extends Thread{
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO(); HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> list = DouyinHotSearchCrawler.getMobileDouyinHotList(); List<HotSearchList> list = DouyinHotSearchCrawler.getMobileDouyinHotList();
log.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<Document> data = new ArrayList<>(); List<Document> data = hotSearchCacheDAO.addData(list);
for(HotSearchList douyinHotSearch : list){ hotSearchDAO.addHotSearchList(data);
Document douyin = new Document();
douyin.put("_id", douyinHotSearch.getId());
douyin.put("name", douyinHotSearch.getName());
douyin.put("rank", douyinHotSearch.getRank());
douyin.put("count", douyinHotSearch.getCount());
douyin.put("hot", douyinHotSearch.getHot());
douyin.put("day", douyinHotSearch.getDay());
douyin.put("time", douyinHotSearch.getTime());
douyin.put("url", null);
douyin.put("type", douyinHotSearch.getType());
data.add(douyin);
hotSearchDAO.addHotSearch(douyin);
hotSearchCacheDAO.addAndUpdateData(douyin);
}
log.info("抖音热搜榜采集结束........"); log.info("抖音热搜榜采集结束........");
} }
......
...@@ -6,6 +6,7 @@ import java.util.List; ...@@ -6,6 +6,7 @@ import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import org.bson.Document; import org.bson.Document;
import org.slf4j.Logger; import org.slf4j.Logger;
...@@ -15,7 +16,6 @@ import com.mongodb.BasicDBObject; ...@@ -15,7 +16,6 @@ import com.mongodb.BasicDBObject;
import com.mongodb.DBObject; import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler; import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2 @Log4j2
...@@ -43,19 +43,7 @@ public class SougoHotSearchRun extends Thread { ...@@ -43,19 +43,7 @@ public class SougoHotSearchRun extends Thread {
log.info("搜狗微信采集开始........"); log.info("搜狗微信采集开始........");
List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch(); List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch();
log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<Document> data = new ArrayList<>(); List<Document> data = hotSearchCacheDAO.addData(list);
for(HotSearchList sougoHotSearch : list){
Document doc = new Document();
doc.put("_id", sougoHotSearch.getId());
doc.put("name", sougoHotSearch.getName());
doc.put("url", sougoHotSearch.getUrl());
doc.put("day", sougoHotSearch.getDay());
doc.put("time", sougoHotSearch.getTime());
doc.put("rank", sougoHotSearch.getRank());
doc.put("type", sougoHotSearch.getType());
data.add(doc);
hotSearchCacheDAO.addAndUpdateData(doc);
}
hotSearchDAO.addHotSearchList(data); hotSearchDAO.addHotSearchList(data);
log.info("搜狗微信采集结束........"); log.info("搜狗微信采集结束........");
} }
......
package com.zhiwei.searchhotcrawler.timer;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.ToutiaoHotSearchCrawler;
import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Log4j2
public class ToutiaoHotSearchRun extends Thread{
@Override
public void run() {
boolean f = true;
while(f) {
try {
getHotList();
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*1000);
}
ZhiWeiTools.sleep(50);
}
}
private void getHotList() {
log.info("今日头条热搜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
List<HotSearchList> list = ToutiaoHotSearchCrawler.toutiaoHotSearchByPhone();
log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
log.info("今日头条热搜采集结束........");
}
}
...@@ -33,30 +33,15 @@ public class WeiboHotSearchRun extends Thread{ ...@@ -33,30 +33,15 @@ public class WeiboHotSearchRun extends Thread{
private void getHotList() { private void getHotList() {
log.info("微博话题采集开始........"); log.info("微博热搜采集开始........");
HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO(); HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO(); HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch(); // List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch();
List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearchByPhone(); List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearchByPhone();
log.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<Document> data = new ArrayList<>(); List<Document> data = hotSearchCacheDAO.addData(list);
for(HotSearchList weiboHotSearch : list){ hotSearchDAO.addHotSearchList(data);
Document doc = new Document(); log.info("微博热搜采集结束........");
doc.put("_id", weiboHotSearch.getId());
doc.put("name", weiboHotSearch.getName());
doc.put("url", weiboHotSearch.getUrl());
doc.put("count", weiboHotSearch.getCount());
doc.put("hot", weiboHotSearch.getHot());
doc.put("day", weiboHotSearch.getDay());
doc.put("time", weiboHotSearch.getTime());
doc.put("rank", weiboHotSearch.getRank());
doc.put("type", weiboHotSearch.getType());
doc.put("icon", weiboHotSearch.getIcon());
data.add(doc);
hotSearchCacheDAO.addAndUpdateData(doc);
}
weiboHotSearchDAO.addHotSearchList(data);
log.info("微博话题采集结束........");
} }
} }
...@@ -45,22 +45,8 @@ public class ZhihuHotSearchRun extends Thread{ ...@@ -45,22 +45,8 @@ public class ZhihuHotSearchRun extends Thread{
List<HotSearchList> mobilelist = ZhihuHotSearchCrawler.getMobileZhihuHotList(); List<HotSearchList> mobilelist = ZhihuHotSearchCrawler.getMobileZhihuHotList();
list.addAll(mobilelist); list.addAll(mobilelist);
log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0)); log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<Document> dataList = new ArrayList<>(); List<Document> data = hotSearchCacheDAO.addData(list);
for(HotSearchList zhihuHotSearch : list){ hotSearchDAO.addHotSearchList(data);
Document zhihu = new Document();
zhihu.put("_id", zhihuHotSearch.getId());
zhihu.put("name", zhihuHotSearch.getName());
zhihu.put("url", zhihuHotSearch.getUrl());
zhihu.put("count", zhihuHotSearch.getCount());
zhihu.put("hot", zhihuHotSearch.getHot());
zhihu.put("day", zhihuHotSearch.getDay());
zhihu.put("time", zhihuHotSearch.getTime());
zhihu.put("rank", zhihuHotSearch.getRank());
zhihu.put("type", zhihuHotSearch.getType());
dataList.add(zhihu);
hotSearchCacheDAO.addAndUpdateData(zhihu);
}
hotSearchDAO.addHotSearchList(dataList);
log.info("知乎话题采集结束........"); log.info("知乎话题采集结束........");
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment