Commit 6f72ce80 by zhiwei

修复按月分库失败bug及添加索引

parent db96247a
......@@ -108,7 +108,7 @@ public class WeiboHuatiCrawler {
id = data.getString("page_id");
score = data.getString("score");
desc1 = data.getString("desc1");
fensi = desc1.replaceAll("影响力.*", "");
fensi = desc1.replaceAll(".*影响力|粉丝", "").trim();
url = data.getString("link");
WeiboTopic topic = new WeiboTopic(url, topicName, toprank, score, fensi, type);
......
......@@ -28,6 +28,25 @@ public class HotSearchListDAO extends MongoDBTemplate{
String month = time.substring(5,7);
String collName = Config.searchCollName + year + "_" + month;
super.setCollName(collName);
DBObject countIndexDoc = new BasicDBObject();
countIndexDoc.put("count", -1);
DBObject timeIndexDoc = new BasicDBObject();
timeIndexDoc.put("time", -1);
DBObject rankIndexDoc = new BasicDBObject();
rankIndexDoc.put("rank", -1);
DBObject nameIndexDoc = new BasicDBObject();
nameIndexDoc.put("name", -1);
DBObject typeIndexDoc = new BasicDBObject();
typeIndexDoc.put("type", -1);
try {
super.getReadColl().createIndex(countIndexDoc, new BasicDBObject("name", "count_desc"));
super.getReadColl().createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
super.getReadColl().createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
super.getReadColl().createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
super.getReadColl().createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
} catch (Exception e) {
e.printStackTrace();
}
}
/**
......
......@@ -7,6 +7,7 @@ import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
......@@ -23,6 +24,26 @@ public class WeiboTopicDAO extends MongoDBTemplate{
String month = time.substring(5,7);
String collName = Config.topicCollName + year + "_" + month;
super.setCollName(collName);
DBObject countIndexDoc = new BasicDBObject();
countIndexDoc.put("score_num", -1);
DBObject timeIndexDoc = new BasicDBObject();
timeIndexDoc.put("time", -1);
DBObject rankIndexDoc = new BasicDBObject();
rankIndexDoc.put("rank", -1);
DBObject nameIndexDoc = new BasicDBObject();
nameIndexDoc.put("name", -1);
DBObject typeIndexDoc = new BasicDBObject();
typeIndexDoc.put("type", -1);
try {
super.getReadColl().createIndex(countIndexDoc, new BasicDBObject("name", "score_desc"));
super.getReadColl().createIndex(timeIndexDoc, new BasicDBObject("name", "time_desc"));
super.getReadColl().createIndex(rankIndexDoc, new BasicDBObject("name", "rank_desc"));
super.getReadColl().createIndex(nameIndexDoc, new BasicDBObject("name", "name_desc"));
super.getReadColl().createIndex(typeIndexDoc, new BasicDBObject("name", "type_desc"));
} catch (Exception e) {
e.printStackTrace();
}
}
/**
......
......@@ -28,15 +28,15 @@ public class HotSearchListTest{
ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
Mongo mongo = new MongoClient(address, Arrays.asList(credential));
DB db = mongo.getDB("NetWork");
DBCollection coll = db.getCollection("weibo_hotsearch2018_10");
DB db = mongo.getDB("hot_search_list");
DBCollection coll = db.getCollection("hot_search_list2019_09");
MongoCredential credentialNew = MongoCredential.createCredential("datapush", "admin", "4d8ce5c42073c".toCharArray());
ServerAddress addressNew = new ServerAddress(Config.mongoIp, Config.mongoPort);
Mongo mongoNew = new MongoClient(address, Arrays.asList(credentialNew));
DB dbNew = mongoNew.getDB("hot_search_list");
Map<String,String> timLine = TimeParse.getTimeMap("2018-02-01 00:00:00", "2019-04-30 23:59:59", "MM", 1);
Map<String,String> timLine = TimeParse.getTimeMap("2019-10-02 00:00:00", "2019-10-09 23:59:59", "dd", 1);
timLine.forEach((start, end) ->{
......@@ -70,37 +70,34 @@ public class HotSearchListTest{
e.printStackTrace();
}
// DBObject query = new BasicDBObject(new BasicDBObject("time",
// new BasicDBObject("$gte",startDate).append("$lte", endDate)));
// DBCursor cur = coll.find(query);
// System.out.println(query +"======="+ cur.count());
// List<DBObject> dataList = new ArrayList<>();
// int i = 0;
// while(cur.hasNext()) {
// DBObject doc = cur.next();
// DBObject zhihu = new BasicDBObject();
// zhihu.put("_id", doc.get("_id"));
// zhihu.put("name", doc.get("name"));
// zhihu.put("url", doc.get("url"));
// zhihu.put("count", doc.get("count"));
// zhihu.put("hot", doc.get("hot"));
// zhihu.put("day", doc.get("day"));
// zhihu.put("time", doc.get("time"));
// zhihu.put("changeCount", doc.get("changeCount"));
// zhihu.put("rank", doc.get("rank"));
// zhihu.put("type", HotSearchType.微博热搜.name());
//
// collNew.save(zhihu);
// dataList.add(zhihu);
// }
// System.out.println(collName +"数据量大小" +dataList.size());
// cur.close();
DBObject query = new BasicDBObject(new BasicDBObject("time",
new BasicDBObject("$gte",startDate).append("$lte", endDate)));
System.out.println(query);
int i = 0;
DBCursor cur = coll.find(query).skip(i);
System.out.println(query +"======="+ cur.count());
List<DBObject> dataList = new ArrayList<>();
while(cur.hasNext()) {
DBObject doc = cur.next();
try {
System.out.println(i+"====");
collNew.save(doc);
i++;
// coll.remove(doc);
} catch (Exception e2) {
e2.printStackTrace();
}
dataList.add(doc);
}
System.out.println(collName +"数据量大小" +dataList.size());
cur.close();
// if(!dataList.isEmpty()) {
// collNew.insert(dataList);
// try {
// collNew.insert(dataList);
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
});
mongo.close();
}
......
......@@ -19,7 +19,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public class BaiduHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(BaiduHotSearchRun.class);
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
@Override
public void run() {
......@@ -38,6 +37,7 @@ public class BaiduHotSearchRun extends Thread{
private void getHotList() {
logger.info("百度风云榜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
List<HotSearchList> list = BaiDuHotSearchCrawler.baiduHotSearch();
logger.info("{}, 此轮百度风云榜采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> saveDataList = new ArrayList<>();
......
......@@ -18,8 +18,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public class DouyinHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(DouyinHotSearchRun.class);
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
@Override
public void run() {
......@@ -42,6 +40,7 @@ public class DouyinHotSearchRun extends Thread{
*/
private void getHotList() {
logger.info("抖音热搜榜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
List<HotSearchList> list = DouyinHotSearchCrawler.getMobileDouyinHotList();
logger.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
......
......@@ -17,8 +17,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public class SougoHotSearchRun extends Thread {
private static Logger logger = LoggerFactory.getLogger(SougoHotSearchRun.class);
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
@Override
public void run() {
......@@ -36,6 +34,7 @@ public class SougoHotSearchRun extends Thread {
private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
logger.info("搜狗微信采集开始........");
List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch();
logger.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
......
......@@ -18,7 +18,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public class WeiboHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(WeiboHotSearchRun.class);
private HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO();
@Override
public void run() {
......@@ -37,6 +36,7 @@ public class WeiboHotSearchRun extends Thread{
private void getHotList() {
logger.info("微博话题采集开始........");
HotSearchListDAO weiboHotSearchDAO = new HotSearchListDAO();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch();
List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearchByPhone();
logger.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
......
......@@ -18,7 +18,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public class WeiboTopicRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(WeiboTopicRun.class);
private WeiboTopicDAO weiboTopicDAO = new WeiboTopicDAO();
@Override
public void run() {
......@@ -36,11 +35,13 @@ public class WeiboTopicRun extends Thread{
private void getTopicList() {
WeiboTopicDAO weiboTopicDAO = new WeiboTopicDAO();
logger.info("微博超话采集开始........");
List<WeiboTopic> list = WeiboHuatiCrawler.startCrawler();
logger.info("{}, 微博超话此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<>();
for(WeiboTopic topic : list){
System.out.println("topic::::"+topic);
DBObject doc = new BasicDBObject();
doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName());
......
......@@ -17,7 +17,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public class ZhihuHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(ZhihuHotSearchRun.class);
private HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
@Override
public void run() {
......@@ -36,6 +35,8 @@ public class ZhihuHotSearchRun extends Thread{
private void getHotList() {
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
logger.info("知乎话题采集开始...,当前线程名字:{}", Thread.currentThread().getName());
List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList();
List<HotSearchList> mobilelist = ZhihuHotSearchCrawler.getMobileZhihuHotList();
......
registry=zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
group=hangzhou
#registry=zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
#group=hangzhou
########################################################
#registry=zookeeper://192.168.0.36:2181
#group=local
\ No newline at end of file
registry=zookeeper://192.168.0.36:2181
group=local
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment