Commit 2ea6b9c5 by zhiwei

处理知乎热搜排名有两个0的情况及添加热度值

parent 9e168863
......@@ -21,7 +21,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
/**
* @ClassName: ZhihuHotCrawler
* @Description: TODO(知乎热搜采集程序)
* @Description: 知乎热搜采集程序
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
......@@ -94,14 +94,33 @@ public class ZhihuHotSearchCrawler {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null && htmlBody.contains("author")){
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray words = topSearch.getJSONArray("data");
JSONArray dataJson = topSearch.getJSONArray("data");
String link = null;
String displayQuery = null;
for (int i = 0; i < words.size(); i++) {
JSONObject word = words.getJSONObject(i).getJSONObject("target");
displayQuery = word.getString("title");
link = "https://www.zhihu.com/question/"+word.getLongValue("id");
HotSearchList zhihu = new HotSearchList(link, displayQuery, null, i, HotSearchType.知乎热搜.name());
Integer hotCount = null;
String hotText = null;
for (int i = 0; i < dataJson.size(); i++) {
JSONObject data = dataJson.getJSONObject(i).getJSONObject("target");
displayQuery = data.getString("title");
link = "https://www.zhihu.com/question/" + data.getLongValue("id");
hotText = dataJson.getJSONObject(i).getString("detail_text");
//计算热度
try {
if(hotText.contains("万")){
hotText = hotText.replaceAll("万.*", "").trim();
hotCount = (int)(Double.parseDouble(hotText)*10000);
}else if(hotText.contains("亿")){
hotText = hotText.replaceAll("亿.*", "").trim();
hotCount = (int)(Double.parseDouble(hotText)*10000000);
}else{
hotCount = Integer.getInteger(hotText);
}
}catch (Exception e){
e.printStackTrace();
}
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i+1, HotSearchType.知乎热搜.name());
log.info(zhihu);
list.add(zhihu);
}
}
......
package com.zhiwei.searchhotcrawler.dao;
import java.util.Collections;
import java.util.List;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import lombok.extern.log4j.Log4j2;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import org.bson.Document;
@Log4j2
public class WechatUserDao{
public static MongoCollection mongoCollection = MongoDBTemplate.getCollection(DBConfig.dbName, DBConfig.collWechatUserName);
/**
* 添加分组用户
* @param userlist
* @param groupName
* @param groupId
*/
public void addWechatUser(List<String> userlist, String groupName, Integer groupId){
for(int i=0; i<3; i++){
try {
Document query = new Document();
query.put("_id", groupId+"-"+groupName);
Document doc = new Document();
doc.put("_id", groupId+"-"+groupName);
doc.put("groupId", groupId);
doc.put("groupName", groupName);
doc.put("user", userlist);
mongoCollection.findOneAndReplace(query, doc);
break;
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
}
}
/**
* 根据分组名称查询分组用户
* @param group
* @return
*/
@SuppressWarnings("unchecked")
public List<String> getWechatUserByGroup(String group){
try {
Document query = new Document();
query.put("groupName", group);
Document doc = (Document) mongoCollection.find(query).first();
if(doc != null){
return (List<String>)doc.get("user");
}
} catch (Exception e) {
log.error("存储数据时出错,错误为:{}", e);
}
return Collections.emptyList();
}
}
......@@ -20,8 +20,8 @@ public class HotSearchRun {
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
// new UpdateWechatUserRun().start();
// ZhiWeiTools.sleep(10000);
new UpdateWechatUserRun().start();
ZhiWeiTools.sleep(10000);
// new CacheListener().startListen();
//推送程序启动
// new SendWeiboHotSearchRun().start();
......
//package com.zhiwei.searchhotcrawler.timer;
//
//import java.util.Calendar;
//import java.util.List;
//import java.util.Map;
//import java.util.Map.Entry;
//
//import lombok.extern.log4j.Log4j2;
//
//import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//@Log4j2
//public class UpdateWechatUserRun extends Thread{
// private WechatUserDao wechatUserDao = new WechatUserDao();
// @Override
// public void run() {
// log.info("开始更新用户数据");
// while(true) {
// try {
// Calendar calendar = Calendar.getInstance();
// int hour = calendar.get(Calendar.HOUR_OF_DAY);
// if(hour > 6 ){
// Map<String,Integer> groupMap = WechatCodeUtil.getAllGroupIp();
// log.info("此公众号的分组数量为:::{}", groupMap.size());
// if(!groupMap.isEmpty() && groupMap!=null){
// for(Entry<String,Integer> group : groupMap.entrySet()){
// log.info("此公众号的分组名称及IP为:::{},{}", group.getKey(), group.getValue());
// List<String> userList = WechatCodeUtil.getUserListByGroupId(group.getValue());
// log.info("{},此分组下的用户数量为::{}", group.getKey(), userList.size());
// if(userList!=null && !userList.isEmpty()){
// wechatUserDao.addWechatUser(userList, group.getKey(), group.getValue());
// }
// }
// }
// }
// ZhiWeiTools.sleep(1*60*60*1000);
// } catch (Exception e) {
// log.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
// ZhiWeiTools.sleep(1*60*60*1000);
// continue;
// }
// }
// }
//
//
//}
package com.zhiwei.searchhotcrawler.timer;
import java.util.Calendar;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.searchhotcrawler.dao.WechatUserDao;
import lombok.extern.log4j.Log4j2;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
@Log4j2
public class UpdateWechatUserRun extends Thread{
private WechatUserDao wechatUserDao = new WechatUserDao();
@Override
public void run() {
log.info("开始更新用户数据");
while(true) {
try {
Calendar calendar = Calendar.getInstance();
int hour = calendar.get(Calendar.HOUR_OF_DAY);
if(hour > 6 ){
Map<String,Integer> groupMap = WechatCodeUtil.getAllGroupIp();
log.info("此公众号的分组数量为:::{}", groupMap.size());
if(!groupMap.isEmpty() && groupMap!=null){
for(Entry<String,Integer> group : groupMap.entrySet()){
log.info("此公众号的分组名称及IP为:::{},{}", group.getKey(), group.getValue());
List<String> userList = WechatCodeUtil.getUserListByGroupId(group.getValue());
log.info("{},此分组下的用户数量为::{}", group.getKey(), userList.size());
if(userList!=null && !userList.isEmpty()){
wechatUserDao.addWechatUser(userList, group.getKey(), group.getValue());
}
}
}
}
ZhiWeiTools.sleep(1*60*60*1000);
} catch (Exception e) {
log.debug("知乎热搜推送出现问题,问题为:::{}",e.fillInStackTrace());
ZhiWeiTools.sleep(1*60*60*1000);
continue;
}
}
}
}
......@@ -33,15 +33,12 @@ public class WeiboHotSearchRun extends Thread{
private void getHotList() {
log.info("微博热搜采集开始........");
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch();
List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearchByPhone();
log.info("{}, 微博此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
log.info("微博热搜采集结束........");
}
}
......@@ -41,13 +41,12 @@ public class ZhihuHotSearchRun extends Thread{
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
log.info("知乎话题采集开始...,当前线程名字:{}", Thread.currentThread().getName());
List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList();
List<HotSearchList> mobilelist = ZhihuHotSearchCrawler.getMobileZhihuHotList();
list.addAll(mobilelist);
// List<HotSearchList> list = ZhihuHotSearchCrawler.getZhihuHotList();
List<HotSearchList> list = ZhihuHotSearchCrawler.getMobileZhihuHotList();
log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
log.info("知乎话题采集结束........");
// List<Document> data = hotSearchCacheDAO.addData(list);
// hotSearchDAO.addHotSearchList(data);
// log.info("知乎话题采集结束........");
}
}
#local service
#mongoUri=mongodb://searchhotcrawleruser:searchhotcrawler1q2w3e4r@202.107.192.94:30000/hot_search_list?authSource=admin&authMechanism=SCRAM-SHA-1
#mongoUri=mongodb://searchhotcrawleruser:searchhotcrawler1q2w3e4r@115.236.59.88:30000/hot_search_list?authSource=admin&authMechanism=SCRAM-SHA-1
#local
#mongoUri=mongodb://192.168.0.81:27017/istarshine_data
#mongoLocalUri=mongodb://192.168.0.81:27017/istarshine_data
#service
mongoUri=mongodb://searchhotcrawleruser:searchhotcrawler1q2w3e4r@192.168.0.101:30000,192.168.0.106:30000,192.168.0.108:30000/hot_search_list?authSource=admin&authMechanism=SCRAM-SHA-1
dbName=hot_search_list
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment