Commit d4cb16b0 by zhiwei

添加索引

parent 41dee457
......@@ -22,17 +22,17 @@ public class HotSearchList implements Serializable{
private String name; //热搜关键词
private int count; //时时热搜量
private Integer count; //时时热搜量
private boolean hot; //状态(true 为热搜; false为时时上升)
private Boolean hot; //状态(true 为热搜; false为时时上升)
private String day; //天
private Date time; //时间
private int changeCount; //据上分钟变化量
private Integer changeCount; //据上分钟变化量
private int rank; //排名
private Integer rank; //排名
private String type; //分类
......@@ -40,7 +40,7 @@ public class HotSearchList implements Serializable{
public HotSearchList(){}
public HotSearchList(String url, String name, int count,boolean hot,int rank,String type){
public HotSearchList(String url, String name, Integer count,Boolean hot,Integer rank,String type){
this.id = name + "_" + new Date().getTime();
this.url = url;
this.name = name;
......@@ -53,7 +53,7 @@ public class HotSearchList implements Serializable{
}
public HotSearchList(String url, String name, Integer count,int rank,String type){
public HotSearchList(String url, String name, Integer count,Integer rank,String type){
this.id = name + "_" + new Date().getTime();
this.url = url;
this.name = name;
......@@ -107,11 +107,11 @@ public class HotSearchList implements Serializable{
this.name = name;
}
public int getCount() {
public Integer getCount() {
return count;
}
public void setCount(int count) {
public void setCount(Integer count) {
this.count = count;
}
......@@ -123,11 +123,11 @@ public class HotSearchList implements Serializable{
this.time = time;
}
public int getChangeCount() {
public Integer getChangeCount() {
return changeCount;
}
public void setChangeCount(int changeCount) {
public void setChangeCount(Integer changeCount) {
this.changeCount = changeCount;
}
......@@ -135,11 +135,11 @@ public class HotSearchList implements Serializable{
return serialVersionUID;
}
public boolean isHot() {
public Boolean isHot() {
return hot;
}
public void setHot(boolean hot) {
public void setHot(Boolean hot) {
this.hot = hot;
}
......@@ -151,11 +151,11 @@ public class HotSearchList implements Serializable{
this.day = day;
}
public int getRank() {
public Integer getRank() {
return rank;
}
public void setRank(int rank) {
public void setRank(Integer rank) {
this.rank = rank;
}
......
package com.zhiwei.searchhotcrawler.config;
import java.io.InputStream;
import java.util.Properties;
public class ProxyConfig {
static {
Properties conf = null;
try {
InputStream is = Thread.currentThread().getContextClassLoader()
.getResourceAsStream("proxyip.properties");
conf = new Properties();
conf.load(is);
is.close();
registry = conf.getProperty("registry");
group = conf.getProperty("group");
} catch (Exception e) {
e.printStackTrace();
}
}
public static String registry;
public static String group;
}
......@@ -13,6 +13,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
......@@ -38,7 +39,7 @@ public class BaiDuHotSearchCrawler {
public static List<HotSearchList> baiduHotSearch() {
String url = "http://top.baidu.com/buzz?b=1&fr=topindex";
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null && htmlBody.contains("mainBody")) {
return ansysData(htmlBody);
} else {
......@@ -79,7 +80,7 @@ public class BaiDuHotSearchCrawler {
}
// 获取关键词(String)
String kw = element.select("td.keyword").select("a.list-title").text();
logger.info("关键词:{}", kw);
// logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
String everurl = element.select("td.keyword").select("a.list-title").attr("href");
// 获取搜索指数count(int)
......@@ -95,8 +96,8 @@ public class BaiDuHotSearchCrawler {
if (StringUtils.isNotBlank(hot)) {
count = Integer.valueOf(hot);
}
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name());
if (Objects.nonNull(rank)) {
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name());
list.add(hotSearch);
}
} catch (Exception e) {
......
......@@ -11,6 +11,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
......@@ -39,7 +40,7 @@ public class DouyinHotSearchCrawler {
List<HotSearchList> list = null;
String url = "https://api.amemv.com/aweme/v1/hot/search/list/";
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("word_list")){
list = new ArrayList<>();
JSONObject data = JSONObject.parseObject(htmlBody);
......
......@@ -3,6 +3,7 @@ package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
......@@ -14,9 +15,11 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool;
/**
* @ClassName:SougoHotSearch
......@@ -43,7 +46,8 @@ public class SougoHotSearchCrawler {
for (int i = 0; i < 3; i++) {
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
Map<String,String> headMap = HeaderTool.getCommonHead();
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null && htmlBody.contains("topwords")) {
try {
Document document = Jsoup.parse(htmlBody);
......@@ -62,9 +66,8 @@ public class SougoHotSearchCrawler {
// 获取关键词(String)
String kw = element.select("li").select("a").text();
logger.info("关键词:{}", kw);
// logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
String everurl = element.select("li").select("a").attr("href");
HotSearchList hotSearch = new HotSearchList(everurl, kw, null, rank, HotSearchType.搜狗微信热搜.name());
......
......@@ -16,6 +16,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
......@@ -46,7 +47,7 @@ public class WeiboHotSearchCrawler {
for(int i =0; i<3; i++){
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody!=null && htmlBody.contains("pl_top_realtimehot")){
try {
// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
......
......@@ -11,6 +11,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
......@@ -46,7 +47,7 @@ public class ZhihuHotSearchCrawler {
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
headerMap.put("Referer", rerferer);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null && htmlBody.contains("words")){
list = new ArrayList<>();
JSONObject topSearch = JSONObject.parseObject(htmlBody);
......@@ -90,7 +91,7 @@ public class ZhihuHotSearchCrawler {
for(int j=0;j<3;j++){
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null && htmlBody.contains("author")){
list = new ArrayList<>();
JSONObject topSearch = JSONObject.parseObject(htmlBody);
......
......@@ -81,8 +81,10 @@ public class HotSearchListDAO extends MongoDBTemplate{
DBCursor cur = this.getReadColl().find(query).sort(sort).limit(1);
while(cur.hasNext()){
DBObject doc = cur.next();
result = weiboHotSearch.getCount() - Integer.valueOf(doc.get("count").toString());
break;
if(doc.get("count")!=null) {
result = weiboHotSearch.getCount() - Integer.valueOf(doc.get("count").toString());
break;
}
}
cur.close();
} catch (Exception e) {
......
......@@ -4,7 +4,10 @@ import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.searchhotcrawler.cache.CacheListener;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.DouyinHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SendWeiboHotSearchRun;
......@@ -33,6 +36,9 @@ public class HotSearchRun {
}
public static void main(String[] args) {
ProxyFactory.init(ProxyConfig.registry, ProxyConfig.group, GroupType.PROVIDER);
new UpdateWechatUserRun().start();
ZhiWeiTools.sleep(10000);
new HotSearchRun().showTimer();
......
......@@ -38,7 +38,7 @@ public class BaiduHotSearchRun extends Thread{
doc.put("time", baiduHotSearch.getTime());
doc.put("changeCount", changeCount);
doc.put("rank", baiduHotSearch.getRank());
doc.put("type", HotSearchType.百度热搜.name());
doc.put("type", baiduHotSearch.getType());
saveDataList.add(doc);
});
}
......
......@@ -36,7 +36,7 @@ public class DouyinHotSearchRun extends Thread{
douyin.put("time", douyinHotSearch.getTime());
douyin.put("changeCount", changeCount);
douyin.put("url", null);
douyin.put("type", HotSearchType.抖音热搜.name());
douyin.put("type", douyinHotSearch.getType());
data.add(douyin);
hotSearchDAO.addHotSearch(douyin);
}
......
......@@ -33,7 +33,7 @@ public class SougoHotSearchRun extends Thread {
doc.put("day", sougoHotSearch.getDay());
doc.put("time", sougoHotSearch.getTime());
doc.put("rank", sougoHotSearch.getRank());
doc.put("type", HotSearchType.搜狗微信热搜.name());
doc.put("type", sougoHotSearch.getType());
data.add(doc);
}
hotSearchDAO.addHotSearchList(data);
......
......@@ -37,7 +37,7 @@ public class WeiboHotSearchRun extends Thread{
doc.put("time", weiboHotSearch.getTime());
doc.put("changeCount", changeCount);
doc.put("rank", weiboHotSearch.getRank());
doc.put("type", HotSearchType.微博热搜.name());
doc.put("type", weiboHotSearch.getType());
data.add(doc);
}
weiboHotSearchDAO.addHotSearchList(data);
......
......@@ -36,7 +36,7 @@ public class ZhihuHotSearchRun extends Thread{
zhihu.put("time", zhihuHotSearch.getTime());
zhihu.put("changeCount", 0);
zhihu.put("rank", zhihuHotSearch.getRank());
zhihu.put("type", HotSearchType.知乎热搜.name());
zhihu.put("type", zhihuHotSearch.getType());
hotSearchDAO.addHotSearch(zhihu);
}
logger.info("知乎话题采集结束........");
......
......@@ -3,8 +3,9 @@ mongoIp=192.168.0.101
mongoPort=30000
#mongoIp=192.168.0.81
#mongoPort=27017
db.username=zzwno
db.paasword=zzwno1q2w3e4r
db.username=datapush
db.paasword=4d8ce5c42073c
db.certifiedDB=admin
dbName=hot_search_list
collName=hot_search_list
\ No newline at end of file
collName=hot_search_list
collWechatUserName=wechat_user
\ No newline at end of file
registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181
group=hangzhou
########################################################
#registry=zookeeper://192.168.0.36:2181
#group=local
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment