Commit d4cb16b0 by zhiwei

添加索引

parent 41dee457
...@@ -22,17 +22,17 @@ public class HotSearchList implements Serializable{ ...@@ -22,17 +22,17 @@ public class HotSearchList implements Serializable{
private String name; //热搜关键词 private String name; //热搜关键词
private int count; //时时热搜量 private Integer count; //时时热搜量
private boolean hot; //状态(true 为热搜; false为时时上升) private Boolean hot; //状态(true 为热搜; false为时时上升)
private String day; //天 private String day; //天
private Date time; //时间 private Date time; //时间
private int changeCount; //据上分钟变化量 private Integer changeCount; //据上分钟变化量
private int rank; //排名 private Integer rank; //排名
private String type; //分类 private String type; //分类
...@@ -40,7 +40,7 @@ public class HotSearchList implements Serializable{ ...@@ -40,7 +40,7 @@ public class HotSearchList implements Serializable{
public HotSearchList(){} public HotSearchList(){}
public HotSearchList(String url, String name, int count,boolean hot,int rank,String type){ public HotSearchList(String url, String name, Integer count,Boolean hot,Integer rank,String type){
this.id = name + "_" + new Date().getTime(); this.id = name + "_" + new Date().getTime();
this.url = url; this.url = url;
this.name = name; this.name = name;
...@@ -53,7 +53,7 @@ public class HotSearchList implements Serializable{ ...@@ -53,7 +53,7 @@ public class HotSearchList implements Serializable{
} }
public HotSearchList(String url, String name, Integer count,int rank,String type){ public HotSearchList(String url, String name, Integer count,Integer rank,String type){
this.id = name + "_" + new Date().getTime(); this.id = name + "_" + new Date().getTime();
this.url = url; this.url = url;
this.name = name; this.name = name;
...@@ -107,11 +107,11 @@ public class HotSearchList implements Serializable{ ...@@ -107,11 +107,11 @@ public class HotSearchList implements Serializable{
this.name = name; this.name = name;
} }
public int getCount() { public Integer getCount() {
return count; return count;
} }
public void setCount(int count) { public void setCount(Integer count) {
this.count = count; this.count = count;
} }
...@@ -123,11 +123,11 @@ public class HotSearchList implements Serializable{ ...@@ -123,11 +123,11 @@ public class HotSearchList implements Serializable{
this.time = time; this.time = time;
} }
public int getChangeCount() { public Integer getChangeCount() {
return changeCount; return changeCount;
} }
public void setChangeCount(int changeCount) { public void setChangeCount(Integer changeCount) {
this.changeCount = changeCount; this.changeCount = changeCount;
} }
...@@ -135,11 +135,11 @@ public class HotSearchList implements Serializable{ ...@@ -135,11 +135,11 @@ public class HotSearchList implements Serializable{
return serialVersionUID; return serialVersionUID;
} }
public boolean isHot() { public Boolean isHot() {
return hot; return hot;
} }
public void setHot(boolean hot) { public void setHot(Boolean hot) {
this.hot = hot; this.hot = hot;
} }
...@@ -151,11 +151,11 @@ public class HotSearchList implements Serializable{ ...@@ -151,11 +151,11 @@ public class HotSearchList implements Serializable{
this.day = day; this.day = day;
} }
public int getRank() { public Integer getRank() {
return rank; return rank;
} }
public void setRank(int rank) { public void setRank(Integer rank) {
this.rank = rank; this.rank = rank;
} }
......
package com.zhiwei.searchhotcrawler.config;
import java.io.InputStream;
import java.util.Properties;
public class ProxyConfig {
static {
Properties conf = null;
try {
InputStream is = Thread.currentThread().getContextClassLoader()
.getResourceAsStream("proxyip.properties");
conf = new Properties();
conf.load(is);
is.close();
registry = conf.getProperty("registry");
group = conf.getProperty("group");
} catch (Exception e) {
e.printStackTrace();
}
}
public static String registry;
public static String group;
}
...@@ -13,6 +13,7 @@ import org.slf4j.Logger; ...@@ -13,6 +13,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
...@@ -38,7 +39,7 @@ public class BaiDuHotSearchCrawler { ...@@ -38,7 +39,7 @@ public class BaiDuHotSearchCrawler {
public static List<HotSearchList> baiduHotSearch() { public static List<HotSearchList> baiduHotSearch() {
String url = "http://top.baidu.com/buzz?b=1&fr=topindex"; String url = "http://top.baidu.com/buzz?b=1&fr=topindex";
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null && htmlBody.contains("mainBody")) { if (htmlBody != null && htmlBody.contains("mainBody")) {
return ansysData(htmlBody); return ansysData(htmlBody);
} else { } else {
...@@ -79,7 +80,7 @@ public class BaiDuHotSearchCrawler { ...@@ -79,7 +80,7 @@ public class BaiDuHotSearchCrawler {
} }
// 获取关键词(String) // 获取关键词(String)
String kw = element.select("td.keyword").select("a.list-title").text(); String kw = element.select("td.keyword").select("a.list-title").text();
logger.info("关键词:{}", kw); // logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String) // 获取关键词相关链接everurl(String)
String everurl = element.select("td.keyword").select("a.list-title").attr("href"); String everurl = element.select("td.keyword").select("a.list-title").attr("href");
// 获取搜索指数count(int) // 获取搜索指数count(int)
...@@ -95,8 +96,8 @@ public class BaiDuHotSearchCrawler { ...@@ -95,8 +96,8 @@ public class BaiDuHotSearchCrawler {
if (StringUtils.isNotBlank(hot)) { if (StringUtils.isNotBlank(hot)) {
count = Integer.valueOf(hot); count = Integer.valueOf(hot);
} }
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name());
if (Objects.nonNull(rank)) { if (Objects.nonNull(rank)) {
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name());
list.add(hotSearch); list.add(hotSearch);
} }
} catch (Exception e) { } catch (Exception e) {
......
...@@ -11,6 +11,7 @@ import org.slf4j.LoggerFactory; ...@@ -11,6 +11,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
...@@ -39,7 +40,7 @@ public class DouyinHotSearchCrawler { ...@@ -39,7 +40,7 @@ public class DouyinHotSearchCrawler {
List<HotSearchList> list = null; List<HotSearchList> list = null;
String url = "https://api.amemv.com/aweme/v1/hot/search/list/"; String url = "https://api.amemv.com/aweme/v1/hot/search/list/";
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("word_list")){ if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("word_list")){
list = new ArrayList<>(); list = new ArrayList<>();
JSONObject data = JSONObject.parseObject(htmlBody); JSONObject data = JSONObject.parseObject(htmlBody);
......
...@@ -3,6 +3,7 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -3,6 +3,7 @@ package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Objects; import java.util.Objects;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
...@@ -14,9 +15,11 @@ import org.slf4j.Logger; ...@@ -14,9 +15,11 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool;
/** /**
* @ClassName:SougoHotSearch * @ClassName:SougoHotSearch
...@@ -43,7 +46,8 @@ public class SougoHotSearchCrawler { ...@@ -43,7 +46,8 @@ public class SougoHotSearchCrawler {
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
String htmlBody = null; String htmlBody = null;
try { try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string(); Map<String,String> headMap = HeaderTool.getCommonHead();
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if (htmlBody != null && htmlBody.contains("topwords")) { if (htmlBody != null && htmlBody.contains("topwords")) {
try { try {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
...@@ -62,9 +66,8 @@ public class SougoHotSearchCrawler { ...@@ -62,9 +66,8 @@ public class SougoHotSearchCrawler {
// 获取关键词(String) // 获取关键词(String)
String kw = element.select("li").select("a").text(); String kw = element.select("li").select("a").text();
logger.info("关键词:{}", kw); // logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
String everurl = element.select("li").select("a").attr("href"); String everurl = element.select("li").select("a").attr("href");
HotSearchList hotSearch = new HotSearchList(everurl, kw, null, rank, HotSearchType.搜狗微信热搜.name()); HotSearchList hotSearch = new HotSearchList(everurl, kw, null, rank, HotSearchType.搜狗微信热搜.name());
......
...@@ -16,6 +16,7 @@ import org.slf4j.LoggerFactory; ...@@ -16,6 +16,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
...@@ -46,7 +47,7 @@ public class WeiboHotSearchCrawler { ...@@ -46,7 +47,7 @@ public class WeiboHotSearchCrawler {
for(int i =0; i<3; i++){ for(int i =0; i<3; i++){
String htmlBody = null; String htmlBody = null;
try { try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string(); htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody!=null && htmlBody.contains("pl_top_realtimehot")){ if(htmlBody!=null && htmlBody.contains("pl_top_realtimehot")){
try { try {
// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0]; // String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
......
...@@ -11,6 +11,7 @@ import org.slf4j.LoggerFactory; ...@@ -11,6 +11,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
...@@ -46,7 +47,7 @@ public class ZhihuHotSearchCrawler { ...@@ -46,7 +47,7 @@ public class ZhihuHotSearchCrawler {
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20"); headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
headerMap.put("Referer", rerferer); headerMap.put("Referer", rerferer);
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null && htmlBody.contains("words")){ if(htmlBody != null && htmlBody.contains("words")){
list = new ArrayList<>(); list = new ArrayList<>();
JSONObject topSearch = JSONObject.parseObject(htmlBody); JSONObject topSearch = JSONObject.parseObject(htmlBody);
...@@ -90,7 +91,7 @@ public class ZhihuHotSearchCrawler { ...@@ -90,7 +91,7 @@ public class ZhihuHotSearchCrawler {
for(int j=0;j<3;j++){ for(int j=0;j<3;j++){
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(htmlBody != null && htmlBody.contains("author")){ if(htmlBody != null && htmlBody.contains("author")){
list = new ArrayList<>(); list = new ArrayList<>();
JSONObject topSearch = JSONObject.parseObject(htmlBody); JSONObject topSearch = JSONObject.parseObject(htmlBody);
......
...@@ -81,9 +81,11 @@ public class HotSearchListDAO extends MongoDBTemplate{ ...@@ -81,9 +81,11 @@ public class HotSearchListDAO extends MongoDBTemplate{
DBCursor cur = this.getReadColl().find(query).sort(sort).limit(1); DBCursor cur = this.getReadColl().find(query).sort(sort).limit(1);
while(cur.hasNext()){ while(cur.hasNext()){
DBObject doc = cur.next(); DBObject doc = cur.next();
if(doc.get("count")!=null) {
result = weiboHotSearch.getCount() - Integer.valueOf(doc.get("count").toString()); result = weiboHotSearch.getCount() - Integer.valueOf(doc.get("count").toString());
break; break;
} }
}
cur.close(); cur.close();
} catch (Exception e) { } catch (Exception e) {
logger.error("存储数据时出错,错误为:{}", e); logger.error("存储数据时出错,错误为:{}", e);
......
...@@ -4,7 +4,10 @@ import java.util.concurrent.Executors; ...@@ -4,7 +4,10 @@ import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.searchhotcrawler.cache.CacheListener; import com.zhiwei.searchhotcrawler.cache.CacheListener;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun; import com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.DouyinHotSearchRun; import com.zhiwei.searchhotcrawler.timer.DouyinHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.SendWeiboHotSearchRun; import com.zhiwei.searchhotcrawler.timer.SendWeiboHotSearchRun;
...@@ -33,6 +36,9 @@ public class HotSearchRun { ...@@ -33,6 +36,9 @@ public class HotSearchRun {
} }
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init(ProxyConfig.registry, ProxyConfig.group, GroupType.PROVIDER);
new UpdateWechatUserRun().start(); new UpdateWechatUserRun().start();
ZhiWeiTools.sleep(10000); ZhiWeiTools.sleep(10000);
new HotSearchRun().showTimer(); new HotSearchRun().showTimer();
......
...@@ -38,7 +38,7 @@ public class BaiduHotSearchRun extends Thread{ ...@@ -38,7 +38,7 @@ public class BaiduHotSearchRun extends Thread{
doc.put("time", baiduHotSearch.getTime()); doc.put("time", baiduHotSearch.getTime());
doc.put("changeCount", changeCount); doc.put("changeCount", changeCount);
doc.put("rank", baiduHotSearch.getRank()); doc.put("rank", baiduHotSearch.getRank());
doc.put("type", HotSearchType.百度热搜.name()); doc.put("type", baiduHotSearch.getType());
saveDataList.add(doc); saveDataList.add(doc);
}); });
} }
......
...@@ -36,7 +36,7 @@ public class DouyinHotSearchRun extends Thread{ ...@@ -36,7 +36,7 @@ public class DouyinHotSearchRun extends Thread{
douyin.put("time", douyinHotSearch.getTime()); douyin.put("time", douyinHotSearch.getTime());
douyin.put("changeCount", changeCount); douyin.put("changeCount", changeCount);
douyin.put("url", null); douyin.put("url", null);
douyin.put("type", HotSearchType.抖音热搜.name()); douyin.put("type", douyinHotSearch.getType());
data.add(douyin); data.add(douyin);
hotSearchDAO.addHotSearch(douyin); hotSearchDAO.addHotSearch(douyin);
} }
......
...@@ -33,7 +33,7 @@ public class SougoHotSearchRun extends Thread { ...@@ -33,7 +33,7 @@ public class SougoHotSearchRun extends Thread {
doc.put("day", sougoHotSearch.getDay()); doc.put("day", sougoHotSearch.getDay());
doc.put("time", sougoHotSearch.getTime()); doc.put("time", sougoHotSearch.getTime());
doc.put("rank", sougoHotSearch.getRank()); doc.put("rank", sougoHotSearch.getRank());
doc.put("type", HotSearchType.搜狗微信热搜.name()); doc.put("type", sougoHotSearch.getType());
data.add(doc); data.add(doc);
} }
hotSearchDAO.addHotSearchList(data); hotSearchDAO.addHotSearchList(data);
......
...@@ -37,7 +37,7 @@ public class WeiboHotSearchRun extends Thread{ ...@@ -37,7 +37,7 @@ public class WeiboHotSearchRun extends Thread{
doc.put("time", weiboHotSearch.getTime()); doc.put("time", weiboHotSearch.getTime());
doc.put("changeCount", changeCount); doc.put("changeCount", changeCount);
doc.put("rank", weiboHotSearch.getRank()); doc.put("rank", weiboHotSearch.getRank());
doc.put("type", HotSearchType.微博热搜.name()); doc.put("type", weiboHotSearch.getType());
data.add(doc); data.add(doc);
} }
weiboHotSearchDAO.addHotSearchList(data); weiboHotSearchDAO.addHotSearchList(data);
......
...@@ -36,7 +36,7 @@ public class ZhihuHotSearchRun extends Thread{ ...@@ -36,7 +36,7 @@ public class ZhihuHotSearchRun extends Thread{
zhihu.put("time", zhihuHotSearch.getTime()); zhihu.put("time", zhihuHotSearch.getTime());
zhihu.put("changeCount", 0); zhihu.put("changeCount", 0);
zhihu.put("rank", zhihuHotSearch.getRank()); zhihu.put("rank", zhihuHotSearch.getRank());
zhihu.put("type", HotSearchType.知乎热搜.name()); zhihu.put("type", zhihuHotSearch.getType());
hotSearchDAO.addHotSearch(zhihu); hotSearchDAO.addHotSearch(zhihu);
} }
logger.info("知乎话题采集结束........"); logger.info("知乎话题采集结束........");
......
...@@ -3,8 +3,9 @@ mongoIp=192.168.0.101 ...@@ -3,8 +3,9 @@ mongoIp=192.168.0.101
mongoPort=30000 mongoPort=30000
#mongoIp=192.168.0.81 #mongoIp=192.168.0.81
#mongoPort=27017 #mongoPort=27017
db.username=zzwno db.username=datapush
db.paasword=zzwno1q2w3e4r db.paasword=4d8ce5c42073c
db.certifiedDB=admin db.certifiedDB=admin
dbName=hot_search_list dbName=hot_search_list
collName=hot_search_list collName=hot_search_list
collWechatUserName=wechat_user
\ No newline at end of file
registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181
group=hangzhou
########################################################
#registry=zookeeper://192.168.0.36:2181
#group=local
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment