Commit 572289d9 by win 10

将百度 搜狗 抖音 热搜采集遗漏的提交

parent c21d66b0
package com.zhiwei.searchhotcrawler.bean;
import java.io.Serializable;
import java.util.Date;
import com.zhiwei.tools.timeparse.TimeParse;
public class SougoHotSearch implements Serializable{
private static final long serialVersionUID = 2076919584659821600L;
private String id; //主键(关键词+时间)
private String url; //主链接
private String everurl; //相关链接
private String kw; //关键词
private String day; //天
private Date time; //时间
private Integer rank; //排名
public SougoHotSearch() {}
public SougoHotSearch(Integer rank, String kw, String everurl) {
this.id = kw + "_" + new Date().getTime();
this.rank = rank;
this.kw = kw;
this.everurl = everurl;
this.rank = rank;
this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
}
@Override
public String toString(){
return "new SougoHotSearch["
+ "id = " + id
+ ", url = " + url
+ ", everurl = " + everurl
+ ", kw = " + kw
+ ", day = " + day
+ ", time = " + time
+ ", rank = " + rank
+ "]";
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getEverurl() {
return everurl;
}
public void setEverurl(String everurl) {
this.everurl = everurl;
}
public String getKw() {
return kw;
}
public void setKw(String kw) {
this.kw = kw;
}
public String getDay() {
return day;
}
public void setDay(String day) {
this.day = day;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public Integer getRank() {
return rank;
}
public void setRank(Integer rank) {
this.rank = rank;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.SougoHotSearch;
/**
* @ClassName:SougoHotSearch
* @Description: TODO(搜狗微信关键词采集)
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
public class SougoHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(SougoHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
* @Title: SougoHotSearchTest
* @author hero
* @Description: TODO(PC端搜狗微信关键词采集)
* @param 设定文件
* @return void 返回类型
*/
public static List<SougoHotSearch> sougoHotSearch(){
String url = "https://weixin.sogou.com";
List<SougoHotSearch> list = new ArrayList<SougoHotSearch>();
for(int i =0; i<3; i++){
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
if(htmlBody!=null && htmlBody.contains("topwords")){
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("ol#topwords").select("li");
for (Element element : elements) {
try {
//获取排名rank
String rankStr = null;
if(!element.select("li").select("i").isEmpty()) {
rankStr = element.select("li").select("i").text();
}
Integer rank = null;
if(StringUtils.isNoneBlank(rankStr)) {
rank = Integer.valueOf(rankStr);
}
//获取关键词(String)
String kw = element.select("li").select("a").text();
logger.info("关键词:{}", kw);
//获取关键词相关链接everurl(String)
String everurl = element.select("li").select("a").attr("href");
SougoHotSearch hotSearch = new SougoHotSearch(rank,kw,everurl);
if(Objects.nonNull(rank)) {
list.add(hotSearch);
}
} catch (Exception e) {
logger.error("解析搜狗微信时出现解析错误", e);
continue;
}
}
}catch (Exception e) {
logger.error("解析搜狗微信时出现解析错误,数据不是json结构",e.fillInStackTrace());
return null;
}
}else{
logger.info("解析搜狗微信时出现解析错误,页面结构有问题");
}
break;
} catch (Exception e) {
logger.error("解析搜狗微信时出现解析错误,页面结构有问题", e);
}
}
logger.info("此轮采集的数据量为:", list.size());
return list;
}
}
package com.zhiwei.searchhotcrawler.dao;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.WriteConcern;
import com.zhiwei.searchhotcrawler.bean.DouyinHotSearch;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class DouyinHotSearchDAO extends MongoDBTemplate{
public DouyinHotSearchDAO() {
super();
super.setDbName(Config.dbName);
super.setCollName(Config.collDouyinName);
}
@SuppressWarnings("deprecation")
public void addDouyinHotSearch(DBObject douyin){
for(int i=0; i<3; i++){
try {
this.getReadColl().insert(douyin,WriteConcern.SAFE);
ZhiWeiTools.sleep(200);
break;
} catch (Exception e) {
continue;
}
}
}
/**
* @Title: getChangeCount
* @author hero
* @Description: TODO(查询据上次变化量)
* @param @param douyinHotSearch
* @param @return 设定文件
* @return int 返回类型
*/
public int getChangeCount(DouyinHotSearch douyinHotSearch){
int result = 0;
DBObject query = new BasicDBObject();
query.put("word", douyinHotSearch.getWord());
DBObject sort = new BasicDBObject();
sort.put("time", -1);
try {
DBCursor cur = this.getReadColl().find(query).sort(sort).limit(1);
while(cur.hasNext()){
DBObject doc = cur.next();
result = douyinHotSearch.getHot_value() - Integer.valueOf(doc.get("hot_value").toString());
break;
}
cur.close();
} catch (Exception e) {
e.printStackTrace();
return result;
}
return result;
}
/**
* @Title: getDouyinHotSearch
* @author hero
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
// public List<DBObject> getDouyinHotSearch(){
// List<DBObject> list = null;
// try {
// Date date = new Date((new Date().getTime()-60*60*1000));
// DBObject query = new BasicDBObject();
// query.put("time", new BasicDBObject("$gte", date));
//
// long count = this.getReadColl().count(query);
// if(count>0){
// list = new ArrayList<DBObject>();
// DBCursor cur = this.getReadColl().find(query);
// while(cur.hasNext()){
// DBObject doc = cur.next();
// list.add(doc);
// }
// cur.close();
// }
// return list;
// } catch (Exception e) {
// e.printStackTrace();
// return list;
// }
// }
}
package com.zhiwei.searchhotcrawler.dao;
import java.util.List;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.config.Config;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class SougoHotSearchDAO extends MongoDBTemplate{
public SougoHotSearchDAO() {
super();
super.setDbName(Config.dbName);
// Date date = new Date();
// String time = TimeParse.dateFormartString(date, "yyyy");
// if(Calendar.MONTH<6){
// collWeiboName = Config.collWeiboName + time+"_01";
// }else{
// collWeiboName = Config.collWeiboName + time+"_06";
// }
// System.out.println("collWeiboName========="+collWeiboName);
super.setCollName(Config.collSougoName);
}
/**
* @Title: addSougoHotSearch
* @author hero
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param @param doc 设定文件
* @return void 返回类型
*/
public void addSougoSearch(List<DBObject> list){
for(int i=0; i<3; i++){
try {
this.getReadColl().insert(list);
ZhiWeiTools.sleep(200);
break;
} catch (Exception e) {
e.printStackTrace();
continue;
}
}
}
/**
* @Title: getChangeCount
* @author hero
* @Description: TODO(查询据上次变化量)
* @param @param sougoHotSearch
* @param @return 设定文件
* @return int 返回类型
*/
// public int getChangeCount(SougoHotSearch sougoHotSearch){
// int result = 0;
// DBObject query = new BasicDBObject();
// query.put("kw", sougoHotSearch.getKw());
// DBObject sort = new BasicDBObject();
// sort.put("time", -1);
// try {
// DBCursor cur = this.getReadColl().find(query).sort(sort).limit(1);
// while(cur.hasNext()){
// DBObject doc = cur.next();
// result = sougoHotSearch.getCount() - Integer.valueOf(doc.get("count").toString());
// break;
// }
// cur.close();
// } catch (Exception e) {
// e.printStackTrace();
// return result;
// }
// return result;
// }
}
package com.zhiwei.searchhotcrawler.timer;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.DouyinHotSearch;
import com.zhiwei.searchhotcrawler.crawler.DouyinHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.DouyinHotSearchDAO;
public class DouyinHotSearchRun extends Thread{
private static Logger logger = LoggerFactory.getLogger(DouyinHotSearchRun.class);
private DouyinHotSearchDAO douyinHotSearchDAO = new DouyinHotSearchDAO();
@Override
public void run() {
logger.info("抖音热搜榜采集开始........");
List<DouyinHotSearch> list = DouyinHotSearchCrawler.getMobileDouyinHotList();
logger.info("{}, 抖音热搜榜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<DBObject>();
for(DouyinHotSearch douyinHotSearch : list){
int changeCount = douyinHotSearchDAO.getChangeCount(douyinHotSearch);
DBObject douyin = new BasicDBObject();
douyin.put("_id", douyinHotSearch.getId());
douyin.put("word", douyinHotSearch.getWord());
douyin.put("position", douyinHotSearch.getPosition());
douyin.put("hot_value", douyinHotSearch.getHot_value());
// douyin.put("url", douyinHotSearch.getUrl());
douyin.put("time", douyinHotSearch.getTime());
douyin.put("changeCount", changeCount);
data.add(douyin);
douyinHotSearchDAO.addDouyinHotSearch(douyin);
}
logger.info("抖音热搜榜采集结束........");
}
}
package com.zhiwei.searchhotcrawler.timer;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.searchhotcrawler.bean.SougoHotSearch;
import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.SougoHotSearchDAO;
public class SougoHotSearchRun extends Thread {
private static Logger logger = LoggerFactory.getLogger(SougoHotSearchRun.class);
private SougoHotSearchDAO sougoHotSearchDAO = new SougoHotSearchDAO();
@Override
public void run() {
logger.info("搜狗微信采集开始........");
List<SougoHotSearch> list = SougoHotSearchCrawler.sougoHotSearch();
logger.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<DBObject> data = new ArrayList<DBObject>();
for(SougoHotSearch sougoHotSearch : list){
// int changeCount = baiduHotSearchDAO.getChangeCount(sougoHotSearch);
DBObject doc = new BasicDBObject();
doc.put("_id", sougoHotSearch.getId());
doc.put("kw", sougoHotSearch.getKw());
doc.put("everurl", sougoHotSearch.getEverurl());
// doc.put("count", baiduHotSearch.getCount());
doc.put("day", sougoHotSearch.getDay());
doc.put("time", sougoHotSearch.getTime());
// doc.put("changeCount", changeCount);
doc.put("rank", sougoHotSearch.getRank());
data.add(doc);
}
sougoHotSearchDAO.addSougoSearch(data);
logger.info("搜狗微信采集结束........");
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment