Commit 241bc05a by chenweitao

Merge remote-tracking branch 'origin/working' into working

parents eb71665b d544547c
......@@ -25,4 +25,5 @@ public enum HotSearchType {
B站热搜,
人气榜36,
虎嗅热文推荐,
快手热榜,
}
......@@ -88,7 +88,10 @@ public class WeiBoMassage implements Serializable {
* 话题
*/
private String topic;
/**
* 头像地址
*/
private String profileImageUrl;
//是否转发
private Integer forward;
//转发 源微博mid
......@@ -110,7 +113,7 @@ public class WeiBoMassage implements Serializable {
public WeiBoMassage(String userId, String text, String userName, String mid,
Date creatTime, Date editTime, Integer cardType, Integer showType, Long repostCount,
Long commentCount, Long attitudeCount, String source, String type, String topic) {
Long commentCount, Long attitudeCount, String source, String type, String topic, String profileImageUrl) {
this.id =mid+"_"+HotSearchType.微博热搜.name()+"_"+topic;
this.userId = userId;
this.text = text;
......@@ -126,6 +129,8 @@ public class WeiBoMassage implements Serializable {
this.source = source;
this.type = type;
this.topic = topic;
this.profileImageUrl = profileImageUrl;
}
}
......@@ -48,10 +48,15 @@ public class WeiBoUser implements Serializable {
* 粉丝数
*/
private Long followerCount;
/**
* 头像地址
*/
private String profileImageUrl;
public WeiBoUser() {
}
public WeiBoUser(String userId, String attestationMassage, String userName,String topic,Date time,Long followerCount) {
public WeiBoUser(String userId, String attestationMassage, String userName,String topic,Date time,Long followerCount,String profileImageUrl) {
this.id = userId+"_"+HotSearchType.微博热搜.name()+"_"+topic;
this.userId = userId;
......@@ -60,6 +65,7 @@ public class WeiBoUser implements Serializable {
this.topic=topic;
this.time=time;
this.followerCount=followerCount;
this.profileImageUrl = profileImageUrl;
}
}
......@@ -12,6 +12,7 @@ import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -21,115 +22,158 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
/**
* @author hero
* @ClassName:BaiDuHotSearch
* @Description: TODO(百度风云榜热搜采集)
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
@Log4j2
public class BaiDuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @return void 返回类型
* @Title: BaiDuHotSearchTest
* @author hero
* @Description: PC端百度风云榜采集
*/
public static List<HotSearchList> baiduHotSearch(Date date) {
String url = "http://top.baidu.com/buzz?b=1&fr=topindex";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("container-bg_lQ801")) {
return ansysNewData(htmlBody, date);
} else {
log.info("解析百度风云榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
/**
* 更新解析
*
* @param htmlBody
* @param date
* @return
*/
/**
* @Title: BaiDuHotSearchTest
* @author hero
* @Description: PC端百度风云榜采集
* @return void 返回类型
*/
public static List<HotSearchList> baiduHotSearch(Date date) {
String url = "http://top.baidu.com/buzz?b=1&fr=topindex";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("mainBody")) {
return ansysData(htmlBody,date);
} else {
log.info("解析百度风云榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
private static List<HotSearchList> ansysNewData(String htmlBody, Date date) {
List<HotSearchList> list = new ArrayList<>();
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.category-wrap_iQLoo");
if (Objects.nonNull(elements) && !elements.isEmpty()) {
for (Element element : elements) {
try {
//获取排名
String strRank = element.select("a.img-wrapper_29V76").select("div.index_1Ew5p").text();
Integer rank = Integer.valueOf(strRank);
//获取标题
String strTitle = element.select("a.title_dIF3B").text();
String title = strTitle.split(" ")[0];
//获取链接
String url = element.select("div.content_1YWBm").select("a.title_dIF3B").attr("href");
//获取内容
String content = element.select("div.small_Uvkd3").text();
//获取搜索指数
String strCount = element.select("div.hot-index_1Bl1a").text();
Long count = Long.valueOf(strCount);
HotSearchList hotSearch = new HotSearchList(url,title, count, rank, HotSearchType.百度热搜.name(), date);
hotSearch.setTopicLead(content);
list.add(hotSearch);
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误", e);
}
}
}
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误,数据不是json结构", e);
}
return list;
}
/**
* 解析数据
* @param htmlBody
* @return
*/
private static List<HotSearchList> ansysData(String htmlBody,Date date){
List<HotSearchList> list = new ArrayList<>();
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("table.list-table").select("tr");
if (Objects.nonNull(elements) && !elements.isEmpty()) {
elements.forEach(element -> {
try {
// 获取排名rank
String rankStr = null;
// 根据网页标签,给rankStr做判断
if (!element.select("td.first").select("span.num-top").isEmpty()) {
rankStr = element.select("td.first").select("span.num-top").text();
} else if (!element.select("td.first").select("span.num-normal").isEmpty()) {
rankStr = element.select("td.first").select("span.num-normal").text();
}
Integer rank = null;
// 判断rankStr是否为空
if (StringUtils.isNoneBlank(rankStr)) {
rank = Integer.valueOf(rankStr);
}
/**
* 解析数据
*
* @param htmlBody
* @return
*/
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
List<HotSearchList> list = new ArrayList<>();
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("table.list-table").select("tr");
if (Objects.nonNull(elements) && !elements.isEmpty()) {
elements.forEach(element -> {
try {
// 获取排名rank
String rankStr = null;
// 根据网页标签,给rankStr做判断
if (!element.select("td.first").select("span.num-top").isEmpty()) {
rankStr = element.select("td.first").select("span.num-top").text();
} else if (!element.select("td.first").select("span.num-normal").isEmpty()) {
rankStr = element.select("td.first").select("span.num-normal").text();
}
Integer rank = null;
// 判断rankStr是否为空
if (StringUtils.isNoneBlank(rankStr)) {
rank = Integer.valueOf(rankStr);
}
// 获取关键词相关链接everurl(String)
String everurl = element.select("td.keyword").select("a.list-title").attr("href");
// 获取关键词(String)
String kw = element.select("td.keyword").select("a.list-title").text();
// 获取关键词相关链接everurl(String)
String everurl = element.select("td.keyword").select("a.list-title").attr("href");
// 获取关键词(String)
String kw = element.select("td.keyword").select("a.list-title").text();
// logger.info("关键词:{}", kw);
//从连接中获取正确编码关键词
try{
if (!everurl.isEmpty()){
kw = URLDecoder.decode(everurl.substring(everurl.indexOf("&wd=")+4).split("&")[0], "GB2312" );
}
}catch (Exception e1){
log.error("解析百度风云榜,地址",e1);
}
// 获取搜索指数count(int)
String hot = null;
// 判断热度值所在的规则是否为null
if (!element.select("td.last").select("span.icon-fall").isEmpty()) {
hot = element.select("td.last").select("span.icon-fall").text();
} else if (!element.select("td.last").select("span.icon-rise").isEmpty()) {
hot = element.select("td.last").select("span.icon-rise").text();
}
else if (!element.select("td.last").select("span.icon-fair").isEmpty()) {
hot = element.select("td.last").select("span.icon-fair").text();
}
long count = 0;
// 判断hot是否为空
if (StringUtils.isNotBlank(hot)) {
count = Integer.valueOf(hot);
}
if (Objects.nonNull(rank)) {
if(count == 0){
log.info(htmlBody);
log.info(hot);
log.info(element);
} else {
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name(),date);
list.add(hotSearch);
}
}
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误", e);
}
});
}
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误,数据不是json结构", e);
}
return list;
}
//从连接中获取正确编码关键词
try {
if (!everurl.isEmpty()) {
kw = URLDecoder.decode(everurl.substring(everurl.indexOf("&wd=") + 4).split("&")[0], "GB2312");
}
} catch (Exception e1) {
log.error("解析百度风云榜,地址", e1);
}
// 获取搜索指数count(int)
String hot = null;
// 判断热度值所在的规则是否为null
if (!element.select("td.last").select("span.icon-fall").isEmpty()) {
hot = element.select("td.last").select("span.icon-fall").text();
} else if (!element.select("td.last").select("span.icon-rise").isEmpty()) {
hot = element.select("td.last").select("span.icon-rise").text();
} else if (!element.select("td.last").select("span.icon-fair").isEmpty()) {
hot = element.select("td.last").select("span.icon-fair").text();
}
long count = 0;
// 判断hot是否为空
if (StringUtils.isNotBlank(hot)) {
count = Integer.valueOf(hot);
}
if (Objects.nonNull(rank)) {
if (count == 0) {
log.info(htmlBody);
log.info(hot);
log.info(element);
} else {
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name(), date);
list.add(hotSearch);
}
}
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误", e);
}
});
}
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误,数据不是json结构", e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import java.time.Duration;
import java.util.*;
/**
* @author ll
* @ClassName:KuaiShouHotSearchCrawlerTeat
* @Description:快手采集
* @date 2021年6月10日 下午5:54:31
*/
@Log4j2
public class KuaiShouHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public static List<HotSearchList> KuaiShouHotSearchCrawler(Date date) {
String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析快手热榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("APOLLO_STATE")) {
return ansysData(htmlBody,date);
} else {
log.info("解析快手热榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
List<HotSearchList> list= new ArrayList<>();
JSONObject jsonObject = null;
try {
String substring = htmlBody.substring(htmlBody.indexOf("homexxunknown")+15, htmlBody.indexOf("homexxfilmcomlist")+18);
String sub = "{"+substring.substring(substring.indexOf("VisionHotRankResult") + 22, substring.indexOf("llsid") - 2)+"}}";
String substring1 = sub.substring(0,sub.indexOf("$ROOT_QUERY.visionMovieRank") - 2)+"}";
jsonObject = JSONObject.parseObject(substring1);
//获取每个jsonObject对象的值
Collection<Object> values = jsonObject.values();
for (Object value : values) {
try {
JSONObject object = (JSONObject)JSONObject.toJSON(value);
//获取话题名
String name = object.getString("name");
//排名
Integer rank = object.getInteger("rank");
String hotValue = object.getString("hotValue");
String[] ws = hotValue.split("w");
//热度
Double d = Double.valueOf(ws[0])*10000;
long hot = d.longValue();
//话题链接
String url = object.getString("poster");
//标签类型
String tagType =null;
if (object.containsKey("tagType")){
tagType = object.getString("tagType");
}
HotSearchList hotSearchList = new HotSearchList(url,name,hot,true,rank, HotSearchType.快手热榜.name(),tagType,date);
list.add(hotSearchList);
} catch (NumberFormatException e) {
log.error("解析快手热榜时出现解析错误",e);
}
}
} catch (NumberFormatException e) {
log.error("解析快手热榜时出现解析错误,数据不是json结构",e);
}
return list;
}
}
......@@ -373,8 +373,9 @@ public class WeiboHotSearchCrawler {
String[] split = followers_count.split("万");
followerCount = Long.valueOf(split[0])*10000;
}
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount);
//用户头像地址
String profileImageUrl = users.getJSONObject(i1).getString("profile_image_url");
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount,profileImageUrl);
weiBoUserList.add(weiBoUser);
}
}
......@@ -397,8 +398,9 @@ public class WeiboHotSearchCrawler {
}else {
followerCount = Long.valueOf(followers_count);
}
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount);
//用户头像地址
String profileImageUrl = user.getString("profile_image_url");
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount,profileImageUrl);
weiBoUserList.add(weiBoUser);
}
return weiBoUserList;
......@@ -476,6 +478,8 @@ public class WeiboHotSearchCrawler {
String userName = mblog.getJSONObject("user").getString("screen_name");
//来源
String source = mblog.getString("source");
//用户头像地址
String profileImageUrl = mblog.getJSONObject("user").getString("profile_image_url");
//内容
String content = null;
if (mblog.getString("text").contains("<")) {
......@@ -488,7 +492,7 @@ public class WeiboHotSearchCrawler {
}
WeiBoMassage weiBoMassage = new WeiBoMassage(userId, content, userName, mid, createTime, editTime, cardType, showType,
repostCount, commentCount, attitudeCount, source, type, topic);
repostCount, commentCount, attitudeCount, source, type, topic,profileImageUrl);
//默认不转发为0
weiBoMassage.setForward(0);
......
......@@ -54,6 +54,9 @@ public class HotSearchCacheDAO {
if("虎嗅热文推荐".equals(hotSearch.getType())){
document.put("comment_count", hotSearch.getCommentCount());
}
if("百度热搜".equals(hotSearch.getType())){
document.put("topic_lead", hotSearch.getTopicLead());
}
if("腾讯较真榜".equals(hotSearch.getType())){
document.put("topic_result",hotSearch.getTopicResult());
......@@ -65,6 +68,9 @@ public class HotSearchCacheDAO {
document.put("pictureUrl",hotSearch.getPictureUrl());
}
addAndUpdateData(document);
if("百度热搜".equals(hotSearch.getType())){
document.remove("topic_lead");
}
dataes.add(document);
});
return dataes;
......
......@@ -49,6 +49,7 @@ public class WeiBoMassageDao {
document.put("repostCount",weiBoMassage.getRepostCount());
document.put("commentCount",weiBoMassage.getCommentCount());
document.put("attitudeCount",weiBoMassage.getAttitudeCount());
document.put("profileImageUrl",weiBoMassage.getProfileImageUrl());
if (Objects.nonNull(weiBoMassage.getPlayCount())){
document.put("playCount",weiBoMassage.getPlayCount());
}
......
......@@ -43,6 +43,7 @@ public class WeiBoUserDao {
document.put("topic",weiBoUser.getTopic());
document.put("time",weiBoUser.getTime());
document.put("followerCount",weiBoUser.getFollowerCount());
document.put("profileImageUrl",weiBoUser.getProfileImageUrl());
try {
mongoCollection.insertOne(document);
} catch (Exception e) {
......
package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import java.text.ParseException;
public class HotSearchRunTest {
public static void main(String[] args) throws ParseException {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
//微博热搜开始采集
// new WeiboHotSearchRun().start();
//快手热榜开始采集
// new KuaiShouHotSearchRun().start();
}
}
package com.zhiwei.searchhotcrawler.test;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import java.time.Duration;
import java.util.*;
/**
* @author ll
* @ClassName:KuaiShouHotSearchCrawlerTeat
* @Description:
* @date 2021年6月10日 下午5:54:31
*/
@Log4j2
public class KuaiShouHotSearchCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public static List<HotSearchList> KuaiShouHotSearchCrawler(Date date) {
String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析快手热榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("APOLLO_STATE")) {
return ansysData(htmlBody,date);
} else {
log.info("解析快手热榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
List<HotSearchList> list= new ArrayList<>();
JSONObject jsonObject = null;
try {
String substring = htmlBody.substring(htmlBody.indexOf("homexxunknown")+15, htmlBody.indexOf("homexxfilmcomlist")+18);
String sub = "{"+substring.substring(substring.indexOf("VisionHotRankResult") + 22, substring.indexOf("llsid") - 2)+"}}";
String substring1 = sub.substring(0,sub.indexOf("$ROOT_QUERY.visionMovieRank") - 2)+"}";
jsonObject = JSONObject.parseObject(substring1);
//获取每个jsonObject对象的值
Collection<Object> values = jsonObject.values();
for (Object value : values) {
try {
JSONObject object = (JSONObject)JSONObject.toJSON(value);
//获取话题名
String name = object.getString("name");
//排名
Integer rank = object.getInteger("rank");
String hotValue = object.getString("hotValue");
String[] ws = hotValue.split("w");
//热度
Double d = Double.valueOf(ws[0])*10000;
long hot = d.longValue();
//话题链接
String url = object.getString("poster");
//标签类型
String tagType =null;
if (object.containsKey("tagType")){
tagType = object.getString("tagType");
}
HotSearchList hotSearchList = new HotSearchList(url,name,hot,true,rank, HotSearchType.快手热榜.name(),tagType,date);
list.add(hotSearchList);
} catch (NumberFormatException e) {
log.error("解析快手热榜时出现解析错误",e);
}
}
} catch (NumberFormatException e) {
log.error("解析快手热榜时出现解析错误,数据不是json结构",e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler;
import com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Log4j2
public class KuaiShouHotSearchRun extends Thread{
@Override
public void run() {
boolean f = true;
while(f) {
try {
getHotList();
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
}
ZhiWeiTools.sleep(50);
}
}
private void getHotList() {
log.info("快手热榜采集开始........");
List<HotSearchList> kuaiShouList = KuaiShouHotSearchCrawlerTest.KuaiShouHotSearchCrawler(new Date());
log.info("{}, 此轮快手热榜采集到的数据量为:{}", new Date(), Integer.valueOf(kuaiShouList != null ? kuaiShouList.size() : 0));
TipsUtils.addHotList("快手热榜",kuaiShouList);
log.info("快手热榜采集结束........");
}
}
\ No newline at end of file
......@@ -507,5 +507,18 @@ public class GatherTimer {
}
return name;
}
/**
*快手热榜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerKuaiShou(){
logger.info("快手热榜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> kuaiShouList = KuaiShouHotSearchCrawler.KuaiShouHotSearchCrawler(date);
logger.info("{}, 快手此轮采集到的数据量为:{}", new Date(), kuaiShouList != null ? kuaiShouList.size() : 0);
TipsUtils.addHotList(HotSearchType.快手热榜.name(), kuaiShouList);
logger.info("快手热榜采集结束...");
}
}
package hotSaerchTest;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.client.MongoCollection;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest;
import com.zhiwei.searchhotcrawler.test.TaoBaoHotSearchCrawlerTest;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.bson.Document;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import static com.ibm.icu.util.LocalePriorityList.add;
import static java.util.Objects.nonNull;
/**
* @author ll
* @date 2021/6/10 6:30
*/
@Log4j2
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations =
{"classpath:applicationContext.xml"})
public class HotSearchTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* 测试快手热榜采集
*/
@Test
public void kuaiShouTestCrawler() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
List<HotSearchList> hotSearchLists = KuaiShouHotSearchCrawlerTest.KuaiShouHotSearchCrawler(new Date());
System.out.println(hotSearchLists);
System.out.println(hotSearchLists.size());
}
@Test
public void WeiBoUpdate() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
Document document = new Document();
//String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26t%3D10%26q%3D%23我国新冠疫苗接种剂次超9亿%23";
String url = "https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23可口可乐回应C罗拒绝与可乐同框%23";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜详情页面时出现连接失败", e);
}
if (htmlBody != null && htmlBody.contains("data")) {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo");
if (json.containsKey("desc")) {
String topicLead = json.getString("desc");
if (!"".equals(topicLead)) {
document.put("topicLead", topicLead);
}
}
if (json.containsKey("cardlist_head_cards")) {
JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext");
String read = midText.replaceAll("阅读", "").replaceAll("讨论.*", "").trim();
String discussCount = midText.replaceAll(".*讨论", "").replaceAll("详情.*", "").trim();
String pictureUrl = readJson.getJSONObject("head_data").getString("portrait_url");
document.put("readCount", TipsUtils.getHotCount(read));
document.put("discussCount", TipsUtils.getHotCount(discussCount));
document.put("pictureUrl", pictureUrl);
if (readJson.getJSONObject("head_data").containsKey("downtext")) {
String downtext = readJson.getJSONObject("head_data").getString("downtext");
if (!"".equals(downtext)) {
document.put("downtext", downtext.replaceAll("主持人:", ""));
}
}
}
}
}
ad(document);
System.out.println(document);
}
private void ad(Document nowDoc) {
MongoCollection collection = MongoDBTemplate.getCollection(DBConfig.dbName, DBConfig.searchCacheCollName);
if(nowDoc.containsKey("topicLead")){
nowDoc.put("topicLead", nowDoc.getString("topicLead"));
}
if(nowDoc.containsKey("readCount") && nowDoc.containsKey("discussCount")) {
nowDoc.put("readCount", nonNull(nowDoc.get("readCount"))?Long.valueOf(nowDoc.get("readCount").toString()):null);
nowDoc.put("discussCount", nonNull(nowDoc.get("discussCount"))?Long.valueOf(nowDoc.get("discussCount").toString()):null);
}
if (nowDoc.containsKey("pictureUrl")) {
nowDoc.put("pictureUrl",nowDoc.getString("pictureUrl"));
}
if (nowDoc.containsKey("downtext")) {
nowDoc.put("downtext",nowDoc.getString("downtext"));
}
collection.insertOne(nowDoc);
}
/**
* 测试淘宝热搜采集
*/
@Test
public void taoBaoTestCrawler() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
List<HotSearchList> hotSearchLists = TaoBaoHotSearchCrawlerTest.taoBaoHotSearch(new Date());
System.out.println(hotSearchLists);
System.out.println(hotSearchLists.size());
}
/**
* 测试百度热搜采集
*/
@Test
public void baiDuTestCrawler() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
List<HotSearchList> hotSearchLists = BaiDuHotSearchCrawler.baiduHotSearch(new Date());
System.out.println(hotSearchLists);
System.out.println(hotSearchLists.size());
}
}
......@@ -333,7 +333,9 @@ public class WeiboHotSearchTest {
String[] split = followers_count.split("万");
followerCount = Long.valueOf(split[0])*10000;
}
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount);
//用户头像地址
String profileImageUrl = users.getJSONObject(i1).getString("profile_image_url");
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount,profileImageUrl);
weiBoUserList.add(weiBoUser);
}
}
......@@ -356,8 +358,9 @@ public class WeiboHotSearchTest {
}else {
followerCount = Long.valueOf(followers_count);
}
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount);
//用户头像地址
String profileImageUrl = user.getString("profile_image_url");
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount,profileImageUrl);
weiBoUserList.add( weiBoUser);
}
......@@ -436,6 +439,8 @@ public class WeiboHotSearchTest {
String userName = mblog.getJSONObject("user").getString("screen_name");
//来源
String source = mblog.getString("source");
//用户头像地址
String profileImageUrl = mblog.getJSONObject("user").getString("profile_image_url");
//内容
String content = null;
if (mblog.getString("text").contains("<")) {
......@@ -448,7 +453,7 @@ public class WeiboHotSearchTest {
}
WeiBoMassage weiBoMassage = new WeiBoMassage(userId, content, userName, mid, createTime, editTime, cardType, showType,
repostCount, commentCount, attitudeCount, source, type, topic);
repostCount, commentCount, attitudeCount, source, type, topic,profileImageUrl);
//默认不转发为0
weiBoMassage.setForward(0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment