Commit 241bc05a by chenweitao

Merge remote-tracking branch 'origin/working' into working

parents eb71665b d544547c
...@@ -25,4 +25,5 @@ public enum HotSearchType { ...@@ -25,4 +25,5 @@ public enum HotSearchType {
B站热搜, B站热搜,
人气榜36, 人气榜36,
虎嗅热文推荐, 虎嗅热文推荐,
快手热榜,
} }
...@@ -88,7 +88,10 @@ public class WeiBoMassage implements Serializable { ...@@ -88,7 +88,10 @@ public class WeiBoMassage implements Serializable {
* 话题 * 话题
*/ */
private String topic; private String topic;
/**
* 头像地址
*/
private String profileImageUrl;
//是否转发 //是否转发
private Integer forward; private Integer forward;
//转发 源微博mid //转发 源微博mid
...@@ -110,7 +113,7 @@ public class WeiBoMassage implements Serializable { ...@@ -110,7 +113,7 @@ public class WeiBoMassage implements Serializable {
public WeiBoMassage(String userId, String text, String userName, String mid, public WeiBoMassage(String userId, String text, String userName, String mid,
Date creatTime, Date editTime, Integer cardType, Integer showType, Long repostCount, Date creatTime, Date editTime, Integer cardType, Integer showType, Long repostCount,
Long commentCount, Long attitudeCount, String source, String type, String topic) { Long commentCount, Long attitudeCount, String source, String type, String topic, String profileImageUrl) {
this.id =mid+"_"+HotSearchType.微博热搜.name()+"_"+topic; this.id =mid+"_"+HotSearchType.微博热搜.name()+"_"+topic;
this.userId = userId; this.userId = userId;
this.text = text; this.text = text;
...@@ -126,6 +129,8 @@ public class WeiBoMassage implements Serializable { ...@@ -126,6 +129,8 @@ public class WeiBoMassage implements Serializable {
this.source = source; this.source = source;
this.type = type; this.type = type;
this.topic = topic; this.topic = topic;
this.profileImageUrl = profileImageUrl;
} }
} }
...@@ -48,10 +48,15 @@ public class WeiBoUser implements Serializable { ...@@ -48,10 +48,15 @@ public class WeiBoUser implements Serializable {
* 粉丝数 * 粉丝数
*/ */
private Long followerCount; private Long followerCount;
/**
* 头像地址
*/
private String profileImageUrl;
public WeiBoUser() { public WeiBoUser() {
} }
public WeiBoUser(String userId, String attestationMassage, String userName,String topic,Date time,Long followerCount) { public WeiBoUser(String userId, String attestationMassage, String userName,String topic,Date time,Long followerCount,String profileImageUrl) {
this.id = userId+"_"+HotSearchType.微博热搜.name()+"_"+topic; this.id = userId+"_"+HotSearchType.微博热搜.name()+"_"+topic;
this.userId = userId; this.userId = userId;
...@@ -60,6 +65,7 @@ public class WeiBoUser implements Serializable { ...@@ -60,6 +65,7 @@ public class WeiBoUser implements Serializable {
this.topic=topic; this.topic=topic;
this.time=time; this.time=time;
this.followerCount=followerCount; this.followerCount=followerCount;
this.profileImageUrl = profileImageUrl;
} }
} }
...@@ -12,6 +12,7 @@ import okhttp3.Response; ...@@ -12,6 +12,7 @@ import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -21,9 +22,9 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList; ...@@ -21,9 +22,9 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
/** /**
* @author hero
* @ClassName:BaiDuHotSearch * @ClassName:BaiDuHotSearch
* @Description: TODO(百度风云榜热搜采集) * @Description: TODO(百度风云榜热搜采集)
* @author hero
* @date 2019年7月10日 上午10:54:31 * @date 2019年7月10日 上午10:54:31
*/ */
@Log4j2 @Log4j2
...@@ -32,35 +33,79 @@ public class BaiDuHotSearchCrawler { ...@@ -32,35 +33,79 @@ public class BaiDuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/** /**
* @return void 返回类型
* @Title: BaiDuHotSearchTest * @Title: BaiDuHotSearchTest
* @author hero * @author hero
* @Description: PC端百度风云榜采集 * @Description: PC端百度风云榜采集
* @return void 返回类型
*/ */
public static List<HotSearchList> baiduHotSearch(Date date) { public static List<HotSearchList> baiduHotSearch(Date date) {
String url = "http://top.baidu.com/buzz?b=1&fr=topindex"; String url = "http://top.baidu.com/buzz?b=1&fr=topindex";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string(); htmlBody = response.body().string();
} catch (Exception e) { } catch (Exception e) {
log.error("解析百度风云榜时出现解析错误,页面结构有问题", e); log.error("解析百度风云榜时出现解析错误,页面结构有问题", e);
} }
if (htmlBody != null && htmlBody.contains("mainBody")) { if (htmlBody != null && htmlBody.contains("container-bg_lQ801")) {
return ansysData(htmlBody,date); return ansysNewData(htmlBody, date);
} else { } else {
log.info("解析百度风云榜时出现解析错误,页面结构有问题"); log.info("解析百度风云榜时出现解析错误,页面结构有问题");
} }
return Collections.emptyList(); return Collections.emptyList();
} }
/**
* 更新解析
*
* @param htmlBody
* @param date
* @return
*/
private static List<HotSearchList> ansysNewData(String htmlBody, Date date) {
List<HotSearchList> list = new ArrayList<>();
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.category-wrap_iQLoo");
if (Objects.nonNull(elements) && !elements.isEmpty()) {
for (Element element : elements) {
try {
//获取排名
String strRank = element.select("a.img-wrapper_29V76").select("div.index_1Ew5p").text();
Integer rank = Integer.valueOf(strRank);
//获取标题
String strTitle = element.select("a.title_dIF3B").text();
String title = strTitle.split(" ")[0];
//获取链接
String url = element.select("div.content_1YWBm").select("a.title_dIF3B").attr("href");
//获取内容
String content = element.select("div.small_Uvkd3").text();
//获取搜索指数
String strCount = element.select("div.hot-index_1Bl1a").text();
Long count = Long.valueOf(strCount);
HotSearchList hotSearch = new HotSearchList(url,title, count, rank, HotSearchType.百度热搜.name(), date);
hotSearch.setTopicLead(content);
list.add(hotSearch);
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误", e);
}
}
}
} catch (Exception e) {
log.error("解析百度风云榜时出现解析错误,数据不是json结构", e);
}
return list;
}
/** /**
* 解析数据 * 解析数据
*
* @param htmlBody * @param htmlBody
* @return * @return
*/ */
private static List<HotSearchList> ansysData(String htmlBody,Date date){ private static List<HotSearchList> ansysData(String htmlBody, Date date) {
List<HotSearchList> list = new ArrayList<>(); List<HotSearchList> list = new ArrayList<>();
try { try {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
...@@ -88,12 +133,12 @@ public class BaiDuHotSearchCrawler { ...@@ -88,12 +133,12 @@ public class BaiDuHotSearchCrawler {
String kw = element.select("td.keyword").select("a.list-title").text(); String kw = element.select("td.keyword").select("a.list-title").text();
// logger.info("关键词:{}", kw); // logger.info("关键词:{}", kw);
//从连接中获取正确编码关键词 //从连接中获取正确编码关键词
try{ try {
if (!everurl.isEmpty()){ if (!everurl.isEmpty()) {
kw = URLDecoder.decode(everurl.substring(everurl.indexOf("&wd=")+4).split("&")[0], "GB2312" ); kw = URLDecoder.decode(everurl.substring(everurl.indexOf("&wd=") + 4).split("&")[0], "GB2312");
} }
}catch (Exception e1){ } catch (Exception e1) {
log.error("解析百度风云榜,地址",e1); log.error("解析百度风云榜,地址", e1);
} }
// 获取搜索指数count(int) // 获取搜索指数count(int)
String hot = null; String hot = null;
...@@ -102,8 +147,7 @@ public class BaiDuHotSearchCrawler { ...@@ -102,8 +147,7 @@ public class BaiDuHotSearchCrawler {
hot = element.select("td.last").select("span.icon-fall").text(); hot = element.select("td.last").select("span.icon-fall").text();
} else if (!element.select("td.last").select("span.icon-rise").isEmpty()) { } else if (!element.select("td.last").select("span.icon-rise").isEmpty()) {
hot = element.select("td.last").select("span.icon-rise").text(); hot = element.select("td.last").select("span.icon-rise").text();
} } else if (!element.select("td.last").select("span.icon-fair").isEmpty()) {
else if (!element.select("td.last").select("span.icon-fair").isEmpty()) {
hot = element.select("td.last").select("span.icon-fair").text(); hot = element.select("td.last").select("span.icon-fair").text();
} }
long count = 0; long count = 0;
...@@ -112,12 +156,12 @@ public class BaiDuHotSearchCrawler { ...@@ -112,12 +156,12 @@ public class BaiDuHotSearchCrawler {
count = Integer.valueOf(hot); count = Integer.valueOf(hot);
} }
if (Objects.nonNull(rank)) { if (Objects.nonNull(rank)) {
if(count == 0){ if (count == 0) {
log.info(htmlBody); log.info(htmlBody);
log.info(hot); log.info(hot);
log.info(element); log.info(element);
} else { } else {
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name(),date); HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name(), date);
list.add(hotSearch); list.add(hotSearch);
} }
} }
......
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import java.time.Duration;
import java.util.*;
/**
* @author ll
* @ClassName:KuaiShouHotSearchCrawlerTeat
* @Description:快手采集
* @date 2021年6月10日 下午5:54:31
*/
@Log4j2
public class KuaiShouHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public static List<HotSearchList> KuaiShouHotSearchCrawler(Date date) {
String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析快手热榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("APOLLO_STATE")) {
return ansysData(htmlBody,date);
} else {
log.info("解析快手热榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
List<HotSearchList> list= new ArrayList<>();
JSONObject jsonObject = null;
try {
String substring = htmlBody.substring(htmlBody.indexOf("homexxunknown")+15, htmlBody.indexOf("homexxfilmcomlist")+18);
String sub = "{"+substring.substring(substring.indexOf("VisionHotRankResult") + 22, substring.indexOf("llsid") - 2)+"}}";
String substring1 = sub.substring(0,sub.indexOf("$ROOT_QUERY.visionMovieRank") - 2)+"}";
jsonObject = JSONObject.parseObject(substring1);
//获取每个jsonObject对象的值
Collection<Object> values = jsonObject.values();
for (Object value : values) {
try {
JSONObject object = (JSONObject)JSONObject.toJSON(value);
//获取话题名
String name = object.getString("name");
//排名
Integer rank = object.getInteger("rank");
String hotValue = object.getString("hotValue");
String[] ws = hotValue.split("w");
//热度
Double d = Double.valueOf(ws[0])*10000;
long hot = d.longValue();
//话题链接
String url = object.getString("poster");
//标签类型
String tagType =null;
if (object.containsKey("tagType")){
tagType = object.getString("tagType");
}
HotSearchList hotSearchList = new HotSearchList(url,name,hot,true,rank, HotSearchType.快手热榜.name(),tagType,date);
list.add(hotSearchList);
} catch (NumberFormatException e) {
log.error("解析快手热榜时出现解析错误",e);
}
}
} catch (NumberFormatException e) {
log.error("解析快手热榜时出现解析错误,数据不是json结构",e);
}
return list;
}
}
...@@ -373,8 +373,9 @@ public class WeiboHotSearchCrawler { ...@@ -373,8 +373,9 @@ public class WeiboHotSearchCrawler {
String[] split = followers_count.split("万"); String[] split = followers_count.split("万");
followerCount = Long.valueOf(split[0])*10000; followerCount = Long.valueOf(split[0])*10000;
} }
//用户头像地址
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount); String profileImageUrl = users.getJSONObject(i1).getString("profile_image_url");
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount,profileImageUrl);
weiBoUserList.add(weiBoUser); weiBoUserList.add(weiBoUser);
} }
} }
...@@ -397,8 +398,9 @@ public class WeiboHotSearchCrawler { ...@@ -397,8 +398,9 @@ public class WeiboHotSearchCrawler {
}else { }else {
followerCount = Long.valueOf(followers_count); followerCount = Long.valueOf(followers_count);
} }
//用户头像地址
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount); String profileImageUrl = user.getString("profile_image_url");
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount,profileImageUrl);
weiBoUserList.add(weiBoUser); weiBoUserList.add(weiBoUser);
} }
return weiBoUserList; return weiBoUserList;
...@@ -476,6 +478,8 @@ public class WeiboHotSearchCrawler { ...@@ -476,6 +478,8 @@ public class WeiboHotSearchCrawler {
String userName = mblog.getJSONObject("user").getString("screen_name"); String userName = mblog.getJSONObject("user").getString("screen_name");
//来源 //来源
String source = mblog.getString("source"); String source = mblog.getString("source");
//用户头像地址
String profileImageUrl = mblog.getJSONObject("user").getString("profile_image_url");
//内容 //内容
String content = null; String content = null;
if (mblog.getString("text").contains("<")) { if (mblog.getString("text").contains("<")) {
...@@ -488,7 +492,7 @@ public class WeiboHotSearchCrawler { ...@@ -488,7 +492,7 @@ public class WeiboHotSearchCrawler {
} }
WeiBoMassage weiBoMassage = new WeiBoMassage(userId, content, userName, mid, createTime, editTime, cardType, showType, WeiBoMassage weiBoMassage = new WeiBoMassage(userId, content, userName, mid, createTime, editTime, cardType, showType,
repostCount, commentCount, attitudeCount, source, type, topic); repostCount, commentCount, attitudeCount, source, type, topic,profileImageUrl);
//默认不转发为0 //默认不转发为0
weiBoMassage.setForward(0); weiBoMassage.setForward(0);
......
...@@ -54,6 +54,9 @@ public class HotSearchCacheDAO { ...@@ -54,6 +54,9 @@ public class HotSearchCacheDAO {
if("虎嗅热文推荐".equals(hotSearch.getType())){ if("虎嗅热文推荐".equals(hotSearch.getType())){
document.put("comment_count", hotSearch.getCommentCount()); document.put("comment_count", hotSearch.getCommentCount());
} }
if("百度热搜".equals(hotSearch.getType())){
document.put("topic_lead", hotSearch.getTopicLead());
}
if("腾讯较真榜".equals(hotSearch.getType())){ if("腾讯较真榜".equals(hotSearch.getType())){
document.put("topic_result",hotSearch.getTopicResult()); document.put("topic_result",hotSearch.getTopicResult());
...@@ -65,6 +68,9 @@ public class HotSearchCacheDAO { ...@@ -65,6 +68,9 @@ public class HotSearchCacheDAO {
document.put("pictureUrl",hotSearch.getPictureUrl()); document.put("pictureUrl",hotSearch.getPictureUrl());
} }
addAndUpdateData(document); addAndUpdateData(document);
if("百度热搜".equals(hotSearch.getType())){
document.remove("topic_lead");
}
dataes.add(document); dataes.add(document);
}); });
return dataes; return dataes;
......
...@@ -49,6 +49,7 @@ public class WeiBoMassageDao { ...@@ -49,6 +49,7 @@ public class WeiBoMassageDao {
document.put("repostCount",weiBoMassage.getRepostCount()); document.put("repostCount",weiBoMassage.getRepostCount());
document.put("commentCount",weiBoMassage.getCommentCount()); document.put("commentCount",weiBoMassage.getCommentCount());
document.put("attitudeCount",weiBoMassage.getAttitudeCount()); document.put("attitudeCount",weiBoMassage.getAttitudeCount());
document.put("profileImageUrl",weiBoMassage.getProfileImageUrl());
if (Objects.nonNull(weiBoMassage.getPlayCount())){ if (Objects.nonNull(weiBoMassage.getPlayCount())){
document.put("playCount",weiBoMassage.getPlayCount()); document.put("playCount",weiBoMassage.getPlayCount());
} }
......
...@@ -43,6 +43,7 @@ public class WeiBoUserDao { ...@@ -43,6 +43,7 @@ public class WeiBoUserDao {
document.put("topic",weiBoUser.getTopic()); document.put("topic",weiBoUser.getTopic());
document.put("time",weiBoUser.getTime()); document.put("time",weiBoUser.getTime());
document.put("followerCount",weiBoUser.getFollowerCount()); document.put("followerCount",weiBoUser.getFollowerCount());
document.put("profileImageUrl",weiBoUser.getProfileImageUrl());
try { try {
mongoCollection.insertOne(document); mongoCollection.insertOne(document);
} catch (Exception e) { } catch (Exception e) {
......
package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import java.text.ParseException;
public class HotSearchRunTest {
public static void main(String[] args) throws ParseException {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
//微博热搜开始采集
// new WeiboHotSearchRun().start();
//快手热榜开始采集
// new KuaiShouHotSearchRun().start();
}
}
package com.zhiwei.searchhotcrawler.test;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import java.time.Duration;
import java.util.*;
/**
* @author ll
* @ClassName:KuaiShouHotSearchCrawlerTeat
* @Description:
* @date 2021年6月10日 下午5:54:31
*/
@Log4j2
public class KuaiShouHotSearchCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public static List<HotSearchList> KuaiShouHotSearchCrawler(Date date) {
String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析快手热榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("APOLLO_STATE")) {
return ansysData(htmlBody,date);
} else {
log.info("解析快手热榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
List<HotSearchList> list= new ArrayList<>();
JSONObject jsonObject = null;
try {
String substring = htmlBody.substring(htmlBody.indexOf("homexxunknown")+15, htmlBody.indexOf("homexxfilmcomlist")+18);
String sub = "{"+substring.substring(substring.indexOf("VisionHotRankResult") + 22, substring.indexOf("llsid") - 2)+"}}";
String substring1 = sub.substring(0,sub.indexOf("$ROOT_QUERY.visionMovieRank") - 2)+"}";
jsonObject = JSONObject.parseObject(substring1);
//获取每个jsonObject对象的值
Collection<Object> values = jsonObject.values();
for (Object value : values) {
try {
JSONObject object = (JSONObject)JSONObject.toJSON(value);
//获取话题名
String name = object.getString("name");
//排名
Integer rank = object.getInteger("rank");
String hotValue = object.getString("hotValue");
String[] ws = hotValue.split("w");
//热度
Double d = Double.valueOf(ws[0])*10000;
long hot = d.longValue();
//话题链接
String url = object.getString("poster");
//标签类型
String tagType =null;
if (object.containsKey("tagType")){
tagType = object.getString("tagType");
}
HotSearchList hotSearchList = new HotSearchList(url,name,hot,true,rank, HotSearchType.快手热榜.name(),tagType,date);
list.add(hotSearchList);
} catch (NumberFormatException e) {
log.error("解析快手热榜时出现解析错误",e);
}
}
} catch (NumberFormatException e) {
log.error("解析快手热榜时出现解析错误,数据不是json结构",e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler;
import com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Log4j2
public class KuaiShouHotSearchRun extends Thread{
@Override
public void run() {
boolean f = true;
while(f) {
try {
getHotList();
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
}
ZhiWeiTools.sleep(50);
}
}
private void getHotList() {
log.info("快手热榜采集开始........");
List<HotSearchList> kuaiShouList = KuaiShouHotSearchCrawlerTest.KuaiShouHotSearchCrawler(new Date());
log.info("{}, 此轮快手热榜采集到的数据量为:{}", new Date(), Integer.valueOf(kuaiShouList != null ? kuaiShouList.size() : 0));
TipsUtils.addHotList("快手热榜",kuaiShouList);
log.info("快手热榜采集结束........");
}
}
\ No newline at end of file
...@@ -507,5 +507,18 @@ public class GatherTimer { ...@@ -507,5 +507,18 @@ public class GatherTimer {
} }
return name; return name;
} }
/**
*快手热榜采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerKuaiShou(){
logger.info("快手热榜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> kuaiShouList = KuaiShouHotSearchCrawler.KuaiShouHotSearchCrawler(date);
logger.info("{}, 快手此轮采集到的数据量为:{}", new Date(), kuaiShouList != null ? kuaiShouList.size() : 0);
TipsUtils.addHotList(HotSearchType.快手热榜.name(), kuaiShouList);
logger.info("快手热榜采集结束...");
}
} }
package hotSaerchTest;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.client.MongoCollection;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest;
import com.zhiwei.searchhotcrawler.test.TaoBaoHotSearchCrawlerTest;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.bson.Document;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import static com.ibm.icu.util.LocalePriorityList.add;
import static java.util.Objects.nonNull;
/**
* @author ll
* @date 2021/6/10 6:30
*/
@Log4j2
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations =
{"classpath:applicationContext.xml"})
public class HotSearchTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* 测试快手热榜采集
*/
@Test
public void kuaiShouTestCrawler() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
List<HotSearchList> hotSearchLists = KuaiShouHotSearchCrawlerTest.KuaiShouHotSearchCrawler(new Date());
System.out.println(hotSearchLists);
System.out.println(hotSearchLists.size());
}
@Test
public void WeiBoUpdate() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
Document document = new Document();
//String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26t%3D10%26q%3D%23我国新冠疫苗接种剂次超9亿%23";
String url = "https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23可口可乐回应C罗拒绝与可乐同框%23";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜详情页面时出现连接失败", e);
}
if (htmlBody != null && htmlBody.contains("data")) {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo");
if (json.containsKey("desc")) {
String topicLead = json.getString("desc");
if (!"".equals(topicLead)) {
document.put("topicLead", topicLead);
}
}
if (json.containsKey("cardlist_head_cards")) {
JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext");
String read = midText.replaceAll("阅读", "").replaceAll("讨论.*", "").trim();
String discussCount = midText.replaceAll(".*讨论", "").replaceAll("详情.*", "").trim();
String pictureUrl = readJson.getJSONObject("head_data").getString("portrait_url");
document.put("readCount", TipsUtils.getHotCount(read));
document.put("discussCount", TipsUtils.getHotCount(discussCount));
document.put("pictureUrl", pictureUrl);
if (readJson.getJSONObject("head_data").containsKey("downtext")) {
String downtext = readJson.getJSONObject("head_data").getString("downtext");
if (!"".equals(downtext)) {
document.put("downtext", downtext.replaceAll("主持人:", ""));
}
}
}
}
}
ad(document);
System.out.println(document);
}
private void ad(Document nowDoc) {
MongoCollection collection = MongoDBTemplate.getCollection(DBConfig.dbName, DBConfig.searchCacheCollName);
if(nowDoc.containsKey("topicLead")){
nowDoc.put("topicLead", nowDoc.getString("topicLead"));
}
if(nowDoc.containsKey("readCount") && nowDoc.containsKey("discussCount")) {
nowDoc.put("readCount", nonNull(nowDoc.get("readCount"))?Long.valueOf(nowDoc.get("readCount").toString()):null);
nowDoc.put("discussCount", nonNull(nowDoc.get("discussCount"))?Long.valueOf(nowDoc.get("discussCount").toString()):null);
}
if (nowDoc.containsKey("pictureUrl")) {
nowDoc.put("pictureUrl",nowDoc.getString("pictureUrl"));
}
if (nowDoc.containsKey("downtext")) {
nowDoc.put("downtext",nowDoc.getString("downtext"));
}
collection.insertOne(nowDoc);
}
/**
* 测试淘宝热搜采集
*/
@Test
public void taoBaoTestCrawler() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
List<HotSearchList> hotSearchLists = TaoBaoHotSearchCrawlerTest.taoBaoHotSearch(new Date());
System.out.println(hotSearchLists);
System.out.println(hotSearchLists.size());
}
/**
* 测试百度热搜采集
*/
@Test
public void baiDuTestCrawler() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
List<HotSearchList> hotSearchLists = BaiDuHotSearchCrawler.baiduHotSearch(new Date());
System.out.println(hotSearchLists);
System.out.println(hotSearchLists.size());
}
}
...@@ -333,7 +333,9 @@ public class WeiboHotSearchTest { ...@@ -333,7 +333,9 @@ public class WeiboHotSearchTest {
String[] split = followers_count.split("万"); String[] split = followers_count.split("万");
followerCount = Long.valueOf(split[0])*10000; followerCount = Long.valueOf(split[0])*10000;
} }
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount); //用户头像地址
String profileImageUrl = users.getJSONObject(i1).getString("profile_image_url");
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount,profileImageUrl);
weiBoUserList.add(weiBoUser); weiBoUserList.add(weiBoUser);
} }
} }
...@@ -356,8 +358,9 @@ public class WeiboHotSearchTest { ...@@ -356,8 +358,9 @@ public class WeiboHotSearchTest {
}else { }else {
followerCount = Long.valueOf(followers_count); followerCount = Long.valueOf(followers_count);
} }
//用户头像地址
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount); String profileImageUrl = user.getString("profile_image_url");
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount,profileImageUrl);
weiBoUserList.add( weiBoUser); weiBoUserList.add( weiBoUser);
} }
...@@ -436,6 +439,8 @@ public class WeiboHotSearchTest { ...@@ -436,6 +439,8 @@ public class WeiboHotSearchTest {
String userName = mblog.getJSONObject("user").getString("screen_name"); String userName = mblog.getJSONObject("user").getString("screen_name");
//来源 //来源
String source = mblog.getString("source"); String source = mblog.getString("source");
//用户头像地址
String profileImageUrl = mblog.getJSONObject("user").getString("profile_image_url");
//内容 //内容
String content = null; String content = null;
if (mblog.getString("text").contains("<")) { if (mblog.getString("text").contains("<")) {
...@@ -448,7 +453,7 @@ public class WeiboHotSearchTest { ...@@ -448,7 +453,7 @@ public class WeiboHotSearchTest {
} }
WeiBoMassage weiBoMassage = new WeiBoMassage(userId, content, userName, mid, createTime, editTime, cardType, showType, WeiBoMassage weiBoMassage = new WeiBoMassage(userId, content, userName, mid, createTime, editTime, cardType, showType,
repostCount, commentCount, attitudeCount, source, type, topic); repostCount, commentCount, attitudeCount, source, type, topic,profileImageUrl);
//默认不转发为0 //默认不转发为0
weiBoMassage.setForward(0); weiBoMassage.setForward(0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment