Commit b59879ef by leiliangliang

新增微博话题采集话题贡献者,关于功能

parent 96ffc323
......@@ -53,9 +53,13 @@ public class WeiBoUser implements Serializable {
* 头像地址
*/
private String profileImageUrl;
/**
* 类型
*/
private String type;
public WeiBoUser() {
}
public WeiBoUser(String userId, String attestationMassage, String userName,String topic,Date time,Long followerCount,String profileImageUrl) {
this.id = userId+"_"+HotSearchType.微博热搜.name()+"_"+topic;
......@@ -66,6 +70,11 @@ public class WeiBoUser implements Serializable {
this.time=time;
this.followerCount=followerCount;
this.profileImageUrl = profileImageUrl;
}
public WeiBoUser(String userId, String userName,String topic,Date time) {
this.userId = userId;
this.userName = userName;
this.topic=topic;
this.time=time;
}
}
package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.stream.Collectors;
import com.alibaba.fastjson.JSON;
import com.mongodb.client.result.UpdateResult;
import com.zhiwei.searchhotcrawler.bean.*;
import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.dao.RedisDao;
......@@ -17,6 +20,12 @@ import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.bson.Document;
import org.checkerframework.checker.units.qual.C;
import org.jsoup.Jsoup;
......@@ -37,26 +46,26 @@ import org.springframework.beans.factory.annotation.Autowired;
import static java.util.Objects.nonNull;
/**
* @author hero
* @ClassName: WeiboHotSearch
* @Description: 微博实时热搜采集
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
public class WeiboHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static RedisDao redisDao = new RedisDao();
private static RedisDao redisDao = new RedisDao();
static WeiBoUserDao weiBoUserDao = new WeiBoUserDao();
static WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao();
/**
* @Title: weiboHotSearchTest
* @author hero
* @Description: TODO(PC端微博热搜采集)
* @return void 返回类型
*/
static WeiBoUserDao weiBoUserDao = new WeiBoUserDao();
static WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao();
/**
* @Title: weiboHotSearchTest
* @author hero
* @Description: TODO(PC端微博热搜采集)
* @return void 返回类型
*/
// public static List<HotSearchList> weiboHotSearch(){
// String url = "https://s.weibo.com/top/summary?cate=realtimehot";
//
......@@ -113,444 +122,553 @@ public class WeiboHotSearchCrawler {
// }
/**
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
* @return void 返回类型
*/
public static List<HotSearchList> weiboHotSearchByPhone(Date date){
String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for(int count =0; count<=5; count++){
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博时热搜时出现连接失败",e);
}
List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray cards = json.getJSONArray("cards");
int rank = 0;
/**
* @return void 返回类型
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
*/
public static List<HotSearchList> weiboHotSearchByPhone(Date date) {
String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
Map<String, String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博时热搜时出现连接失败", e);
}
List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray cards = json.getJSONArray("cards");
int rank = 0;
// for (int i = 0; i < cards.size(); i++) {
try {
JSONObject card = cards.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group");
JSONObject topCard =cardGroup.getJSONObject(0);
if(!topCard.containsKey("pic")){
rank = 1;
}
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
try {
JSONObject card = cards.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group");
JSONObject topCard = cardGroup.getJSONObject(0);
if (!topCard.containsKey("pic")) {
rank = 1;
}
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
// String title = card.getString("title");
boolean hot = true;
boolean hot = true;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
// hot = false;
// rank = 51;
// }
for (int j = 0; j < cardGroup.size(); j++) {
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
long hotCount = cardInfo.getLongValue("desc_extr");
String icon = cardInfo.getString("icon");
if (StringUtils.isNotBlank(icon)) {
icon = icon.split("_")[1].split(".png")[0];
}
String rankPic = cardInfo.getString("pic");
for (int j = 0; j < cardGroup.size(); j++) {
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
long hotCount = cardInfo.getLongValue("desc_extr");
String icon = cardInfo.getString("icon");
if (StringUtils.isNotBlank(icon)) {
icon = icon.split("_")[1].split(".png")[0];
}
// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String urlScheme = cardInfo.getString("scheme");
HotSearchList hotSearch = new HotSearchList(urlScheme, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
hotSearch.setRankPic(rankPic);
result.add(hotSearch);
rank++;
redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS,name+"_微博热搜");
}
} else {
log.info("card 数据结构为:{}", card);
}
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误", e);
continue;
}
String id = cardInfo.getString("scheme");
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
result.add(hotSearch);
rank++;
redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS, name + "_微博热搜");
}
} else {
log.info("card 数据结构为:{}", card);
}
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误", e);
continue;
}
// }
return result;
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微博时热搜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
/**
* 微博预热榜(实时上升热点采集)
* @param date
* @return
*/
public static List<HotSearchList> weiboPreheatSearch(Date date){
String url = "https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜时出现连接失败",e);
}
List<HotSearchList> result = new ArrayList<>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")){
JSONArray cardArray = JSON.parseObject(htmlBody).getJSONArray("cards");
if(cardArray.size() > 1) {
JSONObject jsonObject = cardArray.getJSONObject(1);
if ("实时上升热点".equals(jsonObject.getString("title")) &&
jsonObject.containsKey("card_group")) {
JSONArray jsonArray = jsonObject.getJSONArray("card_group");
for(int i=0; i<jsonArray.size(); i++){
JSONObject cardInfo = jsonArray.getJSONObject(i);
String name = cardInfo.getString("desc");
long hotCount = cardInfo.getIntValue("desc_extr");
String weiboUrl = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
HotSearchList hotSearchList = new HotSearchList(weiboUrl,name,hotCount,null,HotSearchType.微博预热榜.name(),date);
result.add(hotSearchList);
}
//根据热度排序,赋值排名
result = result.stream().sorted(Comparator.comparing(HotSearchList::getCount).reversed()).collect(Collectors.toList());
int rank =1;
for(HotSearchList hotSearchList : result){
hotSearchList.setRank(rank);
rank++;
}
}
}
}
return result;
}
/**
* 微博热搜数据更新导语,阅读量,讨论量
* @param document
* @return
*/
public static Document weiboUpdate(Document document) {
log.info("更新微博热搜{}导语阅读量和讨论量",document.getString("name"));
String url = "https://m.weibo.cn/api/container/getIndex?"+ document.getString("url").substring(
document.getString("url").indexOf("?")+1,document.getString("url").indexOf("&"));
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for(int count =0; count<=5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜详情页面时出现连接失败", e);
}
if (htmlBody != null && htmlBody.contains("data")) {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo");
List<JSONObject> cardsJsons = (List<JSONObject>)JSONObject.parseObject(htmlBody).getJSONObject("data").get("cards");
if(json.containsKey("desc")){
String topicLead = json.getString("desc");
if(!"".equals(topicLead)) {
document.put("topicLead", topicLead);
}
}
if(json.containsKey("cardlist_head_cards")){
JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext");
String read = midText.replaceAll("阅读", "").replaceAll("讨论.*", "").trim();
String discussCount = midText.replaceAll(".*讨论", "").replaceAll("详情.*", "").trim();
String pictureUrl = readJson.getJSONObject("head_data").getString("portrait_url");
document.put("readCount", TipsUtils.getHotCount(read));
document.put("discussCount", TipsUtils.getHotCount(discussCount));
document.put("pictureUrl",pictureUrl);
if (readJson.getJSONObject("head_data").containsKey("downtext")){
String downtext = readJson.getJSONObject("head_data").getString("downtext");
if(!"".equals(downtext)) {
document.put("downtext",downtext.replaceAll("主持人:",""));
}
}
}
}
try {
//解析cards,获取热门微博、人物
if (Objects.isNull(weiBoMassageDao)){
weiBoMassageDao = new WeiBoMassageDao();
}
if (Objects.isNull(weiBoUserDao)){
weiBoUserDao = new WeiBoUserDao();
}
for (JSONObject jsonObject : cardsJsons) {
if (nonNull(jsonObject) && !jsonObject.isEmpty()) {
if (jsonObject.containsKey("mblog")) {
if (jsonObject.getJSONObject("mblog").containsKey("title")) {
WeiBoMassage weiBoMassage = analysisWeiboMBlog(jsonObject, document.getString("name"));
if (Objects.nonNull(weiBoMassage)) {
weiBoMassageDao.addWeiBoMassage(weiBoMassage);
}
}
} else if (jsonObject.containsKey("card_group")) {
JSONArray cardGroup = jsonObject.getJSONArray("card_group");
WeiBoMassage weiBoMassage = analysisWeiboMassage(cardGroup, document.getString("name"));
if (Objects.nonNull(weiBoMassage)) {
weiBoMassageDao.addWeiBoMassage(weiBoMassage);
}
List<WeiBoUser> weiBoUserList = analysisWeiBoUsers(cardGroup, document.getString("name"));
if (!weiBoUserList.isEmpty()){
for (int i = 0; i < weiBoUserList.size(); i++) {
weiBoUserDao.addWeiBoUser(weiBoUserList.get(i));
}
}
}
}
}
} catch (Exception e) {
log.error("解析cards失败,未获得热门微博,人物信息",e);
}
return document;
}
}
return null;
}
/**
* 解析微博信息
*
* @param cardGroup
* @param topic
* @return
*/
public static WeiBoMassage analysisWeiboMassage(JSONArray cardGroup, String topic) {
for (int i = 0; i < cardGroup.size(); i++) {
if (cardGroup.getJSONObject(i).containsKey("mblog")) {
if (cardGroup.getJSONObject(i).getJSONObject("mblog").containsKey("title")) {
WeiBoMassage weiBoMassage = analysisWeiboMBlog(cardGroup.getJSONObject(i), topic);
return weiBoMassage;
}
}
}
return null;
}
/**
* 解析用户信息
*
* @param cardGroup
* @param topic
* @return
*/
public static List<WeiBoUser> analysisWeiBoUsers(JSONArray cardGroup, String topic) {
List<WeiBoUser> weiBoUserList = new ArrayList();
//解析weibo人物信息
Date date = new Date();
for (int i = 0; i < cardGroup.size(); i++) {
if (3==Integer.valueOf(cardGroup.getJSONObject(i).getString("card_type"))) {
if (cardGroup.getJSONObject(i).containsKey("users")){
JSONArray users = cardGroup.getJSONObject(i).getJSONArray("users");
for (int i1 = 0; i1 < users.size(); i1++) {
//获取用户id
String userId = users.getJSONObject(i1).getString("id");
//获取用户名
String userName = users.getJSONObject(i1).getString("screen_name");
//获取认证信息
String attestationMassage = users.getJSONObject(i1).getString("verified_reason");
//获取粉丝数量
String followers_count = users.getJSONObject(i1).getString("followers_count");
Long followerCount =null;
if (!followers_count.contains("万")){
followerCount = Long.valueOf(followers_count);
}else {
String[] split = followers_count.split("万");
followerCount = Long.valueOf(split[0])*10000;
}
//用户头像地址
String profileImageUrl = users.getJSONObject(i1).getString("profile_image_url");
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount,profileImageUrl);
weiBoUserList.add(weiBoUser);
}
}
return weiBoUserList;
} else if (10==Integer.valueOf(cardGroup.getJSONObject(i).getString("card_type"))) {
if (cardGroup.getJSONObject(i).containsKey("user")){
JSONObject user = cardGroup.getJSONObject(i).getJSONObject("user");
//获取用户id
String userId = user.getString("id");
//获取用户名
String userName = user.getString("screen_name");
//获取认证信息
String attestationMassage = user.getString("verified_reason");
//获取粉丝数
String followers_count = user.getString("followers_count");
Long followerCount =null;
if (followers_count.contains("万")){
String[] split = followers_count.split("万");
followerCount = Long.valueOf(split[0])*10000;
}else {
followerCount = Long.valueOf(followers_count);
}
//用户头像地址
String profileImageUrl = user.getString("profile_image_url");
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount,profileImageUrl);
weiBoUserList.add(weiBoUser);
}
return weiBoUserList;
}
}
return Collections.emptyList();
}
/**
* 解析微博类型
*
* @param jsonObject
* @param topic
* @return
*/
public static WeiBoMassage analysisWeiboMBlog(JSONObject jsonObject, String topic) {
JSONObject mblog = jsonObject.getJSONObject("mblog");
String type = mblog.getJSONObject("title").getString("text");
String card_type = jsonObject.getString("card_type");
Integer cardType = Integer.valueOf(card_type);
String show_type = jsonObject.getString("show_type");
Integer showType = Integer.valueOf(show_type);
//点赞数
String attitudes_count = mblog.getString("attitudes_count");
Long attitudeCount = null;
if (attitudes_count.contains("万")) {
String[] split = attitudes_count.split("万");
attitudeCount = Long.valueOf(split[0]) * 10000;
} else {
attitudeCount = Long.valueOf(attitudes_count);
}
//评论数
String comments_count = mblog.getString("comments_count");
Long commentCount = null;
if (comments_count.contains("万")) {
String[] split = comments_count.split("万");
commentCount = Long.valueOf(split[0]) * 10000;
} else {
commentCount = Long.valueOf(comments_count);
}
//转发数
String reposts_count = mblog.getString("reposts_count");
Long repostCount =null;
if (reposts_count.contains("万")){
String[] split = reposts_count.split("万");
repostCount = Long.valueOf(split[0]) * 10000;
}else {
repostCount = Long.valueOf(reposts_count);
}
Date createTime = null;
Date editTime = null;
try {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss z yyyy", java.util.Locale.US);
//创建时间
String created_at = mblog.getString("created_at");
createTime = simpleDateFormat.parse(created_at);
//编辑时间
if (mblog.containsKey("edit_at")){
String edit_at = mblog.getString("edit_at");
editTime = simpleDateFormat.parse(edit_at);
}
} catch (ParseException e) {
log.error("创建时间和编辑时间解析异常",e);
}
String mid = mblog.getString("mid");
//用户id
String userId = mblog.getJSONObject("user").getString("id");
//用户名
String userName = mblog.getJSONObject("user").getString("screen_name");
//来源
String source = mblog.getString("source");
//用户头像地址
String profileImageUrl = mblog.getJSONObject("user").getString("profile_image_url");
//内容
String content = null;
if (mblog.getString("text").contains("<")) {
String text = mblog.getString("text");
org.jsoup.nodes.Document parse = Jsoup.parse(text);
content = parse.text();
} else {
content = mblog.getString("text");
}
WeiBoMassage weiBoMassage = new WeiBoMassage(userId, content, userName, mid, createTime, editTime, cardType, showType,
repostCount, commentCount, attitudeCount, source, type, topic,profileImageUrl);
//默认不转发为0
weiBoMassage.setForward(0);
JSONObject weiboJson = null;
//微博实体 是否转发
if (mblog.containsKey("retweeted_status")) {
weiboJson = mblog.getJSONObject("retweeted_status");
//处理转发特有的
//weiBoMassage.set
//源mid
String rootMid = weiboJson.getString("mid");
//源来源
String rootSource = weiboJson.getString("source");
//源text
String text = weiboJson.getString("text");
//解析
org.jsoup.nodes.Document parse = Jsoup.parse(text);
String rootText = parse.text();
//源用户id
String rootId = weiboJson.getJSONObject("user").getString("id");
//源用户名
String rootName = weiboJson.getJSONObject("user").getString("screen_name");
//数据保存到对象中
weiBoMassage.setRoot_mid(rootMid);
weiBoMassage.setRoot_id(rootId);
weiBoMassage.setRoot_source(rootSource);
weiBoMassage.setRoot_text(rootText);
weiBoMassage.setRoot_name(rootName);
//转发为1
weiBoMassage.setForward(1);
} else {
weiboJson = mblog;
}
List<String> pictureUrlList = new ArrayList();
Long playCount = null;
//获取播放量和图片链接
if (weiboJson.getJSONArray("pic_ids").size() > 0) {
JSONArray jsonArray = weiboJson.getJSONArray("pics");
for (int i = 0; i < jsonArray.size(); i++) {
String picUrl = jsonArray.getJSONObject(i).getString("url");
pictureUrlList.add(picUrl);
}
} else if (weiboJson.containsKey("page_info")) {
if (weiboJson.getJSONObject("page_info").containsKey("play_count")){
String play = weiboJson.getJSONObject("page_info").getString("play_count");
if (play.contains("万")) {
String[] split = play.split("万");
playCount = Long.valueOf(split[0]) * 10000;
}else if(play.contains("次")){
String[] split = play.split("次");
playCount = Long.valueOf(split[0]);
}
}
}
weiBoMassage.setPlayCount(playCount);
weiBoMassage.setPictureUrlList(pictureUrlList);
return weiBoMassage;
}
return result;
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微博时热搜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
/**
* 微博预热榜(实时上升热点采集)
*
* @param date
* @return
*/
public static List<HotSearchList> weiboPreheatSearch(Date date) {
String url = "https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜时出现连接失败", e);
}
List<HotSearchList> result = new ArrayList<>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
JSONArray cardArray = JSON.parseObject(htmlBody).getJSONArray("cards");
if (cardArray.size() > 1) {
JSONObject jsonObject = cardArray.getJSONObject(1);
if ("实时上升热点".equals(jsonObject.getString("title")) &&
jsonObject.containsKey("card_group")) {
JSONArray jsonArray = jsonObject.getJSONArray("card_group");
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject cardInfo = jsonArray.getJSONObject(i);
String name = cardInfo.getString("desc");
long hotCount = cardInfo.getIntValue("desc_extr");
String weiboUrl = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
HotSearchList hotSearchList = new HotSearchList(weiboUrl, name, hotCount, null, HotSearchType.微博预热榜.name(), date);
result.add(hotSearchList);
}
//根据热度排序,赋值排名
result = result.stream().sorted(Comparator.comparing(HotSearchList::getCount).reversed()).collect(Collectors.toList());
int rank = 1;
for (HotSearchList hotSearchList : result) {
hotSearchList.setRank(rank);
rank++;
}
}
}
}
return result;
}
/**
* 微博热搜数据更新导语,阅读量,讨论量
*
* @param document
* @return
*/
public static Document weiboUpdate(Document document) {
log.info("更新微博热搜{}导语阅读量和讨论量", document.getString("name"));
String url = "https://m.weibo.cn/api/container/getIndex?" + document.getString("url").substring(
document.getString("url").indexOf("?") + 1, document.getString("url").indexOf("&"));
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜详情页面时出现连接失败", e);
}
if (htmlBody != null && htmlBody.contains("data")) {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo");
List<JSONObject> cardsJsons = (List<JSONObject>) JSONObject.parseObject(htmlBody).getJSONObject("data").get("cards");
if (json.containsKey("desc")) {
String topicLead = json.getString("desc");
if (!"".equals(topicLead)) {
document.put("topicLead", topicLead);
}
}
if (json.containsKey("cardlist_head_cards")) {
JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext");
String read = midText.replaceAll("阅读", "").replaceAll("讨论.*", "").trim();
String discussCount = midText.replaceAll(".*讨论", "").replaceAll("详情.*", "").trim();
String pictureUrl = readJson.getJSONObject("head_data").getString("portrait_url");
document.put("readCount", TipsUtils.getHotCount(read));
document.put("discussCount", TipsUtils.getHotCount(discussCount));
document.put("pictureUrl", pictureUrl);
if (readJson.getJSONObject("head_data").containsKey("downtext")) {
String downtext = readJson.getJSONObject("head_data").getString("downtext");
if (!"".equals(downtext)) {
document.put("downtext", downtext.replaceAll("主持人:", ""));
}
}
}
}
try {
//解析cards,获取热门微博、人物
if (Objects.isNull(weiBoMassageDao)) {
weiBoMassageDao = new WeiBoMassageDao();
}
if (Objects.isNull(weiBoUserDao)) {
weiBoUserDao = new WeiBoUserDao();
}
for (JSONObject jsonObject : cardsJsons) {
if (nonNull(jsonObject) && !jsonObject.isEmpty()) {
if (jsonObject.containsKey("mblog")) {
if (jsonObject.getJSONObject("mblog").containsKey("title")) {
WeiBoMassage weiBoMassage = analysisWeiboMBlog(jsonObject, document.getString("name"));
if (Objects.nonNull(weiBoMassage)) {
weiBoMassageDao.addWeiBoMassage(weiBoMassage);
}
}
} else if (jsonObject.containsKey("card_group")) {
JSONArray cardGroup = jsonObject.getJSONArray("card_group");
WeiBoMassage weiBoMassage = analysisWeiboMassage(cardGroup, document.getString("name"));
if (Objects.nonNull(weiBoMassage)) {
weiBoMassageDao.addWeiBoMassage(weiBoMassage);
}
List<WeiBoUser> weiBoUserList = analysisWeiBoUsers(cardGroup, document.getString("name"));
if (!weiBoUserList.isEmpty()) {
for (int i = 0; i < weiBoUserList.size(); i++) {
weiBoUserDao.addWeiBoUser(weiBoUserList.get(i));
}
}
}
}
}
} catch (Exception e) {
log.error("解析cards失败,未获得热门微博,人物信息", e);
}
return document;
}
}
return null;
}
/**
* 微博热搜数据更新话题贡献者排行,关于
*
* @param document
* @return
*/
public static Document weiboUpdatePC(Document document) {
document.getString("name");
String topic = document.getString("name");
String gb = "#" + topic + "#";
String encode = null;
try {
encode = URLEncoder.encode(gb, "utf-8");
} catch (UnsupportedEncodingException e) {
log.error("字符解析成URl模式异常", e);
}
String url = "https://s.weibo.com/weibo?q=" + encode;
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博时热搜时出现连接失败", e);
}
if (htmlBody != null && htmlBody.contains("m-main")) {
Document docm = new Document();
try {
org.jsoup.nodes.Document documen = Jsoup.parse(htmlBody);
//获取贡献者信息
try {
Elements cardUser = documen.select("div.card-user");
for (Element element : cardUser) {
if (!element.select("div.card-head").text().isEmpty()) {
Elements li = element.select("ul.card-user-list-a").select("li");
if (Objects.nonNull(li)) {
//循环获取话题贡献者相关信息
for (Element eleme : li) {
String type = "话题贡献者";
writeUser(eleme, type, topic);
}
}
} else {
Elements li = element.select("ul.card-user-list-a").select("li");
if (Objects.nonNull(li)) {
//循环获取话题贡献者相关信息
for (Element eleme : li) {
String type = "当事人";
writeUser(eleme, type, topic);
}
}
}
}
} catch (Exception e) {
log.error("话题贡献者排行采集异常", e);
}
Elements dt = documen.select("div.card-about").select("dt");
if (Objects.nonNull(dt)) {
//获取微博关于的相关信息
Elements dd = documen.select("div.card-about").select("dd");
Document dtDocument = new Document();
Document ddDocument = new Document();
for (int i = 0; i < dt.size(); i++) {
String dtText = dt.get(i).text().replaceAll(":", "").trim();
dtDocument.put(String.valueOf(i), dtText);
}
for (int i1 = 0; i1 < dd.size(); i1++) {
Elements a = dd.get(i1).select("a");
List<String> str = new ArrayList<>();
for (int b = 0; b < a.size(); b++) {
String text1 = a.get(b).text();
str.add(text1);
}
ddDocument.put(String.valueOf(i1), str);
}
for (int a = 0; a < dt.size(); a++) {
docm.put(dtDocument.getString(String.valueOf(a)), ddDocument.get(String.valueOf(a)));
}
}
return docm;
} catch (Exception e) {
log.error("解析微博话题时出现解析错误", e);
}
}
return document;
}
/**
* 写入user数据
*
* @param eleme
* @param type
*/
private static void writeUser(Element eleme, String type, String topic) {
Date date = new Date();
if (Objects.isNull(weiBoUserDao)) {
weiBoUserDao = new WeiBoUserDao();
}
//获取用户名
String userName = eleme.select("a.name").text();
String attr = eleme.select("span.avator").select("a").first().attr("href");
//获取用户id
String userId = attr.substring(14);
String id = userId + "_" + type + "_" + topic;
WeiBoUser weiBoUser = new WeiBoUser(userName, userId, topic, date);
weiBoUser.setType(type);
weiBoUser.setId(id);
weiBoUserDao.addWeiBoUser(weiBoUser);
}
/**
* 解析微博信息
*
* @param cardGroup
* @param topic
* @return
*/
public static WeiBoMassage analysisWeiboMassage(JSONArray cardGroup, String topic) {
for (int i = 0; i < cardGroup.size(); i++) {
if (cardGroup.getJSONObject(i).containsKey("mblog")) {
if (cardGroup.getJSONObject(i).getJSONObject("mblog").containsKey("title")) {
WeiBoMassage weiBoMassage = analysisWeiboMBlog(cardGroup.getJSONObject(i), topic);
return weiBoMassage;
}
}
}
return null;
}
/**
* 解析用户信息
*
* @param cardGroup
* @param topic
* @return
*/
public static List<WeiBoUser> analysisWeiBoUsers(JSONArray cardGroup, String topic) {
List<WeiBoUser> weiBoUserList = new ArrayList();
//解析weibo人物信息
Date date = new Date();
for (int i = 0; i < cardGroup.size(); i++) {
if (3 == Integer.valueOf(cardGroup.getJSONObject(i).getString("card_type"))) {
if (cardGroup.getJSONObject(i).containsKey("users")) {
JSONArray users = cardGroup.getJSONObject(i).getJSONArray("users");
for (int i1 = 0; i1 < users.size(); i1++) {
//获取用户id
String userId = users.getJSONObject(i1).getString("id");
//获取用户名
String userName = users.getJSONObject(i1).getString("screen_name");
//获取认证信息
String attestationMassage = users.getJSONObject(i1).getString("verified_reason");
//获取粉丝数量
String followers_count = users.getJSONObject(i1).getString("followers_count");
Long followerCount = null;
if (!followers_count.contains("万")) {
followerCount = Long.valueOf(followers_count);
} else {
String[] split = followers_count.split("万");
followerCount = Long.valueOf(split[0]) * 10000;
}
//用户头像地址
String profileImageUrl = users.getJSONObject(i1).getString("profile_image_url");
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic, date, followerCount, profileImageUrl);
weiBoUserList.add(weiBoUser);
}
}
return weiBoUserList;
} else if (10 == Integer.valueOf(cardGroup.getJSONObject(i).getString("card_type"))) {
if (cardGroup.getJSONObject(i).containsKey("user")) {
JSONObject user = cardGroup.getJSONObject(i).getJSONObject("user");
//获取用户id
String userId = user.getString("id");
//获取用户名
String userName = user.getString("screen_name");
//获取认证信息
String attestationMassage = user.getString("verified_reason");
//获取粉丝数
String followers_count = user.getString("followers_count");
Long followerCount = null;
if (followers_count.contains("万")) {
String[] split = followers_count.split("万");
followerCount = Long.valueOf(split[0]) * 10000;
} else {
followerCount = Long.valueOf(followers_count);
}
//用户头像地址
String profileImageUrl = user.getString("profile_image_url");
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic, date, followerCount, profileImageUrl);
weiBoUserList.add(weiBoUser);
}
return weiBoUserList;
}
}
return Collections.emptyList();
}
/**
* 解析微博类型
*
* @param jsonObject
* @param topic
* @return
*/
public static WeiBoMassage analysisWeiboMBlog(JSONObject jsonObject, String topic) {
JSONObject mblog = jsonObject.getJSONObject("mblog");
String type = mblog.getJSONObject("title").getString("text");
String card_type = jsonObject.getString("card_type");
Integer cardType = Integer.valueOf(card_type);
String show_type = jsonObject.getString("show_type");
Integer showType = Integer.valueOf(show_type);
//点赞数
String attitudes_count = mblog.getString("attitudes_count");
Long attitudeCount = null;
if (attitudes_count.contains("万")) {
String[] split = attitudes_count.split("万");
attitudeCount = Long.valueOf(split[0]) * 10000;
} else {
attitudeCount = Long.valueOf(attitudes_count);
}
//评论数
String comments_count = mblog.getString("comments_count");
Long commentCount = null;
if (comments_count.contains("万")) {
String[] split = comments_count.split("万");
commentCount = Long.valueOf(split[0]) * 10000;
} else {
commentCount = Long.valueOf(comments_count);
}
//转发数
String reposts_count = mblog.getString("reposts_count");
Long repostCount = null;
if (reposts_count.contains("万")) {
String[] split = reposts_count.split("万");
repostCount = Long.valueOf(split[0]) * 10000;
} else {
repostCount = Long.valueOf(reposts_count);
}
Date createTime = null;
Date editTime = null;
try {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss z yyyy", java.util.Locale.US);
//创建时间
String created_at = mblog.getString("created_at");
createTime = simpleDateFormat.parse(created_at);
//编辑时间
if (mblog.containsKey("edit_at")) {
String edit_at = mblog.getString("edit_at");
editTime = simpleDateFormat.parse(edit_at);
}
} catch (ParseException e) {
log.error("创建时间和编辑时间解析异常", e);
}
String mid = mblog.getString("mid");
//用户id
String userId = mblog.getJSONObject("user").getString("id");
//用户名
String userName = mblog.getJSONObject("user").getString("screen_name");
//来源
String source = mblog.getString("source");
//用户头像地址
String profileImageUrl = mblog.getJSONObject("user").getString("profile_image_url");
//内容
String content = null;
if (mblog.getString("text").contains("<")) {
String text = mblog.getString("text");
org.jsoup.nodes.Document parse = Jsoup.parse(text);
content = parse.text();
} else {
content = mblog.getString("text");
}
WeiBoMassage weiBoMassage = new WeiBoMassage(userId, content, userName, mid, createTime, editTime, cardType, showType,
repostCount, commentCount, attitudeCount, source, type, topic, profileImageUrl);
//默认不转发为0
weiBoMassage.setForward(0);
JSONObject weiboJson = null;
//微博实体 是否转发
if (mblog.containsKey("retweeted_status")) {
weiboJson = mblog.getJSONObject("retweeted_status");
//处理转发特有的
//weiBoMassage.set
//源mid
String rootMid = weiboJson.getString("mid");
//源来源
String rootSource = weiboJson.getString("source");
//源text
String text = weiboJson.getString("text");
//解析
org.jsoup.nodes.Document parse = Jsoup.parse(text);
String rootText = parse.text();
//源用户id
String rootId = weiboJson.getJSONObject("user").getString("id");
//源用户名
String rootName = weiboJson.getJSONObject("user").getString("screen_name");
//数据保存到对象中
weiBoMassage.setRoot_mid(rootMid);
weiBoMassage.setRoot_id(rootId);
weiBoMassage.setRoot_source(rootSource);
weiBoMassage.setRoot_text(rootText);
weiBoMassage.setRoot_name(rootName);
//转发为1
weiBoMassage.setForward(1);
} else {
weiboJson = mblog;
}
List<String> pictureUrlList = new ArrayList();
Long playCount = null;
//获取播放量和图片链接
if (weiboJson.getJSONArray("pic_ids").size() > 0) {
JSONArray jsonArray = weiboJson.getJSONArray("pics");
for (int i = 0; i < jsonArray.size(); i++) {
String picUrl = jsonArray.getJSONObject(i).getString("url");
pictureUrlList.add(picUrl);
}
} else if (weiboJson.containsKey("page_info")) {
if (weiboJson.getJSONObject("page_info").containsKey("play_count")) {
String play = weiboJson.getJSONObject("page_info").getString("play_count");
if (play.contains("万")) {
String[] split = play.split("万");
playCount = Long.valueOf(split[0]) * 10000;
} else if (play.contains("次")) {
String[] split = play.split("次");
playCount = Long.valueOf(split[0]);
}
}
}
weiBoMassage.setPlayCount(playCount);
weiBoMassage.setPictureUrlList(pictureUrlList);
return weiBoMassage;
}
// /**
// * 微博更新历史数据
......
......@@ -208,6 +208,17 @@ public class HotSearchCacheDAO {
}
if("微博热搜".equals(type)){
nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc);
//更新微博话题贡献者,关于功能
Document documentPC = WeiboHotSearchCrawler.weiboUpdatePC(nowDoc);
if (documentPC.containsKey("分类")) {
nowDoc.put("classify",documentPC.get("分类"));
}
if (documentPC.containsKey("地区")) {
nowDoc.put("region", documentPC.get("地区"));
}
if (documentPC.containsKey("标签")) {
nowDoc.put("label", documentPC.get("标签"));
}
if(nowDoc.containsKey("topicLead")){
nowDoc.put("topicLead", nowDoc.getString("topicLead"));
}
......
......@@ -32,6 +32,7 @@ public class WeiBoUserDao {
* @param weiBoUser
*/
public void addWeiBoUser(WeiBoUser weiBoUser){
try {
Document document = new Document();
document.put("_id",weiBoUser.getId());
......@@ -42,8 +43,15 @@ public class WeiBoUserDao {
document.put("userName",weiBoUser.getUserName());
document.put("topic",weiBoUser.getTopic());
document.put("time",weiBoUser.getTime());
document.put("followerCount",weiBoUser.getFollowerCount());
document.put("profileImageUrl",weiBoUser.getProfileImageUrl());
if (Objects.nonNull(weiBoUser.getType())){
document.put("type",weiBoUser.getType());
}
if (Objects.nonNull(weiBoUser.getFollowerCount())){
document.put("followerCount",weiBoUser.getFollowerCount());
}
if (Objects.nonNull(weiBoUser.getProfileImageUrl())){
document.put("profileImageUrl",weiBoUser.getProfileImageUrl());
}
try {
mongoCollection.insertOne(document);
} catch (Exception e) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment