Commit 7546fbb8 by chenweitao

Merge branch 'working' into 'master'

Working

See merge request !99
parents e3740064 3d271f76
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4"> <module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" version="4">
<component name="FacetManager"> <component name="FacetManager">
<facet type="Spring" name="Spring"> <facet type="Spring" name="Spring">
<configuration /> <configuration />
...@@ -10,8 +10,8 @@ ...@@ -10,8 +10,8 @@
<output-test url="file://$MODULE_DIR$/target/test-classes" /> <output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$"> <content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" /> <sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" /> <sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<excludeFolder url="file://$MODULE_DIR$/target" /> <excludeFolder url="file://$MODULE_DIR$/target" />
</content> </content>
<orderEntry type="inheritedJdk" /> <orderEntry type="inheritedJdk" />
...@@ -77,5 +77,8 @@ ...@@ -77,5 +77,8 @@
<orderEntry type="library" name="Maven: org.apache.commons:commons-pool2:2.4.2" level="project" /> <orderEntry type="library" name="Maven: org.apache.commons:commons-pool2:2.4.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.12" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.12" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.3" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpclient:4.5.6" level="project" />
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore:4.4.10" level="project" />
<orderEntry type="library" name="Maven: commons-codec:commons-codec:1.10" level="project" />
</component> </component>
</module> </module>
\ No newline at end of file
package com.zhiwei.searchhotcrawler.bean; package com.zhiwei.searchhotcrawler.bean;
public enum HotSearchType { public enum HotSearchType {
百度热搜, 百度热搜,
微博热搜, 微博热搜,
知乎热搜, 知乎热搜,
抖音热搜, 抖音热搜,
搜狗微信热搜, 搜狗微信热搜,
搜狗微信客户端热搜, 搜狗微信客户端热搜,
微博话题, 微博话题,
今日头条热搜, 今日头条热搜,
知乎热搜榜单, 知乎热搜榜单,
腾讯新闻, 腾讯新闻,
新浪热榜, 新浪热榜,
新浪热点, 新浪热点,
搜狐话题, 搜狐话题,
凤凰新闻热榜, 凤凰新闻热榜,
凤凰新闻热搜, 凤凰新闻热搜,
网易热榜, 网易热榜,
网易跟帖热议, 网易跟帖热议,
微博预热榜, 微博预热榜,
腾讯较真榜, 腾讯较真榜,
脉脉热榜, 脉脉热榜,
B站排行榜, B站排行榜,
B站热搜, B站热搜,
人气榜36, 人气榜36,
虎嗅热文推荐, 虎嗅热文推荐,
快手热榜, 快手热榜,
淘宝热搜,
} }
...@@ -53,6 +53,11 @@ public class WeiBoUser implements Serializable { ...@@ -53,6 +53,11 @@ public class WeiBoUser implements Serializable {
* 头像地址 * 头像地址
*/ */
private String profileImageUrl; private String profileImageUrl;
/**
* 类型
*/
private String type;
public WeiBoUser() { public WeiBoUser() {
} }
...@@ -66,6 +71,11 @@ public class WeiBoUser implements Serializable { ...@@ -66,6 +71,11 @@ public class WeiBoUser implements Serializable {
this.time=time; this.time=time;
this.followerCount=followerCount; this.followerCount=followerCount;
this.profileImageUrl = profileImageUrl; this.profileImageUrl = profileImageUrl;
}
public WeiBoUser(String userId, String userName,String topic,Date time) {
this.userId = userId;
this.userName = userName;
this.topic=topic;
this.time=time;
} }
} }
...@@ -14,8 +14,6 @@ import org.jsoup.Jsoup; ...@@ -14,8 +14,6 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
......
...@@ -45,7 +45,10 @@ public class MaiMaiHotSearchCrawler { ...@@ -45,7 +45,10 @@ public class MaiMaiHotSearchCrawler {
JSONObject jsonObject = jsonArray.getJSONObject(i).getJSONObject("style35"); JSONObject jsonObject = jsonArray.getJSONObject(i).getJSONObject("style35");
if(jsonObject != null) { if(jsonObject != null) {
String name = jsonObject.getString("text"); String name = jsonObject.getString("text");
log.info(name); if (name.length()>750){
name = name.substring(0,750);
}
// log.info(name);
String maimaiUrl = jsonObject.getString("share_url"); String maimaiUrl = jsonObject.getString("share_url");
String icon = null; String icon = null;
if (jsonObject.containsKey("hot_type_card")) { if (jsonObject.containsKey("hot_type_card")) {
......
package com.zhiwei.searchhotcrawler.crawler; package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException; import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSON;
import com.mongodb.client.result.UpdateResult;
import com.zhiwei.searchhotcrawler.bean.*; import com.zhiwei.searchhotcrawler.bean.*;
import com.zhiwei.searchhotcrawler.config.RedisConfig; import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.dao.RedisDao; import com.zhiwei.searchhotcrawler.dao.RedisDao;
...@@ -17,6 +20,12 @@ import lombok.extern.log4j.Log4j2; ...@@ -17,6 +20,12 @@ import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.bson.Document; import org.bson.Document;
import org.checkerframework.checker.units.qual.C; import org.checkerframework.checker.units.qual.C;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
...@@ -37,6 +46,7 @@ import org.springframework.beans.factory.annotation.Autowired; ...@@ -37,6 +46,7 @@ import org.springframework.beans.factory.annotation.Autowired;
import static java.util.Objects.nonNull; import static java.util.Objects.nonNull;
/** /**
* @author hero
* @ClassName: WeiboHotSearch * @ClassName: WeiboHotSearch
* @Description: 微博实时热搜采集 * @Description: 微博实时热搜采集
* @author hero * @author hero
...@@ -113,444 +123,553 @@ public class WeiboHotSearchCrawler { ...@@ -113,444 +123,553 @@ public class WeiboHotSearchCrawler {
// } // }
/**
* @return void 返回类型
/** * @Title: weiboHotSearchByPhoneTest
* @Title: weiboHotSearchByPhoneTest * @author hero
* @author hero * @Description: TODO(手机端Iphone 微博热搜采集)
* @Description: TODO(手机端Iphone 微博热搜采集) */
* @return void 返回类型 public static List<HotSearchList> weiboHotSearchByPhone(Date date) {
*/ String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
public static List<HotSearchList> weiboHotSearchByPhone(Date date){ Map<String, String> headerMap = new HashMap<>();
String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583"; headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
Map<String,String> headerMap = new HashMap<>(); String htmlBody = null;
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"); Request request = RequestUtils.wrapGet(url, headerMap);
String htmlBody = null; for (int count = 0; count <= 5; count++) {
Request request = RequestUtils.wrapGet(url, headerMap); try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
for(int count =0; count<=5; count++){ htmlBody = response.body().string();
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) { } catch (IOException e) {
htmlBody = response.body().string(); log.error("解析微博时热搜时出现连接失败", e);
} catch (IOException e) { }
log.error("解析微博时热搜时出现连接失败",e); List<HotSearchList> result = new ArrayList<HotSearchList>();
} if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
List<HotSearchList> result = new ArrayList<HotSearchList>(); try {
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) { JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data");
try { JSONArray cards = json.getJSONArray("cards");
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data"); int rank = 0;
JSONArray cards = json.getJSONArray("cards");
int rank = 0;
// for (int i = 0; i < cards.size(); i++) { // for (int i = 0; i < cards.size(); i++) {
try { try {
JSONObject card = cards.getJSONObject(0); JSONObject card = cards.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group"); JSONArray cardGroup = card.getJSONArray("card_group");
JSONObject topCard =cardGroup.getJSONObject(0); JSONObject topCard = cardGroup.getJSONObject(0);
if(!topCard.containsKey("pic")){ if (!topCard.containsKey("pic")) {
rank = 1; rank = 1;
} }
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) { if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
// String title = card.getString("title"); // String title = card.getString("title");
boolean hot = true; boolean hot = true;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) { // if (Objects.nonNull(title) && title.contains("实时上升热点")) {
// hot = false; // hot = false;
// rank = 51; // rank = 51;
// } // }
for (int j = 0; j < cardGroup.size(); j++) { for (int j = 0; j < cardGroup.size(); j++) {
JSONObject cardInfo = cardGroup.getJSONObject(j); JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc"); String name = cardInfo.getString("desc");
long hotCount = cardInfo.getLongValue("desc_extr"); long hotCount = cardInfo.getLongValue("desc_extr");
String icon = cardInfo.getString("icon"); String icon = cardInfo.getString("icon");
if (StringUtils.isNotBlank(icon)) { if (StringUtils.isNotBlank(icon)) {
icon = icon.split("_")[1].split(".png")[0]; icon = icon.split("_")[1].split(".png")[0];
} }
String rankPic = cardInfo.getString("pic");
// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top"; // String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String urlScheme = cardInfo.getString("scheme"); String id = cardInfo.getString("scheme");
HotSearchList hotSearch = new HotSearchList(urlScheme, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date); HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
hotSearch.setRankPic(rankPic); result.add(hotSearch);
result.add(hotSearch); rank++;
rank++; redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS, name + "_微博热搜");
redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS,name+"_微博热搜"); }
} } else {
} else { log.info("card 数据结构为:{}", card);
log.info("card 数据结构为:{}", card); }
} } catch (Exception e) {
} catch (Exception e) { log.error("解析微博时热搜时出现解析错误", e);
log.error("解析微博时热搜时出现解析错误", e); continue;
continue; }
}
// } // }
return result; return result;
} catch (Exception e) { } catch (Exception e) {
log.error("解析微博时热搜时出现解析错误,数据不是json结构", e); log.error("解析微博时热搜时出现解析错误,数据不是json结构", e);
} }
} else { } else {
log.info("解析微博时热搜时出现解析错误,页面结构有问题"); log.info("解析微博时热搜时出现解析错误,页面结构有问题");
} }
} }
return Collections.emptyList(); return Collections.emptyList();
} }
/** /**
* 微博预热榜(实时上升热点采集) * 微博预热榜(实时上升热点采集)
* @param date *
* @return * @param date
*/ * @return
public static List<HotSearchList> weiboPreheatSearch(Date date){ */
String url = "https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot"; public static List<HotSearchList> weiboPreheatSearch(Date date) {
String htmlBody = null; String url = "https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot";
Request request = RequestUtils.wrapGet(url); String htmlBody = null;
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) { Request request = RequestUtils.wrapGet(url);
htmlBody = response.body().string(); try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
} catch (IOException e) { htmlBody = response.body().string();
log.error("解析微博热搜时出现连接失败",e); } catch (IOException e) {
} log.error("解析微博热搜时出现连接失败", e);
List<HotSearchList> result = new ArrayList<>(); }
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")){ List<HotSearchList> result = new ArrayList<>();
JSONArray cardArray = JSON.parseObject(htmlBody).getJSONArray("cards"); if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
if(cardArray.size() > 1) { JSONArray cardArray = JSON.parseObject(htmlBody).getJSONArray("cards");
JSONObject jsonObject = cardArray.getJSONObject(1); if (cardArray.size() > 1) {
if ("实时上升热点".equals(jsonObject.getString("title")) && JSONObject jsonObject = cardArray.getJSONObject(1);
jsonObject.containsKey("card_group")) { if ("实时上升热点".equals(jsonObject.getString("title")) &&
JSONArray jsonArray = jsonObject.getJSONArray("card_group"); jsonObject.containsKey("card_group")) {
for(int i=0; i<jsonArray.size(); i++){ JSONArray jsonArray = jsonObject.getJSONArray("card_group");
JSONObject cardInfo = jsonArray.getJSONObject(i); for (int i = 0; i < jsonArray.size(); i++) {
String name = cardInfo.getString("desc"); JSONObject cardInfo = jsonArray.getJSONObject(i);
long hotCount = cardInfo.getIntValue("desc_extr"); String name = cardInfo.getString("desc");
String weiboUrl = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top"; long hotCount = cardInfo.getIntValue("desc_extr");
HotSearchList hotSearchList = new HotSearchList(weiboUrl,name,hotCount,null,HotSearchType.微博预热榜.name(),date); String weiboUrl = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
result.add(hotSearchList); HotSearchList hotSearchList = new HotSearchList(weiboUrl, name, hotCount, null, HotSearchType.微博预热榜.name(), date);
} result.add(hotSearchList);
//根据热度排序,赋值排名 }
result = result.stream().sorted(Comparator.comparing(HotSearchList::getCount).reversed()).collect(Collectors.toList()); //根据热度排序,赋值排名
int rank =1; result = result.stream().sorted(Comparator.comparing(HotSearchList::getCount).reversed()).collect(Collectors.toList());
for(HotSearchList hotSearchList : result){ int rank = 1;
hotSearchList.setRank(rank); for (HotSearchList hotSearchList : result) {
rank++; hotSearchList.setRank(rank);
} rank++;
} }
} }
} }
return result; }
} return result;
}
/**
* 微博热搜数据更新导语,阅读量,讨论量 /**
* @param document * 微博热搜数据更新导语,阅读量,讨论量
* @return *
*/ * @param document
public static Document weiboUpdate(Document document) { * @return
log.info("更新微博热搜{}导语阅读量和讨论量",document.getString("name")); */
String url = "https://m.weibo.cn/api/container/getIndex?"+ document.getString("url").substring( public static Document weiboUpdate(Document document) {
document.getString("url").indexOf("?")+1,document.getString("url").indexOf("&")); log.info("更新微博热搜{}导语阅读量和讨论量", document.getString("name"));
String htmlBody = null; String url = "https://m.weibo.cn/api/container/getIndex?" + document.getString("url").substring(
Request request = RequestUtils.wrapGet(url); document.getString("url").indexOf("?") + 1, document.getString("url").indexOf("&"));
for(int count =0; count<=5; count++) { String htmlBody = null;
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Request request = RequestUtils.wrapGet(url);
htmlBody = response.body().string(); for (int count = 0; count <= 5; count++) {
} catch (IOException e) { try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
log.error("解析微博热搜详情页面时出现连接失败", e); htmlBody = response.body().string();
} } catch (IOException e) {
if (htmlBody != null && htmlBody.contains("data")) { log.error("解析微博热搜详情页面时出现连接失败", e);
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo"); }
List<JSONObject> cardsJsons = (List<JSONObject>)JSONObject.parseObject(htmlBody).getJSONObject("data").get("cards"); if (htmlBody != null && htmlBody.contains("data")) {
if(json.containsKey("desc")){ JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo");
String topicLead = json.getString("desc"); List<JSONObject> cardsJsons = (List<JSONObject>) JSONObject.parseObject(htmlBody).getJSONObject("data").get("cards");
if(!"".equals(topicLead)) { if (json.containsKey("desc")) {
document.put("topicLead", topicLead); String topicLead = json.getString("desc");
} if (!"".equals(topicLead)) {
} document.put("topicLead", topicLead);
if(json.containsKey("cardlist_head_cards")){ }
JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0); }
if (readJson.containsKey("head_data")) { if (json.containsKey("cardlist_head_cards")) {
String midText = readJson.getJSONObject("head_data").getString("midtext"); JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0);
String read = midText.replaceAll("阅读", "").replaceAll("讨论.*", "").trim(); if (readJson.containsKey("head_data")) {
String discussCount = midText.replaceAll(".*讨论", "").replaceAll("详情.*", "").trim(); String midText = readJson.getJSONObject("head_data").getString("midtext");
String pictureUrl = readJson.getJSONObject("head_data").getString("portrait_url"); String read = midText.replaceAll("阅读", "").replaceAll("讨论.*", "").trim();
document.put("readCount", TipsUtils.getHotCount(read)); String discussCount = midText.replaceAll(".*讨论", "").replaceAll("详情.*", "").trim();
document.put("discussCount", TipsUtils.getHotCount(discussCount)); String pictureUrl = readJson.getJSONObject("head_data").getString("portrait_url");
document.put("pictureUrl",pictureUrl); document.put("readCount", TipsUtils.getHotCount(read));
if (readJson.getJSONObject("head_data").containsKey("downtext")){ document.put("discussCount", TipsUtils.getHotCount(discussCount));
String downtext = readJson.getJSONObject("head_data").getString("downtext"); document.put("pictureUrl", pictureUrl);
if(!"".equals(downtext)) { if (readJson.getJSONObject("head_data").containsKey("downtext")) {
document.put("downtext",downtext.replaceAll("主持人:","")); String downtext = readJson.getJSONObject("head_data").getString("downtext");
} if (!"".equals(downtext)) {
} document.put("downtext", downtext.replaceAll("主持人:", ""));
} }
} }
}
try { }
//解析cards,获取热门微博、人物
if (Objects.isNull(weiBoMassageDao)){ try {
weiBoMassageDao = new WeiBoMassageDao(); //解析cards,获取热门微博、人物
} if (Objects.isNull(weiBoMassageDao)) {
if (Objects.isNull(weiBoUserDao)){ weiBoMassageDao = new WeiBoMassageDao();
weiBoUserDao = new WeiBoUserDao(); }
} if (Objects.isNull(weiBoUserDao)) {
for (JSONObject jsonObject : cardsJsons) { weiBoUserDao = new WeiBoUserDao();
if (nonNull(jsonObject) && !jsonObject.isEmpty()) { }
if (jsonObject.containsKey("mblog")) { for (JSONObject jsonObject : cardsJsons) {
if (jsonObject.getJSONObject("mblog").containsKey("title")) { if (nonNull(jsonObject) && !jsonObject.isEmpty()) {
WeiBoMassage weiBoMassage = analysisWeiboMBlog(jsonObject, document.getString("name")); if (jsonObject.containsKey("mblog")) {
if (Objects.nonNull(weiBoMassage)) { if (jsonObject.getJSONObject("mblog").containsKey("title")) {
weiBoMassageDao.addWeiBoMassage(weiBoMassage); WeiBoMassage weiBoMassage = analysisWeiboMBlog(jsonObject, document.getString("name"));
} if (Objects.nonNull(weiBoMassage)) {
} weiBoMassageDao.addWeiBoMassage(weiBoMassage);
} else if (jsonObject.containsKey("card_group")) { }
JSONArray cardGroup = jsonObject.getJSONArray("card_group"); }
WeiBoMassage weiBoMassage = analysisWeiboMassage(cardGroup, document.getString("name")); } else if (jsonObject.containsKey("card_group")) {
if (Objects.nonNull(weiBoMassage)) { JSONArray cardGroup = jsonObject.getJSONArray("card_group");
weiBoMassageDao.addWeiBoMassage(weiBoMassage); WeiBoMassage weiBoMassage = analysisWeiboMassage(cardGroup, document.getString("name"));
} if (Objects.nonNull(weiBoMassage)) {
List<WeiBoUser> weiBoUserList = analysisWeiBoUsers(cardGroup, document.getString("name")); weiBoMassageDao.addWeiBoMassage(weiBoMassage);
if (!weiBoUserList.isEmpty()){ }
for (int i = 0; i < weiBoUserList.size(); i++) { List<WeiBoUser> weiBoUserList = analysisWeiBoUsers(cardGroup, document.getString("name"));
weiBoUserDao.addWeiBoUser(weiBoUserList.get(i)); if (!weiBoUserList.isEmpty()) {
} for (int i = 0; i < weiBoUserList.size(); i++) {
} weiBoUserDao.addWeiBoUser(weiBoUserList.get(i));
} }
} }
} }
} catch (Exception e) { }
log.error("解析cards失败,未获得热门微博,人物信息",e); }
} } catch (Exception e) {
return document; log.error("解析cards失败,未获得热门微博,人物信息", e);
} }
} return document;
return null; }
} }
return null;
/** }
* 解析微博信息
* /**
* @param cardGroup * 微博热搜数据更新话题贡献者排行,关于
* @param topic *
* @return * @param document
*/ * @return
*/
public static WeiBoMassage analysisWeiboMassage(JSONArray cardGroup, String topic) { public static Document weiboUpdatePC(Document document) {
for (int i = 0; i < cardGroup.size(); i++) { document.getString("name");
if (cardGroup.getJSONObject(i).containsKey("mblog")) { String topic = document.getString("name");
if (cardGroup.getJSONObject(i).getJSONObject("mblog").containsKey("title")) { String gb = "#" + topic + "#";
WeiBoMassage weiBoMassage = analysisWeiboMBlog(cardGroup.getJSONObject(i), topic); String encode = null;
return weiBoMassage; try {
} encode = URLEncoder.encode(gb, "utf-8");
} } catch (UnsupportedEncodingException e) {
} log.error("字符解析成URl模式异常", e);
return null; }
} String url = "https://s.weibo.com/weibo?q=" + encode;
String htmlBody = null;
/** Request request = RequestUtils.wrapGet(url);
* 解析用户信息 try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
* htmlBody = response.body().string();
* @param cardGroup } catch (IOException e) {
* @param topic log.error("解析微博时热搜时出现连接失败", e);
* @return }
*/ if (htmlBody != null && htmlBody.contains("m-main")) {
public static List<WeiBoUser> analysisWeiBoUsers(JSONArray cardGroup, String topic) { Document docm = new Document();
List<WeiBoUser> weiBoUserList = new ArrayList(); try {
//解析weibo人物信息 org.jsoup.nodes.Document documen = Jsoup.parse(htmlBody);
Date date = new Date(); //获取贡献者信息
for (int i = 0; i < cardGroup.size(); i++) { try {
if (3==Integer.valueOf(cardGroup.getJSONObject(i).getString("card_type"))) {
if (cardGroup.getJSONObject(i).containsKey("users")){ Elements cardUser = documen.select("div.card-user");
JSONArray users = cardGroup.getJSONObject(i).getJSONArray("users"); for (Element element : cardUser) {
for (int i1 = 0; i1 < users.size(); i1++) { if (!element.select("div.card-head").text().isEmpty()) {
//获取用户id Elements li = element.select("ul.card-user-list-a").select("li");
String userId = users.getJSONObject(i1).getString("id"); if (Objects.nonNull(li)) {
//获取用户名 //循环获取话题贡献者相关信息
String userName = users.getJSONObject(i1).getString("screen_name"); for (Element eleme : li) {
//获取认证信息 String type = "话题贡献者";
String attestationMassage = users.getJSONObject(i1).getString("verified_reason"); writeUser(eleme, type, topic);
}
//获取粉丝数量 }
String followers_count = users.getJSONObject(i1).getString("followers_count"); } else {
Long followerCount =null; Elements li = element.select("ul.card-user-list-a").select("li");
if (!followers_count.contains("万")){ if (Objects.nonNull(li)) {
followerCount = Long.valueOf(followers_count);
}else { //循环获取话题贡献者相关信息
String[] split = followers_count.split("万"); for (Element eleme : li) {
followerCount = Long.valueOf(split[0])*10000; String type = "当事人";
} writeUser(eleme, type, topic);
//用户头像地址 }
String profileImageUrl = users.getJSONObject(i1).getString("profile_image_url"); }
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount,profileImageUrl); }
weiBoUserList.add(weiBoUser); }
} } catch (Exception e) {
} log.error("话题贡献者排行采集异常", e);
return weiBoUserList; }
} else if (10==Integer.valueOf(cardGroup.getJSONObject(i).getString("card_type"))) { Elements dt = documen.select("div.card-about").select("dt");
if (cardGroup.getJSONObject(i).containsKey("user")){ if (Objects.nonNull(dt)) {
JSONObject user = cardGroup.getJSONObject(i).getJSONObject("user"); //获取微博关于的相关信息
//获取用户id Elements dd = documen.select("div.card-about").select("dd");
String userId = user.getString("id"); Document dtDocument = new Document();
//获取用户名 Document ddDocument = new Document();
String userName = user.getString("screen_name"); for (int i = 0; i < dt.size(); i++) {
//获取认证信息 String dtText = dt.get(i).text().replaceAll(":", "").trim();
String attestationMassage = user.getString("verified_reason"); dtDocument.put(String.valueOf(i), dtText);
//获取粉丝数 }
String followers_count = user.getString("followers_count"); for (int i1 = 0; i1 < dd.size(); i1++) {
Long followerCount =null; Elements a = dd.get(i1).select("a");
if (followers_count.contains("万")){ List<String> str = new ArrayList<>();
String[] split = followers_count.split("万"); for (int b = 0; b < a.size(); b++) {
followerCount = Long.valueOf(split[0])*10000; String text1 = a.get(b).text();
}else { str.add(text1);
followerCount = Long.valueOf(followers_count); }
} ddDocument.put(String.valueOf(i1), str);
//用户头像地址 }
String profileImageUrl = user.getString("profile_image_url"); for (int a = 0; a < dt.size(); a++) {
WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic,date,followerCount,profileImageUrl); docm.put(dtDocument.getString(String.valueOf(a)), ddDocument.get(String.valueOf(a)));
weiBoUserList.add(weiBoUser); }
} }
return weiBoUserList; return docm;
} } catch (Exception e) {
} log.error("解析微博话题时出现解析错误", e);
return Collections.emptyList(); }
} }
return document;
}
/**
* 解析微博类型 /**
* * 写入user数据
* @param jsonObject *
* @param topic * @param eleme
* @return * @param type
*/ */
public static WeiBoMassage analysisWeiboMBlog(JSONObject jsonObject, String topic) { private static void writeUser(Element eleme, String type, String topic) {
JSONObject mblog = jsonObject.getJSONObject("mblog"); Date date = new Date();
String type = mblog.getJSONObject("title").getString("text"); if (Objects.isNull(weiBoUserDao)) {
String card_type = jsonObject.getString("card_type"); weiBoUserDao = new WeiBoUserDao();
Integer cardType = Integer.valueOf(card_type); }
String show_type = jsonObject.getString("show_type"); //获取用户名
Integer showType = Integer.valueOf(show_type); String userName = eleme.select("a.name").text();
//点赞数 String attr = eleme.select("span.avator").select("a").first().attr("href");
String attitudes_count = mblog.getString("attitudes_count"); //获取用户id
Long attitudeCount = null; String userId = attr.substring(14);
if (attitudes_count.contains("万")) { String id = userId + "_" + type + "_" + topic;
String[] split = attitudes_count.split("万"); WeiBoUser weiBoUser = new WeiBoUser(userName, userId, topic, date);
attitudeCount = Long.valueOf(split[0]) * 10000; weiBoUser.setType(type);
} else { weiBoUser.setId(id);
attitudeCount = Long.valueOf(attitudes_count); weiBoUserDao.addWeiBoUser(weiBoUser);
} }
//评论数
String comments_count = mblog.getString("comments_count"); /**
Long commentCount = null; * 解析微博信息
if (comments_count.contains("万")) { *
String[] split = comments_count.split("万"); * @param cardGroup
commentCount = Long.valueOf(split[0]) * 10000; * @param topic
} else { * @return
commentCount = Long.valueOf(comments_count); */
}
public static WeiBoMassage analysisWeiboMassage(JSONArray cardGroup, String topic) {
//转发数 for (int i = 0; i < cardGroup.size(); i++) {
String reposts_count = mblog.getString("reposts_count"); if (cardGroup.getJSONObject(i).containsKey("mblog")) {
Long repostCount =null; if (cardGroup.getJSONObject(i).getJSONObject("mblog").containsKey("title")) {
if (reposts_count.contains("万")){ WeiBoMassage weiBoMassage = analysisWeiboMBlog(cardGroup.getJSONObject(i), topic);
String[] split = reposts_count.split("万"); return weiBoMassage;
repostCount = Long.valueOf(split[0]) * 10000; }
}else { }
repostCount = Long.valueOf(reposts_count); }
} return null;
Date createTime = null; }
Date editTime = null;
/**
try { * 解析用户信息
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss z yyyy", java.util.Locale.US); *
//创建时间 * @param cardGroup
String created_at = mblog.getString("created_at"); * @param topic
* @return
createTime = simpleDateFormat.parse(created_at); */
//编辑时间 public static List<WeiBoUser> analysisWeiBoUsers(JSONArray cardGroup, String topic) {
if (mblog.containsKey("edit_at")){ List<WeiBoUser> weiBoUserList = new ArrayList();
String edit_at = mblog.getString("edit_at"); //解析weibo人物信息
editTime = simpleDateFormat.parse(edit_at); Date date = new Date();
} for (int i = 0; i < cardGroup.size(); i++) {
} catch (ParseException e) { if (3 == Integer.valueOf(cardGroup.getJSONObject(i).getString("card_type"))) {
log.error("创建时间和编辑时间解析异常",e); if (cardGroup.getJSONObject(i).containsKey("users")) {
} JSONArray users = cardGroup.getJSONObject(i).getJSONArray("users");
for (int i1 = 0; i1 < users.size(); i1++) {
String mid = mblog.getString("mid"); //获取用户id
//用户id String userId = users.getJSONObject(i1).getString("id");
String userId = mblog.getJSONObject("user").getString("id"); //获取用户名
//用户名 String userName = users.getJSONObject(i1).getString("screen_name");
String userName = mblog.getJSONObject("user").getString("screen_name"); //获取认证信息
//来源 String attestationMassage = users.getJSONObject(i1).getString("verified_reason");
String source = mblog.getString("source");
//用户头像地址 //获取粉丝数量
String profileImageUrl = mblog.getJSONObject("user").getString("profile_image_url"); String followers_count = users.getJSONObject(i1).getString("followers_count");
//内容 Long followerCount = null;
String content = null; if (!followers_count.contains("万")) {
if (mblog.getString("text").contains("<")) { followerCount = Long.valueOf(followers_count);
String text = mblog.getString("text"); } else {
org.jsoup.nodes.Document parse = Jsoup.parse(text); String[] split = followers_count.split("万");
content = parse.text(); followerCount = Long.valueOf(split[0]) * 10000;
}
} else { //用户头像地址
content = mblog.getString("text"); String profileImageUrl = users.getJSONObject(i1).getString("profile_image_url");
} WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic, date, followerCount, profileImageUrl);
weiBoUserList.add(weiBoUser);
WeiBoMassage weiBoMassage = new WeiBoMassage(userId, content, userName, mid, createTime, editTime, cardType, showType, }
repostCount, commentCount, attitudeCount, source, type, topic,profileImageUrl); }
//默认不转发为0 return weiBoUserList;
weiBoMassage.setForward(0); } else if (10 == Integer.valueOf(cardGroup.getJSONObject(i).getString("card_type"))) {
if (cardGroup.getJSONObject(i).containsKey("user")) {
JSONObject weiboJson = null; JSONObject user = cardGroup.getJSONObject(i).getJSONObject("user");
//微博实体 是否转发 //获取用户id
if (mblog.containsKey("retweeted_status")) { String userId = user.getString("id");
weiboJson = mblog.getJSONObject("retweeted_status"); //获取用户名
//处理转发特有的 String userName = user.getString("screen_name");
//weiBoMassage.set //获取认证信息
//源mid String attestationMassage = user.getString("verified_reason");
String rootMid = weiboJson.getString("mid"); //获取粉丝数
//源来源 String followers_count = user.getString("followers_count");
String rootSource = weiboJson.getString("source"); Long followerCount = null;
//源text if (followers_count.contains("万")) {
String text = weiboJson.getString("text"); String[] split = followers_count.split("万");
//解析 followerCount = Long.valueOf(split[0]) * 10000;
org.jsoup.nodes.Document parse = Jsoup.parse(text); } else {
String rootText = parse.text(); followerCount = Long.valueOf(followers_count);
//源用户id }
String rootId = weiboJson.getJSONObject("user").getString("id"); //用户头像地址
//源用户名 String profileImageUrl = user.getString("profile_image_url");
String rootName = weiboJson.getJSONObject("user").getString("screen_name"); WeiBoUser weiBoUser = new WeiBoUser(userId, attestationMassage, userName, topic, date, followerCount, profileImageUrl);
//数据保存到对象中 weiBoUserList.add(weiBoUser);
weiBoMassage.setRoot_mid(rootMid); }
weiBoMassage.setRoot_id(rootId); return weiBoUserList;
weiBoMassage.setRoot_source(rootSource); }
weiBoMassage.setRoot_text(rootText); }
weiBoMassage.setRoot_name(rootName); return Collections.emptyList();
//转发为1 }
weiBoMassage.setForward(1);
} else {
weiboJson = mblog; /**
} * 解析微博类型
List<String> pictureUrlList = new ArrayList(); *
Long playCount = null; * @param jsonObject
//获取播放量和图片链接 * @param topic
if (weiboJson.getJSONArray("pic_ids").size() > 0) { * @return
JSONArray jsonArray = weiboJson.getJSONArray("pics"); */
for (int i = 0; i < jsonArray.size(); i++) { public static WeiBoMassage analysisWeiboMBlog(JSONObject jsonObject, String topic) {
String picUrl = jsonArray.getJSONObject(i).getString("url"); JSONObject mblog = jsonObject.getJSONObject("mblog");
pictureUrlList.add(picUrl); String type = mblog.getJSONObject("title").getString("text");
} String card_type = jsonObject.getString("card_type");
} else if (weiboJson.containsKey("page_info")) { Integer cardType = Integer.valueOf(card_type);
if (weiboJson.getJSONObject("page_info").containsKey("play_count")){ String show_type = jsonObject.getString("show_type");
String play = weiboJson.getJSONObject("page_info").getString("play_count"); Integer showType = Integer.valueOf(show_type);
if (play.contains("万")) { //点赞数
String[] split = play.split("万"); String attitudes_count = mblog.getString("attitudes_count");
playCount = Long.valueOf(split[0]) * 10000; Long attitudeCount = null;
}else if(play.contains("次")){ if (attitudes_count.contains("万")) {
String[] split = play.split("次"); String[] split = attitudes_count.split("万");
playCount = Long.valueOf(split[0]); attitudeCount = Long.valueOf(split[0]) * 10000;
} } else {
} attitudeCount = Long.valueOf(attitudes_count);
} }
weiBoMassage.setPlayCount(playCount);
weiBoMassage.setPictureUrlList(pictureUrlList); //评论数
return weiBoMassage; String comments_count = mblog.getString("comments_count");
} Long commentCount = null;
if (comments_count.contains("万")) {
String[] split = comments_count.split("万");
commentCount = Long.valueOf(split[0]) * 10000;
} else {
commentCount = Long.valueOf(comments_count);
}
//转发数
String reposts_count = mblog.getString("reposts_count");
Long repostCount = null;
if (reposts_count.contains("万")) {
String[] split = reposts_count.split("万");
repostCount = Long.valueOf(split[0]) * 10000;
} else {
repostCount = Long.valueOf(reposts_count);
}
Date createTime = null;
Date editTime = null;
try {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss z yyyy", java.util.Locale.US);
//创建时间
String created_at = mblog.getString("created_at");
createTime = simpleDateFormat.parse(created_at);
//编辑时间
if (mblog.containsKey("edit_at")) {
String edit_at = mblog.getString("edit_at");
editTime = simpleDateFormat.parse(edit_at);
}
} catch (ParseException e) {
log.error("创建时间和编辑时间解析异常", e);
}
String mid = mblog.getString("mid");
//用户id
String userId = mblog.getJSONObject("user").getString("id");
//用户名
String userName = mblog.getJSONObject("user").getString("screen_name");
//来源
String source = mblog.getString("source");
//用户头像地址
String profileImageUrl = mblog.getJSONObject("user").getString("profile_image_url");
//内容
String content = null;
if (mblog.getString("text").contains("<")) {
String text = mblog.getString("text");
org.jsoup.nodes.Document parse = Jsoup.parse(text);
content = parse.text();
} else {
content = mblog.getString("text");
}
WeiBoMassage weiBoMassage = new WeiBoMassage(userId, content, userName, mid, createTime, editTime, cardType, showType,
repostCount, commentCount, attitudeCount, source, type, topic, profileImageUrl);
//默认不转发为0
weiBoMassage.setForward(0);
JSONObject weiboJson = null;
//微博实体 是否转发
if (mblog.containsKey("retweeted_status")) {
weiboJson = mblog.getJSONObject("retweeted_status");
//处理转发特有的
//weiBoMassage.set
//源mid
String rootMid = weiboJson.getString("mid");
//源来源
String rootSource = weiboJson.getString("source");
//源text
String text = weiboJson.getString("text");
//解析
org.jsoup.nodes.Document parse = Jsoup.parse(text);
String rootText = parse.text();
//源用户id
String rootId = weiboJson.getJSONObject("user").getString("id");
//源用户名
String rootName = weiboJson.getJSONObject("user").getString("screen_name");
//数据保存到对象中
weiBoMassage.setRoot_mid(rootMid);
weiBoMassage.setRoot_id(rootId);
weiBoMassage.setRoot_source(rootSource);
weiBoMassage.setRoot_text(rootText);
weiBoMassage.setRoot_name(rootName);
//转发为1
weiBoMassage.setForward(1);
} else {
weiboJson = mblog;
}
List<String> pictureUrlList = new ArrayList();
Long playCount = null;
//获取播放量和图片链接
if (weiboJson.getJSONArray("pic_ids").size() > 0) {
JSONArray jsonArray = weiboJson.getJSONArray("pics");
for (int i = 0; i < jsonArray.size(); i++) {
String picUrl = jsonArray.getJSONObject(i).getString("url");
pictureUrlList.add(picUrl);
}
} else if (weiboJson.containsKey("page_info")) {
if (weiboJson.getJSONObject("page_info").containsKey("play_count")) {
String play = weiboJson.getJSONObject("page_info").getString("play_count");
if (play.contains("万")) {
String[] split = play.split("万");
playCount = Long.valueOf(split[0]) * 10000;
} else if (play.contains("次")) {
String[] split = play.split("次");
playCount = Long.valueOf(split[0]);
}
}
}
weiBoMassage.setPlayCount(playCount);
weiBoMassage.setPictureUrlList(pictureUrlList);
return weiBoMassage;
}
// /** // /**
// * 微博更新历史数据 // * 微博更新历史数据
......
...@@ -6,12 +6,14 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList; ...@@ -6,12 +6,14 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.config.DBConfig; import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler; import com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate; import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.searchhotcrawler.util.MD5Util;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import org.apache.logging.log4j.util.Strings; import org.apache.logging.log4j.util.Strings;
import org.bson.Document; import org.bson.Document;
import java.util.*; import java.util.*;
import static java.util.Objects.isNull;
import static java.util.Objects.nonNull; import static java.util.Objects.nonNull;
/** /**
...@@ -97,7 +99,12 @@ public class HotSearchCacheDAO { ...@@ -97,7 +99,12 @@ public class HotSearchCacheDAO {
String url = document.getString("url")!=null?document.getString("url"):null; String url = document.getString("url")!=null?document.getString("url"):null;
String topicResult = document.getString("topic_result")!=null?document.getString("topic_result"):null; String topicResult = document.getString("topic_result")!=null?document.getString("topic_result"):null;
String pictureUrl = document.getString("pictureUrl")!=null?document.getString("pictureUrl"):null; String pictureUrl = document.getString("pictureUrl")!=null?document.getString("pictureUrl"):null;
String id = name + "_" + type;
String id = document.getString("_id");
if (isNull(id)){
id = name + "_" + type;
}
boolean recommend = false; boolean recommend = false;
// Integer readCount = document.getInteger("comment_count"); // Integer readCount = document.getInteger("comment_count");
if("微博热搜".equals(type)){ if("微博热搜".equals(type)){
...@@ -208,6 +215,17 @@ public class HotSearchCacheDAO { ...@@ -208,6 +215,17 @@ public class HotSearchCacheDAO {
} }
if("微博热搜".equals(type)){ if("微博热搜".equals(type)){
nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc); nowDoc = WeiboHotSearchCrawler.weiboUpdate(nowDoc);
//更新微博话题贡献者,关于功能
Document documentPC = WeiboHotSearchCrawler.weiboUpdatePC(nowDoc);
if (documentPC.containsKey("分类")) {
nowDoc.put("classify",documentPC.get("分类"));
}
if (documentPC.containsKey("地区")) {
nowDoc.put("region", documentPC.get("地区"));
}
if (documentPC.containsKey("标签")) {
nowDoc.put("label", documentPC.get("标签"));
}
if(nowDoc.containsKey("topicLead")){ if(nowDoc.containsKey("topicLead")){
nowDoc.put("topicLead", nowDoc.getString("topicLead")); nowDoc.put("topicLead", nowDoc.getString("topicLead"));
} }
......
...@@ -42,8 +42,17 @@ public class WeiBoUserDao { ...@@ -42,8 +42,17 @@ public class WeiBoUserDao {
document.put("userName",weiBoUser.getUserName()); document.put("userName",weiBoUser.getUserName());
document.put("topic",weiBoUser.getTopic()); document.put("topic",weiBoUser.getTopic());
document.put("time",weiBoUser.getTime()); document.put("time",weiBoUser.getTime());
document.put("followerCount",weiBoUser.getFollowerCount()); if (Objects.nonNull(weiBoUser.getType())){
document.put("profileImageUrl",weiBoUser.getProfileImageUrl()); document.put("type",weiBoUser.getType());
}
if (Objects.nonNull(weiBoUser.getFollowerCount())){
document.put("followerCount",weiBoUser.getFollowerCount());
}
if (Objects.nonNull(weiBoUser.getProfileImageUrl())){
document.put("profileImageUrl",weiBoUser.getProfileImageUrl());
}
try { try {
mongoCollection.insertOne(document); mongoCollection.insertOne(document);
} catch (Exception e) { } catch (Exception e) {
......
...@@ -9,6 +9,7 @@ import com.zhiwei.tools.tools.ZhiWeiTools; ...@@ -9,6 +9,7 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
import org.springframework.context.ApplicationContext; import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext; import org.springframework.context.support.ClassPathXmlApplicationContext;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
......
...@@ -9,12 +9,6 @@ import lombok.extern.log4j.Log4j2; ...@@ -9,12 +9,6 @@ import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
......
...@@ -4,6 +4,9 @@ package com.zhiwei.searchhotcrawler.test; ...@@ -4,6 +4,9 @@ package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.crawler.core.proxy.ProxyFactory; import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig; import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.config.ProxyConfig; import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.WeiboHotSearchRun;
import java.text.ParseException; import java.text.ParseException;
public class HotSearchRunTest { public class HotSearchRunTest {
...@@ -17,5 +20,7 @@ public class HotSearchRunTest { ...@@ -17,5 +20,7 @@ public class HotSearchRunTest {
// new WeiboHotSearchRun().start(); // new WeiboHotSearchRun().start();
//快手热榜开始采集 //快手热榜开始采集
// new KuaiShouHotSearchRun().start(); // new KuaiShouHotSearchRun().start();
//百度热搜
new BaiduHotSearchRun().run();
} }
} }
package com.zhiwei.searchhotcrawler.test;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TaoBaoUtils;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import java.io.IOException;
import java.time.Duration;
import java.util.*;
/**
* @author ll
* @ClassName:TaoBaoHotSearchCrawler
* @Description:
* @date 2021年6月18日 下午16:33:31
*/
@Log4j2
public class TaoBaoHotSearchCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
public static List<HotSearchList> taoBaoHotSearch(Date date) {
long time = new Date().getTime();
String signs="undefined&"+time+"&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}";
//String signs="undefined&1624862377708&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}";
String sign = TaoBaoUtils.parsJSFunction(signs);
//String url = "https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="+time+"&sign="+sign+"&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D";
String url = "https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=1624929605260&sign=ada01d783dc9772d2f84124d293bac26&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D";
Map<String, String> headerMap = new HashMap<>();
headerMap.put("cookie", "_m_h5_tk=975fb07b671f12a689d4ec36cf2e9047_1624937028814; _m_h5_tk_enc=ffb83d60b283eee5992d5e32429c2597;");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
System.out.println(htmlBody);
} catch (Exception e) {
log.error("解析淘宝热搜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("data")) {
return ansysData(htmlBody, date);
} else {
log.info("解析淘宝热搜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
List<HotSearchList> list = new ArrayList<>();
try {
String sub = htmlBody.substring(htmlBody.indexOf("searchdoor"), htmlBody.indexOf("searchdoorFrom"));
String substring = sub.substring(sub.indexOf("showReminder") + 27, sub.indexOf("multi_bangdan_flag") - 2).trim();
JSONArray objects = JSONObject.parseArray(substring);
JSONArray jsonArray = objects.getJSONObject(0).getJSONObject("result").getJSONArray("text");
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject jsonObject = jsonArray.getJSONObject(i);
String name = jsonObject.getString("showtext");
String showmark = jsonObject.getString("showmark");
Integer rank = Integer.valueOf(showmark);
String searchtext = jsonObject.getString("searchtext");
String url = "https://s.m.taobao.com/h5?q=" + searchtext;
String tagText = jsonObject.getString("tagText");
Long count = 0L;
HotSearchList hotSearchList = new HotSearchList(url,name,count,true,rank, HotSearchType.淘宝热搜.name(),tagText,date);
list.add(hotSearchList);
} catch (Exception e) {
log.error("解析淘宝热搜时出现解析错误",e);
}
}
System.out.println(jsonArray.size());
} catch (Exception e) {
log.error("解析淘宝热搜时出现解析错误,数据不是json结构", e);
}
return list;
}
}
...@@ -56,18 +56,18 @@ public class BaiduHotSearchRun extends Thread{ ...@@ -56,18 +56,18 @@ public class BaiduHotSearchRun extends Thread{
// } // }
TipsUtils.addHotList("百度热搜",baiduList); TipsUtils.addHotList("百度热搜",baiduList);
log.info("百度风云榜采集结束........"); log.info("百度风云榜采集结束........");
ZhiWeiTools.sleep(2000L); // ZhiWeiTools.sleep(2000L);
log.info("搜狗微信采集开始........"); // log.info("搜狗微信采集开始........");
List<HotSearchList> sougouList = SougoHotSearchCrawler.sougoHotSearch(new Date()); // List<HotSearchList> sougouList = SougoHotSearchCrawler.sougoHotSearch(new Date());
log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(sougouList != null ? sougouList.size() : 0)); // log.info("{}, 此轮采集到的数据量为:{}", new Date(), Integer.valueOf(sougouList != null ? sougouList.size() : 0));
TipsUtils.addHotList("搜狗微信热搜",sougouList); // TipsUtils.addHotList("搜狗微信热搜",sougouList);
log.info("搜狗微信采集结束........"); // log.info("搜狗微信采集结束........");
ZhiWeiTools.sleep(2000L); // ZhiWeiTools.sleep(2000L);
log.info("知乎话题采集开始........"); // log.info("知乎话题采集开始........");
List<HotSearchList> zhihuList = ZhihuHotSearchCrawler.getMobileZhihuHotList(new Date()); // List<HotSearchList> zhihuList = ZhihuHotSearchCrawler.getMobileZhihuHotList(new Date());
log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(zhihuList != null ? zhihuList.size() : 0)); // log.info("{}, 知乎此轮采集到的数据量为:{}", new Date(), Integer.valueOf(zhihuList != null ? zhihuList.size() : 0));
TipsUtils.addHotList("知乎热搜",zhihuList); // TipsUtils.addHotList("知乎热搜",zhihuList);
log.info("知乎话题采集结束........"); // log.info("知乎话题采集结束........");
} }
} }
\ No newline at end of file
package com.zhiwei.searchhotcrawler.timer; package com.zhiwei.searchhotcrawler.timer;
import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
......
package com.zhiwei.searchhotcrawler.util;
import java.security.MessageDigest;
public class MD5Util {
private MD5Util() {
throw new IllegalStateException("Utility class");
}
/**
* @Title: MD5
* @Description: MD5后的字符串
* @param pwd
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String getMD5(String pwd) {
return getString(pwd);
}
private static String getString(String pwd) {
char[] md5String = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
try {
// 使用平台的默认字符集将此 String 编码为 byte序列,并将结果存储到一个新的 byte数组中
byte[] btInput = pwd.getBytes();
// 信息摘要是安全的单向哈希函数,它接收任意大小的数据,并输出固定长度的哈希值。
MessageDigest mdInst = MessageDigest.getInstance("MD5");
// MessageDigest对象通过使用 update方法处理数据, 使用指定的byte数组更新摘要
mdInst.update(btInput);
// 摘要更新之后,通过调用digest()执行哈希计算,获得密文
byte[] md = mdInst.digest();
// 把密文转换成十六进制的字符串形式
int j = md.length;
char str[] = new char[j * 2];
int k = 0;
for (int i = 0; i < j; i++) { // i = 0
byte byte0 = md[i]; // 95
str[k++] = md5String[byte0 >>> 4 & 0xf]; // 5
str[k++] = md5String[byte0 & 0xf]; // F
}
// 返回经过加密后的字符串
return new String(str);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}
package com.zhiwei.searchhotcrawler.util;
import lombok.extern.log4j.Log4j2;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import javax.script.Invocable;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import java.io.FileReader;
@Log4j2
public class TaoBaoUtils {
public static String parsJSFunction(String sign) {
String scriptResult ="";//脚本的执行结果
ScriptEngine engine = new ScriptEngineManager().getEngineByName("JavaScript");//1.得到脚本引擎
//ScriptEngine engine = new ScriptEngineManager().getEngineByName("nashorn");//1.得到脚本引擎
try {
//2.引擎读取 脚本字符串
//engine.eval(new StringReader(routeScript));
//如果js存在文件里
Resource aesJs = new ClassPathResource("taobao.js");
engine.eval(new FileReader(aesJs.getFile()));
//3.将引擎转换为Invocable,这样才可以掉用js的方法
Invocable invocable = (Invocable) engine;
//4.使用 invocable.invokeFunction掉用js脚本里的方法,第一個参数为方法名,后面的参数为被调用的js方法的入参
scriptResult = (String) invocable.invokeFunction("h", sign);
}catch(Exception e){
log.error("Error executing script: ",e.getMessage());
}
return scriptResult;
}
}
#redis.host=115.236.59.91 #redis.host=115.236.59.91
#redis.port=7382 #redis.port=7382
#redis.host=127.0.0.1
#redis.port=6379
#redis.password= #redis.password=
#ÐÂÏßÉÏredis #ÐÂÏßÉÏredis
#redis.host = 192.168.0.39 #redis.host = 192.168.0.39
......
...@@ -21,6 +21,6 @@ import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; ...@@ -21,6 +21,6 @@ import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
@RunWith(SpringJUnit4ClassRunner.class) @RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations = @ContextConfiguration(locations =
{ "classpath:applicationContext.xml" }) { "classpath:applicationContext.xml" })
public abstract class ObjectTest extends AbstractJUnit4SpringContextTests public abstract class ObjectTest extends AbstractJUnit4SpringContextTests {
{
} }
...@@ -14,6 +14,7 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler; ...@@ -14,6 +14,7 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate; import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest; import com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest;
import com.zhiwei.searchhotcrawler.test.TaoBaoHotSearchCrawlerTest; import com.zhiwei.searchhotcrawler.test.TaoBaoHotSearchCrawlerTest;
import com.zhiwei.searchhotcrawler.util.TaoBaoUtils;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
...@@ -103,27 +104,28 @@ public class HotSearchTest { ...@@ -103,27 +104,28 @@ public class HotSearchTest {
} }
} }
ad(document); ad(document);
System.out.println(document); System.out.println(document);
} }
private void ad(Document nowDoc) { private void ad(Document nowDoc) {
MongoCollection collection = MongoDBTemplate.getCollection(DBConfig.dbName, DBConfig.searchCacheCollName); MongoCollection collection = MongoDBTemplate.getCollection(DBConfig.dbName, DBConfig.searchCacheCollName);
if(nowDoc.containsKey("topicLead")){ if (nowDoc.containsKey("topicLead")) {
nowDoc.put("topicLead", nowDoc.getString("topicLead")); nowDoc.put("topicLead", nowDoc.getString("topicLead"));
} }
if(nowDoc.containsKey("readCount") && nowDoc.containsKey("discussCount")) { if (nowDoc.containsKey("readCount") && nowDoc.containsKey("discussCount")) {
nowDoc.put("readCount", nonNull(nowDoc.get("readCount"))?Long.valueOf(nowDoc.get("readCount").toString()):null); nowDoc.put("readCount", nonNull(nowDoc.get("readCount")) ? Long.valueOf(nowDoc.get("readCount").toString()) : null);
nowDoc.put("discussCount", nonNull(nowDoc.get("discussCount"))?Long.valueOf(nowDoc.get("discussCount").toString()):null); nowDoc.put("discussCount", nonNull(nowDoc.get("discussCount")) ? Long.valueOf(nowDoc.get("discussCount").toString()) : null);
} }
if (nowDoc.containsKey("pictureUrl")) { if (nowDoc.containsKey("pictureUrl")) {
nowDoc.put("pictureUrl",nowDoc.getString("pictureUrl")); nowDoc.put("pictureUrl", nowDoc.getString("pictureUrl"));
} }
if (nowDoc.containsKey("downtext")) { if (nowDoc.containsKey("downtext")) {
nowDoc.put("downtext",nowDoc.getString("downtext")); nowDoc.put("downtext", nowDoc.getString("downtext"));
} }
collection.insertOne(nowDoc); collection.insertOne(nowDoc);
} }
/** /**
* 测试淘宝热搜采集 * 测试淘宝热搜采集
*/ */
...@@ -154,9 +156,20 @@ public class HotSearchTest { ...@@ -154,9 +156,20 @@ public class HotSearchTest {
List<HotSearchList> hotSearchLists = BaiDuHotSearchCrawler.baiduHotSearch(new Date()); List<HotSearchList> hotSearchLists = BaiDuHotSearchCrawler.baiduHotSearch(new Date());
System.out.println(hotSearchLists); System.out.println(hotSearchLists);
System.out.println(hotSearchLists.size()); System.out.println(hotSearchLists.size());
} }
/**
* 测试解析淘宝js文件
*/
@Test
public void taoBaoJSTest() throws IOException {
long time = new Date().getTime();
String signs="undefined&1625624820156&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}";
// https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=1624930984092&sign=acf994dbcee6c0c1d7a8a566a6b8ff0a&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D
String s = TaoBaoUtils.parsJSFunction(signs);
System.out.println(s);
}
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment