Commit 34004178 by 马黎滨

Merge branch 'mlbWork' into 'master'

Mlb work

See merge request !3
parents 355bdd18 65b7efe4
...@@ -100,6 +100,10 @@ public class BaiDuHotSearchCrawler { ...@@ -100,6 +100,10 @@ public class BaiDuHotSearchCrawler {
count = Integer.valueOf(hot); count = Integer.valueOf(hot);
} }
if (Objects.nonNull(rank)) { if (Objects.nonNull(rank)) {
if(count == 0){
log.info(hot);
log.info(element);
}
HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name()); HotSearchList hotSearch = new HotSearchList(everurl, kw, count, rank, HotSearchType.百度热搜.name());
list.add(hotSearch); list.add(hotSearch);
} }
......
...@@ -129,14 +129,14 @@ public class WeiboTopicCrawler { ...@@ -129,14 +129,14 @@ public class WeiboTopicCrawler {
*/ */
public static List<HotSearchList> startCrawlerByPhone(){ public static List<HotSearchList> startCrawlerByPhone(){
List<HotSearchList> topicList = new ArrayList<>(); List<HotSearchList> topicList = new ArrayList<>();
for(int page=1; page<=7; page++){ for(int page=1; page<=6; page++){
String pageUrl = "https://api.weibo.cn/2/page?gsid=_2A25zJX_EDeRxGedH71YS8CzKzzmIHXVuc_QMrDV6PUJbkdANLXPbkWpNUK3OyitGCJsX8exvua-vfubUqCiaA4lb&from=10A1193010&c=iphone&s=2827eebe&count=20&containerid=106003type%253D25%2526t%253D3%2526disable_hot%253D1%2526filter_type%253Dtopicscene&page=" + page; String pageUrl = "https://api.weibo.cn/2/page?gsid=_2A25zJX_EDeRxGedH71YS8CzKzzmIHXVuc_QMrDV6PUJbkdANLXPbkWpNUK3OyitGCJsX8exvua-vfubUqCiaA4lb&from=10A1193010&c=iphone&s=2827eebe&count=20&containerid=106003type%253D25%2526t%253D3%2526disable_hot%253D1%2526filter_type%253Dtopicscene&page=" + page;
//重试三次 //重试三次
for(int retryTimes = 1; retryTimes<=5; retryTimes++) { for(int retryTimes = 1; retryTimes<=5; retryTimes++) {
try { try {
// log.info("pageUrl::{}", pageUrl); // log.info("pageUrl::{}", pageUrl);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl), ProxyHolder.NAT_HEAVY_PROXY).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(pageUrl), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("top_mark_text")) { if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
topicList.addAll(parseTopicHtml(htmlBody)); topicList.addAll(parseTopicHtml(htmlBody));
break; break;
}else { }else {
...@@ -170,8 +170,11 @@ public class WeiboTopicCrawler { ...@@ -170,8 +170,11 @@ public class WeiboTopicCrawler {
rank = cardGroup.getInteger("top_mark_text"); rank = cardGroup.getInteger("top_mark_text");
topicName = cardGroup.getString("title_sub"); topicName = cardGroup.getString("title_sub");
url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode(topicName, "utf-8"); url = "https://s.weibo.com/weibo?q="+ URLCodeUtil.getURLEncode(topicName, "utf-8");
description = cardGroup.getString("desc1"); description = null;
desc2 = cardGroup.getString("desc2"); if(cardGroup.containsKey("card_expand")){
description = cardGroup.getJSONObject("card_expand").getString("content");
}
desc2 = cardGroup.getString("desc");
String commentNumStr = desc2.replaceAll("讨论.*", "").trim(); String commentNumStr = desc2.replaceAll("讨论.*", "").trim();
String readNumStr = desc2.replaceAll(".*讨论|阅读", "").trim(); String readNumStr = desc2.replaceAll(".*讨论|阅读", "").trim();
try { try {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment