Commit 2cff3725 by leiliangliang

更新微博PC端采集程序

parent 982502f7
...@@ -6,12 +6,9 @@ import java.net.URLEncoder; ...@@ -6,12 +6,9 @@ import java.net.URLEncoder;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.*; import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSON;
import com.mongodb.client.result.UpdateResult;
import com.zhiwei.searchhotcrawler.bean.*; import com.zhiwei.searchhotcrawler.bean.*;
import com.zhiwei.searchhotcrawler.config.RedisConfig; import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.dao.RedisDao; import com.zhiwei.searchhotcrawler.dao.RedisDao;
...@@ -22,19 +19,10 @@ import lombok.extern.log4j.Log4j2; ...@@ -22,19 +19,10 @@ import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.bson.Document; import org.bson.Document;
import org.checkerframework.checker.units.qual.C;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
...@@ -43,86 +31,130 @@ import com.zhiwei.crawler.core.proxy.ProxyHolder; ...@@ -43,86 +31,130 @@ import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.mail.SendMailWeibo; import com.zhiwei.searchhotcrawler.mail.SendMailWeibo;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import org.springframework.beans.factory.annotation.Autowired;
import static java.util.Objects.isNull;
import static java.util.Objects.nonNull; import static java.util.Objects.nonNull;
/** /**
* @author hero * @author hero
* @author hero
* @ClassName: WeiboHotSearch * @ClassName: WeiboHotSearch
* @Description: 微博实时热搜采集 * @Description: 微博实时热搜采集
* @author hero
* @date 2017年9月15日 上午10:54:31 * @date 2017年9月15日 上午10:54:31
*/ */
@Log4j2 @Log4j2
public class WeiboHotSearchCrawler { public class WeiboHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static RedisDao redisDao = new RedisDao(); private static RedisDao redisDao = new RedisDao();
static WeiBoUserDao weiBoUserDao = new WeiBoUserDao(); static WeiBoUserDao weiBoUserDao = new WeiBoUserDao();
static WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao(); static WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao();
/**
* @Title: weiboHotSearchTest // /**
* @author hero // * @return void 返回类型
* @Description: TODO(PC端微博热搜采集) // * @Title: weiboHotSearchTest
* @return void 返回类型 // * @author hero
*/ // * @Description: TODO(PC端微博热搜采集)
// public static List<HotSearchList> weiboHotSearch(){ // */
// String url = "https://s.weibo.com/top/summary?cate=realtimehot"; // public static List<HotSearchList> weiboHotSearch() {
// // String url = "https://s.weibo.com/top/summary?cate=realtimehot";
// List<HotSearchList> list = new ArrayList<HotSearchList>(); // Map<String, String> headerMap = new HashMap<>();
// for(int i =0; i<3; i++){ // headerMap.put("Cookie", "SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN");
// String htmlBody = null; // List<HotSearchList> list = new ArrayList<HotSearchList>();
// Request request = RequestUtils.wrapGet(url); // for (int i = 0; i < 3; i++) {
// try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) { // String htmlBody = null;
// htmlBody = response.body().string(); // Request request = RequestUtils.wrapGet(url, headerMap);
// } catch (Exception e) { // try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
// if(i==2){ // htmlBody = response.body().string();
// return list; // } catch (Exception e) {
// }else{ // if (i == 2) {
// continue; // return list;
// } // } else {
// } // continue;
// if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) { // }
// try { // }
// if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) {
// try {
//// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0]; //// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
//// script = script.replace("(", "").replace(")", ""); //// script = script.replace("(", "").replace(")", "");
//// JSONObject json = JSONObject.parseObject(script); //// JSONObject json = JSONObject.parseObject(script);
//// String html = json.getString("html"); //// String html =
// Document document = Jsoup.parse(htmlBody); // Date date = new Date();
// Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr"); // org.jsoup.nodes.Document document = Jsoup.parse(htmlBody);
// for (Element element : elements) { // Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
// try { // for (Element element : elements) {
// String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href"); // try {
// String name = element.select("td.td-02").select("a").text(); // String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href");
// String num = !element.select("td.td-02").select("span").text().equals("") ? element.select("td.td-02").select("span").text() : "0"; // String name = element.select("td.td-02").select("a").text();
// String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("") ? element.select("td[class=\"td-01 ranktop\"]").text() : "-1"; // //String num = !element.select("td.td-02").select("span").text().equals("") ? element.select("td.td-02").select("span").text() : "0";
// // String num = element.select("td.td-02").select("span").text();
// int hotCount = Integer.valueOf(num); // //String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("") ? element.select("td[class=\"td-01 ranktop\"]").text() : "-1";
// int rankCount = Integer.valueOf(rank); // //获取排名
// HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), null); // String rank = element.select("td.td-01").text();
// list.add(hotSearch); // Integer rankCount = null;
// } catch (Exception e) { // //默认推荐位排名为0 置顶为-1
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com"); // if ("•".equals(rank)) {
// log.error("解析微博时时热搜时出现解析错误", e); // rankCount = 0;
// continue; // id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href_to");
// } // } else if (StringUtils.isEmpty(rank)) {
// } // rankCount = -1;
// } catch (Exception e) { // } else {
// log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace()); // rankCount = Integer.valueOf(rank);
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com"); // }
// return null; // //获取icon
// } // String text = element.select("td.td-03").text();
// } else { // String icon = null;
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com"); // if (StringUtils.isNotEmpty(text) && nonNull(text)) {
// log.info("解析微博时时热搜时出现解析错误,页面结构有问题"); // if ("商".equals(text)) {
// } // icon = "jian";
// break; // } else if ("新".equals(text)) {
// } // icon = "new";
// return list; // } else if ("热".equals(text)) {
// } // icon = "hot";
// } else if ("沸".equals(text)) {
// icon = "fei";
// } else if ("爆".equals(text)) {
// icon = "boom";
// }
// }
// //获取热度标签
// String heatLabel = null;
// //获取热度值 置顶 推荐位 默认值为0
// Long hotCount =0L;
// if (StringUtils.isNotEmpty(num) && Objects.nonNull(num)) {
// String[] split = num.split(" ");
// if (split.length > 1) {
// heatLabel = split[0].trim();
// hotCount = Long.valueOf(split[1].trim());
// }else {
// hotCount = Long.valueOf(num);
// }
// }
// // Long hotCount = Long.valueOf(num);
// HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), icon, date);
// hotSearch.setHeatLabel(heatLabel);
// list.add(hotSearch);
// } catch (Exception e) {
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
// log.error("解析微博时时热搜时出现解析错误", e);
// continue;
// }
// }
// } catch (Exception e) {
// log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace());
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
// return null;
// }
// } else {
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
// log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
// }
// break;
// }
// return list;
// }
/** /**
...@@ -263,20 +295,20 @@ public class WeiboHotSearchCrawler { ...@@ -263,20 +295,20 @@ public class WeiboHotSearchCrawler {
JSONObject cardInfo = cardGroup.getJSONObject(j); JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc"); String name = cardInfo.getString("desc");
String desc_extr = cardInfo.getString("desc_extr"); String desc_extr = cardInfo.getString("desc_extr");
String heatLabel=null; String heatLabel = null;
Long hotCount =null; Long hotCount = null;
if (Objects.nonNull(desc_extr)){ if (Objects.nonNull(desc_extr)) {
String[] split = desc_extr.split(" "); String[] split = desc_extr.split(" ");
if (split.length>1){ if (split.length > 1) {
heatLabel= split[0].trim(); heatLabel = split[0].trim();
hotCount= Long.valueOf(split[1].trim()); hotCount = Long.valueOf(split[1].trim());
}else { } else {
hotCount = cardInfo.getLongValue("desc_extr"); hotCount = cardInfo.getLongValue("desc_extr");
} }
} }
String iconUrl = cardInfo.getString("icon"); String iconUrl = cardInfo.getString("icon");
String icon=null; String icon = null;
if (StringUtils.isNotBlank(iconUrl)) { if (StringUtils.isNotBlank(iconUrl)) {
icon = iconUrl.split("_")[1].split(".png")[0]; icon = iconUrl.split("_")[1].split(".png")[0];
} }
...@@ -284,7 +316,9 @@ public class WeiboHotSearchCrawler { ...@@ -284,7 +316,9 @@ public class WeiboHotSearchCrawler {
String id = cardInfo.getString("scheme"); String id = cardInfo.getString("scheme");
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date); HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
hotSearch.setHeatLabel(heatLabel); hotSearch.setHeatLabel(heatLabel);
if (Objects.nonNull(iconUrl)){hotSearch.setIconUrl(iconUrl);} if (Objects.nonNull(iconUrl)) {
hotSearch.setIconUrl(iconUrl);
}
result.add(hotSearch); result.add(hotSearch);
rank++; rank++;
redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS, name + "_微博热搜"); redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS, name + "_微博热搜");
...@@ -371,7 +405,7 @@ public class WeiboHotSearchCrawler { ...@@ -371,7 +405,7 @@ public class WeiboHotSearchCrawler {
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
log.error("更新导语时字符解析成URl模式异常", e); log.error("更新导语时字符解析成URl模式异常", e);
} }
String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type"+encode; String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type" + encode;
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 5; count++) { for (int count = 0; count <= 5; count++) {
...@@ -389,7 +423,7 @@ public class WeiboHotSearchCrawler { ...@@ -389,7 +423,7 @@ public class WeiboHotSearchCrawler {
document.put("topicLead", topicLead); document.put("topicLead", topicLead);
} }
} }
if (json.containsKey("cardlist_head_cards")&&!json.getJSONArray("cardlist_head_cards").isEmpty()) { if (json.containsKey("cardlist_head_cards") && !json.getJSONArray("cardlist_head_cards").isEmpty()) {
JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0); JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) { if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext"); String midText = readJson.getJSONObject("head_data").getString("midtext");
...@@ -465,9 +499,11 @@ public class WeiboHotSearchCrawler { ...@@ -465,9 +499,11 @@ public class WeiboHotSearchCrawler {
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
log.error("字符解析成URl模式异常", e); log.error("字符解析成URl模式异常", e);
} }
String url = "https://s.weibo.com/weibo?q="+encode+"&Refer=top"; String url = "https://s.weibo.com/weibo?q=" + encode + "&Refer=top";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Map<String, String> headerMap = new HashMap<>();
headerMap.put("Cookie", "SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN");
Request request = RequestUtils.wrapGet(url,headerMap);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string(); htmlBody = response.body().string();
} catch (IOException e) { } catch (IOException e) {
...@@ -634,13 +670,13 @@ public class WeiboHotSearchCrawler { ...@@ -634,13 +670,13 @@ public class WeiboHotSearchCrawler {
Long followerCount = null; Long followerCount = null;
if (followers_count.contains("万")) { if (followers_count.contains("万")) {
String[] split = followers_count.split("万"); String[] split = followers_count.split("万");
Double aDouble = Double.valueOf(split[0])*10000; Double aDouble = Double.valueOf(split[0]) * 10000;
followerCount = new Double(aDouble).longValue(); followerCount = new Double(aDouble).longValue();
} else if (followers_count.contains("亿")){ } else if (followers_count.contains("亿")) {
String[] split = followers_count.split("亿"); String[] split = followers_count.split("亿");
Double aDouble = Double.valueOf(split[0])*100000000; Double aDouble = Double.valueOf(split[0]) * 100000000;
followerCount = new Double(aDouble).longValue(); followerCount = new Double(aDouble).longValue();
}else { } else {
followerCount = Long.valueOf(followers_count); followerCount = Long.valueOf(followers_count);
} }
//用户头像地址 //用户头像地址
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment