Commit 2cff3725 by leiliangliang

更新微博PC端采集程序

parent 982502f7
......@@ -6,12 +6,9 @@ import java.net.URLEncoder;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.alibaba.fastjson.JSON;
import com.mongodb.client.result.UpdateResult;
import com.zhiwei.searchhotcrawler.bean.*;
import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.dao.RedisDao;
......@@ -22,19 +19,10 @@ import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.bson.Document;
import org.checkerframework.checker.units.qual.C;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
......@@ -43,15 +31,15 @@ import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.mail.SendMailWeibo;
import com.zhiwei.tools.tools.URLCodeUtil;
import org.springframework.beans.factory.annotation.Autowired;
import static java.util.Objects.isNull;
import static java.util.Objects.nonNull;
/**
* @author hero
* @author hero
* @ClassName: WeiboHotSearch
* @Description: 微博实时热搜采集
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
......@@ -63,25 +51,27 @@ public class WeiboHotSearchCrawler {
static WeiBoUserDao weiBoUserDao = new WeiBoUserDao();
static WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao();
/**
* @Title: weiboHotSearchTest
* @author hero
* @Description: TODO(PC端微博热搜采集)
* @return void 返回类型
*/
// public static List<HotSearchList> weiboHotSearch(){
// /**
// * @return void 返回类型
// * @Title: weiboHotSearchTest
// * @author hero
// * @Description: TODO(PC端微博热搜采集)
// */
// public static List<HotSearchList> weiboHotSearch() {
// String url = "https://s.weibo.com/top/summary?cate=realtimehot";
//
// Map<String, String> headerMap = new HashMap<>();
// headerMap.put("Cookie", "SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN");
// List<HotSearchList> list = new ArrayList<HotSearchList>();
// for(int i =0; i<3; i++){
// for (int i = 0; i < 3; i++) {
// String htmlBody = null;
// Request request = RequestUtils.wrapGet(url);
// try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
// Request request = RequestUtils.wrapGet(url, headerMap);
// try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// } catch (Exception e) {
// if(i==2){
// if (i == 2) {
// return list;
// }else{
// } else {
// continue;
// }
// }
......@@ -90,19 +80,61 @@ public class WeiboHotSearchCrawler {
//// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
//// script = script.replace("(", "").replace(")", "");
//// JSONObject json = JSONObject.parseObject(script);
//// String html = json.getString("html");
// Document document = Jsoup.parse(htmlBody);
//// String html =
// Date date = new Date();
// org.jsoup.nodes.Document document = Jsoup.parse(htmlBody);
// Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
// for (Element element : elements) {
// try {
// String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href");
// String name = element.select("td.td-02").select("a").text();
// String num = !element.select("td.td-02").select("span").text().equals("") ? element.select("td.td-02").select("span").text() : "0";
// String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("") ? element.select("td[class=\"td-01 ranktop\"]").text() : "-1";
//
// int hotCount = Integer.valueOf(num);
// int rankCount = Integer.valueOf(rank);
// HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), null);
// //String num = !element.select("td.td-02").select("span").text().equals("") ? element.select("td.td-02").select("span").text() : "0";
// String num = element.select("td.td-02").select("span").text();
// //String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("") ? element.select("td[class=\"td-01 ranktop\"]").text() : "-1";
// //获取排名
// String rank = element.select("td.td-01").text();
// Integer rankCount = null;
// //默认推荐位排名为0 置顶为-1
// if ("•".equals(rank)) {
// rankCount = 0;
// id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href_to");
// } else if (StringUtils.isEmpty(rank)) {
// rankCount = -1;
// } else {
// rankCount = Integer.valueOf(rank);
// }
// //获取icon
// String text = element.select("td.td-03").text();
// String icon = null;
// if (StringUtils.isNotEmpty(text) && nonNull(text)) {
// if ("商".equals(text)) {
// icon = "jian";
// } else if ("新".equals(text)) {
// icon = "new";
// } else if ("热".equals(text)) {
// icon = "hot";
// } else if ("沸".equals(text)) {
// icon = "fei";
// } else if ("爆".equals(text)) {
// icon = "boom";
// }
// }
// //获取热度标签
// String heatLabel = null;
// //获取热度值 置顶 推荐位 默认值为0
// Long hotCount =0L;
// if (StringUtils.isNotEmpty(num) && Objects.nonNull(num)) {
// String[] split = num.split(" ");
// if (split.length > 1) {
// heatLabel = split[0].trim();
// hotCount = Long.valueOf(split[1].trim());
// }else {
// hotCount = Long.valueOf(num);
// }
// }
// // Long hotCount = Long.valueOf(num);
// HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), icon, date);
// hotSearch.setHeatLabel(heatLabel);
// list.add(hotSearch);
// } catch (Exception e) {
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
......@@ -263,20 +295,20 @@ public class WeiboHotSearchCrawler {
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
String desc_extr = cardInfo.getString("desc_extr");
String heatLabel=null;
Long hotCount =null;
if (Objects.nonNull(desc_extr)){
String heatLabel = null;
Long hotCount = null;
if (Objects.nonNull(desc_extr)) {
String[] split = desc_extr.split(" ");
if (split.length>1){
heatLabel= split[0].trim();
hotCount= Long.valueOf(split[1].trim());
if (split.length > 1) {
heatLabel = split[0].trim();
hotCount = Long.valueOf(split[1].trim());
}else {
} else {
hotCount = cardInfo.getLongValue("desc_extr");
}
}
String iconUrl = cardInfo.getString("icon");
String icon=null;
String icon = null;
if (StringUtils.isNotBlank(iconUrl)) {
icon = iconUrl.split("_")[1].split(".png")[0];
}
......@@ -284,7 +316,9 @@ public class WeiboHotSearchCrawler {
String id = cardInfo.getString("scheme");
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
hotSearch.setHeatLabel(heatLabel);
if (Objects.nonNull(iconUrl)){hotSearch.setIconUrl(iconUrl);}
if (Objects.nonNull(iconUrl)) {
hotSearch.setIconUrl(iconUrl);
}
result.add(hotSearch);
rank++;
redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS, name + "_微博热搜");
......@@ -371,7 +405,7 @@ public class WeiboHotSearchCrawler {
} catch (UnsupportedEncodingException e) {
log.error("更新导语时字符解析成URl模式异常", e);
}
String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type"+encode;
String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type" + encode;
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 5; count++) {
......@@ -389,7 +423,7 @@ public class WeiboHotSearchCrawler {
document.put("topicLead", topicLead);
}
}
if (json.containsKey("cardlist_head_cards")&&!json.getJSONArray("cardlist_head_cards").isEmpty()) {
if (json.containsKey("cardlist_head_cards") && !json.getJSONArray("cardlist_head_cards").isEmpty()) {
JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext");
......@@ -465,9 +499,11 @@ public class WeiboHotSearchCrawler {
} catch (UnsupportedEncodingException e) {
log.error("字符解析成URl模式异常", e);
}
String url = "https://s.weibo.com/weibo?q="+encode+"&Refer=top";
String url = "https://s.weibo.com/weibo?q=" + encode + "&Refer=top";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
Map<String, String> headerMap = new HashMap<>();
headerMap.put("Cookie", "SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN");
Request request = RequestUtils.wrapGet(url,headerMap);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
......@@ -634,13 +670,13 @@ public class WeiboHotSearchCrawler {
Long followerCount = null;
if (followers_count.contains("万")) {
String[] split = followers_count.split("万");
Double aDouble = Double.valueOf(split[0])*10000;
Double aDouble = Double.valueOf(split[0]) * 10000;
followerCount = new Double(aDouble).longValue();
} else if (followers_count.contains("亿")){
} else if (followers_count.contains("亿")) {
String[] split = followers_count.split("亿");
Double aDouble = Double.valueOf(split[0])*100000000;
Double aDouble = Double.valueOf(split[0]) * 100000000;
followerCount = new Double(aDouble).longValue();
}else {
} else {
followerCount = Long.valueOf(followers_count);
}
//用户头像地址
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment