Commit 172e5b3c by chenweitao

Merge branch 'working' into 'master'

Working

See merge request !135
parents 139ff5af 8ec17aa9
......@@ -6,12 +6,9 @@ import java.net.URLEncoder;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.alibaba.fastjson.JSON;
import com.mongodb.client.result.UpdateResult;
import com.zhiwei.searchhotcrawler.bean.*;
import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.dao.RedisDao;
......@@ -22,19 +19,10 @@ import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.bson.Document;
import org.checkerframework.checker.units.qual.C;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
......@@ -43,86 +31,126 @@ import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.mail.SendMailWeibo;
import com.zhiwei.tools.tools.URLCodeUtil;
import org.springframework.beans.factory.annotation.Autowired;
import static java.util.Objects.isNull;
import static java.util.Objects.nonNull;
/**
* @author hero
* @author hero
* @ClassName: WeiboHotSearch
* @Description: 微博实时热搜采集
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
public class WeiboHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static RedisDao redisDao = new RedisDao();
private static RedisDao redisDao = new RedisDao();
static WeiBoUserDao weiBoUserDao = new WeiBoUserDao();
static WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao();
/**
* @Title: weiboHotSearchTest
* @author hero
* @Description: TODO(PC端微博热搜采集)
* @return void 返回类型
*/
// public static List<HotSearchList> weiboHotSearch(){
// String url = "https://s.weibo.com/top/summary?cate=realtimehot";
//
// List<HotSearchList> list = new ArrayList<HotSearchList>();
// for(int i =0; i<3; i++){
// String htmlBody = null;
// Request request = RequestUtils.wrapGet(url);
// try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// } catch (Exception e) {
// if(i==2){
// return list;
// }else{
// continue;
// }
// }
// if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) {
// try {
//// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0];
//// script = script.replace("(", "").replace(")", "");
//// JSONObject json = JSONObject.parseObject(script);
//// String html = json.getString("html");
// Document document = Jsoup.parse(htmlBody);
// Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
// for (Element element : elements) {
// try {
// String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href");
// String name = element.select("td.td-02").select("a").text();
// String num = !element.select("td.td-02").select("span").text().equals("") ? element.select("td.td-02").select("span").text() : "0";
// String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("") ? element.select("td[class=\"td-01 ranktop\"]").text() : "-1";
//
// int hotCount = Integer.valueOf(num);
// int rankCount = Integer.valueOf(rank);
// HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), null);
// list.add(hotSearch);
// } catch (Exception e) {
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
// log.error("解析微博时时热搜时出现解析错误", e);
// continue;
// }
// }
// } catch (Exception e) {
// log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace());
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
// return null;
// }
// } else {
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
// log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
// }
// break;
// }
// return list;
// }
static WeiBoUserDao weiBoUserDao = new WeiBoUserDao();
static WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao();
/**
* @return void 返回类型
* @Title: weiboHotSearchTest
* @author hero
* @Description: TODO(PC端微博热搜采集)
*/
public static List<HotSearchList> weiboHotSearch() {
String url = "https://s.weibo.com/top/summary?cate=realtimehot";
Map<String, String> headerMap = new HashMap<>();
headerMap.put("Cookie", "SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN");
List<HotSearchList> list = new ArrayList<HotSearchList>();
for (int i = 0; i < 3; i++) {
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
if (i == 2) {
return list;
} else {
continue;
}
}
if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) {
try {
Date date = new Date();
org.jsoup.nodes.Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
for (Element element : elements) {
try {
//获取链接
String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href");
//获取标题
String name = element.select("td.td-02").select("a").text();
//获取热度值
String num = element.select("td.td-02").select("span").text();
//获取排名
String rank = element.select("td.td-01").text();
Integer rankCount = null;
//默认推荐位排名为0 置顶为-1
if ("•".equals(rank)) {
rankCount = 0;
id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href_to");
} else if (StringUtils.isEmpty(rank)) {
rankCount = -1;
} else {
rankCount = Integer.valueOf(rank);
}
//获取icon
String text = element.select("td.td-03").text();
String icon = null;
if (StringUtils.isNotEmpty(text) && nonNull(text)) {
if ("商".equals(text)) {
icon = "jian";
} else if ("新".equals(text)) {
icon = "new";
} else if ("热".equals(text)) {
icon = "hot";
} else if ("沸".equals(text)) {
icon = "fei";
} else if ("爆".equals(text)) {
icon = "boom";
}
}
//获取热度标签
String heatLabel = null;
//获取热度值 置顶 推荐位 默认值为0
Long hotCount =0L;
if (StringUtils.isNotEmpty(num) && Objects.nonNull(num)) {
String[] split = num.split(" ");
if (split.length > 1) {
heatLabel = split[0].trim();
hotCount = Long.valueOf(split[1].trim());
}else {
hotCount = Long.valueOf(num);
}
}
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), icon, date);
hotSearch.setHeatLabel(heatLabel);
list.add(hotSearch);
} catch (Exception e) {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log.error("解析微博时时热搜时出现解析错误", e);
continue;
}
}
} catch (Exception e) {
log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace());
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
return null;
}
} else {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
}
break;
}
return list;
}
/**
......@@ -263,20 +291,20 @@ public class WeiboHotSearchCrawler {
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
String desc_extr = cardInfo.getString("desc_extr");
String heatLabel=null;
Long hotCount =null;
if (Objects.nonNull(desc_extr)){
String heatLabel = null;
Long hotCount = null;
if (Objects.nonNull(desc_extr)) {
String[] split = desc_extr.split(" ");
if (split.length>1){
heatLabel= split[0].trim();
hotCount= Long.valueOf(split[1].trim());
if (split.length > 1) {
heatLabel = split[0].trim();
hotCount = Long.valueOf(split[1].trim());
}else {
} else {
hotCount = cardInfo.getLongValue("desc_extr");
}
}
String iconUrl = cardInfo.getString("icon");
String icon=null;
String icon = null;
if (StringUtils.isNotBlank(iconUrl)) {
icon = iconUrl.split("_")[1].split(".png")[0];
}
......@@ -284,7 +312,9 @@ public class WeiboHotSearchCrawler {
String id = cardInfo.getString("scheme");
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
hotSearch.setHeatLabel(heatLabel);
if (Objects.nonNull(iconUrl)){hotSearch.setIconUrl(iconUrl);}
if (Objects.nonNull(iconUrl)) {
hotSearch.setIconUrl(iconUrl);
}
result.add(hotSearch);
rank++;
redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS, name + "_微博热搜");
......@@ -371,7 +401,7 @@ public class WeiboHotSearchCrawler {
} catch (UnsupportedEncodingException e) {
log.error("更新导语时字符解析成URl模式异常", e);
}
String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type"+encode;
String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type" + encode;
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 5; count++) {
......@@ -389,7 +419,7 @@ public class WeiboHotSearchCrawler {
document.put("topicLead", topicLead);
}
}
if (json.containsKey("cardlist_head_cards")&&!json.getJSONArray("cardlist_head_cards").isEmpty()) {
if (json.containsKey("cardlist_head_cards") && !json.getJSONArray("cardlist_head_cards").isEmpty()) {
JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext");
......@@ -465,9 +495,11 @@ public class WeiboHotSearchCrawler {
} catch (UnsupportedEncodingException e) {
log.error("字符解析成URl模式异常", e);
}
String url = "https://s.weibo.com/weibo?q="+encode+"&Refer=top";
String url = "https://s.weibo.com/weibo?q=" + encode + "&Refer=top";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
Map<String, String> headerMap = new HashMap<>();
headerMap.put("Cookie", "SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN");
Request request = RequestUtils.wrapGet(url,headerMap);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
......@@ -634,13 +666,13 @@ public class WeiboHotSearchCrawler {
Long followerCount = null;
if (followers_count.contains("万")) {
String[] split = followers_count.split("万");
Double aDouble = Double.valueOf(split[0])*10000;
Double aDouble = Double.valueOf(split[0]) * 10000;
followerCount = new Double(aDouble).longValue();
} else if (followers_count.contains("亿")){
} else if (followers_count.contains("亿")) {
String[] split = followers_count.split("亿");
Double aDouble = Double.valueOf(split[0])*100000000;
Double aDouble = Double.valueOf(split[0]) * 100000000;
followerCount = new Double(aDouble).longValue();
}else {
} else {
followerCount = Long.valueOf(followers_count);
}
//用户头像地址
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment