Commit 172e5b3c by chenweitao

Merge branch 'working' into 'master'

Working

See merge request !135
parents 139ff5af 8ec17aa9
...@@ -6,12 +6,9 @@ import java.net.URLEncoder; ...@@ -6,12 +6,9 @@ import java.net.URLEncoder;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.*; import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSON;
import com.mongodb.client.result.UpdateResult;
import com.zhiwei.searchhotcrawler.bean.*; import com.zhiwei.searchhotcrawler.bean.*;
import com.zhiwei.searchhotcrawler.config.RedisConfig; import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.dao.RedisDao; import com.zhiwei.searchhotcrawler.dao.RedisDao;
...@@ -22,19 +19,10 @@ import lombok.extern.log4j.Log4j2; ...@@ -22,19 +19,10 @@ import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.bson.Document; import org.bson.Document;
import org.checkerframework.checker.units.qual.C;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
...@@ -43,86 +31,126 @@ import com.zhiwei.crawler.core.proxy.ProxyHolder; ...@@ -43,86 +31,126 @@ import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.mail.SendMailWeibo; import com.zhiwei.searchhotcrawler.mail.SendMailWeibo;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import org.springframework.beans.factory.annotation.Autowired;
import static java.util.Objects.isNull;
import static java.util.Objects.nonNull; import static java.util.Objects.nonNull;
/** /**
* @author hero * @author hero
* @author hero
* @ClassName: WeiboHotSearch * @ClassName: WeiboHotSearch
* @Description: 微博实时热搜采集 * @Description: 微博实时热搜采集
* @author hero
* @date 2017年9月15日 上午10:54:31 * @date 2017年9月15日 上午10:54:31
*/ */
@Log4j2 @Log4j2
public class WeiboHotSearchCrawler { public class WeiboHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static RedisDao redisDao = new RedisDao(); private static RedisDao redisDao = new RedisDao();
static WeiBoUserDao weiBoUserDao = new WeiBoUserDao(); static WeiBoUserDao weiBoUserDao = new WeiBoUserDao();
static WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao(); static WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao();
/**
* @Title: weiboHotSearchTest /**
* @author hero * @return void 返回类型
* @Description: TODO(PC端微博热搜采集) * @Title: weiboHotSearchTest
* @return void 返回类型 * @author hero
*/ * @Description: TODO(PC端微博热搜采集)
// public static List<HotSearchList> weiboHotSearch(){ */
// String url = "https://s.weibo.com/top/summary?cate=realtimehot"; public static List<HotSearchList> weiboHotSearch() {
// String url = "https://s.weibo.com/top/summary?cate=realtimehot";
// List<HotSearchList> list = new ArrayList<HotSearchList>(); Map<String, String> headerMap = new HashMap<>();
// for(int i =0; i<3; i++){ headerMap.put("Cookie", "SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN");
// String htmlBody = null; List<HotSearchList> list = new ArrayList<HotSearchList>();
// Request request = RequestUtils.wrapGet(url); for (int i = 0; i < 3; i++) {
// try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) { String htmlBody = null;
// htmlBody = response.body().string(); Request request = RequestUtils.wrapGet(url, headerMap);
// } catch (Exception e) { try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
// if(i==2){ htmlBody = response.body().string();
// return list; } catch (Exception e) {
// }else{ if (i == 2) {
// continue; return list;
// } } else {
// } continue;
// if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) { }
// try { }
//// String script = htmlBody.split("<script>STK && STK.pageletM && STK.pageletM.view")[5].split("<\\/script>")[0]; if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) {
//// script = script.replace("(", "").replace(")", ""); try {
//// JSONObject json = JSONObject.parseObject(script); Date date = new Date();
//// String html = json.getString("html"); org.jsoup.nodes.Document document = Jsoup.parse(htmlBody);
// Document document = Jsoup.parse(htmlBody); Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr");
// Elements elements = document.select("div#pl_top_realtimehot").select("tbody").select("tr"); for (Element element : elements) {
// for (Element element : elements) { try {
// try { //获取链接
// String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href"); String id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href");
// String name = element.select("td.td-02").select("a").text(); //获取标题
// String num = !element.select("td.td-02").select("span").text().equals("") ? element.select("td.td-02").select("span").text() : "0"; String name = element.select("td.td-02").select("a").text();
// String rank = !element.select("td[class=\"td-01 ranktop\"]").text().equals("") ? element.select("td[class=\"td-01 ranktop\"]").text() : "-1"; //获取热度值
// String num = element.select("td.td-02").select("span").text();
// int hotCount = Integer.valueOf(num); //获取排名
// int rankCount = Integer.valueOf(rank); String rank = element.select("td.td-01").text();
// HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), null); Integer rankCount = null;
// list.add(hotSearch); //默认推荐位排名为0 置顶为-1
// } catch (Exception e) { if ("•".equals(rank)) {
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com"); rankCount = 0;
// log.error("解析微博时时热搜时出现解析错误", e); id = "http://s.weibo.com" + element.select("td.td-02").select("a").attr("href_to");
// continue; } else if (StringUtils.isEmpty(rank)) {
// } rankCount = -1;
// } } else {
// } catch (Exception e) { rankCount = Integer.valueOf(rank);
// log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace()); }
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com"); //获取icon
// return null; String text = element.select("td.td-03").text();
// } String icon = null;
// } else { if (StringUtils.isNotEmpty(text) && nonNull(text)) {
// SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com"); if ("商".equals(text)) {
// log.info("解析微博时时热搜时出现解析错误,页面结构有问题"); icon = "jian";
// } } else if ("新".equals(text)) {
// break; icon = "new";
// } } else if ("热".equals(text)) {
// return list; icon = "hot";
// } } else if ("沸".equals(text)) {
icon = "fei";
} else if ("爆".equals(text)) {
icon = "boom";
}
}
//获取热度标签
String heatLabel = null;
//获取热度值 置顶 推荐位 默认值为0
Long hotCount =0L;
if (StringUtils.isNotEmpty(num) && Objects.nonNull(num)) {
String[] split = num.split(" ");
if (split.length > 1) {
heatLabel = split[0].trim();
hotCount = Long.valueOf(split[1].trim());
}else {
hotCount = Long.valueOf(num);
}
}
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, true, rankCount, HotSearchType.微博热搜.name(), icon, date);
hotSearch.setHeatLabel(heatLabel);
list.add(hotSearch);
} catch (Exception e) {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log.error("解析微博时时热搜时出现解析错误", e);
continue;
}
}
} catch (Exception e) {
log.error("解析微博时时热搜时出现解析错误,数据不是json结构", e.fillInStackTrace());
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
return null;
}
} else {
SendMailWeibo.sendMail("微博热搜采集出现问题", "859548429@qq.com");
log.info("解析微博时时热搜时出现解析错误,页面结构有问题");
}
break;
}
return list;
}
/** /**
...@@ -263,20 +291,20 @@ public class WeiboHotSearchCrawler { ...@@ -263,20 +291,20 @@ public class WeiboHotSearchCrawler {
JSONObject cardInfo = cardGroup.getJSONObject(j); JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc"); String name = cardInfo.getString("desc");
String desc_extr = cardInfo.getString("desc_extr"); String desc_extr = cardInfo.getString("desc_extr");
String heatLabel=null; String heatLabel = null;
Long hotCount =null; Long hotCount = null;
if (Objects.nonNull(desc_extr)){ if (Objects.nonNull(desc_extr)) {
String[] split = desc_extr.split(" "); String[] split = desc_extr.split(" ");
if (split.length>1){ if (split.length > 1) {
heatLabel= split[0].trim(); heatLabel = split[0].trim();
hotCount= Long.valueOf(split[1].trim()); hotCount = Long.valueOf(split[1].trim());
}else { } else {
hotCount = cardInfo.getLongValue("desc_extr"); hotCount = cardInfo.getLongValue("desc_extr");
} }
} }
String iconUrl = cardInfo.getString("icon"); String iconUrl = cardInfo.getString("icon");
String icon=null; String icon = null;
if (StringUtils.isNotBlank(iconUrl)) { if (StringUtils.isNotBlank(iconUrl)) {
icon = iconUrl.split("_")[1].split(".png")[0]; icon = iconUrl.split("_")[1].split(".png")[0];
} }
...@@ -284,7 +312,9 @@ public class WeiboHotSearchCrawler { ...@@ -284,7 +312,9 @@ public class WeiboHotSearchCrawler {
String id = cardInfo.getString("scheme"); String id = cardInfo.getString("scheme");
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date); HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
hotSearch.setHeatLabel(heatLabel); hotSearch.setHeatLabel(heatLabel);
if (Objects.nonNull(iconUrl)){hotSearch.setIconUrl(iconUrl);} if (Objects.nonNull(iconUrl)) {
hotSearch.setIconUrl(iconUrl);
}
result.add(hotSearch); result.add(hotSearch);
rank++; rank++;
redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS, name + "_微博热搜"); redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS, name + "_微博热搜");
...@@ -371,7 +401,7 @@ public class WeiboHotSearchCrawler { ...@@ -371,7 +401,7 @@ public class WeiboHotSearchCrawler {
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
log.error("更新导语时字符解析成URl模式异常", e); log.error("更新导语时字符解析成URl模式异常", e);
} }
String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type"+encode; String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type" + encode;
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 5; count++) { for (int count = 0; count <= 5; count++) {
...@@ -389,7 +419,7 @@ public class WeiboHotSearchCrawler { ...@@ -389,7 +419,7 @@ public class WeiboHotSearchCrawler {
document.put("topicLead", topicLead); document.put("topicLead", topicLead);
} }
} }
if (json.containsKey("cardlist_head_cards")&&!json.getJSONArray("cardlist_head_cards").isEmpty()) { if (json.containsKey("cardlist_head_cards") && !json.getJSONArray("cardlist_head_cards").isEmpty()) {
JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0); JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) { if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext"); String midText = readJson.getJSONObject("head_data").getString("midtext");
...@@ -465,9 +495,11 @@ public class WeiboHotSearchCrawler { ...@@ -465,9 +495,11 @@ public class WeiboHotSearchCrawler {
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
log.error("字符解析成URl模式异常", e); log.error("字符解析成URl模式异常", e);
} }
String url = "https://s.weibo.com/weibo?q="+encode+"&Refer=top"; String url = "https://s.weibo.com/weibo?q=" + encode + "&Refer=top";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Map<String, String> headerMap = new HashMap<>();
headerMap.put("Cookie", "SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN");
Request request = RequestUtils.wrapGet(url,headerMap);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string(); htmlBody = response.body().string();
} catch (IOException e) { } catch (IOException e) {
...@@ -634,13 +666,13 @@ public class WeiboHotSearchCrawler { ...@@ -634,13 +666,13 @@ public class WeiboHotSearchCrawler {
Long followerCount = null; Long followerCount = null;
if (followers_count.contains("万")) { if (followers_count.contains("万")) {
String[] split = followers_count.split("万"); String[] split = followers_count.split("万");
Double aDouble = Double.valueOf(split[0])*10000; Double aDouble = Double.valueOf(split[0]) * 10000;
followerCount = new Double(aDouble).longValue(); followerCount = new Double(aDouble).longValue();
} else if (followers_count.contains("亿")){ } else if (followers_count.contains("亿")) {
String[] split = followers_count.split("亿"); String[] split = followers_count.split("亿");
Double aDouble = Double.valueOf(split[0])*100000000; Double aDouble = Double.valueOf(split[0]) * 100000000;
followerCount = new Double(aDouble).longValue(); followerCount = new Double(aDouble).longValue();
}else { } else {
followerCount = Long.valueOf(followers_count); followerCount = Long.valueOf(followers_count);
} }
//用户头像地址 //用户头像地址
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment