Commit 833c31d7 by win 10

补全遗漏程序

parent 572289d9
package com.zhiwei.searchhotcrawler.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.BaiDuHotSearch;
/**
* @ClassName:BaiDuHotSearch
* @Description: TODO(百度风云榜热搜采集)
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
public class BaiDuHotSearchCrawler {
private static Logger logger = LoggerFactory.getLogger(BaiDuHotSearchCrawler.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
* @Title: BaiDuHotSearchTest
* @author hero
* @Description: TODO(PC端百度风云榜采集)
* @param 设定文件
* @return void 返回类型
*/
public static List<BaiDuHotSearch> baiduHotSearch(){
String url = "http://top.baidu.com/buzz?b=1&fr=topindex";
List<BaiDuHotSearch> list = new ArrayList<BaiDuHotSearch>();
for(int i =0; i<3; i++){
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
if(htmlBody!=null && htmlBody.contains("mainBody")){
try {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("table.list-table").select("tr");
for (Element element : elements) {
try {
//获取排名rank
String rankStr = null;
//根据网页标签,给rankStr做判断
if(!element.select("td.first").select("span.num-top").isEmpty()) {
rankStr = element.select("td.first").select("span.num-top").text();
}else if(!element.select("td.first").select("span.num-normal").isEmpty()) {
rankStr = element.select("td.first").select("span.num-normal").text();
}
Integer rank = null;
//判断rankStr是否为空
if(StringUtils.isNoneBlank(rankStr)) {
rank = Integer.valueOf(rankStr);
}
//获取id(主键String)
// String id = element.select("td.keyword").select("a").text() + "_" +
// TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss");
//获取关键词(String)
String kw = element.select("td.keyword").select("a.list-title").text();
logger.info("关键词:{}", kw);
//获取关键词相关链接everurl(String)
String everurl = element.select("td.keyword").select("a.list-title").attr("href");
//获取搜索指数count(int)
String hot = null;
//判断热度值所在的规则是否为null
if(!element.select("td.last").select("span.icon-fall").isEmpty()) {
hot = element.select("td.last").select("span.icon-fall").text();
}else if(!element.select("td.last").select("span.icon-rise").isEmpty()) {
hot = element.select("td.last").select("span.icon-rise").text();
}
int count = 0;
//判断hot是否为空
if(StringUtils.isNotBlank(hot)) {
count = Integer.valueOf(hot);
}
BaiDuHotSearch hotSearch = new BaiDuHotSearch(rank,kw,everurl,count);
if(Objects.nonNull(rank)) {
list.add(hotSearch);
}
} catch (Exception e) {
// SendMailWeibo.sendMail("百度风云榜采集出现问题", "859548429@qq.com");
logger.error("解析百度风云榜时出现解析错误", e);
continue;
}
}
}catch (Exception e) {
logger.error("解析百度风云榜时出现解析错误,数据不是json结构",e.fillInStackTrace());
// SendMailWeibo.sendMail("百度风云榜采集出现问题", "859548429@qq.com");
return null;
}
}else{
// SendMailWeibo.sendMail("百度风云榜采集出现问题", "859548429@qq.com");
logger.info("解析百度风云榜时出现解析错误,页面结构有问题");
}
break;
} catch (Exception e) {
logger.error("解析百度风云榜时出现解析错误,页面结构有问题", e);
}
}
logger.info("次轮采集的数据量为:", list.size());
return list;
}
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment