Commit 8da69ed3 by chenweitao

Merge branch 'working' into 'master'

微博热词采集上线

See merge request !192
parents e800df88 97dcf959
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.WeiBoSearchBoxHotWords;
import com.zhiwei.searchhotcrawler.dao.WeiBoSearchBoxHotWordsDao;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Objects;
/**
* @author: ll
* @ClassName: WeiBoSearchHotWordsCrawler
* @Description: 移动端微博热词采集
* @date: 2022年05月12日 下午05:35:31
* @Title: WeiBoSearchHotWordsCrawler
*/
@Log4j2
public class WeiBoSearchHotWordsCrawler {
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
static WeiBoSearchBoxHotWordsDao weiBoSearchDao = new WeiBoSearchBoxHotWordsDao();
public static void weiBoSearchHotWords(Date date){
String url = "https://m.s.weibo.com/ajax_weibo/recomband?recom_type=user_words_realhot&lcode=&uicode=&refer=&lfid=";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 5; count++) {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()){
Throwable cause = response.cause();
log.error("解析微博热词时出现解析错误,页面结构有问题",cause);
}else {
htmlBody = response.bodyString();
}
if (htmlBody != null && htmlBody.contains("data")) {
int num = ansysData(htmlBody, date);
if(num>0){
break;
}
} else {
log.info("解析微博热词时出现解析错误,页面结构有问题");
continue;
}
}
}
//解析页面数据
private static int ansysData(String htmlBody, Date date) {
//使用静态WeiBoSearchDao,防止频繁连数据库
if (Objects.isNull(weiBoSearchDao)) {
weiBoSearchDao = new WeiBoSearchBoxHotWordsDao();
}
List<WeiBoSearchBoxHotWords> list = new ArrayList<>();
try {
//解析htmlBody
JSONObject object = JSONObject.parseObject(htmlBody);
//类型
String type="微博热词";
JSONObject data = object.getJSONObject("data");
//获取json数组
JSONArray cards = data.getJSONArray("topics");
for (int i = 0; i < cards.size(); i++) {
//获取单条数据
JSONObject card = cards.getJSONObject(i);
String ext = null;
String word = null;
//获取标题
String name = card.getString("name");
WeiBoSearchBoxHotWords weiBoSearch = new WeiBoSearchBoxHotWords(name, ext, word, type, date);
weiBoSearch.setId(name+"_"+type);
list.add(weiBoSearch);
}
} catch (Exception e) {
log.error("解析微博热词时出现解析错误,数据不是json结构",e);
}
log.info("{}, 此轮微博热词采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
//数据传给dao
weiBoSearchDao.addWeiBoSearchBoxHotWords(list);
return list.size();
}
}
......@@ -670,4 +670,16 @@ public class GatherTimer {
}
log.info(" 头条财经,科技,汽车,数码榜采集结束........");
}
/**
*微博热词采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 0 0/1 * * ? ")
public void WeiBoSearchHotWordsCrawler(){
log.info("微博热词采集开始........");
Date date = DateUtils.getMillSecondTime(new Date());
WeiBoSearchHotWordsCrawler.weiBoSearchHotWords(date);
log.info("微博热词采集结束........");
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment