Commit 14db8c10 by 马黎滨

搜狗微信热搜采集

parent f23cfb22
......@@ -6,6 +6,7 @@ public enum HotSearchType {
知乎热搜,
抖音热搜,
搜狗微信热搜,
搜狗微信客户端热搜,
微博话题,
今日头条热搜,
知乎热搜榜单,
......
......@@ -2,6 +2,9 @@ package com.zhiwei.searchhotcrawler.crawler;
import java.util.*;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.tools.URLCodeUtil;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
......@@ -93,4 +96,39 @@ public class SougoHotSearchCrawler {
}
return list;
}
/**
* App搜狗微信热搜采集
* @param date
* @return
*/
public static List<HotSearchList> sougouHotDataCrawler(Date date) {
String url = "https://sa.sogou.com/sgsfe/aw/api/sgs/discovery?appinfo=SgL8QvtGkoc7xfT9%2Bcv6KR%2B4GzPVAe173rfQAid28G5Lewv%2Bs6%2FdCirHWDxkEg99530mEV4gZe0b0y8MQW9kyvJt2PR6sNL9tpWvrB6Lth24GK4NsBjfjZzsJwY5C7YyMjDKAQotP5NXXOifSq2b06G6A38QAMn6Su0a0z%2B1QrzKzU5Q7wTQFjMu0%2Ffm8QsYWJSJcsFK3ElzRy3gqjG5v%2F%2BcKesSlNVeVSy%2B8OU8df9likSHPejsFYCsf%2Bs%2B7aPRFn5a2vZeBoa70qFJwm%2BiesGrVe31SPPbrrhXQ7ZRWW1CdvL3k%2B8Rp1U5XWwWOGlfk1OtxjMdLujBukSbbFhGOEVofeWG%2Bh3vJv9cP%2FCfKvPKI1qALBLVzvhCT0VZN9eB&from=app&mid=a0e71605170951972%7C68c976&product_id=fr9H6hncTEmpn8AJjYairg";
List<HotSearchList> list = new ArrayList<>();
Map<String,String> headMap = HeaderTool.getCommonHead();
Request request = RequestUtils.wrapGet(url, headMap);
for (int i = 0; i < 3; i++) {
String htmlBody = null;
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析搜狗微信时出现解析错误,页面结构有问题", e);
}
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("data");
for (int j=0; j<jsonArray.size(); j++) {
JSONObject object = jsonArray.getJSONObject(j);
int rank = j+1;
String name = object.getString("name");
Integer count = object.getIntValue("num");
String sougouUrl = "https://m.sogou.com/web/searchList.jsp?s_from=pcsearch&keyword=" + URLCodeUtil.getURLEncode(name, "utf-8");
String icon = object.getIntValue("tag") == 1 ? "热" : null;
HotSearchList hotSearchList = new HotSearchList(sougouUrl,name,count,false,rank,HotSearchType.搜狗微信客户端热搜.name(),icon,date);
list.add(hotSearchList);
}
return list;
}
}
return list;
}
}
......@@ -163,16 +163,30 @@ public class GatherTimer {
}
/**
* 搜狗微信热的采集
* 搜狗微信热的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeChat(){
logger.info("搜狗微信热开始采集...");
logger.info("搜狗微信热开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch(date);
logger.info("{}, 搜狗微信热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
logger.info("{}, 搜狗微信热采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
TipsUtils.addHotList(HotSearchType.搜狗微信热搜.name(),list);
logger.info("搜狗微信热词采集结束...");
}
/**
* 搜狗微信热搜的采集(app端采集链接)
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void ceawlerSougouHotData(){
logger.info("搜狗微信热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = SougoHotSearchCrawler.sougouHotDataCrawler(date);
logger.info("{}, 搜狗微信热搜此轮采集到的数据量为:{}", new Date(), list != null ? list.size() : 0);
TipsUtils.addHotList(HotSearchType.搜狗微信客户端热搜.name(),list);
logger.info("搜狗微信热搜采集结束...");
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment