Commit 977f9678 by chenweitao

Merge branch 'mlb-template-local' into 'mlbWork'

Mlb template local

See merge request !60
parents 27a25051 14db8c10
......@@ -6,6 +6,7 @@ public enum HotSearchType {
知乎热搜,
抖音热搜,
搜狗微信热搜,
搜狗微信客户端热搜,
微博话题,
今日头条热搜,
知乎热搜榜单,
......@@ -18,5 +19,6 @@ public enum HotSearchType {
网易热榜,
网易跟帖热议,
微博预热榜,
腾讯较真榜
腾讯较真榜,
脉脉热榜
}
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@Log4j2
public class MaiMaiHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).build();
/**
* 获取maimai热榜
* @return
*/
public static List<HotSearchList> getMaiMaiHotData(Date date){
log.info("脉脉热榜开始采集");
List<HotSearchList> list = new ArrayList<>();
String url = "https://open.taou.com/maimai/feed/v6/hot_posts_list?tab=profession&count=15&version=5.3.34&u=232258287&access_token=1.4c82e8ad6d6b4e03262a48f334dea336";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("脉脉热榜页面连接异常...", e);
}
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("feeds")) {
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("feeds");
if (jsonArray != null) {
for (int i = 0; i < jsonArray.size(); i++) {
Integer rank = i + 1 ;
JSONObject jsonObject = jsonArray.getJSONObject(i).getJSONObject("style35");
if(jsonObject != null) {
String name = jsonObject.getString("text");
log.info(name);
String maimaiUrl = jsonObject.getString("share_url");
String icon = null;
if (jsonObject.containsKey("hot_type_card")) {
icon = jsonObject.getJSONObject("hot_type_card").getString("text");
}
String hotValue = jsonArray.getJSONObject(i).getJSONObject("common").getString("hot_info");
Integer count = hotValue.length() > 0 ? TipsUtils.getHotCount(hotValue) : 0;
HotSearchList hotSearchList = new HotSearchList(maimaiUrl, name, count, null, rank, HotSearchType.脉脉热榜.name(), icon, date);
list.add(hotSearchList);
}
}
}
}
log.info("{}, 此轮脉脉热榜采集到的数据量为:{}", new Date(), list != null ? list.size() : 0);
log.info("脉脉热榜采集结束");
return list;
}
}
......@@ -2,6 +2,9 @@ package com.zhiwei.searchhotcrawler.crawler;
import java.util.*;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.tools.URLCodeUtil;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
......@@ -93,4 +96,39 @@ public class SougoHotSearchCrawler {
}
return list;
}
/**
* App搜狗微信热搜采集
* @param date
* @return
*/
public static List<HotSearchList> sougouHotDataCrawler(Date date) {
String url = "https://sa.sogou.com/sgsfe/aw/api/sgs/discovery?appinfo=SgL8QvtGkoc7xfT9%2Bcv6KR%2B4GzPVAe173rfQAid28G5Lewv%2Bs6%2FdCirHWDxkEg99530mEV4gZe0b0y8MQW9kyvJt2PR6sNL9tpWvrB6Lth24GK4NsBjfjZzsJwY5C7YyMjDKAQotP5NXXOifSq2b06G6A38QAMn6Su0a0z%2B1QrzKzU5Q7wTQFjMu0%2Ffm8QsYWJSJcsFK3ElzRy3gqjG5v%2F%2BcKesSlNVeVSy%2B8OU8df9likSHPejsFYCsf%2Bs%2B7aPRFn5a2vZeBoa70qFJwm%2BiesGrVe31SPPbrrhXQ7ZRWW1CdvL3k%2B8Rp1U5XWwWOGlfk1OtxjMdLujBukSbbFhGOEVofeWG%2Bh3vJv9cP%2FCfKvPKI1qALBLVzvhCT0VZN9eB&from=app&mid=a0e71605170951972%7C68c976&product_id=fr9H6hncTEmpn8AJjYairg";
List<HotSearchList> list = new ArrayList<>();
Map<String,String> headMap = HeaderTool.getCommonHead();
Request request = RequestUtils.wrapGet(url, headMap);
for (int i = 0; i < 3; i++) {
String htmlBody = null;
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析搜狗微信时出现解析错误,页面结构有问题", e);
}
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("data");
for (int j=0; j<jsonArray.size(); j++) {
JSONObject object = jsonArray.getJSONObject(j);
int rank = j+1;
String name = object.getString("name");
Integer count = object.getIntValue("num");
String sougouUrl = "https://m.sogou.com/web/searchList.jsp?s_from=pcsearch&keyword=" + URLCodeUtil.getURLEncode(name, "utf-8");
String icon = object.getIntValue("tag") == 1 ? "热" : null;
HotSearchList hotSearchList = new HotSearchList(sougouUrl,name,count,false,rank,HotSearchType.搜狗微信客户端热搜.name(),icon,date);
list.add(hotSearchList);
}
return list;
}
}
return list;
}
}
......@@ -239,7 +239,11 @@ public class HotSearchCacheDAO {
// default :
// duration = duration + 1;
// }
if("脉脉热榜".equals(type)){
duration = duration + 30;
}else {
duration = duration + 1;
}
return duration;
}
......
......@@ -163,16 +163,30 @@ public class GatherTimer {
}
/**
* 搜狗微信热的采集
* 搜狗微信热的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeChat(){
logger.info("搜狗微信热开始采集...");
logger.info("搜狗微信热开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = SougoHotSearchCrawler.sougoHotSearch(date);
logger.info("{}, 搜狗微信热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
logger.info("{}, 搜狗微信热采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
TipsUtils.addHotList(HotSearchType.搜狗微信热搜.name(),list);
logger.info("搜狗微信热词采集结束...");
}
/**
* 搜狗微信热搜的采集(app端采集链接)
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
public void ceawlerSougouHotData(){
logger.info("搜狗微信热搜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = SougoHotSearchCrawler.sougouHotDataCrawler(date);
logger.info("{}, 搜狗微信热搜此轮采集到的数据量为:{}", new Date(), list != null ? list.size() : 0);
TipsUtils.addHotList(HotSearchType.搜狗微信客户端热搜.name(),list);
logger.info("搜狗微信热搜采集结束...");
}
......@@ -348,6 +362,17 @@ public class GatherTimer {
}
/**
* maimai采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "30 0/30 * * * ? ")
public void crawlerMaiMaiHotSearch(){
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> list = MaiMaiHotSearchCrawler.getMaiMaiHotData(date);
TipsUtils.addHotList(HotSearchType.脉脉热榜.name(),list);
}
/**
* 微博超话的采集
*/
@Async(value = "myScheduler")
......
test
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment