Commit 5ed24a98 by leiliangliang

新增快手热榜的采集

parent d2e5b1cc
......@@ -25,4 +25,5 @@ public enum HotSearchType {
B站热搜,
人气榜36,
虎嗅热文推荐,
快手热榜,
}
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import java.time.Duration;
import java.util.*;
/**
* @author ll
* @ClassName:KuaiShouHotSearchCrawlerTeat
* @Description:
* @date 2021年6月10日 下午5:54:31
*/
@Log4j2
public class KuaiShouHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public static List<HotSearchList> KuaiShouHotSearchCrawler(Date date) {
String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析快手热榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("APOLLO_STATE")) {
return ansysData(htmlBody,date);
} else {
log.info("解析快手热榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
List<HotSearchList> list= new ArrayList<>();
JSONObject jsonObject = null;
try {
String substring = htmlBody.substring(htmlBody.indexOf("homexxunknown")+15, htmlBody.indexOf("homexxfilmcomlist")+18);
String sub = "{"+substring.substring(substring.indexOf("VisionHotRankResult") + 22, substring.indexOf("llsid") - 2)+"}}";
String substring1 = sub.substring(0,sub.indexOf("$ROOT_QUERY.visionMovieRank") - 2)+"}";
jsonObject = JSONObject.parseObject(substring1);
//获取每个jsonObject对象的值
Collection<Object> values = jsonObject.values();
for (Object value : values) {
try {
JSONObject object = (JSONObject)JSONObject.toJSON(value);
//获取话题名
String name = object.getString("name");
//排名
Integer rank = object.getInteger("rank");
String hotValue = object.getString("hotValue");
String[] ws = hotValue.split("w");
//热度
Double d = Double.valueOf(ws[0])*10000;
long hot = d.longValue();
//话题链接
String url = object.getString("poster");
//标签类型
String tagType =null;
if (object.containsKey("tagType")){
tagType = object.getString("tagType");
}
HotSearchList hotSearchList = new HotSearchList(url,name,hot,true,rank, HotSearchType.快手热榜.name(),tagType,date);
list.add(hotSearchList);
} catch (NumberFormatException e) {
log.error("解析快手热榜时出现解析错误",e);
}
}
} catch (NumberFormatException e) {
log.error("解析快手热榜时出现解析错误,数据不是json结构",e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import java.text.ParseException;
public class HotSearchRunTest {
public static void main(String[] args) throws ParseException {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
//微博热搜开始采集
// new WeiboHotSearchRun().start();
//快手热榜开始采集
// new KuaiShouHotSearchRun().start();
}
}
package com.zhiwei.searchhotcrawler.test;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import java.time.Duration;
import java.util.*;
/**
* @author ll
* @ClassName:KuaiShouHotSearchCrawlerTeat
* @Description:
* @date 2021年6月10日 下午5:54:31
*/
@Log4j2
public class KuaiShouHotSearchCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public static List<HotSearchList> KuaiShouHotSearchCrawler(Date date) {
String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析快手热榜时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("APOLLO_STATE")) {
return ansysData(htmlBody,date);
} else {
log.info("解析快手热榜时出现解析错误,页面结构有问题");
}
return Collections.emptyList();
}
private static List<HotSearchList> ansysData(String htmlBody, Date date) {
List<HotSearchList> list= new ArrayList<>();
JSONObject jsonObject = null;
try {
String substring = htmlBody.substring(htmlBody.indexOf("homexxunknown")+15, htmlBody.indexOf("homexxfilmcomlist")+18);
String sub = "{"+substring.substring(substring.indexOf("VisionHotRankResult") + 22, substring.indexOf("llsid") - 2)+"}}";
String substring1 = sub.substring(0,sub.indexOf("$ROOT_QUERY.visionMovieRank") - 2)+"}";
jsonObject = JSONObject.parseObject(substring1);
//获取每个jsonObject对象的值
Collection<Object> values = jsonObject.values();
for (Object value : values) {
try {
JSONObject object = (JSONObject)JSONObject.toJSON(value);
//获取话题名
String name = object.getString("name");
//排名
Integer rank = object.getInteger("rank");
String hotValue = object.getString("hotValue");
String[] ws = hotValue.split("w");
//热度
Double d = Double.valueOf(ws[0])*10000;
long hot = d.longValue();
//话题链接
String url = object.getString("poster");
//标签类型
String tagType =null;
if (object.containsKey("tagType")){
tagType = object.getString("tagType");
}
HotSearchList hotSearchList = new HotSearchList(url,name,hot,true,rank, HotSearchType.快手热榜.name(),tagType,date);
list.add(hotSearchList);
} catch (NumberFormatException e) {
log.error("解析快手热榜时出现解析错误",e);
}
}
} catch (NumberFormatException e) {
log.error("解析快手热榜时出现解析错误,数据不是json结构",e);
}
return list;
}
}
package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler;
import com.zhiwei.searchhotcrawler.crawler.ZhihuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Log4j2
public class KuaiShouHotSearchRun extends Thread{
@Override
public void run() {
boolean f = true;
while(f) {
try {
getHotList();
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
}
ZhiWeiTools.sleep(50);
}
}
private void getHotList() {
log.info("快手热榜采集开始........");
List<HotSearchList> kuaiShouList = KuaiShouHotSearchCrawlerTest.KuaiShouHotSearchCrawler(new Date());
log.info("{}, 此轮快手热榜采集到的数据量为:{}", new Date(), Integer.valueOf(kuaiShouList != null ? kuaiShouList.size() : 0));
TipsUtils.addHotList("快手热榜",kuaiShouList);
log.info("快手热榜采集结束........");
}
}
\ No newline at end of file
......@@ -507,5 +507,18 @@ public class GatherTimer {
}
return name;
}
/**
*快手热榜的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerKuaiShou(){
logger.info("快手热榜开始采集...");
Date date = DateUtils.getMillSecondTime(new Date());
List<HotSearchList> kuaiShouList = KuaiShouHotSearchCrawler.KuaiShouHotSearchCrawler(date);
logger.info("{}, 快手此轮采集到的数据量为:{}", new Date(), kuaiShouList != null ? kuaiShouList.size() : 0);
TipsUtils.addHotList(HotSearchType.快手热榜.name(), kuaiShouList);
logger.info("快手热榜采集结束...");
}
}
package hotSaerchTest;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest;
import lombok.extern.log4j.Log4j2;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import java.util.Date;
import java.util.List;
/**
* @author ll
* @date 2021/6/10 6:30
*/
@Log4j2
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations =
{"classpath:applicationContext.xml"})
public class HotSearchTest {
/**
* 测试快手热榜采集
*/
@Test
public void kuaiShouTestCrawler() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
List<HotSearchList> hotSearchLists = KuaiShouHotSearchCrawlerTest.KuaiShouHotSearchCrawler(new Date());
System.out.println(hotSearchLists);
System.out.println(hotSearchLists.size());
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment