Commit 1d1a7503 by leiliangliang

暂停知乎数码,微博超话,微博热词定时任务及头条阅读量更新

parent 2450a48a
......@@ -166,34 +166,29 @@ public class ToutiaoHotSearchCrawler {
String htmlBody = null;
String url = hotSearchList.getUrl();
Request request = RequestUtils.wrapGet(url);
for (int i = 0; i <= 5; i++) {
Response response = httpBoot.syncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY);
if (response.hasCause()) {
Throwable cause = response.cause();
log.error("解析今日头条热搜详情页面出现连接失败", cause);
} else {
htmlBody = response.bodyString();
}
if (StringUtils.isNotBlank(htmlBody)) {
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select(".result-content .cs-view .cs-topone-tail .cs-view .margin-bottom-m .margin-left-m");
if (Objects.nonNull(elements) && !elements.isEmpty()) {
Element element = elements.first();
String readCount = element.text().replaceAll("阅读", "");
Long count = TipsUtils.getHotCount(readCount);
log.info("{},阅读量:{}", hotSearchList.getName(), count);
hotSearchList.setCommentCount(count);
hotSearchListDAO.updateTouTiaoReadCount(hotSearchList);
return hotSearchList;
}
Response response = httpBoot.syncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY);
if (response.hasCause()) {
Throwable cause = response.cause();
log.error("解析今日头条热搜详情页面出现连接失败", cause);
} else {
htmlBody = response.bodyString();
}
if (StringUtils.isNotBlank(htmlBody)&&htmlBody.contains("data")) {
try {
String substring = htmlBody.substring(htmlBody.indexOf("read_count")+12, htmlBody.indexOf("search_bar_controll"));
String s = substring.split(",")[0];
Long commentCount = Long.valueOf(s);
hotSearchList.setCommentCount(commentCount);
hotSearchListDAO.updateTouTiaoReadCount(hotSearchList);
return hotSearchList;
} catch (Exception e) {
e.printStackTrace();
}
ZhiWeiTools.sleep(1000L);
}
}
return hotSearchList;
}
/**
* 热搜类型
*
......
......@@ -4,6 +4,7 @@ package com.zhiwei.searchhotcrawler.run;
import com.zhiwei.http.proxy.CynomysFactory;
import com.zhiwei.network.cynomys.consumer.CynomysConsumer;
import com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.timer.*;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.dubbo.config.ApplicationConfig;
......@@ -18,21 +19,28 @@ public class HotSearchRun {
public static void main(String[] args) {
ApplicationContext context = new ClassPathXmlApplicationContext("applicationContext.xml");
// SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
// .group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
// ProxyFactory.init(simpleConfig);
ApplicationConfig applicationConfig = new ApplicationConfig();
applicationConfig.setName("xxx-project");
applicationConfig.setName("hot_search-project");
RegistryConfig registryConfig = new RegistryConfig();
registryConfig.setAddress("zookeeper://192.168.0.30:2181?timeout=30000");
ConsumerConfig consumerConfig = new ConsumerConfig();
// 设置分组
consumerConfig.setGroup("test");
String username = "your cool username";
String password = "your cool password";
String username = null;
String password = null;
if (ProxyConfig.isLocal) {
registryConfig.setAddress(ProxyConfig.localRegistry);
// 设置分组
consumerConfig.setGroup(ProxyConfig.localGroup);
username = ProxyConfig.localUsername;
password = ProxyConfig.localPassword;
} else {
registryConfig.setAddress(ProxyConfig.hangzhouRegistry);
// 设置分组
consumerConfig.setGroup(ProxyConfig.hangzhouGroup);
username = ProxyConfig.hangzhouUsername;
password = ProxyConfig.hangzhouPassword;
}
// 创建 consumer,applicationConfig 非必需参数
CynomysConsumer consumer = CynomysConsumerFactory.create(applicationConfig, registryConfig, consumerConfig, username, password);
......
......@@ -113,8 +113,13 @@ public class GatherTimer {
log.info("{}, 今日头条此轮采集到的数据量为:{}", new Date(), toutiaoList != null ? toutiaoList.size() : 0);
TipsUtils.addHotList(HotSearchType.今日头条热搜.name(),toutiaoList);
log.info("今日头条热搜采集结束...");
log.info("今日头条热搜详情趋势阅读量更新...");
TouTiaoExecutor.countTouTiaoReadCount(toutiaoList);
//暂停今日头条阅读量更新
// log.info("今日头条热搜详情趋势阅读量更新开始...");
// //TouTiaoExecutor.countTouTiaoReadCount(toutiaoList);
// for (HotSearchList hotSearchList : toutiaoList) {
// ToutiaoHotSearchCrawler.toutiaoReadCount(hotSearchList);
// }
// log.info("今日头条热搜详情趋势阅读量更新结束...");
}
/**
......@@ -362,7 +367,7 @@ public class GatherTimer {
* 知乎热搜数码分类采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ")
//@Scheduled(cron = "20 * * * * ? ")
public void crawlerZhiHuDigital(){
this.crawlerZhiHuChild(DIGITAL);
}
......@@ -428,7 +433,7 @@ public class GatherTimer {
* 微博超话的采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 0 0/3 * * ? ")
//@Scheduled(cron = "0 0 0/3 * * ? ")
public void crawlerWeiBoSuperTopic(){
log.info("微博超话采集开始........");
Date date = DateUtils.getMillSecondTime(new Date());
......@@ -675,7 +680,7 @@ public class GatherTimer {
*微博热词采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 0 0/1 * * ? ")
//@Scheduled(cron = "0 0 0/1 * * ? ")
public void WeiBoSearchHotWordsCrawler(){
log.info("微博热词采集开始........");
Date date = DateUtils.getMillSecondTime(new Date());
......
isLocal = true
isLocal = false
hangzhou.registry=zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182&timeout=60000
hangzhou.group=hangzhou
......@@ -7,5 +7,7 @@ hangzhou.password=gRG9QJ6QghuLcCC9
########################################################
local.registry=zookeeper://192.168.0.35:2181?backup=192.168.0.30:2181,192.168.0.11:2181&timeout=60000
local.group=local
#local.username=15139460980
#local.password=lllq2w3e4r
local.username=15757871020
local.password=Cwt1q2w3e4r@
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment