Commit 3a0f95c0 by leiliangliang

升级核心包Http-boot

parent 672054e6
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
<name>各平台热搜榜单采集程序</name> <name>各平台热搜榜单采集程序</name>
<version>0.0.6-SNAPSHOT</version> <version>0.0.6-SNAPSHOT</version>
<description>各平台热搜榜单采集程序 <description>各平台热搜榜单采集程序
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序</description> 目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序</description>
<developers> <developers>
<developer> <developer>
<id>Bewilder</id> <id>Bewilder</id>
...@@ -38,6 +38,15 @@ ...@@ -38,6 +38,15 @@
</filters> </filters>
<transformers> <transformers>
<transformer> <transformer>
<resource>META-INF/spring.handlers</resource>
</transformer>
<transformer>
<resource>META-INF/spring.schemas</resource>
</transformer>
<transformer>
<resource>META-INF/spring.tooling</resource>
</transformer>
<transformer>
<mainClass>com.zhiwei.searchhotcrawler.run.HotSearchRun</mainClass> <mainClass>com.zhiwei.searchhotcrawler.run.HotSearchRun</mainClass>
</transformer> </transformer>
</transformers> </transformers>
...@@ -73,32 +82,22 @@ ...@@ -73,32 +82,22 @@
</build> </build>
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.7.2-RELEASE</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId> <groupId>junit</groupId>
<artifactId>junit</artifactId> <artifactId>junit</artifactId>
<version>4.13</version> <version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.20</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>5.3.6</version>
<scope>test</scope> <scope>test</scope>
<exclusions>
<exclusion>
<artifactId>hamcrest-core</artifactId>
<groupId>org.hamcrest</groupId>
</exclusion>
</exclusions>
</dependency> </dependency>
</dependencies> </dependencies>
<properties> <properties>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<spring.version>4.2.2.RELEASE</spring.version>
<log4j.version>2.15.0</log4j.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties> </properties>
</project> </project>
......
...@@ -43,10 +43,16 @@ ...@@ -43,10 +43,16 @@
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.6-SNAPSHOT</version> <version>0.1.6-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <!--<dependency>-->
<groupId>com.zhiwei.crawler</groupId> <!--<groupId>com.zhiwei.crawler</groupId>-->
<artifactId>crawler-core</artifactId> <!--<artifactId>crawler-core</artifactId>-->
<version>0.6.7.4-SNAPSHOT</version> <!--<version>0.6.7.4-SNAPSHOT</version>-->
<!--</dependency>-->
<!-- http知微核心包 -->
<dependency>
<groupId>com.zhiwei.http</groupId>
<artifactId>http-boot</artifactId>
<version>0.0.5.9-SNAPSHOT</version>
</dependency> </dependency>
<!-- https://mvnrepository.com/artifact/org.conscrypt/conscrypt-openjdk-uber --> <!-- https://mvnrepository.com/artifact/org.conscrypt/conscrypt-openjdk-uber -->
<dependency> <dependency>
......
...@@ -73,18 +73,17 @@ public class WeiboSuperTopic { ...@@ -73,18 +73,17 @@ public class WeiboSuperTopic {
public WeiboSuperTopic() {} public WeiboSuperTopic() {}
public WeiboSuperTopic(String url, String topicName, Integer rank, String score, public WeiboSuperTopic(String url, String topicName, Integer rank, String postNum,
String fensi, String type) { String fensi, String type) {
this.url = url; this.url = url;
this.topicName = topicName; this.topicName = topicName;
this.rank = rank; this.rank = rank;
this.score = score; this.postNum = postNum;
this.fensi = fensi; this.fensi = fensi;
this.type = type; this.type = type;
this.time = new Date(); this.time = new Date();
this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd"); this.day = TimeParse.dateFormartString(new Date(), "yyyy-MM-dd");
this.id = topicName + "_" + type + "_" + day; this.id = topicName + "_" + type + "_" + time.getTime();
} }
} }
...@@ -4,18 +4,17 @@ import java.net.URLDecoder; ...@@ -4,18 +4,17 @@ import java.net.URLDecoder;
import java.time.Duration; import java.time.Duration;
import java.util.*; import java.util.*;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
...@@ -28,7 +27,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType; ...@@ -28,7 +27,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
@Log4j2 @Log4j2
public class BaiDuHotSearchCrawler { public class BaiDuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); //private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/** /**
* @return void 返回类型 * @return void 返回类型
...@@ -40,10 +40,12 @@ public class BaiDuHotSearchCrawler { ...@@ -40,10 +40,12 @@ public class BaiDuHotSearchCrawler {
String url = "http://top.baidu.com/buzz?b=1&fr=topindex"; String url = "http://top.baidu.com/buzz?b=1&fr=topindex";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (Exception e) { Throwable cause = response.cause();
log.error("解析百度风云榜时出现解析错误,页面结构有问题", e); log.error("解析百度风云榜时出现解析错误,页面结构有问题", cause);
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("container-bg_lQ801")) { if (htmlBody != null && htmlBody.contains("container-bg_lQ801")) {
return ansysNewData(htmlBody, date); return ansysNewData(htmlBody, date);
......
...@@ -2,17 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,17 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.text.DateFormat; import java.text.DateFormat;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.*; import java.util.*;
...@@ -27,7 +26,8 @@ import java.util.*; ...@@ -27,7 +26,8 @@ import java.util.*;
@Log4j2 @Log4j2
public class BiliComprehensiveHotCrawler { public class BiliComprehensiveHotCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); //private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
...@@ -53,10 +53,12 @@ public class BiliComprehensiveHotCrawler { ...@@ -53,10 +53,12 @@ public class BiliComprehensiveHotCrawler {
for (int i = 0; i < urlList.size(); i++) { for (int i = 0; i < urlList.size(); i++) {
Request request = RequestUtils.wrapGet(urlList.get(i)); Request request = RequestUtils.wrapGet(urlList.get(i));
//发送请求每次获取20条数据 //发送请求每次获取20条数据
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error(fmt.format(date)+":第"+i+1+"次请求解析B站综合热门时出现连接失败", e); log.error(fmt.format(date)+":第"+i+1+"次请求解析B站综合热门时出现连接失败", cause);
}else {
htmlBody = response.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
try { try {
......
...@@ -2,16 +2,17 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,16 +2,17 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -20,12 +21,13 @@ import java.io.IOException; ...@@ -20,12 +21,13 @@ import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
@Log4j2 @Log4j2
public class BililiCrawler { public class BililiCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
* B站排行榜的采集 * B站排行榜的采集
...@@ -41,10 +43,12 @@ public class BililiCrawler { ...@@ -41,10 +43,12 @@ public class BililiCrawler {
String url = "https://api.bilibili.com/x/web-interface/ranking/v2?rid=0&type=all"; String url = "https://api.bilibili.com/x/web-interface/ranking/v2?rid=0&type=all";
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for(int t=0; t<3 && dataJson==null; t++){ for(int t=0; t<3 && dataJson==null; t++){
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("B站排行榜页面连接失败",e.fillInStackTrace()); log.error("B站排行榜页面连接失败",cause.fillInStackTrace());
}else {
htmlBody = response.bodyString();
} }
try { try {
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){ if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
...@@ -129,8 +133,8 @@ public class BililiCrawler { ...@@ -129,8 +133,8 @@ public class BililiCrawler {
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
try { try {
System.setProperty("https.protocols", "TLSv1,TLSv1.1,TLSv1.2,SSLv3"); System.setProperty("https.protocols", "TLSv1,TLSv1.1,TLSv1.2,SSLv3");
Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY); Response response = httpBoot.syncCall(request, ProxySupplier.FOREIGN_INNER_PROXY);
String htmlBody = response.body().string(); String htmlBody = response.bodyString();
if (htmlBody != null && htmlBody.contains("v-wrap")) { if (htmlBody != null && htmlBody.contains("v-wrap")) {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
//获取标签 //获取标签
...@@ -139,17 +143,21 @@ public class BililiCrawler { ...@@ -139,17 +143,21 @@ public class BililiCrawler {
hotSearchList.setTag(tag); hotSearchList.setTag(tag);
//获取粉丝数 //获取粉丝数
if (htmlBody.contains("v_upinfo")) { if (htmlBody.contains("v_upinfo")) {
String text = document.select("div.follow-btn").select("span").text(); String text = document.select("div.follow-btn").select("span").last().text();
String fan = text.split(" ")[2]; if (StringUtils.isNotEmpty(text)&& Objects.nonNull(text)) {
Long fanCount =null; Long fanCount = null;
if (fan.contains("万")){ if (text.contains("关注")){
double dou = Double.parseDouble(fan.replaceAll("万", " ")); text =text.replaceAll("关注"," ").trim();
fanCount =new Double(dou*10000).longValue(); }
}else { if (text.contains("万")) {
fanCount =Long.valueOf(fan); double dou = Double.parseDouble(text.replaceAll("万", " ").trim());
fanCount = new Double(dou * 10000).longValue();
} else {
fanCount = Long.valueOf(text);
} }
hotSearchList.setFans(fanCount); hotSearchList.setFans(fanCount);
} }
}
return hotSearchList; return hotSearchList;
} else { } else {
return hotSearchList; return hotSearchList;
...@@ -173,10 +181,12 @@ public class BililiCrawler { ...@@ -173,10 +181,12 @@ public class BililiCrawler {
String url = "https://app.biliapi.com/x/v2/search/square?build=616050&limit=10"; String url = "https://app.biliapi.com/x/v2/search/square?build=616050&limit=10";
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for(int t=0; t<3 && dataJson==null; t++){ for(int t=0; t<3 && dataJson==null; t++){
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("B站热搜页面连接失败",e.fillInStackTrace()); log.error("B站热搜页面连接失败",cause.fillInStackTrace());
}else {
htmlBody = response.bodyString();
} }
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){ if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
dataJson = JSONObject.parseObject(htmlBody).getJSONArray("data").getJSONObject(0).getJSONObject("data").getJSONArray("list"); dataJson = JSONObject.parseObject(htmlBody).getJSONArray("data").getJSONObject(0).getJSONObject("data").getJSONArray("list");
......
...@@ -5,17 +5,17 @@ import java.util.ArrayList; ...@@ -5,17 +5,17 @@ import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
...@@ -30,7 +30,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType; ...@@ -30,7 +30,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
@Log4j2 @Log4j2
public class DouyinHotSearchCrawler { public class DouyinHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
public static List<HotSearchList> list = new ArrayList<>(); public static List<HotSearchList> list = new ArrayList<>();
...@@ -46,10 +46,12 @@ public class DouyinHotSearchCrawler { ...@@ -46,10 +46,12 @@ public class DouyinHotSearchCrawler {
String url = "https://api.amemv.com/aweme/v1/hot/search/list/"; String url = "https://api.amemv.com/aweme/v1/hot/search/list/";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
}catch (IOException e) { Throwable cause = response.cause();
log.debug("获取抖音热搜榜时出现问题:{}", e); log.debug("获取抖音热搜榜时出现问题:{}", cause);
}else {
htmlBody = response.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("word_list")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("word_list")) {
list = new ArrayList<>(); list = new ArrayList<>();
...@@ -87,10 +89,12 @@ public class DouyinHotSearchCrawler { ...@@ -87,10 +89,12 @@ public class DouyinHotSearchCrawler {
String resultUrl = null; String resultUrl = null;
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
}catch (IOException e) { Throwable cause = response.cause();
log.debug("获取抖音热搜榜链接时出现问题:{}", e); log.debug("获取抖音热搜榜链接时出现问题:{}", cause);
}else {
htmlBody = response.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("aweme_list")){ if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("aweme_list")){
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("aweme_list"); JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("aweme_list");
......
...@@ -2,15 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,15 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.io.IOException; import java.io.IOException;
...@@ -20,7 +21,7 @@ import java.util.List; ...@@ -20,7 +21,7 @@ import java.util.List;
@Log4j2 @Log4j2
public class FengHuangSearchCrawler { public class FengHuangSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
* 获取凤凰新闻热榜 * 获取凤凰新闻热榜
...@@ -33,10 +34,12 @@ public class FengHuangSearchCrawler { ...@@ -33,10 +34,12 @@ public class FengHuangSearchCrawler {
String url = "https://nine.ifeng.com/hotspotlist?gv=7.9.1&page="+page; String url = "https://nine.ifeng.com/hotspotlist?gv=7.9.1&page="+page;
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("凤凰新闻热榜页面连接异常...", e); log.error("凤凰新闻热榜页面连接异常...", cause);
}else {
htmlBody = response.bodyString();
} }
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){ if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("list"); JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONArray("list");
...@@ -71,10 +74,12 @@ public class FengHuangSearchCrawler { ...@@ -71,10 +74,12 @@ public class FengHuangSearchCrawler {
String url = "https://shankapi.ifeng.com/autumn/sogouSearchHotword"; String url = "https://shankapi.ifeng.com/autumn/sogouSearchHotword";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("凤凰新闻热搜页面连接异常...", e); log.error("凤凰新闻热搜页面连接异常...", cause);
}else {
htmlBody = response.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("data").getJSONObject(0).getJSONArray("item"); JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("data").getJSONObject(0).getJSONArray("item");
......
package com.zhiwei.searchhotcrawler.crawler; package com.zhiwei.searchhotcrawler.crawler;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -26,7 +27,7 @@ import java.util.*; ...@@ -26,7 +27,7 @@ import java.util.*;
@Log4j2 @Log4j2
public class HotSearch36KrCrawler { public class HotSearch36KrCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/** /**
* @return void 返回类型 * @return void 返回类型
...@@ -38,13 +39,13 @@ public class HotSearch36KrCrawler { ...@@ -38,13 +39,13 @@ public class HotSearch36KrCrawler {
String url = "https://www.36kr.com/hot-list/catalog"; String url = "https://www.36kr.com/hot-list/catalog";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { if (response.hasCause()){
htmlBody = response.body().string(); Throwable cause = response.cause();
} catch (Exception e) { log.error("解析36Kr人气榜时出现解析错误,页面结构有问题", cause);
log.error("解析36Kr人气榜时出现解析错误,页面结构有问题", e); }else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("article-list")) { if (htmlBody != null && htmlBody.contains("article-list")) {
return ansysData(htmlBody,date); return ansysData(htmlBody,date);
} else { } else {
......
...@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
...@@ -27,17 +29,19 @@ import java.util.*; ...@@ -27,17 +29,19 @@ import java.util.*;
*/ */
@Log4j2 @Log4j2
public class HuXiuHotSearchCrawler { public class HuXiuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
public static List<HotSearchList> HuXiuHotArticleRecommended(Date date){ public static List<HotSearchList> HuXiuHotArticleRecommended(Date date){
String url = "https://www.huxiu.com/"; String url = "https://www.huxiu.com/";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (Exception e) { Throwable cause = response.cause();
log.error("解析虎嗅热文推荐时出现解析错误,页面结构有问题", e); log.error("解析虎嗅热文推荐时出现解析错误,页面结构有问题", cause);
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("hot__list")) { if (htmlBody != null && htmlBody.contains("hot__list")) {
return ansysData(htmlBody,date); return ansysData(htmlBody,date);
......
package com.zhiwei.searchhotcrawler.crawler; package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import java.time.Duration; import java.time.Duration;
import java.util.*; import java.util.*;
...@@ -22,7 +24,7 @@ import java.util.*; ...@@ -22,7 +24,7 @@ import java.util.*;
*/ */
@Log4j2 @Log4j2
public class KuaiShouHotSearchCrawler { public class KuaiShouHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/** /**
* @return void 返回类型 * @return void 返回类型
...@@ -34,11 +36,12 @@ public class KuaiShouHotSearchCrawler { ...@@ -34,11 +36,12 @@ public class KuaiShouHotSearchCrawler {
String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"; String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { if (response.hasCause()){
htmlBody = response.body().string(); Throwable cause = response.cause();
} catch (Exception e) { log.error("解析快手热榜时出现解析错误,页面结构有问题", cause);
log.error("解析快手热榜时出现解析错误,页面结构有问题", e); }else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("APOLLO_STATE")) { if (htmlBody != null && htmlBody.contains("APOLLO_STATE")) {
return ansysData(htmlBody,date); return ansysData(htmlBody,date);
......
...@@ -2,15 +2,17 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,15 +2,17 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.io.IOException; import java.io.IOException;
...@@ -21,7 +23,7 @@ import java.util.List; ...@@ -21,7 +23,7 @@ import java.util.List;
@Log4j2 @Log4j2
public class MaiMaiHotSearchCrawler { public class MaiMaiHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
* 获取maimai热榜 * 获取maimai热榜
...@@ -33,10 +35,12 @@ public class MaiMaiHotSearchCrawler { ...@@ -33,10 +35,12 @@ public class MaiMaiHotSearchCrawler {
String url = "https://open.taou.com/maimai/feed/v6/hot_posts_list?tab=profession&count=15&version=5.3.34&u=232258287&access_token=1.4c82e8ad6d6b4e03262a48f334dea336"; String url = "https://open.taou.com/maimai/feed/v6/hot_posts_list?tab=profession&count=15&version=5.3.34&u=232258287&access_token=1.4c82e8ad6d6b4e03262a48f334dea336";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("脉脉热榜页面连接异常...", e); log.error("脉脉热榜页面连接异常...", cause);
}else {
htmlBody = response.bodyString();
} }
//1024 - 26(时间戳+type) = 998 -> name.getBytes(StandardCharsets.UTF_8).length<998 -> 998/3 = 332 //1024 - 26(时间戳+type) = 998 -> name.getBytes(StandardCharsets.UTF_8).length<998 -> 998/3 = 332
int nameLengthMax = 300; int nameLengthMax = 300;
......
...@@ -4,10 +4,13 @@ import java.util.*; ...@@ -4,10 +4,13 @@ import java.util.*;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -15,9 +18,6 @@ import org.jsoup.nodes.Element; ...@@ -15,9 +18,6 @@ import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
...@@ -31,7 +31,7 @@ import com.zhiwei.tools.httpclient.HeaderTool; ...@@ -31,7 +31,7 @@ import com.zhiwei.tools.httpclient.HeaderTool;
@Log4j2 @Log4j2
public class SougoHotSearchCrawler { public class SougoHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
* @Title: SougoHotSearchTest * @Title: SougoHotSearchTest
...@@ -46,10 +46,12 @@ public class SougoHotSearchCrawler { ...@@ -46,10 +46,12 @@ public class SougoHotSearchCrawler {
Request request = RequestUtils.wrapGet(url, headMap); Request request = RequestUtils.wrapGet(url, headMap);
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
String htmlBody = null; String htmlBody = null;
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
}catch (Exception e) { Throwable cause = response.cause();
log.error("解析搜狗微信时出现解析错误,页面结构有问题", e); log.error("解析搜狗微信时出现解析错误,页面结构有问题", cause);
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("topwords")) { if (htmlBody != null && htmlBody.contains("topwords")) {
try { try {
...@@ -108,10 +110,12 @@ public class SougoHotSearchCrawler { ...@@ -108,10 +110,12 @@ public class SougoHotSearchCrawler {
Request request = RequestUtils.wrapGet(url, headMap); Request request = RequestUtils.wrapGet(url, headMap);
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
String htmlBody = null; String htmlBody = null;
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (Exception e) { Throwable cause = response.cause();
log.error("解析搜狗微信时出现解析错误,页面结构有问题", e); log.error("解析搜狗微信时出现解析错误,页面结构有问题", cause);
}else {
htmlBody = response.bodyString();
} }
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){ if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("data"); JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("data");
......
...@@ -2,19 +2,18 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,19 +2,18 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.Date; import java.util.Date;
...@@ -22,7 +21,7 @@ import java.util.List; ...@@ -22,7 +21,7 @@ import java.util.List;
@Log4j2 @Log4j2
public class SouhuTopicCrawler { public class SouhuTopicCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
public static List<HotSearchList> getSouhuTopic(Date date){ public static List<HotSearchList> getSouhuTopic(Date date){
List<HotSearchList> hotSearchLists = new ArrayList<>(); List<HotSearchList> hotSearchLists = new ArrayList<>();
...@@ -32,10 +31,12 @@ public class SouhuTopicCrawler { ...@@ -32,10 +31,12 @@ public class SouhuTopicCrawler {
String url = "https://api.k.sohu.com/api/news/moment/v2/list.go?pageSize=50&v=6.4.4"; String url = "https://api.k.sohu.com/api/news/moment/v2/list.go?pageSize=50&v=6.4.4";
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for(int t=0; t<3 && dataJson==null; t++){ for(int t=0; t<3 && dataJson==null; t++){
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("搜狐话题页面连接失败",e.fillInStackTrace()); log.error("搜狐话题页面连接失败",cause.fillInStackTrace());
}else {
htmlBody = response.bodyString();
} }
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){ if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
JSONObject jsonObject = JSONObject.parseObject(htmlBody).getJSONObject("data"); JSONObject jsonObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
......
...@@ -2,15 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,15 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.MD5Util; import com.zhiwei.searchhotcrawler.util.MD5Util;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import java.time.Duration; import java.time.Duration;
import java.util.*; import java.util.*;
...@@ -23,7 +24,7 @@ import java.util.*; ...@@ -23,7 +24,7 @@ import java.util.*;
*/ */
@Log4j2 @Log4j2
public class TaoBaoHotSearchCrawler { public class TaoBaoHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
public static List<HotSearchList> taoBaoHotSearch(Date date) { public static List<HotSearchList> taoBaoHotSearch(Date date) {
Map<String, String> headerMap = new HashMap<>(); Map<String, String> headerMap = new HashMap<>();
...@@ -34,8 +35,12 @@ public class TaoBaoHotSearchCrawler { ...@@ -34,8 +35,12 @@ public class TaoBaoHotSearchCrawler {
String urls = "https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=" + time + "&sign=&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"; String urls = "https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=" + time + "&sign=&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D";
Request request1 = RequestUtils.wrapGet(urls); Request request1 = RequestUtils.wrapGet(urls);
String token = null; String token = null;
try (Response response = httpBoot.syncCall(request1, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request1, ProxySupplier.NAT_HEAVY_PROXY);
List<String> values = response.networkResponse().headers().values("Set-Cookie"); if (response.hasCause()){
Throwable cause = response.cause();
log.error("解析淘宝热搜时出现解析错误,页面结构有问题", cause);
}else {
List<String> values = response.headers().values("Set-Cookie");
String tk = values.get(1); String tk = values.get(1);
String[] splitTk = tk.split(";"); String[] splitTk = tk.split(";");
String _m_h5_tk = splitTk[0]; String _m_h5_tk = splitTk[0];
...@@ -44,18 +49,18 @@ public class TaoBaoHotSearchCrawler { ...@@ -44,18 +49,18 @@ public class TaoBaoHotSearchCrawler {
String[] splitEnc = enc.split(";"); String[] splitEnc = enc.split(";");
String _m_h5_tk_enc = splitEnc[0]; String _m_h5_tk_enc = splitEnc[0];
headerMap.put("cookie", _m_h5_tk + ";" + _m_h5_tk_enc); headerMap.put("cookie", _m_h5_tk + ";" + _m_h5_tk_enc);
} catch (Exception e) {
log.error("解析淘宝热搜时出现解析错误,页面结构有问题", e);
} }
String signs = token + "&" + time + "&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}"; String signs = token + "&" + time + "&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}";
String sign = MD5Util.getMD5(signs).toLowerCase(); String sign = MD5Util.getMD5(signs).toLowerCase();
String url = "https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=" + time + "&sign=" + sign + "&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"; String url = "https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=" + time + "&sign=" + sign + "&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D";
Request request = RequestUtils.wrapGet(url, headerMap); Request request = RequestUtils.wrapGet(url, headerMap);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response1 = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response1.hasCause()){
Throwable cause = response1.cause();
log.error("解析淘宝热搜时出现解析错误,页面结构有问题", cause);
}else {
htmlBody = response1.bodyString();
ht = !htmlBody.contains("非法请求"); ht = !htmlBody.contains("非法请求");
} catch (Exception e) {
log.error("解析淘宝热搜时出现解析错误,页面结构有问题", e);
} }
if (htmlBody != null && htmlBody.contains("data")) { if (htmlBody != null && htmlBody.contains("data")) {
return ansysData(htmlBody, date); return ansysData(htmlBody, date);
......
...@@ -2,15 +2,15 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,15 +2,15 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
...@@ -20,7 +20,7 @@ import java.util.List; ...@@ -20,7 +20,7 @@ import java.util.List;
@Log4j2 @Log4j2
public class TengXunCrawler { public class TengXunCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
* 腾讯热榜数据采集 * 腾讯热榜数据采集
...@@ -35,10 +35,12 @@ public class TengXunCrawler { ...@@ -35,10 +35,12 @@ public class TengXunCrawler {
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
//采集为空最多重试3次 //采集为空最多重试3次
for (int t = 0; t < 3 && dataJson == null; t++) { for (int t = 0; t < 3 && dataJson == null; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { response.cause().printStackTrace();
e.printStackTrace();
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("idlist")) { if (htmlBody != null && htmlBody.contains("idlist")) {
JSONObject topSearch = JSONObject.parseObject(htmlBody); JSONObject topSearch = JSONObject.parseObject(htmlBody);
...@@ -96,10 +98,12 @@ public class TengXunCrawler { ...@@ -96,10 +98,12 @@ public class TengXunCrawler {
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
//采集为空最多重试3次 //采集为空最多重试3次
for (int t = 0; t < 3; t++) { for (int t = 0; t < 3; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { response.cause().printStackTrace();
e.printStackTrace();
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("data")){ if (htmlBody != null && htmlBody.contains("data")){
JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("data"); JSONArray jsonArray = JSONObject.parseObject(htmlBody).getJSONArray("data");
......
...@@ -3,9 +3,11 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -3,9 +3,11 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO; import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
...@@ -14,7 +16,7 @@ import com.zhiwei.tools.tools.URLCodeUtil; ...@@ -14,7 +16,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -34,7 +36,7 @@ import java.util.*; ...@@ -34,7 +36,7 @@ import java.util.*;
*/ */
@Log4j2 @Log4j2
public class ToutiaoHotSearchCrawler { public class ToutiaoHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
* @Title: weiboHotSearchByPhoneTest * @Title: weiboHotSearchByPhoneTest
...@@ -47,10 +49,12 @@ public class ToutiaoHotSearchCrawler { ...@@ -47,10 +49,12 @@ public class ToutiaoHotSearchCrawler {
String jsUrl = "https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js"; String jsUrl = "https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js";
Request jsRequest = RequestUtils.wrapGet(jsUrl); Request jsRequest = RequestUtils.wrapGet(jsUrl);
String jsBody = null; String jsBody = null;
try(Response response = httpBoot.syncCall(jsRequest,ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(jsRequest, ProxySupplier.NAT_HEAVY_PROXY);
jsBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("获取今日头条实时热搜头部信息标识失败",e); log.error("获取今日头条实时热搜头部信息标识失败",cause);
}else {
jsBody = response.bodyString();
} }
if(jsBody != null && jsBody.contains("origin")){ if(jsBody != null && jsBody.contains("origin")){
String s = jsBody.substring(jsBody.indexOf("origin:")+"origin:".length()); String s = jsBody.substring(jsBody.indexOf("origin:")+"origin:".length());
...@@ -61,10 +65,12 @@ public class ToutiaoHotSearchCrawler { ...@@ -61,10 +65,12 @@ public class ToutiaoHotSearchCrawler {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for(int count =0; count<=5; count++){ for(int count =0; count<=5; count++){
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) { Response response1 = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response1.hasCause()){
} catch (IOException e1) { Throwable cause = response1.cause();
log.error("解析今日头条实时热搜时出现连接失败",e1); log.error("解析今日头条实时热搜时出现连接失败",cause);
}else {
htmlBody = response1.bodyString();
} }
List<HotSearchList> result = new ArrayList<HotSearchList>(); List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
...@@ -159,10 +165,12 @@ public class ToutiaoHotSearchCrawler { ...@@ -159,10 +165,12 @@ public class ToutiaoHotSearchCrawler {
String url = hotSearchList.getUrl(); String url = hotSearchList.getUrl();
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for (int i = 0; i <= 5; i++) { for (int i = 0; i <= 5; i++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e1) { Throwable cause = response.cause();
log.error("解析今日头条热搜详情页面出现连接失败", e1); log.error("解析今日头条热搜详情页面出现连接失败", cause);
}else {
htmlBody = response.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody)) { if (StringUtils.isNotBlank(htmlBody)) {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
......
...@@ -2,16 +2,18 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,16 +2,18 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -26,7 +28,7 @@ import java.util.List; ...@@ -26,7 +28,7 @@ import java.util.List;
*/ */
@Log4j2 @Log4j2
public class WangYiHotSearchCrawler { public class WangYiHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
* 网易新闻实时热榜的采集 * 网易新闻实时热榜的采集
...@@ -39,10 +41,12 @@ public class WangYiHotSearchCrawler { ...@@ -39,10 +41,12 @@ public class WangYiHotSearchCrawler {
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
String htmlBody = null; String htmlBody = null;
for(int t=0 ;t<3; t++) { for(int t=0 ;t<3; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("网易新闻实时热榜页面连接异常...", e); log.error("网易新闻实时热榜页面连接异常...",cause);
}else {
htmlBody = response.bodyString();
} }
if(htmlBody!=null && htmlBody.contains("data")) { if(htmlBody!=null && htmlBody.contains("data")) {
JSONObject bodyObject = JSONObject.parseObject(htmlBody).getJSONObject("data"); JSONObject bodyObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
...@@ -80,10 +84,12 @@ public class WangYiHotSearchCrawler { ...@@ -80,10 +84,12 @@ public class WangYiHotSearchCrawler {
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
String htmlBody = null; String htmlBody = null;
for(int t=0 ;t<3; t++) { for(int t=0 ;t<3; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("网易新闻跟贴热议页面连接异常...", e); log.error("网易新闻跟贴热议页面连接异常...",cause);
}else {
htmlBody = response.bodyString();
} }
if(htmlBody!=null && htmlBody.contains("data")) { if(htmlBody!=null && htmlBody.contains("data")) {
JSONObject bodyObject = JSONObject.parseObject(htmlBody).getJSONObject("data"); JSONObject bodyObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
......
...@@ -2,14 +2,15 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,14 +2,15 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.WeiBoSearchBoxHotWords; import com.zhiwei.searchhotcrawler.bean.WeiBoSearchBoxHotWords;
import com.zhiwei.searchhotcrawler.dao.WeiBoSearchBoxHotWordsDao; import com.zhiwei.searchhotcrawler.dao.WeiBoSearchBoxHotWordsDao;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import java.time.Duration; import java.time.Duration;
import java.util.ArrayList; import java.util.ArrayList;
...@@ -26,7 +27,7 @@ import java.util.Objects; ...@@ -26,7 +27,7 @@ import java.util.Objects;
*/ */
@Log4j2 @Log4j2
public class WeiBoSearchBoxHotWordsCrawler { public class WeiBoSearchBoxHotWordsCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
static WeiBoSearchBoxHotWordsDao weiBoSearchDao = new WeiBoSearchBoxHotWordsDao(); static WeiBoSearchBoxHotWordsDao weiBoSearchDao = new WeiBoSearchBoxHotWordsDao();
public static void weiBoSearchBoxHotWords(Date date){ public static void weiBoSearchBoxHotWords(Date date){
...@@ -35,11 +36,12 @@ public class WeiBoSearchBoxHotWordsCrawler { ...@@ -35,11 +36,12 @@ public class WeiBoSearchBoxHotWordsCrawler {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 5; count++) { for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
Throwable cause = response.cause();
} catch (Exception e) { log.error("解析微博搜索框热词时出现解析错误,页面结构有问题",cause);
log.error("解析微博搜索框热词时出现解析错误,页面结构有问题", e); }else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("hotwords")) { if (htmlBody != null && htmlBody.contains("hotwords")) {
int num = ansysData(htmlBody, date); int num = ansysData(htmlBody, date);
......
...@@ -2,17 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,17 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.util.*; import java.util.*;
/** /**
...@@ -24,7 +23,7 @@ import java.util.*; ...@@ -24,7 +23,7 @@ import java.util.*;
@Log4j2 @Log4j2
public class WeiboEntertainmentCrawler { public class WeiboEntertainmentCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
* @return void 返回类型 * @return void 返回类型
...@@ -38,10 +37,12 @@ public class WeiboEntertainmentCrawler { ...@@ -38,10 +37,12 @@ public class WeiboEntertainmentCrawler {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 5; count++) { for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("解析微博娱乐榜时出现连接失败", e); log.error("解析微博娱乐榜时出现连接失败",cause);
}else {
htmlBody = response.bodyString();
} }
List<HotSearchList> result = new ArrayList(); List<HotSearchList> result = new ArrayList();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
......
...@@ -9,6 +9,10 @@ import java.util.*; ...@@ -9,6 +9,10 @@ import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSON;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.*; import com.zhiwei.searchhotcrawler.bean.*;
import com.zhiwei.searchhotcrawler.config.RedisConfig; import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.dao.RedisDao; import com.zhiwei.searchhotcrawler.dao.RedisDao;
...@@ -17,7 +21,6 @@ import com.zhiwei.searchhotcrawler.dao.WeiBoUserDao; ...@@ -17,7 +21,6 @@ import com.zhiwei.searchhotcrawler.dao.WeiBoUserDao;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.bson.Document; import org.bson.Document;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
...@@ -26,9 +29,6 @@ import org.jsoup.select.Elements; ...@@ -26,9 +29,6 @@ import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.mail.SendMailWeibo; import com.zhiwei.searchhotcrawler.mail.SendMailWeibo;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
...@@ -45,7 +45,7 @@ import static java.util.Objects.nonNull; ...@@ -45,7 +45,7 @@ import static java.util.Objects.nonNull;
@Log4j2 @Log4j2
public class WeiboHotSearchCrawler { public class WeiboHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
private static RedisDao redisDao = new RedisDao(); private static RedisDao redisDao = new RedisDao();
...@@ -66,14 +66,15 @@ public class WeiboHotSearchCrawler { ...@@ -66,14 +66,15 @@ public class WeiboHotSearchCrawler {
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap); Request request = RequestUtils.wrapGet(url, headerMap);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (Exception e) {
if (i == 2) { if (i == 2) {
return list; return list;
} else { } else {
continue; continue;
} }
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) { if (htmlBody != null && htmlBody.contains("pl_top_realtimehot")) {
try { try {
...@@ -261,10 +262,12 @@ public class WeiboHotSearchCrawler { ...@@ -261,10 +262,12 @@ public class WeiboHotSearchCrawler {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap); Request request = RequestUtils.wrapGet(url, headerMap);
for (int count = 0; count <= 5; count++) { for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("解析微博时热搜时出现连接失败", e); log.error("解析微博时热搜时出现连接失败",cause);
}else {
htmlBody = response.bodyString();
} }
List<HotSearchList> result = new ArrayList<HotSearchList>(); List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
...@@ -349,10 +352,12 @@ public class WeiboHotSearchCrawler { ...@@ -349,10 +352,12 @@ public class WeiboHotSearchCrawler {
String url = "https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot"; String url = "https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("解析微博热搜时出现连接失败", e); log.error("解析微博热搜时出现连接失败",cause);
}else {
htmlBody = response.bodyString();
} }
List<HotSearchList> result = new ArrayList<>(); List<HotSearchList> result = new ArrayList<>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
...@@ -405,10 +410,12 @@ public class WeiboHotSearchCrawler { ...@@ -405,10 +410,12 @@ public class WeiboHotSearchCrawler {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 5; count++) { for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("解析微博热搜详情页面时出现连接失败", e); log.error("解析微博热搜详情页面时出现连接失败",cause);
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("data")) { if (htmlBody != null && htmlBody.contains("data")) {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo"); JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo");
...@@ -500,10 +507,12 @@ public class WeiboHotSearchCrawler { ...@@ -500,10 +507,12 @@ public class WeiboHotSearchCrawler {
Map<String, String> headerMap = new HashMap<>(); Map<String, String> headerMap = new HashMap<>();
headerMap.put("Cookie", "SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN"); headerMap.put("Cookie", "SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN");
Request request = RequestUtils.wrapGet(url,headerMap); Request request = RequestUtils.wrapGet(url,headerMap);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("解析微博时热搜时出现连接失败", e); log.error("解析微博时热搜时出现连接失败",cause);
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("m-main")) { if (htmlBody != null && htmlBody.contains("m-main")) {
Document docm = new Document(); Document docm = new Document();
......
...@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.io.IOException; import java.io.IOException;
...@@ -25,7 +27,7 @@ import java.util.*; ...@@ -25,7 +27,7 @@ import java.util.*;
@Log4j2 @Log4j2
public class WeiboNewsCrawler { public class WeiboNewsCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
...@@ -46,11 +48,13 @@ public class WeiboNewsCrawler { ...@@ -46,11 +48,13 @@ public class WeiboNewsCrawler {
for (int count = 0; count <= 5; count++) { for (int count = 0; count <= 5; count++) {
List<HotSearchList> result = new ArrayList(); List<HotSearchList> result = new ArrayList();
//发送第一次请求获取前20条数据 //发送第一次请求获取前20条数据
try (Response response = httpBoot.syncCall(request1, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request1, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("第一次请求解析微博要闻榜时出现连接失败", e); log.error("第一次请求解析微博要闻榜时出现连接失败", cause);
continue; continue;
}else {
htmlBody = response.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try { try {
...@@ -66,11 +70,13 @@ public class WeiboNewsCrawler { ...@@ -66,11 +70,13 @@ public class WeiboNewsCrawler {
continue; continue;
} }
//发送第二次请求获取中间20条数据 //发送第二次请求获取中间20条数据
try (Response response = httpBoot.syncCall(request2, ProxyHolder.NAT_HEAVY_PROXY)) { Response response1 = httpBoot.syncCall(request2, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response1.hasCause()){
} catch (IOException e) { Throwable cause = response1.cause();
log.error("第二次请求解析微博要闻榜时出现连接失败", e); log.error("第二次请求解析微博要闻榜时出现连接失败",cause);
continue; continue;
}else {
htmlBody = response1.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try { try {
...@@ -87,11 +93,13 @@ public class WeiboNewsCrawler { ...@@ -87,11 +93,13 @@ public class WeiboNewsCrawler {
continue; continue;
} }
//发送第三次请求获取最后10条数据 //发送第三次请求获取最后10条数据
try (Response response = httpBoot.syncCall(request3, ProxyHolder.NAT_HEAVY_PROXY)) { Response response2 = httpBoot.syncCall(request3, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response2.hasCause()){
} catch (IOException e) { Throwable cause = response2.cause();
log.error("第三次请求解析微博要闻榜时出现连接失败", e); log.error("第三次请求解析微博要闻榜时出现连接失败",cause);
continue; continue;
}else {
htmlBody = response2.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try { try {
......
...@@ -8,18 +8,19 @@ import java.util.Map; ...@@ -8,18 +8,19 @@ import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Objects; import java.util.Objects;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic; import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
/** /**
* *
...@@ -31,7 +32,7 @@ import com.zhiwei.crawler.core.utils.RequestUtils; ...@@ -31,7 +32,7 @@ import com.zhiwei.crawler.core.utils.RequestUtils;
@Log4j2 @Log4j2
public class WeiboSuperTopicCrawler { public class WeiboSuperTopicCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
private static Map<String,String> headMap = new HashMap<>(); private static Map<String,String> headMap = new HashMap<>();
static { static {
...@@ -63,13 +64,15 @@ public class WeiboSuperTopicCrawler { ...@@ -63,13 +64,15 @@ public class WeiboSuperTopicCrawler {
String htmlBody = null; String htmlBody = null;
//重试三次 //重试三次
for(int retryTimes = 1; retryTimes<=3; retryTimes++) { for(int retryTimes = 1; retryTimes<=3; retryTimes++) {
try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
}catch (Exception e) { Throwable cause = response.cause();
log.error("获取榜单列表页面时出现错误,错误为:{}", e); log.error("获取榜单列表页面时出现错误,错误为:{}",cause);
continue; continue;
}else {
htmlBody = response.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc1")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
topicList.addAll(parseTopicRankHtml(page, htmlBody, type)); topicList.addAll(parseTopicRankHtml(page, htmlBody, type));
break; break;
} else { } else {
...@@ -99,21 +102,18 @@ public class WeiboSuperTopicCrawler { ...@@ -99,21 +102,18 @@ public class WeiboSuperTopicCrawler {
Integer toprank = null; Integer toprank = null;
String topicName = null; String topicName = null;
String id = null; String id = null;
String score = null; String postNum = null;
String desc1 = null;
String fensi = null; String fensi = null;
String url = null; String url = null;
for(int i=0;i<list.size();i++) { for(int i=0;i<list.size();i++) {
JSONObject data = list.getJSONObject(i); JSONObject data = list.getJSONObject(i);
toprank = page + data.getInteger("toprank"); toprank = ++page;
topicName = data.getString("display_name"); topicName = data.getString("display_name");
id = data.getString("page_id"); id = data.getString("page_id");
score = data.getString("score"); postNum = data.getString("status_count");
desc1 = data.getString("desc1"); fensi = data.getString("fans_count");
fensi = desc1.replaceAll(".*影响力|粉丝", "").trim();
url = data.getString("link"); url = data.getString("link");
WeiboSuperTopic topic = new WeiboSuperTopic(url, topicName, toprank, postNum, fensi, type);
WeiboSuperTopic topic = new WeiboSuperTopic(url, topicName, toprank, score, fensi, type);
topic = getTopicInfo(id, topic); topic = getTopicInfo(id, topic);
topicList.add(topic); topicList.add(topic);
} }
...@@ -140,17 +140,19 @@ public class WeiboSuperTopicCrawler { ...@@ -140,17 +140,19 @@ public class WeiboSuperTopicCrawler {
String url = "https://m.weibo.cn/api/container/getIndex?containerid="+ id; String url = "https://m.weibo.cn/api/container/getIndex?containerid="+ id;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
String htmlBody = null; String htmlBody = null;
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (Exception e) { Throwable cause = response.cause();
log.error("解析榜单详情页面时出现错误,错误为:{}", e); log.error("解析榜单详情页面时出现错误,错误为:{}",cause);
}else {
htmlBody = response.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc_more")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("desc_more")) {
String descMore = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("pageInfo").getJSONArray("desc_more").getString(0); String descMore = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("pageInfo").getJSONArray("desc_more").getString(0);
if (StringUtils.isNotBlank(descMore)) { if (StringUtils.isNotBlank(descMore)) {
String readNum = descMore.replaceAll("阅读|帖子.*", "").trim(); String readNum = descMore.replaceAll("阅读|帖子.*", "").trim();
String postNum = descMore.replaceAll(".*帖子|粉丝.*", "").trim(); //String postNum = descMore.replaceAll(".*帖子|粉丝.*", "").trim();
topic.setPostNum(postNum); //topic.setPostNum(postNum);
topic.setReadNum(readNum); topic.setReadNum(readNum);
return topic; return topic;
} }
......
...@@ -3,9 +3,10 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -3,9 +3,10 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic; import com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic;
...@@ -13,7 +14,6 @@ import com.zhiwei.searchhotcrawler.util.TipsUtils; ...@@ -13,7 +14,6 @@ import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
...@@ -31,7 +31,7 @@ import java.util.regex.Pattern; ...@@ -31,7 +31,7 @@ import java.util.regex.Pattern;
*/ */
@Log4j2 @Log4j2
public class WeiboTopicCrawler { public class WeiboTopicCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
private static Map<String,String> headMap = new HashMap<>(); private static Map<String,String> headMap = new HashMap<>();
static { static {
...@@ -137,11 +137,13 @@ public class WeiboTopicCrawler { ...@@ -137,11 +137,13 @@ public class WeiboTopicCrawler {
String htmlBody = null; String htmlBody = null;
//重试三次 //重试三次
for(int retryTimes = 1; retryTimes<=5; retryTimes++) { for(int retryTimes = 1; retryTimes<=5; retryTimes++) {
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (Exception e) { Throwable cause = response.cause();
log.error("下载榜单列表页面时出现错误,错误为:{}", e); log.error("下载榜单列表页面时出现错误,错误为:{}", cause);
continue; continue;
}else {
htmlBody = response.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody)) { if (StringUtils.isNotBlank(htmlBody)) {
topicList.addAll(parseTopicHtml(htmlBody,date)); topicList.addAll(parseTopicHtml(htmlBody,date));
......
...@@ -2,9 +2,10 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,9 +2,10 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
...@@ -12,7 +13,7 @@ import com.zhiwei.tools.tools.URLCodeUtil; ...@@ -12,7 +13,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -24,7 +25,7 @@ import java.util.*; ...@@ -24,7 +25,7 @@ import java.util.*;
@Log4j2 @Log4j2
public class XinLangHotSearchCrawler { public class XinLangHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
* 新浪热榜的采集 * 新浪热榜的采集
...@@ -38,10 +39,12 @@ public class XinLangHotSearchCrawler { ...@@ -38,10 +39,12 @@ public class XinLangHotSearchCrawler {
String htmlBody = null; String htmlBody = null;
JSONObject jsonObject = null; JSONObject jsonObject = null;
for(int t=0 ;t<3&&jsonObject==null; t++) { for(int t=0 ;t<3&&jsonObject==null; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("新浪热榜页面连接异常...", e); log.error("新浪热榜页面连接异常...",cause);
}else {
htmlBody = response.bodyString();
} }
if(htmlBody!=null) { if(htmlBody!=null) {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
...@@ -111,10 +114,12 @@ public class XinLangHotSearchCrawler { ...@@ -111,10 +114,12 @@ public class XinLangHotSearchCrawler {
String htmlBody = null; String htmlBody = null;
JSONArray dataJson = null; JSONArray dataJson = null;
for(int t=0 ;t<3&&dataJson==null; t++) { for(int t=0 ;t<3&&dataJson==null; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("新浪热点页面连接异常...", e); log.error("新浪热点页面连接异常...",cause);
}else {
htmlBody = response.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
JSONObject jsonObject = JSONObject.parseObject(htmlBody).getJSONObject("data"); JSONObject jsonObject = JSONObject.parseObject(htmlBody).getJSONObject("data");
......
...@@ -4,16 +4,18 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -4,16 +4,18 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
...@@ -21,7 +23,7 @@ import java.util.*; ...@@ -21,7 +23,7 @@ import java.util.*;
@Log4j2 @Log4j2
public class ZhihuChildHotSearchCrawler { public class ZhihuChildHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
* 知乎子级分类数据采集 * 知乎子级分类数据采集
...@@ -39,10 +41,11 @@ public class ZhihuChildHotSearchCrawler { ...@@ -39,10 +41,11 @@ public class ZhihuChildHotSearchCrawler {
Request request = RequestUtils.wrapGet(url, headerMap); Request request = RequestUtils.wrapGet(url, headerMap);
//采集为空最多重试3次 //采集为空最多重试3次
for (int t = 0; t < 3 && dataJson == null; t++) { for (int t = 0; t < 3 && dataJson == null; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { response.cause().printStackTrace();
e.printStackTrace(); }else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("data")) { if (htmlBody != null && htmlBody.contains("data")) {
JSONObject topSearch = JSONObject.parseObject(htmlBody); JSONObject topSearch = JSONObject.parseObject(htmlBody);
......
...@@ -3,17 +3,18 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -3,17 +3,18 @@ package com.zhiwei.searchhotcrawler.crawler;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
import com.zhiwei.crawler.core.config.SslProvider; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import io.netty.handler.ssl.SslProvider;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
...@@ -34,7 +35,8 @@ import static java.util.Objects.nonNull; ...@@ -34,7 +35,8 @@ import static java.util.Objects.nonNull;
@Log4j2 @Log4j2
public class ZhihuHotSearchCrawler { public class ZhihuHotSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().sslProvider(SslProvider.CONSCRYPT).retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
//private static HttpBoot httpBoot = HttpBoot.newBuilder().sslProvider(SslProvider.CONSCRYPT).retryTimes(3).build();
/** /**
* @Title: getZhihuHotList * @Title: getZhihuHotList
* @author hero * @author hero
...@@ -98,11 +100,13 @@ public class ZhihuHotSearchCrawler { ...@@ -98,11 +100,13 @@ public class ZhihuHotSearchCrawler {
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20"); headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap); Request request = RequestUtils.wrapGet(url, headerMap);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.debug("获取知乎热搜时出现问题:{}", e); log.debug("获取知乎热搜时出现问题:{}",cause);
return list; return list;
}else {
htmlBody = response.bodyString();
} }
try { try {
if (htmlBody != null && htmlBody.contains("author")) { if (htmlBody != null && htmlBody.contains("author")) {
...@@ -160,8 +164,13 @@ public class ZhihuHotSearchCrawler { ...@@ -160,8 +164,13 @@ public class ZhihuHotSearchCrawler {
Map<String,String> Map = HeaderTool.getCommonHead(); Map<String,String> Map = HeaderTool.getCommonHead();
Map.put("cookie", "_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4"); Map.put("cookie", "_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4");
Request request = RequestUtils.wrapGet(url,Map); Request request = RequestUtils.wrapGet(url,Map);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
String htmlBody = response.body().string(); if (response.hasCause()){
Throwable cause = response.cause();
log.error("单条知乎热搜数据页面连接失败",cause);
return doc;
}else {
String htmlBody = response.bodyString();
if (htmlBody != null && htmlBody.contains("QuestionHeader")) { if (htmlBody != null && htmlBody.contains("QuestionHeader")) {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
//获取标签 //获取标签
...@@ -182,9 +191,6 @@ public class ZhihuHotSearchCrawler { ...@@ -182,9 +191,6 @@ public class ZhihuHotSearchCrawler {
}else { }else {
return doc; return doc;
} }
} catch (Exception e) {
log.error("单条知乎热搜数据页面连接失败",e);
return doc;
} }
} }
......
...@@ -2,29 +2,21 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -2,29 +2,21 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.Data;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.w3c.dom.Element;
import java.io.IOException;
import java.util.*; import java.util.*;
@Log4j2 @Log4j2
public class ZhihuTopicSearchCrawler { public class ZhihuTopicSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
public static List<HotSearchList> getZhihuTopicSearch(Date date){ public static List<HotSearchList> getZhihuTopicSearch(Date date){
List<HotSearchList> list = new ArrayList<>(); List<HotSearchList> list = new ArrayList<>();
...@@ -33,10 +25,12 @@ public class ZhihuTopicSearchCrawler { ...@@ -33,10 +25,12 @@ public class ZhihuTopicSearchCrawler {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for (int t = 0; t < 3 && jsonObject == null; t++) { for (int t = 0; t < 3 && jsonObject == null; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("知乎热搜页面连接异常", e); log.error("知乎热搜页面连接异常",cause);
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null) { if (htmlBody != null) {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
......
package com.zhiwei.searchhotcrawler.run; package com.zhiwei.searchhotcrawler.run;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.http.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig; import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.config.ProxyConfig; import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.timer.*; import com.zhiwei.searchhotcrawler.timer.*;
......
package com.zhiwei.searchhotcrawler.test; package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -26,7 +27,8 @@ import java.util.*; ...@@ -26,7 +27,8 @@ import java.util.*;
@Log4j2 @Log4j2
public class HotSearch36KrCrawlerTest { public class HotSearch36KrCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); //private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/** /**
* @return void 返回类型 * @return void 返回类型
...@@ -38,11 +40,12 @@ public class HotSearch36KrCrawlerTest { ...@@ -38,11 +40,12 @@ public class HotSearch36KrCrawlerTest {
String url = "https://www.36kr.com/hot-list/catalog"; String url = "https://www.36kr.com/hot-list/catalog";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { if (response.hasCause()){
htmlBody = response.body().string(); Throwable cause = response.cause();
} catch (Exception e) { log.error("解析36Kr人气榜时出现解析错误,页面结构有问题", cause);
log.error("解析36Kr人气榜时出现解析错误,页面结构有问题", e); }else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("article-list")) { if (htmlBody != null && htmlBody.contains("article-list")) {
......
package com.zhiwei.searchhotcrawler.test; package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.http.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig; import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.config.ProxyConfig; import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun; import com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun;
......
package com.zhiwei.searchhotcrawler.test; package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.http.HttpEntity; import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpGet;
...@@ -31,17 +31,20 @@ import java.util.*; ...@@ -31,17 +31,20 @@ import java.util.*;
*/ */
@Log4j2 @Log4j2
public class HuXiuHotSearchCrawlerTest { public class HuXiuHotSearchCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); //private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
public static List<HotSearchList> HuXiuHotArticleRecommended(Date date){ public static List<HotSearchList> HuXiuHotArticleRecommended(Date date){
String url = "https://www.huxiu.com/"; String url = "https://www.huxiu.com/";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (Exception e) { Throwable cause = response.cause();
log.error("解析虎嗅热文推荐时出现解析错误,页面结构有问题", e); log.error("解析虎嗅热文推荐时出现解析错误,页面结构有问题", cause);
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("hot__list")) { if (htmlBody != null && htmlBody.contains("hot__list")) {
return ansysData(htmlBody,date); return ansysData(htmlBody,date);
......
...@@ -4,10 +4,12 @@ package com.zhiwei.searchhotcrawler.test; ...@@ -4,10 +4,12 @@ package com.zhiwei.searchhotcrawler.test;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.mongodb.client.MongoDatabase; import com.mongodb.client.MongoDatabase;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyFactory; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxyFactory;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.proxy.config.SimpleConfig; import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
...@@ -17,7 +19,7 @@ import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBLocalTemplate; ...@@ -17,7 +19,7 @@ import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBLocalTemplate;
import com.zhiwei.searchhotcrawler.util.HttpClientUtils; import com.zhiwei.searchhotcrawler.util.HttpClientUtils;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.springframework.context.ApplicationContext; import org.springframework.context.ApplicationContext;
...@@ -38,7 +40,7 @@ public class Job51Test { ...@@ -38,7 +40,7 @@ public class Job51Test {
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build(); .group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig); ProxyFactory.init(simpleConfig);
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
// MongoDatabase mongoDBLocal = MongoDBLocalTemplate.getDB(DBConfig.dbName); // MongoDatabase mongoDBLocal = MongoDBLocalTemplate.getDB(DBConfig.dbName);
List<HotSearchList> list = new ArrayList<>(); List<HotSearchList> list = new ArrayList<>();
...@@ -62,11 +64,12 @@ public class Job51Test { ...@@ -62,11 +64,12 @@ public class Job51Test {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url,header); Request request = RequestUtils.wrapGet(url,header);
for (int t = 0; t < 1 && jsonObject == null; t++) { for (int t = 0; t < 1 && jsonObject == null; t++) {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { if (response.hasCause()){
htmlBody = response.body().string(); Throwable cause = response.cause();
} catch (IOException e) { log.error("知乎热搜页面连接异常", cause);
log.error("知乎热搜页面连接异常", e); }else {
htmlBody = response.bodyString();
} }
if (htmlBody != null) { if (htmlBody != null) {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
......
...@@ -3,14 +3,15 @@ package com.zhiwei.searchhotcrawler.test; ...@@ -3,14 +3,15 @@ package com.zhiwei.searchhotcrawler.test;
import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import java.time.Duration; import java.time.Duration;
import java.util.*; import java.util.*;
...@@ -23,7 +24,8 @@ import java.util.*; ...@@ -23,7 +24,8 @@ import java.util.*;
*/ */
@Log4j2 @Log4j2
public class KuaiShouHotSearchCrawlerTest { public class KuaiShouHotSearchCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); //private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
/** /**
* @return void 返回类型 * @return void 返回类型
...@@ -35,11 +37,12 @@ public class KuaiShouHotSearchCrawlerTest { ...@@ -35,11 +37,12 @@ public class KuaiShouHotSearchCrawlerTest {
String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"; String url = "https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { if (response.hasCause()){
htmlBody = response.body().string(); Throwable cause = response.cause();
} catch (Exception e) { log.error("解析快手热榜时出现解析错误,页面结构有问题", cause);
log.error("解析快手热榜时出现解析错误,页面结构有问题", e); }else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("APOLLO_STATE")) { if (htmlBody != null && htmlBody.contains("APOLLO_STATE")) {
return ansysData(htmlBody,date); return ansysData(htmlBody,date);
......
...@@ -2,15 +2,17 @@ package com.zhiwei.searchhotcrawler.test; ...@@ -2,15 +2,17 @@ package com.zhiwei.searchhotcrawler.test;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.util.MD5Util; import com.zhiwei.searchhotcrawler.util.MD5Util;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; ;
import java.time.Duration; import java.time.Duration;
import java.util.*; import java.util.*;
...@@ -23,7 +25,8 @@ import java.util.*; ...@@ -23,7 +25,8 @@ import java.util.*;
*/ */
@Log4j2 @Log4j2
public class TaoBaoHotSearchCrawlerTest { public class TaoBaoHotSearchCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build(); //private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
public static List<HotSearchList> taoBaoHotSearch(Date date) { public static List<HotSearchList> taoBaoHotSearch(Date date) {
Map<String, String> headerMap = new HashMap<>(); Map<String, String> headerMap = new HashMap<>();
...@@ -34,8 +37,12 @@ public class TaoBaoHotSearchCrawlerTest { ...@@ -34,8 +37,12 @@ public class TaoBaoHotSearchCrawlerTest {
String urls = "https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=" + time + "&sign=&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"; String urls = "https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=" + time + "&sign=&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D";
Request request1 = RequestUtils.wrapGet(urls); Request request1 = RequestUtils.wrapGet(urls);
String token = null; String token = null;
try (Response response = httpBoot.syncCall(request1, ProxyHolder.NAT_HEAVY_PROXY)) { Response response1 = httpBoot.syncCall(request1, ProxySupplier.NAT_HEAVY_PROXY);
List<String> values = response.networkResponse().headers().values("Set-Cookie"); if (response1.hasCause()){
Throwable cause = response1.cause();
log.error("解析淘宝热搜时出现解析错误,页面结构有问题", cause);
}else {
List<String> values = response1.headers().values("Set-Cookie");
String tk = values.get(1); String tk = values.get(1);
String[] splitTk = tk.split(";"); String[] splitTk = tk.split(";");
String _m_h5_tk = splitTk[0]; String _m_h5_tk = splitTk[0];
...@@ -44,18 +51,18 @@ public class TaoBaoHotSearchCrawlerTest { ...@@ -44,18 +51,18 @@ public class TaoBaoHotSearchCrawlerTest {
String[] splitEnc = enc.split(";"); String[] splitEnc = enc.split(";");
String _m_h5_tk_enc = splitEnc[0]; String _m_h5_tk_enc = splitEnc[0];
headerMap.put("cookie", _m_h5_tk + ";" + _m_h5_tk_enc); headerMap.put("cookie", _m_h5_tk + ";" + _m_h5_tk_enc);
} catch (Exception e) {
log.error("解析淘宝热搜时出现解析错误,页面结构有问题", e);
} }
String signs = token + "&" + time + "&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}"; String signs = token + "&" + time + "&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}";
String sign = MD5Util.getMD5(signs).toLowerCase(); String sign = MD5Util.getMD5(signs).toLowerCase();
String url = "https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=" + time + "&sign=" + sign + "&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"; String url = "https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=" + time + "&sign=" + sign + "&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D";
Request request = RequestUtils.wrapGet(url, headerMap); Request request = RequestUtils.wrapGet(url, headerMap);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
Throwable cause = response.cause();
log.error("解析淘宝热搜时出现解析错误,页面结构有问题", cause);
}else {
htmlBody = response.bodyString();
ht = !htmlBody.contains("非法请求"); ht = !htmlBody.contains("非法请求");
} catch (Exception e) {
log.error("解析淘宝热搜时出现解析错误,页面结构有问题", e);
} }
if (htmlBody != null && htmlBody.contains("data")) { if (htmlBody != null && htmlBody.contains("data")) {
return ansysData(htmlBody, date); return ansysData(htmlBody, date);
......
package com.zhiwei.searchhotcrawler.test; package com.zhiwei.searchhotcrawler.test;
import com.zhiwei.crawler.core.proxy.ProxyFactory; import com.zhiwei.http.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig; import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.config.ProxyConfig; import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun; import com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun;
import com.zhiwei.searchhotcrawler.timer.WeiboSuperTopicRun;
import com.zhiwei.searchhotcrawler.timer.WeiboTopicRun;
import java.text.ParseException; import java.text.ParseException;
...@@ -18,8 +20,10 @@ public class TaoBaoRunTest { ...@@ -18,8 +20,10 @@ public class TaoBaoRunTest {
//微博热搜开始采集 //微博热搜开始采集
// new WeiboHotSearchRun().start(); // new WeiboHotSearchRun().start();
//快手热榜开始采集 //快手热榜开始采集
// new KuaiShouHotSearchRun().start(); //new KuaiShouHotSearchRun().start();
//百度热搜 //百度热搜
// new TaoBaoHotSearchRun().run(); //new TaoBaoHotSearchRun().run();
//超话测试
//new WeiboSuperTopicRun().run();
} }
} }
...@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.test; ...@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.test;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
...@@ -24,7 +26,8 @@ import java.util.*; ...@@ -24,7 +26,8 @@ import java.util.*;
@Log4j2 @Log4j2
public class WeiboEntertainmentCrawlerTest { public class WeiboEntertainmentCrawlerTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); //private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
...@@ -38,10 +41,12 @@ public class WeiboEntertainmentCrawlerTest { ...@@ -38,10 +41,12 @@ public class WeiboEntertainmentCrawlerTest {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 5; count++) { for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("解析微博娱乐榜时出现连接失败", e); log.error("解析微博娱乐榜时出现连接失败", cause);
}else {
htmlBody = response.bodyString();
} }
List<HotSearchList> result = new ArrayList(); List<HotSearchList> result = new ArrayList();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
......
...@@ -43,7 +43,7 @@ public class WeiboSuperTopicRun extends Thread{ ...@@ -43,7 +43,7 @@ public class WeiboSuperTopicRun extends Thread{
doc.put("_id", topic.getId()); doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName()); doc.put("name", topic.getTopicName());
doc.put("rank", topic.getRank()); doc.put("rank", topic.getRank());
doc.put("score_num", topic.getScore()); doc.put("read_Num", topic.getReadNum());
doc.put("fensi_num", topic.getFensi()); doc.put("fensi_num", topic.getFensi());
doc.put("post_num", topic.getPostNum()); doc.put("post_num", topic.getPostNum());
doc.put("type", topic.getType()); doc.put("type", topic.getType());
...@@ -53,7 +53,7 @@ public class WeiboSuperTopicRun extends Thread{ ...@@ -53,7 +53,7 @@ public class WeiboSuperTopicRun extends Thread{
data.add(doc); data.add(doc);
} }
weiboTopicDAO.addTopicList(data); weiboTopicDAO.addTopicList(data);
log.info("微博话题采集结束........"); log.info("微博超话采集结束........");
} }
} }
...@@ -309,7 +309,7 @@ public class GatherTimer { ...@@ -309,7 +309,7 @@ public class GatherTimer {
* 腾讯较真辟谣榜采集 * 腾讯较真辟谣榜采集
*/ */
@Async(value = "myScheduler") @Async(value = "myScheduler")
@Scheduled(cron = "10 * * * * ? ") //@Scheduled(cron = "10 * * * * ? ")
public void crawlerTengXunVerificationHotSearch(){ public void crawlerTengXunVerificationHotSearch(){
log.info("{},腾讯较真辟谣榜开始采集", new Date()); log.info("{},腾讯较真辟谣榜开始采集", new Date());
Date date = DateUtils.getMillSecondTime(new Date()); Date date = DateUtils.getMillSecondTime(new Date());
...@@ -371,7 +371,7 @@ public class GatherTimer { ...@@ -371,7 +371,7 @@ public class GatherTimer {
* 知乎热搜国际分类采集 * 知乎热搜国际分类采集
*/ */
@Async(value = "myScheduler") @Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ") //@Scheduled(cron = "20 * * * * ? ")
public void crawlerZhiHuFocus(){ public void crawlerZhiHuFocus(){
this.crawlerZhiHuChild(FOCUS); this.crawlerZhiHuChild(FOCUS);
} }
...@@ -380,7 +380,7 @@ public class GatherTimer { ...@@ -380,7 +380,7 @@ public class GatherTimer {
* 知乎热搜时事分类采集 * 知乎热搜时事分类采集
*/ */
@Async(value = "myScheduler") @Async(value = "myScheduler")
@Scheduled(cron = "20 * * * * ? ") //@Scheduled(cron = "20 * * * * ? ")
public void crawlerZhiHuDepth(){ public void crawlerZhiHuDepth(){
this.crawlerZhiHuChild(DEPTH); this.crawlerZhiHuChild(DEPTH);
} }
...@@ -442,7 +442,7 @@ public class GatherTimer { ...@@ -442,7 +442,7 @@ public class GatherTimer {
doc.put("_id", topic.getId()); doc.put("_id", topic.getId());
doc.put("name", topic.getTopicName()); doc.put("name", topic.getTopicName());
doc.put("rank", topic.getRank()); doc.put("rank", topic.getRank());
doc.put("score_num", topic.getScore()); doc.put("read_Num", topic.getReadNum());
doc.put("fensi_num", topic.getFensi()); doc.put("fensi_num", topic.getFensi());
doc.put("post_num", topic.getPostNum()); doc.put("post_num", topic.getPostNum());
doc.put("type", topic.getType()); doc.put("type", topic.getType());
...@@ -452,7 +452,7 @@ public class GatherTimer { ...@@ -452,7 +452,7 @@ public class GatherTimer {
data.add(doc); data.add(doc);
} }
weiboTopicDAO.addTopicList(data); weiboTopicDAO.addTopicList(data);
log.info("微博话题采集结束........"); log.info("微博超话采集结束........");
} }
......
package com.zhiwei.searchhotcrawler.util; package com.zhiwei.searchhotcrawler.util;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.util.RequestUtils;
import okhttp3.MediaType; import okhttp3.MediaType;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.RequestBody; import okhttp3.RequestBody;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
...@@ -24,7 +24,8 @@ public final class HttpClientUtils { ...@@ -24,7 +24,8 @@ public final class HttpClientUtils {
private static final String QUERY_PARAM_SEP = "&"; private static final String QUERY_PARAM_SEP = "&";
private static final String URL_QUERY_PARAM_SEPARATOR = "?"; private static final String URL_QUERY_PARAM_SEPARATOR = "?";
private static final HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(2).build(); //private static final HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(2).build();
private static final HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
public static String sendPost(String url, String jsonParam){ public static String sendPost(String url, String jsonParam){
return sendPost(url, jsonParam, null, Charset.forName("UTF-8")); return sendPost(url, jsonParam, null, Charset.forName("UTF-8"));
} }
...@@ -39,11 +40,12 @@ public final class HttpClientUtils { ...@@ -39,11 +40,12 @@ public final class HttpClientUtils {
String result = null; String result = null;
Request request= RequestUtils.wrapPost(url, headers, RequestBody.create(MediaType.get("application/json"), Request request= RequestUtils.wrapPost(url, headers, RequestBody.create(MediaType.get("application/json"),
jsonParam)); jsonParam));
Response response = httpBoot.syncCall(request);
try(Response response = httpBoot.syncCall(request)) { if (response.hasCause()){
result = response.body().string(); Throwable cause = response.cause();
}catch (IOException e) { LOGGER.error("http connection error :" + cause.getMessage(), cause);
LOGGER.error("http connection error :" + e.getMessage(), e); }else {
result = response.bodyString();
} }
return result; return result;
} }
......
...@@ -4,18 +4,15 @@ import java.io.IOException; ...@@ -4,18 +4,15 @@ import java.io.IOException;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.http.boot.Response;
import com.zhiwei.http.util.RequestUtils;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
import okhttp3.MediaType; import okhttp3.MediaType;
...@@ -23,7 +20,8 @@ import okhttp3.RequestBody; ...@@ -23,7 +20,8 @@ import okhttp3.RequestBody;
public class WechatCodeUtil { public class WechatCodeUtil {
private static Logger log = LogManager.getLogger(WechatCodeUtil.class); private static Logger log = LogManager.getLogger(WechatCodeUtil.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); //private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
* @Title: getToken * @Title: getToken
* @author hero * @author hero
...@@ -40,12 +38,13 @@ public class WechatCodeUtil { ...@@ -40,12 +38,13 @@ public class WechatCodeUtil {
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
Request request = RequestUtils.wrapGet(url, headerMap); Request request = RequestUtils.wrapGet(url, headerMap);
String result = null; String result = null;
try(Response response = httpBoot.syncCall(request)) { Response response = httpBoot.syncCall(request);
result = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
e.printStackTrace(); log.error("获取微信公众号推送token失败,问题为:::{}", cause.fillInStackTrace());
log.error("获取微信公众号推送token失败,问题为:::{}", e.fillInStackTrace());
return null; return null;
}else {
result = response.bodyString();
} }
if (result != null) { if (result != null) {
JSONObject jsonObject = JSONObject.parseObject(result); JSONObject jsonObject = JSONObject.parseObject(result);
...@@ -73,11 +72,13 @@ public class WechatCodeUtil { ...@@ -73,11 +72,13 @@ public class WechatCodeUtil {
RequestBody requestBody = RequestBody.create(MediaType.get("application/json"), templateJson.toJSONString()); RequestBody requestBody = RequestBody.create(MediaType.get("application/json"), templateJson.toJSONString());
Request request = RequestUtils.wrapPost(url,requestBody); Request request = RequestUtils.wrapPost(url,requestBody);
String htmlBody = null; String htmlBody = null;
try(Response response = httpBoot.syncCall(request)) { Response response = httpBoot.syncCall(request);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (Exception e) { Throwable cause = response.cause();
log.error("消息推送失败,错误为::{}",e.fillInStackTrace()); log.error("消息推送失败,错误为::{}",cause.fillInStackTrace());
msgid = 0; msgid = 0;
}else {
htmlBody = response.bodyString();
} }
if (StringUtils.isNotBlank(htmlBody)) { if (StringUtils.isNotBlank(htmlBody)) {
JSONObject jsonObject = JSONObject.parseObject(htmlBody); JSONObject jsonObject = JSONObject.parseObject(htmlBody);
...@@ -115,11 +116,13 @@ public class WechatCodeUtil { ...@@ -115,11 +116,13 @@ public class WechatCodeUtil {
RequestBody requestBody = RequestBody.create(MediaType.get("application/json"), postData.toJSONString()); RequestBody requestBody = RequestBody.create(MediaType.get("application/json"), postData.toJSONString());
Request request = RequestUtils.wrapPost(url,requestBody); Request request = RequestUtils.wrapPost(url,requestBody);
String htmlBody = null; String htmlBody = null;
try(Response response = httpBoot.syncCall(request)) { Response response = httpBoot.syncCall(request);
htmlBody = response.body().string(); if (response.hasCause()){
}catch (IOException e){ Throwable cause = response.cause();
log.error("页面连接获取失败",e); log.error("页面连接获取失败",cause);
return null; return null;
}else {
htmlBody = response.bodyString();
} }
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) { if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
JSONObject jsonObject = JSONObject.parseObject(htmlBody); JSONObject jsonObject = JSONObject.parseObject(htmlBody);
...@@ -154,11 +157,13 @@ public class WechatCodeUtil { ...@@ -154,11 +157,13 @@ public class WechatCodeUtil {
RequestBody requestBody = RequestBody.create(MediaType.get("application/json"), postData.toJSONString()); RequestBody requestBody = RequestBody.create(MediaType.get("application/json"), postData.toJSONString());
Request request = RequestUtils.wrapPost(url,requestBody); Request request = RequestUtils.wrapPost(url,requestBody);
String htmlBody = null; String htmlBody = null;
try(Response response = httpBoot.syncCall(request)){ Response response = httpBoot.syncCall(request);
htmlBody = response.body().string(); if (response.hasCause()){
}catch (IOException e){ Throwable cause = response.cause();
log.error("页面链接获取失败",e); log.error("页面链接获取失败",cause);
return null; return null;
}else {
htmlBody = response.bodyString();
} }
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) { if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
JSONObject jsonObject = JSONObject.parseObject(htmlBody); JSONObject jsonObject = JSONObject.parseObject(htmlBody);
...@@ -197,11 +202,13 @@ public class WechatCodeUtil { ...@@ -197,11 +202,13 @@ public class WechatCodeUtil {
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
Request request = RequestUtils.wrapGet(url, headerMap); Request request = RequestUtils.wrapGet(url, headerMap);
String htmlBody = null; String htmlBody = null;
try(Response response = httpBoot.syncCall(request)) { Response response = httpBoot.syncCall(request);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("获取分组id时出现错误",e.fillInStackTrace()); log.error("获取分组id时出现错误",cause.fillInStackTrace());
return null; return null;
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null) { if (htmlBody != null) {
if (htmlBody.contains("tags")) { if (htmlBody.contains("tags")) {
...@@ -230,11 +237,13 @@ public class WechatCodeUtil { ...@@ -230,11 +237,13 @@ public class WechatCodeUtil {
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
Request request = RequestUtils.wrapGet(url, headerMap); Request request = RequestUtils.wrapGet(url, headerMap);
String htmlBody = null; String htmlBody = null;
try(Response response = httpBoot.syncCall(request)) { Response response = httpBoot.syncCall(request);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("获取分组id时出现错误",e.fillInStackTrace()); log.error("获取分组id时出现错误",cause.fillInStackTrace());
return null; return null;
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null) { if (htmlBody != null) {
if (htmlBody.contains("tags")) { if (htmlBody.contains("tags")) {
......
...@@ -2,10 +2,12 @@ package hotSaerchTest; ...@@ -2,10 +2,12 @@ package hotSaerchTest;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCollection;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyFactory; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxyFactory;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.proxy.config.SimpleConfig; import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.config.DBConfig; import com.zhiwei.searchhotcrawler.config.DBConfig;
...@@ -14,11 +16,11 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler; ...@@ -14,11 +16,11 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate; import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest; import com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest;
import com.zhiwei.searchhotcrawler.test.TaoBaoHotSearchCrawlerTest; import com.zhiwei.searchhotcrawler.test.TaoBaoHotSearchCrawlerTest;
import com.zhiwei.searchhotcrawler.util.TaoBaoUtils; import com.zhiwei.searchhotcrawler.util.QYWechatUtil;
import com.zhiwei.searchhotcrawler.util.TipsUtils; import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.searchhotcrawler.util.WechatCodeUtil;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.bson.Document; import org.bson.Document;
import org.junit.Test; import org.junit.Test;
import org.junit.runner.RunWith; import org.junit.runner.RunWith;
...@@ -29,7 +31,6 @@ import java.io.IOException; ...@@ -29,7 +31,6 @@ import java.io.IOException;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import static com.ibm.icu.util.LocalePriorityList.add;
import static java.util.Objects.nonNull; import static java.util.Objects.nonNull;
/** /**
...@@ -42,7 +43,7 @@ import static java.util.Objects.nonNull; ...@@ -42,7 +43,7 @@ import static java.util.Objects.nonNull;
{"classpath:applicationContext.xml"}) {"classpath:applicationContext.xml"})
public class HotSearchTest { public class HotSearchTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
* 测试快手热榜采集 * 测试快手热榜采集
...@@ -71,10 +72,12 @@ public class HotSearchTest { ...@@ -71,10 +72,12 @@ public class HotSearchTest {
String url = "https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23可口可乐回应C罗拒绝与可乐同框%23"; String url = "https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23可口可乐回应C罗拒绝与可乐同框%23";
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("解析微博热搜详情页面时出现连接失败", e); log.error("解析微博热搜详情页面时出现连接失败", cause);
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("data")) { if (htmlBody != null && htmlBody.contains("data")) {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo"); JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo");
...@@ -167,8 +170,18 @@ public class HotSearchTest { ...@@ -167,8 +170,18 @@ public class HotSearchTest {
long time = new Date().getTime(); long time = new Date().getTime();
String signs="undefined&1625624820156&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}"; String signs="undefined&1625624820156&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}";
// https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=1624930984092&sign=acf994dbcee6c0c1d7a8a566a6b8ff0a&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D // https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=1624930984092&sign=acf994dbcee6c0c1d7a8a566a6b8ff0a&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D
String s = TaoBaoUtils.parsJSFunction(signs); // String s = TaoBaoUtils.parsJSFunction(signs);
System.out.println(s); // System.out.println(s);
}
private static String key = "a8e26ce3-8aaa-4d3e-bcf6-30b81526050b";
/**
* 测试预警发送
*/
@Test
public void testWarn(){
QYWechatUtil.send(key, QYWechatUtil.MSGTYPE_TEXT, "你好",
null, null);
} }
} }
......
...@@ -3,10 +3,12 @@ package weiboTest; ...@@ -3,10 +3,12 @@ package weiboTest;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyFactory; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.Response;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.proxy.ProxyFactory;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.proxy.config.SimpleConfig; import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
...@@ -19,7 +21,6 @@ import com.zhiwei.searchhotcrawler.util.TipsUtils; ...@@ -19,7 +21,6 @@ import com.zhiwei.searchhotcrawler.util.TipsUtils;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -50,7 +51,7 @@ public class WeiboHotSearchTest { ...@@ -50,7 +51,7 @@ public class WeiboHotSearchTest {
static WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao(); static WeiBoMassageDao weiBoMassageDao = new WeiBoMassageDao();
//调用weiBoUserDao添加数据 //调用weiBoUserDao添加数据
static WeiBoUserDao weiBoUserDao = new WeiBoUserDao(); static WeiBoUserDao weiBoUserDao = new WeiBoUserDao();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
@Test @Test
public void test() { public void test() {
...@@ -122,10 +123,12 @@ public class WeiboHotSearchTest { ...@@ -122,10 +123,12 @@ public class WeiboHotSearchTest {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 2; count++) { for (int count = 0; count <= 2; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("解析微博热搜详情页面时出现连接失败", e); log.error("解析微博热搜详情页面时出现连接失败", cause);
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("data")) { if (htmlBody != null && htmlBody.contains("data")) {
JSONObject dataJson = JSONObject.parseObject(htmlBody).getJSONObject("data"); JSONObject dataJson = JSONObject.parseObject(htmlBody).getJSONObject("data");
...@@ -213,10 +216,12 @@ public class WeiboHotSearchTest { ...@@ -213,10 +216,12 @@ public class WeiboHotSearchTest {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url); Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 2; count++) { for (int count = 0; count <= 2; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("解析微博热搜详情页面时出现连接失败", e); log.error("解析微博热搜详情页面时出现连接失败", cause);
}else {
htmlBody = response.bodyString();
} }
if (htmlBody != null && htmlBody.contains("data")) { if (htmlBody != null && htmlBody.contains("data")) {
JSONObject dataJson = JSONObject.parseObject(htmlBody).getJSONObject("data"); JSONObject dataJson = JSONObject.parseObject(htmlBody).getJSONObject("data");
...@@ -527,10 +532,12 @@ public class WeiboHotSearchTest { ...@@ -527,10 +532,12 @@ public class WeiboHotSearchTest {
String htmlBody = null; String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap); Request request = RequestUtils.wrapGet(url, headerMap);
for (int count = 0; count <= 5; count++) { for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
htmlBody = response.body().string(); if (response.hasCause()){
} catch (IOException e) { Throwable cause = response.cause();
log.error("解析微博时热搜时出现连接失败", e); log.error("解析微博时热搜时出现连接失败", cause);
}else {
htmlBody = response.bodyString();
} }
List<HotSearchList> result = new ArrayList<HotSearchList>(); List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) { if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
......
package weiboTest; package weiboTest;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.http.boot.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.http.boot.Response;
import com.zhiwei.http.proxy.ProxySupplier;
import com.zhiwei.http.util.RequestUtils;
import com.zhiwei.searchhotcrawler.util.AESUtils; import com.zhiwei.searchhotcrawler.util.AESUtils;
import com.zhiwei.searchhotcrawler.util.HttpClientUtils; import com.zhiwei.searchhotcrawler.util.HttpClientUtils;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.codec.DecoderException; import org.apache.commons.codec.DecoderException;
import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.binary.Hex;
import org.junit.Test; import org.junit.Test;
...@@ -19,6 +21,7 @@ import javax.crypto.spec.SecretKeySpec; ...@@ -19,6 +21,7 @@ import javax.crypto.spec.SecretKeySpec;
import java.beans.Encoder; import java.beans.Encoder;
import java.io.IOException; import java.io.IOException;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLDecoder; import java.net.URLDecoder;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.nio.charset.Charset; import java.nio.charset.Charset;
...@@ -33,7 +36,7 @@ import java.util.Map; ...@@ -33,7 +36,7 @@ import java.util.Map;
*/ */
public class WeiboTopInfoTest { public class WeiboTopInfoTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
/** /**
* 加密测试 * 加密测试
...@@ -106,10 +109,11 @@ public class WeiboTopInfoTest { ...@@ -106,10 +109,11 @@ public class WeiboTopInfoTest {
System.out.println(url); System.out.println(url);
Request request = RequestUtils.wrapGet(url,getHeaderMap()); Request request = RequestUtils.wrapGet(url,getHeaderMap());
//测试使用空代理 //测试使用空代理
try (Response response = httpBoot.syncCall(request, ProxyHolder.NONE_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NONE_PROXY);
System.out.println(response.body().string()); if (response.hasCause()){
} catch (IOException e) { response.cause().printStackTrace();
e.printStackTrace(); }else {
System.out.println(response.bodyString());
} }
} }
...@@ -135,8 +139,11 @@ public class WeiboTopInfoTest { ...@@ -135,8 +139,11 @@ public class WeiboTopInfoTest {
System.out.println(url); System.out.println(url);
Request request = RequestUtils.wrapGet(url,getHeaderMap()); Request request = RequestUtils.wrapGet(url,getHeaderMap());
//测试使用空代理 //测试使用空代理
try (Response response = httpBoot.syncCall(request, ProxyHolder.NONE_PROXY)) { Response response = httpBoot.syncCall(request, ProxySupplier.NONE_PROXY);
String result = response.body().string(); if (response.hasCause()){
response.cause().printStackTrace();
}else {
String result = response.bodyString();
//结果解密 //结果解密
String decodeResult = decodeStr(key,result); String decodeResult = decodeStr(key,result);
System.out.println(decodeResult); System.out.println(decodeResult);
...@@ -144,11 +151,7 @@ public class WeiboTopInfoTest { ...@@ -144,11 +151,7 @@ public class WeiboTopInfoTest {
JSONArray jsonArray = JSONArray.parseArray(decodeResult); JSONArray jsonArray = JSONArray.parseArray(decodeResult);
for (Object o : jsonArray) { for (Object o : jsonArray) {
System.out.println(o); System.out.println(o);
} }
} catch (IOException e) {
e.printStackTrace();
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment