Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
3a0f95c0
Commit
3a0f95c0
authored
Jan 18, 2022
by
leiliangliang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
升级核心包Http-boot
parent
672054e6
Hide whitespace changes
Inline
Side-by-side
Showing
44 changed files
with
700 additions
and
537 deletions
+700
-537
dependency-reduced-pom.xml
+21
-22
pom.xml
+10
-4
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboSuperTopic.java
+3
-4
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+12
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/BiliComprehensiveHotCrawler.java
+12
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/BililiCrawler.java
+34
-24
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
+17
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/FengHuangSearchCrawler.java
+18
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/HotSearch36KrCrawler.java
+13
-12
src/main/java/com/zhiwei/searchhotcrawler/crawler/HuXiuHotSearchCrawler.java
+13
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
+13
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/MaiMaiHotSearchCrawler.java
+13
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
+17
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/SouhuTopicCrawler.java
+11
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/TaoBaoHotSearchCrawler.java
+18
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
+17
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
+25
-17
src/main/java/com/zhiwei/searchhotcrawler/crawler/WangYiHotSearchCrawler.java
+19
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoSearchBoxHotWordsCrawler.java
+12
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboEntertainmentCrawler.java
+11
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+33
-24
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboNewsCrawler.java
+25
-17
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboSuperTopicCrawler.java
+26
-24
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
+11
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/XinLangHotSearchCrawler.java
+18
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuChildHotSearchCrawler.java
+12
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+27
-21
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuTopicSearchCrawler.java
+11
-17
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearch36KrCrawlerTest.java
+14
-11
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchRunTest.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/test/HuXiuHotSearchCrawlerTest.java
+12
-9
src/main/java/com/zhiwei/searchhotcrawler/test/Job51Test.java
+14
-11
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchCrawlerTest.java
+13
-10
src/main/java/com/zhiwei/searchhotcrawler/test/TaoBaoHotSearchCrawlerTest.java
+20
-13
src/main/java/com/zhiwei/searchhotcrawler/test/TaoBaoRunTest.java
+7
-3
src/main/java/com/zhiwei/searchhotcrawler/test/WeiboEntertainmentCrawlerTest.java
+14
-9
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboSuperTopicRun.java
+2
-2
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+5
-5
src/main/java/com/zhiwei/searchhotcrawler/util/HttpClientUtils.java
+12
-10
src/main/java/com/zhiwei/searchhotcrawler/util/WechatCodeUtil.java
+41
-32
src/test/java/hotSaerchTest/HotSearchTest.java
+27
-14
src/test/java/weiboTest/WeiboHotSearchTest.java
+25
-18
src/test/java/weiboTest/WeiboTopInfoTest.java
+18
-15
No files found.
dependency-reduced-pom.xml
View file @
3a0f95c0
...
...
@@ -6,7 +6,7 @@
<name>
各平台热搜榜单采集程序
</name>
<version>
0.0.6-SNAPSHOT
</version>
<description>
各平台热搜榜单采集程序
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序
</description>
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序
</description>
<developers>
<developer>
<id>
Bewilder
</id>
...
...
@@ -38,6 +38,15 @@
</filters>
<transformers>
<transformer>
<resource>
META-INF/spring.handlers
</resource>
</transformer>
<transformer>
<resource>
META-INF/spring.schemas
</resource>
</transformer>
<transformer>
<resource>
META-INF/spring.tooling
</resource>
</transformer>
<transformer>
<mainClass>
com.zhiwei.searchhotcrawler.run.HotSearchRun
</mainClass>
</transformer>
</transformers>
...
...
@@ -73,32 +82,22 @@
</build>
<dependencies>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.6.7.2-RELEASE
</version>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
<version>
4.12
</version>
<scope>
test
</scope>
<exclusions>
<exclusion>
<artifactId>
hamcrest-core
</artifactId>
<groupId>
org.hamcrest
</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
<version>
4.13
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
org.projectlombok
</groupId>
<artifactId>
lombok
</artifactId>
<version>
1.18.20
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-test
</artifactId>
<version>
5.3.6
</version>
<scope>
test
</scope>
</dependency>
</dependencies>
<properties>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
<spring.version>
4.2.2.RELEASE
</spring.version>
<log4j.version>
2.15.0
</log4j.version>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
</properties>
</project>
...
...
pom.xml
View file @
3a0f95c0
...
...
@@ -43,10 +43,16 @@
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.6-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.6.7.4-SNAPSHOT
</version>
<!--<dependency>-->
<!--<groupId>com.zhiwei.crawler</groupId>-->
<!--<artifactId>crawler-core</artifactId>-->
<!--<version>0.6.7.4-SNAPSHOT</version>-->
<!--</dependency>-->
<!-- http知微核心包 -->
<dependency>
<groupId>
com.zhiwei.http
</groupId>
<artifactId>
http-boot
</artifactId>
<version>
0.0.5.9-SNAPSHOT
</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.conscrypt/conscrypt-openjdk-uber -->
<dependency>
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboSuperTopic.java
View file @
3a0f95c0
...
...
@@ -73,18 +73,17 @@ public class WeiboSuperTopic {
public
WeiboSuperTopic
()
{}
public
WeiboSuperTopic
(
String
url
,
String
topicName
,
Integer
rank
,
String
score
,
public
WeiboSuperTopic
(
String
url
,
String
topicName
,
Integer
rank
,
String
postNum
,
String
fensi
,
String
type
)
{
this
.
url
=
url
;
this
.
topicName
=
topicName
;
this
.
rank
=
rank
;
this
.
score
=
score
;
this
.
postNum
=
postNum
;
this
.
fensi
=
fensi
;
this
.
type
=
type
;
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
id
=
topicName
+
"_"
+
type
+
"_"
+
day
;
this
.
id
=
topicName
+
"_"
+
type
+
"_"
+
time
.
getTime
()
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -4,18 +4,17 @@ import java.net.URLDecoder;
import
java.time.Duration
;
import
java.util.*
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
...
...
@@ -28,7 +27,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
@Log4j2
public
class
BaiDuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @return void 返回类型
...
...
@@ -40,10 +40,12 @@ public class BaiDuHotSearchCrawler {
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"container-bg_lQ801"
))
{
return
ansysNewData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/BiliComprehensiveHotCrawler.java
View file @
3a0f95c0
...
...
@@ -2,17 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.text.DateFormat
;
import
java.text.SimpleDateFormat
;
import
java.util.*
;
...
...
@@ -27,7 +26,8 @@ import java.util.*;
@Log4j2
public
class
BiliComprehensiveHotCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
build
();
/**
...
...
@@ -53,10 +53,12 @@ public class BiliComprehensiveHotCrawler {
for
(
int
i
=
0
;
i
<
urlList
.
size
();
i
++)
{
Request
request
=
RequestUtils
.
wrapGet
(
urlList
.
get
(
i
));
//发送请求每次获取20条数据
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
fmt
.
format
(
date
)+
":第"
+
i
+
1
+
"次请求解析B站综合热门时出现连接失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
fmt
.
format
(
date
)+
":第"
+
i
+
1
+
"次请求解析B站综合热门时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
try
{
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/BililiCrawler.java
View file @
3a0f95c0
...
...
@@ -2,16 +2,17 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -20,12 +21,13 @@ import java.io.IOException;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Objects
;
import
java.util.concurrent.ExecutorService
;
import
java.util.concurrent.Executors
;
@Log4j2
public
class
BililiCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
* B站排行榜的采集
...
...
@@ -41,10 +43,12 @@ public class BililiCrawler {
String
url
=
"https://api.bilibili.com/x/web-interface/ranking/v2?rid=0&type=all"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"B站排行榜页面连接失败"
,
e
.
fillInStackTrace
());
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"B站排行榜页面连接失败"
,
cause
.
fillInStackTrace
());
}
else
{
htmlBody
=
response
.
bodyString
();
}
try
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
...
...
@@ -129,8 +133,8 @@ public class BililiCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
{
System
.
setProperty
(
"https.protocols"
,
"TLSv1,TLSv1.1,TLSv1.2,SSLv3"
);
Response
response
=
httpBoot
.
syncCall
(
request
,
Proxy
Holder
.
NAT_HEAVY
_PROXY
);
String
htmlBody
=
response
.
body
().
s
tring
();
Response
response
=
httpBoot
.
syncCall
(
request
,
Proxy
Supplier
.
FOREIGN_INNER
_PROXY
);
String
htmlBody
=
response
.
body
S
tring
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"v-wrap"
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
//获取标签
...
...
@@ -139,16 +143,20 @@ public class BililiCrawler {
hotSearchList
.
setTag
(
tag
);
//获取粉丝数
if
(
htmlBody
.
contains
(
"v_upinfo"
))
{
String
text
=
document
.
select
(
"div.follow-btn"
).
select
(
"span"
).
text
();
String
fan
=
text
.
split
(
" "
)[
2
];
Long
fanCount
=
null
;
if
(
fan
.
contains
(
"万"
)){
double
dou
=
Double
.
parseDouble
(
fan
.
replaceAll
(
"万"
,
" "
));
fanCount
=
new
Double
(
dou
*
10000
).
longValue
();
}
else
{
fanCount
=
Long
.
valueOf
(
fan
);
String
text
=
document
.
select
(
"div.follow-btn"
).
select
(
"span"
).
last
().
text
();
if
(
StringUtils
.
isNotEmpty
(
text
)&&
Objects
.
nonNull
(
text
))
{
Long
fanCount
=
null
;
if
(
text
.
contains
(
"关注"
)){
text
=
text
.
replaceAll
(
"关注"
,
" "
).
trim
();
}
if
(
text
.
contains
(
"万"
))
{
double
dou
=
Double
.
parseDouble
(
text
.
replaceAll
(
"万"
,
" "
).
trim
());
fanCount
=
new
Double
(
dou
*
10000
).
longValue
();
}
else
{
fanCount
=
Long
.
valueOf
(
text
);
}
hotSearchList
.
setFans
(
fanCount
);
}
hotSearchList
.
setFans
(
fanCount
);
}
return
hotSearchList
;
}
else
{
...
...
@@ -173,10 +181,12 @@ public class BililiCrawler {
String
url
=
"https://app.biliapi.com/x/v2/search/square?build=616050&limit=10"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"B站热搜页面连接失败"
,
e
.
fillInStackTrace
());
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"B站热搜页面连接失败"
,
cause
.
fillInStackTrace
());
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
).
getJSONObject
(
0
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -5,17 +5,17 @@ import java.util.ArrayList;
import
java.util.Date
;
import
java.util.List
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
...
...
@@ -30,7 +30,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
@Log4j2
public
class
DouyinHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
build
();
public
static
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
...
...
@@ -46,10 +46,12 @@ public class DouyinHotSearchCrawler {
String
url
=
"https://api.amemv.com/aweme/v1/hot/search/list/"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
))
{
list
=
new
ArrayList
<>();
...
...
@@ -87,10 +89,12 @@ public class DouyinHotSearchCrawler {
String
resultUrl
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取抖音热搜榜链接时出现问题:{}"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取抖音热搜榜链接时出现问题:{}"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"aweme_list"
)){
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"aweme_list"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/FengHuangSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -2,15 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
...
...
@@ -20,7 +21,7 @@ import java.util.List;
@Log4j2
public
class
FengHuangSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
build
();
/**
* 获取凤凰新闻热榜
...
...
@@ -33,10 +34,12 @@ public class FengHuangSearchCrawler {
String
url
=
"https://nine.ifeng.com/hotspotlist?gv=7.9.1&page="
+
page
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"凤凰新闻热榜页面连接异常..."
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"凤凰新闻热榜页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
...
...
@@ -71,10 +74,12 @@ public class FengHuangSearchCrawler {
String
url
=
"https://shankapi.ifeng.com/autumn/sogouSearchHotword"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"凤凰新闻热搜页面连接异常..."
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"凤凰新闻热搜页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
).
getJSONObject
(
0
).
getJSONArray
(
"item"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/HotSearch36KrCrawler.java
View file @
3a0f95c0
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -26,7 +27,7 @@ import java.util.*;
@Log4j2
public
class
HotSearch36KrCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @return void 返回类型
...
...
@@ -38,13 +39,13 @@ public class HotSearch36KrCrawler {
String
url
=
"https://www.36kr.com/hot-list/catalog"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
,
e
);
}
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"article-list"
))
{
return
ansysData
(
htmlBody
,
date
);
}
else
{
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/HuXiuHotSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
...
...
@@ -27,17 +29,19 @@ import java.util.*;
*/
@Log4j2
public
class
HuXiuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
public
static
List
<
HotSearchList
>
HuXiuHotArticleRecommended
(
Date
date
){
String
url
=
"https://www.huxiu.com/"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"hot__list"
))
{
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
View file @
3a0f95c0
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.time.Duration
;
import
java.util.*
;
...
...
@@ -22,7 +24,7 @@ import java.util.*;
*/
@Log4j2
public
class
KuaiShouHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @return void 返回类型
...
...
@@ -34,11 +36,12 @@ public class KuaiShouHotSearchCrawler {
String
url
=
"https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"APOLLO_STATE"
))
{
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/MaiMaiHotSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -2,15 +2,17 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
...
...
@@ -21,7 +23,7 @@ import java.util.List;
@Log4j2
public
class
MaiMaiHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
build
();
/**
* 获取maimai热榜
...
...
@@ -33,10 +35,12 @@ public class MaiMaiHotSearchCrawler {
String
url
=
"https://open.taou.com/maimai/feed/v6/hot_posts_list?tab=profession&count=15&version=5.3.34&u=232258287&access_token=1.4c82e8ad6d6b4e03262a48f334dea336"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"脉脉热榜页面连接异常..."
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"脉脉热榜页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
//1024 - 26(时间戳+type) = 998 -> name.getBytes(StandardCharsets.UTF_8).length<998 -> 998/3 = 332
int
nameLengthMax
=
300
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -4,10 +4,13 @@ import java.util.*;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -15,9 +18,6 @@ import org.jsoup.nodes.Element;
import
org.jsoup.select.Elements
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
...
...
@@ -31,7 +31,7 @@ import com.zhiwei.tools.httpclient.HeaderTool;
@Log4j2
public
class
SougoHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
build
();
/**
* @Title: SougoHotSearchTest
...
...
@@ -46,10 +46,12 @@ public class SougoHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headMap
);
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"topwords"
))
{
try
{
...
...
@@ -108,10 +110,12 @@ public class SougoHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headMap
);
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/SouhuTopicCrawler.java
View file @
3a0f95c0
...
...
@@ -2,19 +2,18 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Date
;
...
...
@@ -22,7 +21,7 @@ import java.util.List;
@Log4j2
public
class
SouhuTopicCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
public
static
List
<
HotSearchList
>
getSouhuTopic
(
Date
date
){
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
...
...
@@ -32,10 +31,12 @@ public class SouhuTopicCrawler {
String
url
=
"https://api.k.sohu.com/api/news/moment/v2/list.go?pageSize=50&v=6.4.4"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"搜狐话题页面连接失败"
,
e
.
fillInStackTrace
());
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"搜狐话题页面连接失败"
,
cause
.
fillInStackTrace
());
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/TaoBaoHotSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -2,15 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.MD5Util
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.time.Duration
;
import
java.util.*
;
...
...
@@ -23,7 +24,7 @@ import java.util.*;
*/
@Log4j2
public
class
TaoBaoHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
public
static
List
<
HotSearchList
>
taoBaoHotSearch
(
Date
date
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
...
...
@@ -34,8 +35,12 @@ public class TaoBaoHotSearchCrawler {
String
urls
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign=&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
Request
request1
=
RequestUtils
.
wrapGet
(
urls
);
String
token
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
List
<
String
>
values
=
response
.
networkResponse
().
headers
().
values
(
"Set-Cookie"
);
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
List
<
String
>
values
=
response
.
headers
().
values
(
"Set-Cookie"
);
String
tk
=
values
.
get
(
1
);
String
[]
splitTk
=
tk
.
split
(
";"
);
String
_m_h5_tk
=
splitTk
[
0
];
...
...
@@ -44,18 +49,18 @@ public class TaoBaoHotSearchCrawler {
String
[]
splitEnc
=
enc
.
split
(
";"
);
String
_m_h5_tk_enc
=
splitEnc
[
0
];
headerMap
.
put
(
"cookie"
,
_m_h5_tk
+
";"
+
_m_h5_tk_enc
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
e
);
}
String
signs
=
token
+
"&"
+
time
+
"&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}"
;
String
sign
=
MD5Util
.
getMD5
(
signs
).
toLowerCase
();
String
url
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign="
+
sign
+
"&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
Response
response1
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response1
.
hasCause
()){
Throwable
cause
=
response1
.
cause
();
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response1
.
bodyString
();
ht
=
!
htmlBody
.
contains
(
"非法请求"
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
View file @
3a0f95c0
...
...
@@ -2,15 +2,15 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.io.IOException
;
import
java.util.ArrayList
;
...
...
@@ -20,7 +20,7 @@ import java.util.List;
@Log4j2
public
class
TengXunCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
* 腾讯热榜数据采集
...
...
@@ -35,10 +35,12 @@ public class TengXunCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
response
.
cause
().
printStackTrace
();
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"idlist"
))
{
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
...
...
@@ -96,10 +98,12 @@ public class TengXunCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
response
.
cause
().
printStackTrace
();
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
)){
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -3,9 +3,11 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
...
...
@@ -14,7 +16,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -34,7 +36,7 @@ import java.util.*;
*/
@Log4j2
public
class
ToutiaoHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: weiboHotSearchByPhoneTest
...
...
@@ -47,10 +49,12 @@ public class ToutiaoHotSearchCrawler {
String
jsUrl
=
"https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js"
;
Request
jsRequest
=
RequestUtils
.
wrapGet
(
jsUrl
);
String
jsBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
jsRequest
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
jsBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"获取今日头条实时热搜头部信息标识失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
jsRequest
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"获取今日头条实时热搜头部信息标识失败"
,
cause
);
}
else
{
jsBody
=
response
.
bodyString
();
}
if
(
jsBody
!=
null
&&
jsBody
.
contains
(
"origin"
)){
String
s
=
jsBody
.
substring
(
jsBody
.
indexOf
(
"origin:"
)+
"origin:"
.
length
());
...
...
@@ -61,10 +65,12 @@ public class ToutiaoHotSearchCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e1
)
{
log
.
error
(
"解析今日头条实时热搜时出现连接失败"
,
e1
);
Response
response1
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response1
.
hasCause
()){
Throwable
cause
=
response1
.
cause
();
log
.
error
(
"解析今日头条实时热搜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response1
.
bodyString
();
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
...
...
@@ -159,10 +165,12 @@ public class ToutiaoHotSearchCrawler {
String
url
=
hotSearchList
.
getUrl
();
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
i
=
0
;
i
<=
5
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e1
)
{
log
.
error
(
"解析今日头条热搜详情页面出现连接失败"
,
e1
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析今日头条热搜详情页面出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WangYiHotSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -2,16 +2,18 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -26,7 +28,7 @@ import java.util.List;
*/
@Log4j2
public
class
WangYiHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
build
();
/**
* 网易新闻实时热榜的采集
...
...
@@ -39,10 +41,12 @@ public class WangYiHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
htmlBody
=
null
;
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"网易新闻实时热榜页面连接异常..."
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"网易新闻实时热榜页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
bodyObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
...
...
@@ -80,10 +84,12 @@ public class WangYiHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
htmlBody
=
null
;
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"网易新闻跟贴热议页面连接异常..."
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"网易新闻跟贴热议页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
bodyObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoSearchBoxHotWordsCrawler.java
View file @
3a0f95c0
...
...
@@ -2,14 +2,15 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoSearchBoxHotWords
;
import
com.zhiwei.searchhotcrawler.dao.WeiBoSearchBoxHotWordsDao
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.time.Duration
;
import
java.util.ArrayList
;
...
...
@@ -26,7 +27,7 @@ import java.util.Objects;
*/
@Log4j2
public
class
WeiBoSearchBoxHotWordsCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
static
WeiBoSearchBoxHotWordsDao
weiBoSearchDao
=
new
WeiBoSearchBoxHotWordsDao
();
public
static
void
weiBoSearchBoxHotWords
(
Date
date
){
...
...
@@ -35,11 +36,12 @@ public class WeiBoSearchBoxHotWordsCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博搜索框热词时出现解析错误,页面结构有问题"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博搜索框热词时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"hotwords"
))
{
int
num
=
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboEntertainmentCrawler.java
View file @
3a0f95c0
...
...
@@ -2,17 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.util.*
;
/**
...
...
@@ -24,7 +23,7 @@ import java.util.*;
@Log4j2
public
class
WeiboEntertainmentCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
* @return void 返回类型
...
...
@@ -38,10 +37,12 @@ public class WeiboEntertainmentCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博娱乐榜时出现连接失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博娱乐榜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
List
<
HotSearchList
>
result
=
new
ArrayList
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -9,6 +9,10 @@ import java.util.*;
import
java.util.stream.Collectors
;
import
com.alibaba.fastjson.JSON
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.*
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
com.zhiwei.searchhotcrawler.dao.RedisDao
;
...
...
@@ -17,7 +21,6 @@ import com.zhiwei.searchhotcrawler.dao.WeiBoUserDao;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.bson.Document
;
import
org.jsoup.Jsoup
;
...
...
@@ -26,9 +29,6 @@ import org.jsoup.select.Elements;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.mail.SendMailWeibo
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
...
...
@@ -45,7 +45,7 @@ import static java.util.Objects.nonNull;
@Log4j2
public
class
WeiboHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
private
static
RedisDao
redisDao
=
new
RedisDao
();
...
...
@@ -66,14 +66,15 @@ public class WeiboHotSearchCrawler {
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
if
(
i
==
2
)
{
return
list
;
}
else
{
continue
;
}
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"pl_top_realtimehot"
))
{
try
{
...
...
@@ -261,10 +262,12 @@ public class WeiboHotSearchCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博时热搜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
...
...
@@ -349,10 +352,12 @@ public class WeiboHotSearchCrawler {
String
url
=
"https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜时出现连接失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
List
<
HotSearchList
>
result
=
new
ArrayList
<>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
...
...
@@ -405,10 +410,12 @@ public class WeiboHotSearchCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"cardlistInfo"
);
...
...
@@ -500,10 +507,12 @@ public class WeiboHotSearchCrawler {
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"Cookie"
,
"SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博时热搜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"m-main"
))
{
Document
docm
=
new
Document
();
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboNewsCrawler.java
View file @
3a0f95c0
...
...
@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
...
...
@@ -25,7 +27,7 @@ import java.util.*;
@Log4j2
public
class
WeiboNewsCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
...
...
@@ -46,11 +48,13 @@ public class WeiboNewsCrawler {
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
List
<
HotSearchList
>
result
=
new
ArrayList
();
//发送第一次请求获取前20条数据
try
(
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"第一次请求解析微博要闻榜时出现连接失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"第一次请求解析微博要闻榜时出现连接失败"
,
caus
e
);
continue
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
...
...
@@ -66,11 +70,13 @@ public class WeiboNewsCrawler {
continue
;
}
//发送第二次请求获取中间20条数据
try
(
Response
response
=
httpBoot
.
syncCall
(
request2
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"第二次请求解析微博要闻榜时出现连接失败"
,
e
);
Response
response1
=
httpBoot
.
syncCall
(
request2
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response1
.
hasCause
()){
Throwable
cause
=
response1
.
cause
();
log
.
error
(
"第二次请求解析微博要闻榜时出现连接失败"
,
caus
e
);
continue
;
}
else
{
htmlBody
=
response1
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
...
...
@@ -87,11 +93,13 @@ public class WeiboNewsCrawler {
continue
;
}
//发送第三次请求获取最后10条数据
try
(
Response
response
=
httpBoot
.
syncCall
(
request3
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"第三次请求解析微博要闻榜时出现连接失败"
,
e
);
Response
response2
=
httpBoot
.
syncCall
(
request3
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response2
.
hasCause
()){
Throwable
cause
=
response2
.
cause
();
log
.
error
(
"第三次请求解析微博要闻榜时出现连接失败"
,
caus
e
);
continue
;
}
else
{
htmlBody
=
response2
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboSuperTopicCrawler.java
View file @
3a0f95c0
...
...
@@ -8,18 +8,19 @@ import java.util.Map;
import
java.util.Map.Entry
;
import
java.util.Objects
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
/**
*
...
...
@@ -31,7 +32,7 @@ import com.zhiwei.crawler.core.utils.RequestUtils;
@Log4j2
public
class
WeiboSuperTopicCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
static
{
...
...
@@ -63,13 +64,15 @@ public class WeiboSuperTopicCrawler {
String
htmlBody
=
null
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
caus
e
);
continue
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"d
esc1
"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"d
ata
"
))
{
topicList
.
addAll
(
parseTopicRankHtml
(
page
,
htmlBody
,
type
));
break
;
}
else
{
...
...
@@ -99,21 +102,18 @@ public class WeiboSuperTopicCrawler {
Integer
toprank
=
null
;
String
topicName
=
null
;
String
id
=
null
;
String
score
=
null
;
String
desc1
=
null
;
String
postNum
=
null
;
String
fensi
=
null
;
String
url
=
null
;
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
JSONObject
data
=
list
.
getJSONObject
(
i
);
toprank
=
page
+
data
.
getInteger
(
"toprank"
)
;
toprank
=
++
page
;
topicName
=
data
.
getString
(
"display_name"
);
id
=
data
.
getString
(
"page_id"
);
score
=
data
.
getString
(
"score"
);
desc1
=
data
.
getString
(
"desc1"
);
fensi
=
desc1
.
replaceAll
(
".*影响力|粉丝"
,
""
).
trim
();
postNum
=
data
.
getString
(
"status_count"
);
fensi
=
data
.
getString
(
"fans_count"
);
url
=
data
.
getString
(
"link"
);
WeiboSuperTopic
topic
=
new
WeiboSuperTopic
(
url
,
topicName
,
toprank
,
score
,
fensi
,
type
);
WeiboSuperTopic
topic
=
new
WeiboSuperTopic
(
url
,
topicName
,
toprank
,
postNum
,
fensi
,
type
);
topic
=
getTopicInfo
(
id
,
topic
);
topicList
.
add
(
topic
);
}
...
...
@@ -140,17 +140,19 @@ public class WeiboSuperTopicCrawler {
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid="
+
id
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc_more"
))
{
String
descMore
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"pageInfo"
).
getJSONArray
(
"desc_more"
).
getString
(
0
);
if
(
StringUtils
.
isNotBlank
(
descMore
))
{
String
readNum
=
descMore
.
replaceAll
(
"阅读|帖子.*"
,
""
).
trim
();
String
postNum
=
descMore
.
replaceAll
(
".*帖子|粉丝.*"
,
""
).
trim
();
topic
.
setPostNum
(
postNum
);
//
String postNum = descMore.replaceAll(".*帖子|粉丝.*", "").trim();
//
topic.setPostNum(postNum);
topic
.
setReadNum
(
readNum
);
return
topic
;
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
View file @
3a0f95c0
...
...
@@ -3,9 +3,10 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
...
...
@@ -13,7 +14,6 @@ import com.zhiwei.searchhotcrawler.util.TipsUtils;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Element
;
...
...
@@ -31,7 +31,7 @@ import java.util.regex.Pattern;
*/
@Log4j2
public
class
WeiboTopicCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
static
{
...
...
@@ -137,11 +137,13 @@ public class WeiboTopicCrawler {
String
htmlBody
=
null
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
5
;
retryTimes
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"下载榜单列表页面时出现错误,错误为:{}"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"下载榜单列表页面时出现错误,错误为:{}"
,
caus
e
);
continue
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
topicList
.
addAll
(
parseTopicHtml
(
htmlBody
,
date
));
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/XinLangHotSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -2,9 +2,10 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
...
...
@@ -12,7 +13,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -24,7 +25,7 @@ import java.util.*;
@Log4j2
public
class
XinLangHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
* 新浪热榜的采集
...
...
@@ -38,10 +39,12 @@ public class XinLangHotSearchCrawler {
String
htmlBody
=
null
;
JSONObject
jsonObject
=
null
;
for
(
int
t
=
0
;
t
<
3
&&
jsonObject
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"新浪热榜页面连接异常..."
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"新浪热榜页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
)
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
@@ -111,10 +114,12 @@ public class XinLangHotSearchCrawler {
String
htmlBody
=
null
;
JSONArray
dataJson
=
null
;
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"新浪热点页面连接异常..."
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"新浪热点页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuChildHotSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -4,16 +4,18 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.io.IOException
;
import
java.util.*
;
...
...
@@ -21,7 +23,7 @@ import java.util.*;
@Log4j2
public
class
ZhihuChildHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
* 知乎子级分类数据采集
...
...
@@ -39,10 +41,11 @@ public class ZhihuChildHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
response
.
cause
().
printStackTrace
();
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -3,17 +3,18 @@ package com.zhiwei.searchhotcrawler.crawler;
import
java.io.IOException
;
import
java.util.*
;
import
com.zhiwei.crawler.core.config.SslProvider
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
io.netty.handler.ssl.SslProvider
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
...
...
@@ -34,7 +35,8 @@ import static java.util.Objects.nonNull;
@Log4j2
public
class
ZhihuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
sslProvider
(
SslProvider
.
CONSCRYPT
).
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
build
();
//private static HttpBoot httpBoot = HttpBoot.newBuilder().sslProvider(SslProvider.CONSCRYPT).retryTimes(3).build();
/**
* @Title: getZhihuHotList
* @author hero
...
...
@@ -98,11 +100,13 @@ public class ZhihuHotSearchCrawler {
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
caus
e
);
return
list
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
try
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
))
{
...
...
@@ -160,17 +164,22 @@ public class ZhihuHotSearchCrawler {
Map
<
String
,
String
>
Map
=
HeaderTool
.
getCommonHead
();
Map
.
put
(
"cookie"
,
"_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
Map
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
String
htmlBody
=
response
.
body
().
string
();
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"单条知乎热搜数据页面连接失败"
,
cause
);
return
doc
;
}
else
{
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"QuestionHeader"
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
//获取标签
String
label
=
""
;
Elements
select
=
document
.
select
(
"div.Tag"
);
for
(
Element
element
:
select
)
{
String
text
=
"`"
+
element
.
select
(
"div.Popover"
).
text
()+
";"
;
label
=
label
+
text
;
}
String
label
=
""
;
Elements
select
=
document
.
select
(
"div.Tag"
);
for
(
Element
element
:
select
)
{
String
text
=
"`"
+
element
.
select
(
"div.Popover"
).
text
()+
";"
;
label
=
label
+
text
;
}
doc
.
put
(
"tag"
,
label
.
trim
());
String
strong
=
document
.
select
(
"div.NumberBoard-itemInner"
).
select
(
"strong"
).
text
();
String
[]
count
=
strong
.
split
(
" "
);
...
...
@@ -182,9 +191,6 @@ public class ZhihuHotSearchCrawler {
}
else
{
return
doc
;
}
}
catch
(
Exception
e
)
{
log
.
error
(
"单条知乎热搜数据页面连接失败"
,
e
);
return
doc
;
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuTopicSearchCrawler.java
View file @
3a0f95c0
...
...
@@ -2,29 +2,21 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.Data
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.select.Elements
;
import
org.w3c.dom.Element
;
import
java.io.IOException
;
import
java.util.*
;
@Log4j2
public
class
ZhihuTopicSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
public
static
List
<
HotSearchList
>
getZhihuTopicSearch
(
Date
date
){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
...
...
@@ -33,10 +25,12 @@ public class ZhihuTopicSearchCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
jsonObject
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"知乎热搜页面连接异常"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"知乎热搜页面连接异常"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
)
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
3a0f95c0
package
com
.
zhiwei
.
searchhotcrawler
.
run
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.*
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearch36KrCrawlerTest.java
View file @
3a0f95c0
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -26,7 +27,8 @@ import java.util.*;
@Log4j2
public
class
HotSearch36KrCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @return void 返回类型
...
...
@@ -38,12 +40,13 @@ public class HotSearch36KrCrawlerTest {
String
url
=
"https://www.36kr.com/hot-list/catalog"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
,
e
);
}
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"article-list"
))
{
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchRunTest.java
View file @
3a0f95c0
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HuXiuHotSearchCrawlerTest.java
View file @
3a0f95c0
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpGet
;
...
...
@@ -31,17 +31,20 @@ import java.util.*;
*/
@Log4j2
public
class
HuXiuHotSearchCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
public
static
List
<
HotSearchList
>
HuXiuHotArticleRecommended
(
Date
date
){
String
url
=
"https://www.huxiu.com/"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"hot__list"
))
{
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/Job51Test.java
View file @
3a0f95c0
...
...
@@ -4,10 +4,12 @@ package com.zhiwei.searchhotcrawler.test;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.client.MongoDatabase
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
...
...
@@ -17,7 +19,7 @@ import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBLocalTemplate;
import
com.zhiwei.searchhotcrawler.util.HttpClientUtils
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.springframework.context.ApplicationContext
;
...
...
@@ -38,7 +40,7 @@ public class Job51Test {
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
// MongoDatabase mongoDBLocal = MongoDBLocalTemplate.getDB(DBConfig.dbName);
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
...
...
@@ -62,11 +64,12 @@ public class Job51Test {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
header
);
for
(
int
t
=
0
;
t
<
1
&&
jsonObject
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"知乎热搜页面连接异常"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"知乎热搜页面连接异常"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
)
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchCrawlerTest.java
View file @
3a0f95c0
...
...
@@ -3,14 +3,15 @@ package com.zhiwei.searchhotcrawler.test;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.time.Duration
;
import
java.util.*
;
...
...
@@ -23,7 +24,8 @@ import java.util.*;
*/
@Log4j2
public
class
KuaiShouHotSearchCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @return void 返回类型
...
...
@@ -35,11 +37,12 @@ public class KuaiShouHotSearchCrawlerTest {
String
url
=
"https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"APOLLO_STATE"
))
{
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/TaoBaoHotSearchCrawlerTest.java
View file @
3a0f95c0
...
...
@@ -2,15 +2,17 @@ package com.zhiwei.searchhotcrawler.test;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.MD5Util
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
;
import
java.time.Duration
;
import
java.util.*
;
...
...
@@ -23,7 +25,8 @@ import java.util.*;
*/
@Log4j2
public
class
TaoBaoHotSearchCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
public
static
List
<
HotSearchList
>
taoBaoHotSearch
(
Date
date
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
...
...
@@ -34,8 +37,12 @@ public class TaoBaoHotSearchCrawlerTest {
String
urls
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign=&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
Request
request1
=
RequestUtils
.
wrapGet
(
urls
);
String
token
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
List
<
String
>
values
=
response
.
networkResponse
().
headers
().
values
(
"Set-Cookie"
);
Response
response1
=
httpBoot
.
syncCall
(
request1
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response1
.
hasCause
()){
Throwable
cause
=
response1
.
cause
();
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
List
<
String
>
values
=
response1
.
headers
().
values
(
"Set-Cookie"
);
String
tk
=
values
.
get
(
1
);
String
[]
splitTk
=
tk
.
split
(
";"
);
String
_m_h5_tk
=
splitTk
[
0
];
...
...
@@ -44,18 +51,18 @@ public class TaoBaoHotSearchCrawlerTest {
String
[]
splitEnc
=
enc
.
split
(
";"
);
String
_m_h5_tk_enc
=
splitEnc
[
0
];
headerMap
.
put
(
"cookie"
,
_m_h5_tk
+
";"
+
_m_h5_tk_enc
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
e
);
}
String
signs
=
token
+
"&"
+
time
+
"&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}"
;
String
sign
=
MD5Util
.
getMD5
(
signs
).
toLowerCase
();
String
url
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign="
+
sign
+
"&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
ht
=
!
htmlBody
.
contains
(
"非法请求"
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/TaoBaoRunTest.java
View file @
3a0f95c0
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.
crawler.core
.proxy.ProxyFactory
;
import
com.zhiwei.
http
.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.WeiboSuperTopicRun
;
import
com.zhiwei.searchhotcrawler.timer.WeiboTopicRun
;
import
java.text.ParseException
;
...
...
@@ -18,8 +20,10 @@ public class TaoBaoRunTest {
//微博热搜开始采集
// new WeiboHotSearchRun().start();
//快手热榜开始采集
//
new KuaiShouHotSearchRun().start();
//
new KuaiShouHotSearchRun().start();
//百度热搜
// new TaoBaoHotSearchRun().run();
//new TaoBaoHotSearchRun().run();
//超话测试
//new WeiboSuperTopicRun().run();
}
}
src/main/java/com/zhiwei/searchhotcrawler/test/WeiboEntertainmentCrawlerTest.java
View file @
3a0f95c0
...
...
@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.test;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.util.*
;
...
...
@@ -24,7 +26,8 @@ import java.util.*;
@Log4j2
public
class
WeiboEntertainmentCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
build
();
/**
...
...
@@ -38,10 +41,12 @@ public class WeiboEntertainmentCrawlerTest {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博娱乐榜时出现连接失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博娱乐榜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
List
<
HotSearchList
>
result
=
new
ArrayList
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboSuperTopicRun.java
View file @
3a0f95c0
...
...
@@ -43,7 +43,7 @@ public class WeiboSuperTopicRun extends Thread{
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"
score_num"
,
topic
.
getScore
());
doc
.
put
(
"
read_Num"
,
topic
.
getReadNum
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"type"
,
topic
.
getType
());
...
...
@@ -53,7 +53,7 @@ public class WeiboSuperTopicRun extends Thread{
data
.
add
(
doc
);
}
weiboTopicDAO
.
addTopicList
(
data
);
log
.
info
(
"微博
话题
采集结束........"
);
log
.
info
(
"微博
超话
采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
3a0f95c0
...
...
@@ -309,7 +309,7 @@ public class GatherTimer {
* 腾讯较真辟谣榜采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"10 * * * * ? "
)
//
@Scheduled(cron = "10 * * * * ? ")
public
void
crawlerTengXunVerificationHotSearch
(){
log
.
info
(
"{},腾讯较真辟谣榜开始采集"
,
new
Date
());
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
...
...
@@ -371,7 +371,7 @@ public class GatherTimer {
* 知乎热搜国际分类采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"20 * * * * ? "
)
//
@Scheduled(cron = "20 * * * * ? ")
public
void
crawlerZhiHuFocus
(){
this
.
crawlerZhiHuChild
(
FOCUS
);
}
...
...
@@ -380,7 +380,7 @@ public class GatherTimer {
* 知乎热搜时事分类采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"20 * * * * ? "
)
//
@Scheduled(cron = "20 * * * * ? ")
public
void
crawlerZhiHuDepth
(){
this
.
crawlerZhiHuChild
(
DEPTH
);
}
...
...
@@ -442,7 +442,7 @@ public class GatherTimer {
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"
score_num"
,
topic
.
getScore
());
doc
.
put
(
"
read_Num"
,
topic
.
getReadNum
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"type"
,
topic
.
getType
());
...
...
@@ -452,7 +452,7 @@ public class GatherTimer {
data
.
add
(
doc
);
}
weiboTopicDAO
.
addTopicList
(
data
);
log
.
info
(
"微博
话题
采集结束........"
);
log
.
info
(
"微博
超话
采集结束........"
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/util/HttpClientUtils.java
View file @
3a0f95c0
package
com
.
zhiwei
.
searchhotcrawler
.
util
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.util.RequestUtils
;
import
okhttp3.MediaType
;
import
okhttp3.Request
;
import
okhttp3.RequestBody
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
...
...
@@ -24,7 +24,8 @@ public final class HttpClientUtils {
private
static
final
String
QUERY_PARAM_SEP
=
"&"
;
private
static
final
String
URL_QUERY_PARAM_SEPARATOR
=
"?"
;
private
static
final
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
2
).
build
();
//private static final HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(2).build();
private
static
final
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
2
).
build
();
public
static
String
sendPost
(
String
url
,
String
jsonParam
){
return
sendPost
(
url
,
jsonParam
,
null
,
Charset
.
forName
(
"UTF-8"
));
}
...
...
@@ -39,12 +40,13 @@ public final class HttpClientUtils {
String
result
=
null
;
Request
request
=
RequestUtils
.
wrapPost
(
url
,
headers
,
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
jsonParam
));
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
result
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
LOGGER
.
error
(
"http connection error :"
+
e
.
getMessage
(),
e
);
}
Response
response
=
httpBoot
.
syncCall
(
request
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
LOGGER
.
error
(
"http connection error :"
+
cause
.
getMessage
(),
cause
);
}
else
{
result
=
response
.
bodyString
();
}
return
result
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/util/WechatCodeUtil.java
View file @
3a0f95c0
...
...
@@ -4,18 +4,15 @@ import java.io.IOException;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.util.RequestUtils
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
okhttp3.MediaType
;
...
...
@@ -23,7 +20,8 @@ import okhttp3.RequestBody;
public
class
WechatCodeUtil
{
private
static
Logger
log
=
LogManager
.
getLogger
(
WechatCodeUtil
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
build
();
/**
* @Title: getToken
* @author hero
...
...
@@ -40,12 +38,13 @@ public class WechatCodeUtil {
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
result
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
result
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
log
.
error
(
"获取微信公众号推送token失败,问题为:::{}"
,
e
.
fillInStackTrace
());
Response
response
=
httpBoot
.
syncCall
(
request
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"获取微信公众号推送token失败,问题为:::{}"
,
cause
.
fillInStackTrace
());
return
null
;
}
else
{
result
=
response
.
bodyString
();
}
if
(
result
!=
null
)
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
result
);
...
...
@@ -73,11 +72,13 @@ public class WechatCodeUtil {
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
templateJson
.
toJSONString
());
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"消息推送失败,错误为::{}"
,
e
.
fillInStackTrace
());
Response
response
=
httpBoot
.
syncCall
(
request
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"消息推送失败,错误为::{}"
,
caus
e
.
fillInStackTrace
());
msgid
=
0
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
...
...
@@ -115,11 +116,13 @@ public class WechatCodeUtil {
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
postData
.
toJSONString
());
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
){
log
.
error
(
"页面连接获取失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"页面连接获取失败"
,
caus
e
);
return
null
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
...
...
@@ -154,11 +157,13 @@ public class WechatCodeUtil {
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
postData
.
toJSONString
());
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
)){
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
){
log
.
error
(
"页面链接获取失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"页面链接获取失败"
,
caus
e
);
return
null
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
...
...
@@ -197,11 +202,13 @@ public class WechatCodeUtil {
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"获取分组id时出现错误"
,
e
.
fillInStackTrace
());
Response
response
=
httpBoot
.
syncCall
(
request
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"获取分组id时出现错误"
,
caus
e
.
fillInStackTrace
());
return
null
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
.
contains
(
"tags"
))
{
...
...
@@ -230,11 +237,13 @@ public class WechatCodeUtil {
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"获取分组id时出现错误"
,
e
.
fillInStackTrace
());
Response
response
=
httpBoot
.
syncCall
(
request
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"获取分组id时出现错误"
,
caus
e
.
fillInStackTrace
());
return
null
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
.
contains
(
"tags"
))
{
...
...
src/test/java/hotSaerchTest/HotSearchTest.java
View file @
3a0f95c0
...
...
@@ -2,10 +2,12 @@ package hotSaerchTest;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.client.MongoCollection
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
...
...
@@ -14,11 +16,11 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest
;
import
com.zhiwei.searchhotcrawler.test.TaoBaoHotSearchCrawlerTest
;
import
com.zhiwei.searchhotcrawler.util.
TaoBaoUtils
;
import
com.zhiwei.searchhotcrawler.util.
QYWechatUtil
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.bson.Document
;
import
org.junit.Test
;
import
org.junit.runner.RunWith
;
...
...
@@ -29,7 +31,6 @@ import java.io.IOException;
import
java.util.Date
;
import
java.util.List
;
import
static
com
.
ibm
.
icu
.
util
.
LocalePriorityList
.
add
;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
...
...
@@ -42,7 +43,7 @@ import static java.util.Objects.nonNull;
{
"classpath:applicationContext.xml"
})
public
class
HotSearchTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
* 测试快手热榜采集
...
...
@@ -71,10 +72,12 @@ public class HotSearchTest {
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23可口可乐回应C罗拒绝与可乐同框%23"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"cardlistInfo"
);
...
...
@@ -167,8 +170,18 @@ public class HotSearchTest {
long
time
=
new
Date
().
getTime
();
String
signs
=
"undefined&1625624820156&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}"
;
// https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=1624930984092&sign=acf994dbcee6c0c1d7a8a566a6b8ff0a&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D
String
s
=
TaoBaoUtils
.
parsJSFunction
(
signs
);
System
.
out
.
println
(
s
);
// String s = TaoBaoUtils.parsJSFunction(signs);
// System.out.println(s);
}
private
static
String
key
=
"a8e26ce3-8aaa-4d3e-bcf6-30b81526050b"
;
/**
* 测试预警发送
*/
@Test
public
void
testWarn
(){
QYWechatUtil
.
send
(
key
,
QYWechatUtil
.
MSGTYPE_TEXT
,
"你好"
,
null
,
null
);
}
}
...
...
src/test/java/weiboTest/WeiboHotSearchTest.java
View file @
3a0f95c0
...
...
@@ -3,10 +3,12 @@ package weiboTest;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
...
...
@@ -19,7 +21,6 @@ import com.zhiwei.searchhotcrawler.util.TipsUtils;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -50,7 +51,7 @@ public class WeiboHotSearchTest {
static
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
//调用weiBoUserDao添加数据
static
WeiBoUserDao
weiBoUserDao
=
new
WeiBoUserDao
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
@Test
public
void
test
()
{
...
...
@@ -122,10 +123,12 @@ public class WeiboHotSearchTest {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
2
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
...
...
@@ -213,10 +216,12 @@ public class WeiboHotSearchTest {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
2
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
...
...
@@ -527,10 +532,12 @@ public class WeiboHotSearchTest {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博时热搜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
...
...
src/test/java/weiboTest/WeiboTopInfoTest.java
View file @
3a0f95c0
package
weiboTest
;
import
com.alibaba.fastjson.JSONArray
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.util.AESUtils
;
import
com.zhiwei.searchhotcrawler.util.HttpClientUtils
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.codec.DecoderException
;
import
org.apache.commons.codec.binary.Hex
;
import
org.junit.Test
;
...
...
@@ -19,6 +21,7 @@ import javax.crypto.spec.SecretKeySpec;
import
java.beans.Encoder
;
import
java.io.IOException
;
import
java.io.UnsupportedEncodingException
;
import
java.net.Proxy
;
import
java.net.URLDecoder
;
import
java.net.URLEncoder
;
import
java.nio.charset.Charset
;
...
...
@@ -33,7 +36,7 @@ import java.util.Map;
*/
public
class
WeiboTopInfoTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
* 加密测试
...
...
@@ -106,10 +109,11 @@ public class WeiboTopInfoTest {
System
.
out
.
println
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
getHeaderMap
());
//测试使用空代理
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NONE_PROXY
))
{
System
.
out
.
println
(
response
.
body
().
string
());
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NONE_PROXY
);
if
(
response
.
hasCause
()){
response
.
cause
().
printStackTrace
();
}
else
{
System
.
out
.
println
(
response
.
bodyString
());
}
}
...
...
@@ -135,8 +139,11 @@ public class WeiboTopInfoTest {
System
.
out
.
println
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
getHeaderMap
());
//测试使用空代理
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NONE_PROXY
))
{
String
result
=
response
.
body
().
string
();
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NONE_PROXY
);
if
(
response
.
hasCause
()){
response
.
cause
().
printStackTrace
();
}
else
{
String
result
=
response
.
bodyString
();
//结果解密
String
decodeResult
=
decodeStr
(
key
,
result
);
System
.
out
.
println
(
decodeResult
);
...
...
@@ -144,11 +151,7 @@ public class WeiboTopInfoTest {
JSONArray
jsonArray
=
JSONArray
.
parseArray
(
decodeResult
);
for
(
Object
o
:
jsonArray
)
{
System
.
out
.
println
(
o
);
}
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment