Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
5448a5f4
Commit
5448a5f4
authored
Jan 18, 2022
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'working' into 'master'
Working See merge request
!174
parents
7c395174
24298940
Hide whitespace changes
Inline
Side-by-side
Showing
44 changed files
with
699 additions
and
536 deletions
+699
-536
dependency-reduced-pom.xml
+21
-22
pom.xml
+10
-4
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboSuperTopic.java
+3
-4
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+12
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/BiliComprehensiveHotCrawler.java
+12
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/BililiCrawler.java
+34
-24
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
+17
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/FengHuangSearchCrawler.java
+18
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/HotSearch36KrCrawler.java
+13
-12
src/main/java/com/zhiwei/searchhotcrawler/crawler/HuXiuHotSearchCrawler.java
+13
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
+13
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/MaiMaiHotSearchCrawler.java
+13
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
+17
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/SouhuTopicCrawler.java
+11
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/TaoBaoHotSearchCrawler.java
+18
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
+17
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
+25
-17
src/main/java/com/zhiwei/searchhotcrawler/crawler/WangYiHotSearchCrawler.java
+19
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoSearchBoxHotWordsCrawler.java
+12
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboEntertainmentCrawler.java
+11
-10
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+33
-24
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboNewsCrawler.java
+25
-17
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboSuperTopicCrawler.java
+26
-24
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
+11
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/XinLangHotSearchCrawler.java
+18
-13
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuChildHotSearchCrawler.java
+12
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+27
-21
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuTopicSearchCrawler.java
+11
-17
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearch36KrCrawlerTest.java
+14
-11
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchRunTest.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/test/HuXiuHotSearchCrawlerTest.java
+12
-9
src/main/java/com/zhiwei/searchhotcrawler/test/Job51Test.java
+14
-11
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchCrawlerTest.java
+13
-10
src/main/java/com/zhiwei/searchhotcrawler/test/TaoBaoHotSearchCrawlerTest.java
+20
-13
src/main/java/com/zhiwei/searchhotcrawler/test/TaoBaoRunTest.java
+7
-3
src/main/java/com/zhiwei/searchhotcrawler/test/WeiboEntertainmentCrawlerTest.java
+14
-9
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboSuperTopicRun.java
+2
-2
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+4
-4
src/main/java/com/zhiwei/searchhotcrawler/util/HttpClientUtils.java
+12
-10
src/main/java/com/zhiwei/searchhotcrawler/util/WechatCodeUtil.java
+41
-32
src/test/java/hotSaerchTest/HotSearchTest.java
+27
-14
src/test/java/weiboTest/WeiboHotSearchTest.java
+25
-18
src/test/java/weiboTest/WeiboTopInfoTest.java
+18
-15
No files found.
dependency-reduced-pom.xml
View file @
5448a5f4
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
<name>
各平台热搜榜单采集程序
</name>
<name>
各平台热搜榜单采集程序
</name>
<version>
0.0.6-SNAPSHOT
</version>
<version>
0.0.6-SNAPSHOT
</version>
<description>
各平台热搜榜单采集程序
<description>
各平台热搜榜单采集程序
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序
</description>
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序
</description>
<developers>
<developers>
<developer>
<developer>
<id>
Bewilder
</id>
<id>
Bewilder
</id>
...
@@ -38,6 +38,15 @@
...
@@ -38,6 +38,15 @@
</filters>
</filters>
<transformers>
<transformers>
<transformer>
<transformer>
<resource>
META-INF/spring.handlers
</resource>
</transformer>
<transformer>
<resource>
META-INF/spring.schemas
</resource>
</transformer>
<transformer>
<resource>
META-INF/spring.tooling
</resource>
</transformer>
<transformer>
<mainClass>
com.zhiwei.searchhotcrawler.run.HotSearchRun
</mainClass>
<mainClass>
com.zhiwei.searchhotcrawler.run.HotSearchRun
</mainClass>
</transformer>
</transformer>
</transformers>
</transformers>
...
@@ -73,32 +82,22 @@
...
@@ -73,32 +82,22 @@
</build>
</build>
<dependencies>
<dependencies>
<dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<groupId>
junit
</groupId>
<artifactId>
crawler-core
</artifactId>
<artifactId>
junit
</artifactId>
<version>
0.6.7.2-RELEASE
</version>
<version>
4.12
</version>
<scope>
test
</scope>
<scope>
test
</scope>
<exclusions>
<exclusion>
<artifactId>
hamcrest-core
</artifactId>
<groupId>
org.hamcrest
</groupId>
</exclusion>
</exclusions>
</dependency>
</dependency>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
<version>
4.13
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
org.projectlombok
</groupId>
<artifactId>
lombok
</artifactId>
<version>
1.18.20
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-test
</artifactId>
<version>
5.3.6
</version>
<scope>
test
</scope>
</dependency>
</dependencies>
</dependencies>
<properties>
<properties>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
<spring.version>
4.2.2.RELEASE
</spring.version>
<log4j.version>
2.15.0
</log4j.version>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
</properties>
</properties>
</project>
</project>
...
...
pom.xml
View file @
5448a5f4
...
@@ -43,10 +43,16 @@
...
@@ -43,10 +43,16 @@
<artifactId>
zhiwei-tools
</artifactId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.6-SNAPSHOT
</version>
<version>
0.1.6-SNAPSHOT
</version>
</dependency>
</dependency>
<dependency>
<!--<dependency>-->
<groupId>
com.zhiwei.crawler
</groupId>
<!--<groupId>com.zhiwei.crawler</groupId>-->
<artifactId>
crawler-core
</artifactId>
<!--<artifactId>crawler-core</artifactId>-->
<version>
0.6.7.4-SNAPSHOT
</version>
<!--<version>0.6.7.4-SNAPSHOT</version>-->
<!--</dependency>-->
<!-- http知微核心包 -->
<dependency>
<groupId>
com.zhiwei.http
</groupId>
<artifactId>
http-boot
</artifactId>
<version>
0.0.5.9-SNAPSHOT
</version>
</dependency>
</dependency>
<!-- https://mvnrepository.com/artifact/org.conscrypt/conscrypt-openjdk-uber -->
<!-- https://mvnrepository.com/artifact/org.conscrypt/conscrypt-openjdk-uber -->
<dependency>
<dependency>
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiboSuperTopic.java
View file @
5448a5f4
...
@@ -73,18 +73,17 @@ public class WeiboSuperTopic {
...
@@ -73,18 +73,17 @@ public class WeiboSuperTopic {
public
WeiboSuperTopic
()
{}
public
WeiboSuperTopic
()
{}
public
WeiboSuperTopic
(
String
url
,
String
topicName
,
Integer
rank
,
String
score
,
public
WeiboSuperTopic
(
String
url
,
String
topicName
,
Integer
rank
,
String
postNum
,
String
fensi
,
String
type
)
{
String
fensi
,
String
type
)
{
this
.
url
=
url
;
this
.
url
=
url
;
this
.
topicName
=
topicName
;
this
.
topicName
=
topicName
;
this
.
rank
=
rank
;
this
.
rank
=
rank
;
this
.
score
=
score
;
this
.
postNum
=
postNum
;
this
.
fensi
=
fensi
;
this
.
fensi
=
fensi
;
this
.
type
=
type
;
this
.
type
=
type
;
this
.
time
=
new
Date
();
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
this
.
id
=
topicName
+
"_"
+
type
+
"_"
+
day
;
this
.
id
=
topicName
+
"_"
+
type
+
"_"
+
time
.
getTime
()
;
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
View file @
5448a5f4
...
@@ -4,18 +4,17 @@ import java.net.URLDecoder;
...
@@ -4,18 +4,17 @@ import java.net.URLDecoder;
import
java.time.Duration
;
import
java.time.Duration
;
import
java.util.*
;
import
java.util.*
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.jsoup.select.Elements
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
...
@@ -28,7 +27,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
...
@@ -28,7 +27,8 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
@Log4j2
@Log4j2
public
class
BaiDuHotSearchCrawler
{
public
class
BaiDuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
/**
* @return void 返回类型
* @return void 返回类型
...
@@ -40,10 +40,12 @@ public class BaiDuHotSearchCrawler {
...
@@ -40,10 +40,12 @@ public class BaiDuHotSearchCrawler {
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
Exception
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"container-bg_lQ801"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"container-bg_lQ801"
))
{
return
ansysNewData
(
htmlBody
,
date
);
return
ansysNewData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/BiliComprehensiveHotCrawler.java
View file @
5448a5f4
...
@@ -2,17 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,17 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.text.DateFormat
;
import
java.text.DateFormat
;
import
java.text.SimpleDateFormat
;
import
java.text.SimpleDateFormat
;
import
java.util.*
;
import
java.util.*
;
...
@@ -27,7 +26,8 @@ import java.util.*;
...
@@ -27,7 +26,8 @@ import java.util.*;
@Log4j2
@Log4j2
public
class
BiliComprehensiveHotCrawler
{
public
class
BiliComprehensiveHotCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
build
();
/**
/**
...
@@ -53,10 +53,12 @@ public class BiliComprehensiveHotCrawler {
...
@@ -53,10 +53,12 @@ public class BiliComprehensiveHotCrawler {
for
(
int
i
=
0
;
i
<
urlList
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
urlList
.
size
();
i
++)
{
Request
request
=
RequestUtils
.
wrapGet
(
urlList
.
get
(
i
));
Request
request
=
RequestUtils
.
wrapGet
(
urlList
.
get
(
i
));
//发送请求每次获取20条数据
//发送请求每次获取20条数据
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
fmt
.
format
(
date
)+
":第"
+
i
+
1
+
"次请求解析B站综合热门时出现连接失败"
,
e
);
log
.
error
(
fmt
.
format
(
date
)+
":第"
+
i
+
1
+
"次请求解析B站综合热门时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
try
{
try
{
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/BililiCrawler.java
View file @
5448a5f4
...
@@ -2,16 +2,17 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,16 +2,17 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
...
@@ -20,12 +21,13 @@ import java.io.IOException;
...
@@ -20,12 +21,13 @@ import java.io.IOException;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Objects
;
import
java.util.concurrent.ExecutorService
;
import
java.util.concurrent.ExecutorService
;
import
java.util.concurrent.Executors
;
import
java.util.concurrent.Executors
;
@Log4j2
@Log4j2
public
class
BililiCrawler
{
public
class
BililiCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
/**
* B站排行榜的采集
* B站排行榜的采集
...
@@ -41,10 +43,12 @@ public class BililiCrawler {
...
@@ -41,10 +43,12 @@ public class BililiCrawler {
String
url
=
"https://api.bilibili.com/x/web-interface/ranking/v2?rid=0&type=all"
;
String
url
=
"https://api.bilibili.com/x/web-interface/ranking/v2?rid=0&type=all"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"B站排行榜页面连接失败"
,
e
.
fillInStackTrace
());
log
.
error
(
"B站排行榜页面连接失败"
,
cause
.
fillInStackTrace
());
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
try
{
try
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
...
@@ -129,8 +133,8 @@ public class BililiCrawler {
...
@@ -129,8 +133,8 @@ public class BililiCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
{
try
{
System
.
setProperty
(
"https.protocols"
,
"TLSv1,TLSv1.1,TLSv1.2,SSLv3"
);
System
.
setProperty
(
"https.protocols"
,
"TLSv1,TLSv1.1,TLSv1.2,SSLv3"
);
Response
response
=
httpBoot
.
syncCall
(
request
,
Proxy
Holder
.
NAT_HEAVY
_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
Proxy
Supplier
.
FOREIGN_INNER
_PROXY
);
String
htmlBody
=
response
.
body
().
s
tring
();
String
htmlBody
=
response
.
body
S
tring
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"v-wrap"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"v-wrap"
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
//获取标签
//获取标签
...
@@ -139,16 +143,20 @@ public class BililiCrawler {
...
@@ -139,16 +143,20 @@ public class BililiCrawler {
hotSearchList
.
setTag
(
tag
);
hotSearchList
.
setTag
(
tag
);
//获取粉丝数
//获取粉丝数
if
(
htmlBody
.
contains
(
"v_upinfo"
))
{
if
(
htmlBody
.
contains
(
"v_upinfo"
))
{
String
text
=
document
.
select
(
"div.follow-btn"
).
select
(
"span"
).
text
();
String
text
=
document
.
select
(
"div.follow-btn"
).
select
(
"span"
).
last
().
text
();
String
fan
=
text
.
split
(
" "
)[
2
];
if
(
StringUtils
.
isNotEmpty
(
text
)&&
Objects
.
nonNull
(
text
))
{
Long
fanCount
=
null
;
Long
fanCount
=
null
;
if
(
fan
.
contains
(
"万"
)){
if
(
text
.
contains
(
"关注"
)){
double
dou
=
Double
.
parseDouble
(
fan
.
replaceAll
(
"万"
,
" "
));
text
=
text
.
replaceAll
(
"关注"
,
" "
).
trim
();
fanCount
=
new
Double
(
dou
*
10000
).
longValue
();
}
}
else
{
if
(
text
.
contains
(
"万"
))
{
fanCount
=
Long
.
valueOf
(
fan
);
double
dou
=
Double
.
parseDouble
(
text
.
replaceAll
(
"万"
,
" "
).
trim
());
fanCount
=
new
Double
(
dou
*
10000
).
longValue
();
}
else
{
fanCount
=
Long
.
valueOf
(
text
);
}
hotSearchList
.
setFans
(
fanCount
);
}
}
hotSearchList
.
setFans
(
fanCount
);
}
}
return
hotSearchList
;
return
hotSearchList
;
}
else
{
}
else
{
...
@@ -173,10 +181,12 @@ public class BililiCrawler {
...
@@ -173,10 +181,12 @@ public class BililiCrawler {
String
url
=
"https://app.biliapi.com/x/v2/search/square?build=616050&limit=10"
;
String
url
=
"https://app.biliapi.com/x/v2/search/square?build=616050&limit=10"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"B站热搜页面连接失败"
,
e
.
fillInStackTrace
());
log
.
error
(
"B站热搜页面连接失败"
,
cause
.
fillInStackTrace
());
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
).
getJSONObject
(
0
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
).
getJSONObject
(
0
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
View file @
5448a5f4
...
@@ -5,17 +5,17 @@ import java.util.ArrayList;
...
@@ -5,17 +5,17 @@ import java.util.ArrayList;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
...
@@ -30,7 +30,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
...
@@ -30,7 +30,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchType;
@Log4j2
@Log4j2
public
class
DouyinHotSearchCrawler
{
public
class
DouyinHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
build
();
public
static
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
public
static
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
...
@@ -46,10 +46,12 @@ public class DouyinHotSearchCrawler {
...
@@ -46,10 +46,12 @@ public class DouyinHotSearchCrawler {
String
url
=
"https://api.amemv.com/aweme/v1/hot/search/list/"
;
String
url
=
"https://api.amemv.com/aweme/v1/hot/search/list/"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
);
log
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
))
{
list
=
new
ArrayList
<>();
list
=
new
ArrayList
<>();
...
@@ -87,10 +89,12 @@ public class DouyinHotSearchCrawler {
...
@@ -87,10 +89,12 @@ public class DouyinHotSearchCrawler {
String
resultUrl
=
null
;
String
resultUrl
=
null
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取抖音热搜榜链接时出现问题:{}"
,
e
);
log
.
debug
(
"获取抖音热搜榜链接时出现问题:{}"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"aweme_list"
)){
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"aweme_list"
)){
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"aweme_list"
);
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"aweme_list"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/FengHuangSearchCrawler.java
View file @
5448a5f4
...
@@ -2,15 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,15 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.io.IOException
;
...
@@ -20,7 +21,7 @@ import java.util.List;
...
@@ -20,7 +21,7 @@ import java.util.List;
@Log4j2
@Log4j2
public
class
FengHuangSearchCrawler
{
public
class
FengHuangSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
build
();
/**
/**
* 获取凤凰新闻热榜
* 获取凤凰新闻热榜
...
@@ -33,10 +34,12 @@ public class FengHuangSearchCrawler {
...
@@ -33,10 +34,12 @@ public class FengHuangSearchCrawler {
String
url
=
"https://nine.ifeng.com/hotspotlist?gv=7.9.1&page="
+
page
;
String
url
=
"https://nine.ifeng.com/hotspotlist?gv=7.9.1&page="
+
page
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"凤凰新闻热榜页面连接异常..."
,
e
);
log
.
error
(
"凤凰新闻热榜页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
...
@@ -71,10 +74,12 @@ public class FengHuangSearchCrawler {
...
@@ -71,10 +74,12 @@ public class FengHuangSearchCrawler {
String
url
=
"https://shankapi.ifeng.com/autumn/sogouSearchHotword"
;
String
url
=
"https://shankapi.ifeng.com/autumn/sogouSearchHotword"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"凤凰新闻热搜页面连接异常..."
,
e
);
log
.
error
(
"凤凰新闻热搜页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
).
getJSONObject
(
0
).
getJSONArray
(
"item"
);
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
).
getJSONObject
(
0
).
getJSONArray
(
"item"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/HotSearch36KrCrawler.java
View file @
5448a5f4
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
...
@@ -26,7 +27,7 @@ import java.util.*;
...
@@ -26,7 +27,7 @@ import java.util.*;
@Log4j2
@Log4j2
public
class
HotSearch36KrCrawler
{
public
class
HotSearch36KrCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
/**
* @return void 返回类型
* @return void 返回类型
...
@@ -38,13 +39,13 @@ public class HotSearch36KrCrawler {
...
@@ -38,13 +39,13 @@ public class HotSearch36KrCrawler {
String
url
=
"https://www.36kr.com/hot-list/catalog"
;
String
url
=
"https://www.36kr.com/hot-list/catalog"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
if
(
response
.
hasCause
())
{
htmlBody
=
response
.
body
().
string
();
Throwable
cause
=
response
.
cause
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
,
cause
);
log
.
error
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
,
e
);
}
else
{
}
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"article-list"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"article-list"
))
{
return
ansysData
(
htmlBody
,
date
);
return
ansysData
(
htmlBody
,
date
);
}
else
{
}
else
{
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/HuXiuHotSearchCrawler.java
View file @
5448a5f4
...
@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.nodes.Element
;
...
@@ -27,17 +29,19 @@ import java.util.*;
...
@@ -27,17 +29,19 @@ import java.util.*;
*/
*/
@Log4j2
@Log4j2
public
class
HuXiuHotSearchCrawler
{
public
class
HuXiuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
public
static
List
<
HotSearchList
>
HuXiuHotArticleRecommended
(
Date
date
){
public
static
List
<
HotSearchList
>
HuXiuHotArticleRecommended
(
Date
date
){
String
url
=
"https://www.huxiu.com/"
;
String
url
=
"https://www.huxiu.com/"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
Exception
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
,
e
);
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"hot__list"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"hot__list"
))
{
return
ansysData
(
htmlBody
,
date
);
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
View file @
5448a5f4
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.time.Duration
;
import
java.time.Duration
;
import
java.util.*
;
import
java.util.*
;
...
@@ -22,7 +24,7 @@ import java.util.*;
...
@@ -22,7 +24,7 @@ import java.util.*;
*/
*/
@Log4j2
@Log4j2
public
class
KuaiShouHotSearchCrawler
{
public
class
KuaiShouHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
/**
* @return void 返回类型
* @return void 返回类型
...
@@ -34,11 +36,12 @@ public class KuaiShouHotSearchCrawler {
...
@@ -34,11 +36,12 @@ public class KuaiShouHotSearchCrawler {
String
url
=
"https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"
;
String
url
=
"https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
if
(
response
.
hasCause
()){
htmlBody
=
response
.
body
().
string
();
Throwable
cause
=
response
.
cause
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
cause
);
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
e
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"APOLLO_STATE"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"APOLLO_STATE"
))
{
return
ansysData
(
htmlBody
,
date
);
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/MaiMaiHotSearchCrawler.java
View file @
5448a5f4
...
@@ -2,15 +2,17 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,15 +2,17 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.io.IOException
;
...
@@ -21,7 +23,7 @@ import java.util.List;
...
@@ -21,7 +23,7 @@ import java.util.List;
@Log4j2
@Log4j2
public
class
MaiMaiHotSearchCrawler
{
public
class
MaiMaiHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
build
();
/**
/**
* 获取maimai热榜
* 获取maimai热榜
...
@@ -33,10 +35,12 @@ public class MaiMaiHotSearchCrawler {
...
@@ -33,10 +35,12 @@ public class MaiMaiHotSearchCrawler {
String
url
=
"https://open.taou.com/maimai/feed/v6/hot_posts_list?tab=profession&count=15&version=5.3.34&u=232258287&access_token=1.4c82e8ad6d6b4e03262a48f334dea336"
;
String
url
=
"https://open.taou.com/maimai/feed/v6/hot_posts_list?tab=profession&count=15&version=5.3.34&u=232258287&access_token=1.4c82e8ad6d6b4e03262a48f334dea336"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"脉脉热榜页面连接异常..."
,
e
);
log
.
error
(
"脉脉热榜页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
//1024 - 26(时间戳+type) = 998 -> name.getBytes(StandardCharsets.UTF_8).length<998 -> 998/3 = 332
//1024 - 26(时间戳+type) = 998 -> name.getBytes(StandardCharsets.UTF_8).length<998 -> 998/3 = 332
int
nameLengthMax
=
300
;
int
nameLengthMax
=
300
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
View file @
5448a5f4
...
@@ -4,10 +4,13 @@ import java.util.*;
...
@@ -4,10 +4,13 @@ import java.util.*;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
...
@@ -15,9 +18,6 @@ import org.jsoup.nodes.Element;
...
@@ -15,9 +18,6 @@ import org.jsoup.nodes.Element;
import
org.jsoup.select.Elements
;
import
org.jsoup.select.Elements
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
...
@@ -31,7 +31,7 @@ import com.zhiwei.tools.httpclient.HeaderTool;
...
@@ -31,7 +31,7 @@ import com.zhiwei.tools.httpclient.HeaderTool;
@Log4j2
@Log4j2
public
class
SougoHotSearchCrawler
{
public
class
SougoHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
build
();
/**
/**
* @Title: SougoHotSearchTest
* @Title: SougoHotSearchTest
...
@@ -46,10 +46,12 @@ public class SougoHotSearchCrawler {
...
@@ -46,10 +46,12 @@ public class SougoHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headMap
);
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
Exception
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"topwords"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"topwords"
))
{
try
{
try
{
...
@@ -108,10 +110,12 @@ public class SougoHotSearchCrawler {
...
@@ -108,10 +110,12 @@ public class SougoHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headMap
);
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
Exception
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
);
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/SouhuTopicCrawler.java
View file @
5448a5f4
...
@@ -2,19 +2,18 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,19 +2,18 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Collections
;
import
java.util.Date
;
import
java.util.Date
;
...
@@ -22,7 +21,7 @@ import java.util.List;
...
@@ -22,7 +21,7 @@ import java.util.List;
@Log4j2
@Log4j2
public
class
SouhuTopicCrawler
{
public
class
SouhuTopicCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
public
static
List
<
HotSearchList
>
getSouhuTopic
(
Date
date
){
public
static
List
<
HotSearchList
>
getSouhuTopic
(
Date
date
){
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
List
<
HotSearchList
>
hotSearchLists
=
new
ArrayList
<>();
...
@@ -32,10 +31,12 @@ public class SouhuTopicCrawler {
...
@@ -32,10 +31,12 @@ public class SouhuTopicCrawler {
String
url
=
"https://api.k.sohu.com/api/news/moment/v2/list.go?pageSize=50&v=6.4.4"
;
String
url
=
"https://api.k.sohu.com/api/news/moment/v2/list.go?pageSize=50&v=6.4.4"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"搜狐话题页面连接失败"
,
e
.
fillInStackTrace
());
log
.
error
(
"搜狐话题页面连接失败"
,
cause
.
fillInStackTrace
());
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
)){
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/TaoBaoHotSearchCrawler.java
View file @
5448a5f4
...
@@ -2,15 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,15 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.MD5Util
;
import
com.zhiwei.searchhotcrawler.util.MD5Util
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.time.Duration
;
import
java.time.Duration
;
import
java.util.*
;
import
java.util.*
;
...
@@ -23,7 +24,7 @@ import java.util.*;
...
@@ -23,7 +24,7 @@ import java.util.*;
*/
*/
@Log4j2
@Log4j2
public
class
TaoBaoHotSearchCrawler
{
public
class
TaoBaoHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
public
static
List
<
HotSearchList
>
taoBaoHotSearch
(
Date
date
)
{
public
static
List
<
HotSearchList
>
taoBaoHotSearch
(
Date
date
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
...
@@ -34,8 +35,12 @@ public class TaoBaoHotSearchCrawler {
...
@@ -34,8 +35,12 @@ public class TaoBaoHotSearchCrawler {
String
urls
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign=&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
String
urls
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign=&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
Request
request1
=
RequestUtils
.
wrapGet
(
urls
);
Request
request1
=
RequestUtils
.
wrapGet
(
urls
);
String
token
=
null
;
String
token
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
List
<
String
>
values
=
response
.
networkResponse
().
headers
().
values
(
"Set-Cookie"
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
List
<
String
>
values
=
response
.
headers
().
values
(
"Set-Cookie"
);
String
tk
=
values
.
get
(
1
);
String
tk
=
values
.
get
(
1
);
String
[]
splitTk
=
tk
.
split
(
";"
);
String
[]
splitTk
=
tk
.
split
(
";"
);
String
_m_h5_tk
=
splitTk
[
0
];
String
_m_h5_tk
=
splitTk
[
0
];
...
@@ -44,18 +49,18 @@ public class TaoBaoHotSearchCrawler {
...
@@ -44,18 +49,18 @@ public class TaoBaoHotSearchCrawler {
String
[]
splitEnc
=
enc
.
split
(
";"
);
String
[]
splitEnc
=
enc
.
split
(
";"
);
String
_m_h5_tk_enc
=
splitEnc
[
0
];
String
_m_h5_tk_enc
=
splitEnc
[
0
];
headerMap
.
put
(
"cookie"
,
_m_h5_tk
+
";"
+
_m_h5_tk_enc
);
headerMap
.
put
(
"cookie"
,
_m_h5_tk
+
";"
+
_m_h5_tk_enc
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
e
);
}
}
String
signs
=
token
+
"&"
+
time
+
"&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}"
;
String
signs
=
token
+
"&"
+
time
+
"&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}"
;
String
sign
=
MD5Util
.
getMD5
(
signs
).
toLowerCase
();
String
sign
=
MD5Util
.
getMD5
(
signs
).
toLowerCase
();
String
url
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign="
+
sign
+
"&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
String
url
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign="
+
sign
+
"&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response1
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response1
.
hasCause
()){
Throwable
cause
=
response1
.
cause
();
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response1
.
bodyString
();
ht
=
!
htmlBody
.
contains
(
"非法请求"
);
ht
=
!
htmlBody
.
contains
(
"非法请求"
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
e
);
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
return
ansysData
(
htmlBody
,
date
);
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
View file @
5448a5f4
...
@@ -2,15 +2,15 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,15 +2,15 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
...
@@ -20,7 +20,7 @@ import java.util.List;
...
@@ -20,7 +20,7 @@ import java.util.List;
@Log4j2
@Log4j2
public
class
TengXunCrawler
{
public
class
TengXunCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
/**
* 腾讯热榜数据采集
* 腾讯热榜数据采集
...
@@ -35,10 +35,12 @@ public class TengXunCrawler {
...
@@ -35,10 +35,12 @@ public class TengXunCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//采集为空最多重试3次
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
response
.
cause
().
printStackTrace
();
e
.
printStackTrace
();
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"idlist"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"idlist"
))
{
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
...
@@ -96,10 +98,12 @@ public class TengXunCrawler {
...
@@ -96,10 +98,12 @@ public class TengXunCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//采集为空最多重试3次
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
response
.
cause
().
printStackTrace
();
e
.
printStackTrace
();
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
)){
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
)){
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
);
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
View file @
5448a5f4
...
@@ -3,9 +3,11 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -3,9 +3,11 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
...
@@ -14,7 +16,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
...
@@ -14,7 +16,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
...
@@ -34,7 +36,7 @@ import java.util.*;
...
@@ -34,7 +36,7 @@ import java.util.*;
*/
*/
@Log4j2
@Log4j2
public
class
ToutiaoHotSearchCrawler
{
public
class
ToutiaoHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
/**
* @Title: weiboHotSearchByPhoneTest
* @Title: weiboHotSearchByPhoneTest
...
@@ -47,10 +49,12 @@ public class ToutiaoHotSearchCrawler {
...
@@ -47,10 +49,12 @@ public class ToutiaoHotSearchCrawler {
String
jsUrl
=
"https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js"
;
String
jsUrl
=
"https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js"
;
Request
jsRequest
=
RequestUtils
.
wrapGet
(
jsUrl
);
Request
jsRequest
=
RequestUtils
.
wrapGet
(
jsUrl
);
String
jsBody
=
null
;
String
jsBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
jsRequest
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
jsRequest
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
jsBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"获取今日头条实时热搜头部信息标识失败"
,
e
);
log
.
error
(
"获取今日头条实时热搜头部信息标识失败"
,
cause
);
}
else
{
jsBody
=
response
.
bodyString
();
}
}
if
(
jsBody
!=
null
&&
jsBody
.
contains
(
"origin"
)){
if
(
jsBody
!=
null
&&
jsBody
.
contains
(
"origin"
)){
String
s
=
jsBody
.
substring
(
jsBody
.
indexOf
(
"origin:"
)+
"origin:"
.
length
());
String
s
=
jsBody
.
substring
(
jsBody
.
indexOf
(
"origin:"
)+
"origin:"
.
length
());
...
@@ -61,10 +65,12 @@ public class ToutiaoHotSearchCrawler {
...
@@ -61,10 +65,12 @@ public class ToutiaoHotSearchCrawler {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++){
for
(
int
count
=
0
;
count
<=
5
;
count
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response1
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response1
.
hasCause
()){
}
catch
(
IOException
e1
)
{
Throwable
cause
=
response1
.
cause
();
log
.
error
(
"解析今日头条实时热搜时出现连接失败"
,
e1
);
log
.
error
(
"解析今日头条实时热搜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response1
.
bodyString
();
}
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
...
@@ -159,10 +165,12 @@ public class ToutiaoHotSearchCrawler {
...
@@ -159,10 +165,12 @@ public class ToutiaoHotSearchCrawler {
String
url
=
hotSearchList
.
getUrl
();
String
url
=
hotSearchList
.
getUrl
();
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
i
=
0
;
i
<=
5
;
i
++)
{
for
(
int
i
=
0
;
i
<=
5
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e1
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析今日头条热搜详情页面出现连接失败"
,
e1
);
log
.
error
(
"解析今日头条热搜详情页面出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WangYiHotSearchCrawler.java
View file @
5448a5f4
...
@@ -2,16 +2,18 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,16 +2,18 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
...
@@ -26,7 +28,7 @@ import java.util.List;
...
@@ -26,7 +28,7 @@ import java.util.List;
*/
*/
@Log4j2
@Log4j2
public
class
WangYiHotSearchCrawler
{
public
class
WangYiHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
build
();
/**
/**
* 网易新闻实时热榜的采集
* 网易新闻实时热榜的采集
...
@@ -39,10 +41,12 @@ public class WangYiHotSearchCrawler {
...
@@ -39,10 +41,12 @@ public class WangYiHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"网易新闻实时热榜页面连接异常..."
,
e
);
log
.
error
(
"网易新闻实时热榜页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
bodyObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONObject
bodyObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
...
@@ -80,10 +84,12 @@ public class WangYiHotSearchCrawler {
...
@@ -80,10 +84,12 @@ public class WangYiHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"网易新闻跟贴热议页面连接异常..."
,
e
);
log
.
error
(
"网易新闻跟贴热议页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
bodyObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONObject
bodyObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoSearchBoxHotWordsCrawler.java
View file @
5448a5f4
...
@@ -2,14 +2,15 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,14 +2,15 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoSearchBoxHotWords
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoSearchBoxHotWords
;
import
com.zhiwei.searchhotcrawler.dao.WeiBoSearchBoxHotWordsDao
;
import
com.zhiwei.searchhotcrawler.dao.WeiBoSearchBoxHotWordsDao
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.time.Duration
;
import
java.time.Duration
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
...
@@ -26,7 +27,7 @@ import java.util.Objects;
...
@@ -26,7 +27,7 @@ import java.util.Objects;
*/
*/
@Log4j2
@Log4j2
public
class
WeiBoSearchBoxHotWordsCrawler
{
public
class
WeiBoSearchBoxHotWordsCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
(
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
static
WeiBoSearchBoxHotWordsDao
weiBoSearchDao
=
new
WeiBoSearchBoxHotWordsDao
();
static
WeiBoSearchBoxHotWordsDao
weiBoSearchDao
=
new
WeiBoSearchBoxHotWordsDao
();
public
static
void
weiBoSearchBoxHotWords
(
Date
date
){
public
static
void
weiBoSearchBoxHotWords
(
Date
date
){
...
@@ -35,11 +36,12 @@ public class WeiBoSearchBoxHotWordsCrawler {
...
@@ -35,11 +36,12 @@ public class WeiBoSearchBoxHotWordsCrawler {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博搜索框热词时出现解析错误,页面结构有问题"
,
cause
);
log
.
error
(
"解析微博搜索框热词时出现解析错误,页面结构有问题"
,
e
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"hotwords"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"hotwords"
))
{
int
num
=
ansysData
(
htmlBody
,
date
);
int
num
=
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboEntertainmentCrawler.java
View file @
5448a5f4
...
@@ -2,17 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,17 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.util.*
;
import
java.util.*
;
/**
/**
...
@@ -24,7 +23,7 @@ import java.util.*;
...
@@ -24,7 +23,7 @@ import java.util.*;
@Log4j2
@Log4j2
public
class
WeiboEntertainmentCrawler
{
public
class
WeiboEntertainmentCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
/**
* @return void 返回类型
* @return void 返回类型
...
@@ -38,10 +37,12 @@ public class WeiboEntertainmentCrawler {
...
@@ -38,10 +37,12 @@ public class WeiboEntertainmentCrawler {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博娱乐榜时出现连接失败"
,
e
);
log
.
error
(
"解析微博娱乐榜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
List
<
HotSearchList
>
result
=
new
ArrayList
();
List
<
HotSearchList
>
result
=
new
ArrayList
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
5448a5f4
...
@@ -9,6 +9,10 @@ import java.util.*;
...
@@ -9,6 +9,10 @@ import java.util.*;
import
java.util.stream.Collectors
;
import
java.util.stream.Collectors
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSON
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.*
;
import
com.zhiwei.searchhotcrawler.bean.*
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
com.zhiwei.searchhotcrawler.dao.RedisDao
;
import
com.zhiwei.searchhotcrawler.dao.RedisDao
;
...
@@ -17,7 +21,6 @@ import com.zhiwei.searchhotcrawler.dao.WeiBoUserDao;
...
@@ -17,7 +21,6 @@ import com.zhiwei.searchhotcrawler.dao.WeiBoUserDao;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.bson.Document
;
import
org.bson.Document
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
...
@@ -26,9 +29,6 @@ import org.jsoup.select.Elements;
...
@@ -26,9 +29,6 @@ import org.jsoup.select.Elements;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.mail.SendMailWeibo
;
import
com.zhiwei.searchhotcrawler.mail.SendMailWeibo
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
...
@@ -45,7 +45,7 @@ import static java.util.Objects.nonNull;
...
@@ -45,7 +45,7 @@ import static java.util.Objects.nonNull;
@Log4j2
@Log4j2
public
class
WeiboHotSearchCrawler
{
public
class
WeiboHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
private
static
RedisDao
redisDao
=
new
RedisDao
();
private
static
RedisDao
redisDao
=
new
RedisDao
();
...
@@ -66,14 +66,15 @@ public class WeiboHotSearchCrawler {
...
@@ -66,14 +66,15 @@ public class WeiboHotSearchCrawler {
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
Exception
e
)
{
if
(
i
==
2
)
{
if
(
i
==
2
)
{
return
list
;
return
list
;
}
else
{
}
else
{
continue
;
continue
;
}
}
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"pl_top_realtimehot"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"pl_top_realtimehot"
))
{
try
{
try
{
...
@@ -261,10 +262,12 @@ public class WeiboHotSearchCrawler {
...
@@ -261,10 +262,12 @@ public class WeiboHotSearchCrawler {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
log
.
error
(
"解析微博时热搜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
...
@@ -349,10 +352,12 @@ public class WeiboHotSearchCrawler {
...
@@ -349,10 +352,12 @@ public class WeiboHotSearchCrawler {
String
url
=
"https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot"
;
String
url
=
"https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜时出现连接失败"
,
e
);
log
.
error
(
"解析微博热搜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
List
<
HotSearchList
>
result
=
new
ArrayList
<>();
List
<
HotSearchList
>
result
=
new
ArrayList
<>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
...
@@ -405,10 +410,12 @@ public class WeiboHotSearchCrawler {
...
@@ -405,10 +410,12 @@ public class WeiboHotSearchCrawler {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"cardlistInfo"
);
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"cardlistInfo"
);
...
@@ -500,10 +507,12 @@ public class WeiboHotSearchCrawler {
...
@@ -500,10 +507,12 @@ public class WeiboHotSearchCrawler {
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"Cookie"
,
"SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN"
);
headerMap
.
put
(
"Cookie"
,
"SUB=_2AkMWEQNHf8NxqwFRmPwdzmrnaYl_zgzEieKgTfKcJRMxHRl-yT9jqmkjtRB6PZEtqE0muNq5OZJPytvesIwD-Kh1dwIz; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFKfwIoPvvYaew277IR3CUN"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
log
.
error
(
"解析微博时热搜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"m-main"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"m-main"
))
{
Document
docm
=
new
Document
();
Document
docm
=
new
Document
();
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboNewsCrawler.java
View file @
5448a5f4
...
@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.io.IOException
;
...
@@ -25,7 +27,7 @@ import java.util.*;
...
@@ -25,7 +27,7 @@ import java.util.*;
@Log4j2
@Log4j2
public
class
WeiboNewsCrawler
{
public
class
WeiboNewsCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
/**
...
@@ -46,11 +48,13 @@ public class WeiboNewsCrawler {
...
@@ -46,11 +48,13 @@ public class WeiboNewsCrawler {
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
List
<
HotSearchList
>
result
=
new
ArrayList
();
List
<
HotSearchList
>
result
=
new
ArrayList
();
//发送第一次请求获取前20条数据
//发送第一次请求获取前20条数据
try
(
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"第一次请求解析微博要闻榜时出现连接失败"
,
e
);
log
.
error
(
"第一次请求解析微博要闻榜时出现连接失败"
,
caus
e
);
continue
;
continue
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
try
{
...
@@ -66,11 +70,13 @@ public class WeiboNewsCrawler {
...
@@ -66,11 +70,13 @@ public class WeiboNewsCrawler {
continue
;
continue
;
}
}
//发送第二次请求获取中间20条数据
//发送第二次请求获取中间20条数据
try
(
Response
response
=
httpBoot
.
syncCall
(
request2
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response1
=
httpBoot
.
syncCall
(
request2
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response1
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response1
.
cause
();
log
.
error
(
"第二次请求解析微博要闻榜时出现连接失败"
,
e
);
log
.
error
(
"第二次请求解析微博要闻榜时出现连接失败"
,
caus
e
);
continue
;
continue
;
}
else
{
htmlBody
=
response1
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
try
{
...
@@ -87,11 +93,13 @@ public class WeiboNewsCrawler {
...
@@ -87,11 +93,13 @@ public class WeiboNewsCrawler {
continue
;
continue
;
}
}
//发送第三次请求获取最后10条数据
//发送第三次请求获取最后10条数据
try
(
Response
response
=
httpBoot
.
syncCall
(
request3
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response2
=
httpBoot
.
syncCall
(
request3
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response2
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response2
.
cause
();
log
.
error
(
"第三次请求解析微博要闻榜时出现连接失败"
,
e
);
log
.
error
(
"第三次请求解析微博要闻榜时出现连接失败"
,
caus
e
);
continue
;
continue
;
}
else
{
htmlBody
=
response2
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
try
{
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboSuperTopicCrawler.java
View file @
5448a5f4
...
@@ -8,18 +8,19 @@ import java.util.Map;
...
@@ -8,18 +8,19 @@ import java.util.Map;
import
java.util.Map.Entry
;
import
java.util.Map.Entry
;
import
java.util.Objects
;
import
java.util.Objects
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
/**
/**
*
*
...
@@ -31,7 +32,7 @@ import com.zhiwei.crawler.core.utils.RequestUtils;
...
@@ -31,7 +32,7 @@ import com.zhiwei.crawler.core.utils.RequestUtils;
@Log4j2
@Log4j2
public
class
WeiboSuperTopicCrawler
{
public
class
WeiboSuperTopicCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
static
{
static
{
...
@@ -63,13 +64,15 @@ public class WeiboSuperTopicCrawler {
...
@@ -63,13 +64,15 @@ public class WeiboSuperTopicCrawler {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
//重试三次
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
Exception
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
e
);
log
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
caus
e
);
continue
;
continue
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"d
esc1
"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"d
ata
"
))
{
topicList
.
addAll
(
parseTopicRankHtml
(
page
,
htmlBody
,
type
));
topicList
.
addAll
(
parseTopicRankHtml
(
page
,
htmlBody
,
type
));
break
;
break
;
}
else
{
}
else
{
...
@@ -99,21 +102,18 @@ public class WeiboSuperTopicCrawler {
...
@@ -99,21 +102,18 @@ public class WeiboSuperTopicCrawler {
Integer
toprank
=
null
;
Integer
toprank
=
null
;
String
topicName
=
null
;
String
topicName
=
null
;
String
id
=
null
;
String
id
=
null
;
String
score
=
null
;
String
postNum
=
null
;
String
desc1
=
null
;
String
fensi
=
null
;
String
fensi
=
null
;
String
url
=
null
;
String
url
=
null
;
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
JSONObject
data
=
list
.
getJSONObject
(
i
);
JSONObject
data
=
list
.
getJSONObject
(
i
);
toprank
=
page
+
data
.
getInteger
(
"toprank"
)
;
toprank
=
++
page
;
topicName
=
data
.
getString
(
"display_name"
);
topicName
=
data
.
getString
(
"display_name"
);
id
=
data
.
getString
(
"page_id"
);
id
=
data
.
getString
(
"page_id"
);
score
=
data
.
getString
(
"score"
);
postNum
=
data
.
getString
(
"status_count"
);
desc1
=
data
.
getString
(
"desc1"
);
fensi
=
data
.
getString
(
"fans_count"
);
fensi
=
desc1
.
replaceAll
(
".*影响力|粉丝"
,
""
).
trim
();
url
=
data
.
getString
(
"link"
);
url
=
data
.
getString
(
"link"
);
WeiboSuperTopic
topic
=
new
WeiboSuperTopic
(
url
,
topicName
,
toprank
,
postNum
,
fensi
,
type
);
WeiboSuperTopic
topic
=
new
WeiboSuperTopic
(
url
,
topicName
,
toprank
,
score
,
fensi
,
type
);
topic
=
getTopicInfo
(
id
,
topic
);
topic
=
getTopicInfo
(
id
,
topic
);
topicList
.
add
(
topic
);
topicList
.
add
(
topic
);
}
}
...
@@ -140,17 +140,19 @@ public class WeiboSuperTopicCrawler {
...
@@ -140,17 +140,19 @@ public class WeiboSuperTopicCrawler {
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid="
+
id
;
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid="
+
id
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
Exception
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
e
);
log
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc_more"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"desc_more"
))
{
String
descMore
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"pageInfo"
).
getJSONArray
(
"desc_more"
).
getString
(
0
);
String
descMore
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"pageInfo"
).
getJSONArray
(
"desc_more"
).
getString
(
0
);
if
(
StringUtils
.
isNotBlank
(
descMore
))
{
if
(
StringUtils
.
isNotBlank
(
descMore
))
{
String
readNum
=
descMore
.
replaceAll
(
"阅读|帖子.*"
,
""
).
trim
();
String
readNum
=
descMore
.
replaceAll
(
"阅读|帖子.*"
,
""
).
trim
();
String
postNum
=
descMore
.
replaceAll
(
".*帖子|粉丝.*"
,
""
).
trim
();
//
String postNum = descMore.replaceAll(".*帖子|粉丝.*", "").trim();
topic
.
setPostNum
(
postNum
);
//
topic.setPostNum(postNum);
topic
.
setReadNum
(
readNum
);
topic
.
setReadNum
(
readNum
);
return
topic
;
return
topic
;
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
View file @
5448a5f4
...
@@ -3,9 +3,10 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -3,9 +3,10 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
...
@@ -13,7 +14,6 @@ import com.zhiwei.searchhotcrawler.util.TipsUtils;
...
@@ -13,7 +14,6 @@ import com.zhiwei.searchhotcrawler.util.TipsUtils;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.nodes.Element
;
...
@@ -31,7 +31,7 @@ import java.util.regex.Pattern;
...
@@ -31,7 +31,7 @@ import java.util.regex.Pattern;
*/
*/
@Log4j2
@Log4j2
public
class
WeiboTopicCrawler
{
public
class
WeiboTopicCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
private
static
Map
<
String
,
String
>
headMap
=
new
HashMap
<>();
static
{
static
{
...
@@ -137,11 +137,13 @@ public class WeiboTopicCrawler {
...
@@ -137,11 +137,13 @@ public class WeiboTopicCrawler {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
//重试三次
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
5
;
retryTimes
++)
{
for
(
int
retryTimes
=
1
;
retryTimes
<=
5
;
retryTimes
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
Exception
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"下载榜单列表页面时出现错误,错误为:{}"
,
e
);
log
.
error
(
"下载榜单列表页面时出现错误,错误为:{}"
,
caus
e
);
continue
;
continue
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
topicList
.
addAll
(
parseTopicHtml
(
htmlBody
,
date
));
topicList
.
addAll
(
parseTopicHtml
(
htmlBody
,
date
));
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/XinLangHotSearchCrawler.java
View file @
5448a5f4
...
@@ -2,9 +2,10 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,9 +2,10 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
...
@@ -12,7 +13,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
...
@@ -12,7 +13,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
...
@@ -24,7 +25,7 @@ import java.util.*;
...
@@ -24,7 +25,7 @@ import java.util.*;
@Log4j2
@Log4j2
public
class
XinLangHotSearchCrawler
{
public
class
XinLangHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
/**
* 新浪热榜的采集
* 新浪热榜的采集
...
@@ -38,10 +39,12 @@ public class XinLangHotSearchCrawler {
...
@@ -38,10 +39,12 @@ public class XinLangHotSearchCrawler {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
JSONObject
jsonObject
=
null
;
JSONObject
jsonObject
=
null
;
for
(
int
t
=
0
;
t
<
3
&&
jsonObject
==
null
;
t
++)
{
for
(
int
t
=
0
;
t
<
3
&&
jsonObject
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"新浪热榜页面连接异常..."
,
e
);
log
.
error
(
"新浪热榜页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
!=
null
)
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
@@ -111,10 +114,12 @@ public class XinLangHotSearchCrawler {
...
@@ -111,10 +114,12 @@ public class XinLangHotSearchCrawler {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
JSONArray
dataJson
=
null
;
JSONArray
dataJson
=
null
;
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"新浪热点页面连接异常..."
,
e
);
log
.
error
(
"新浪热点页面连接异常..."
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuChildHotSearchCrawler.java
View file @
5448a5f4
...
@@ -4,16 +4,18 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -4,16 +4,18 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.util.*
;
import
java.util.*
;
...
@@ -21,7 +23,7 @@ import java.util.*;
...
@@ -21,7 +23,7 @@ import java.util.*;
@Log4j2
@Log4j2
public
class
ZhihuChildHotSearchCrawler
{
public
class
ZhihuChildHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
/**
* 知乎子级分类数据采集
* 知乎子级分类数据采集
...
@@ -39,10 +41,11 @@ public class ZhihuChildHotSearchCrawler {
...
@@ -39,10 +41,11 @@ public class ZhihuChildHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
//采集为空最多重试3次
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
response
.
cause
().
printStackTrace
();
e
.
printStackTrace
();
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
5448a5f4
...
@@ -3,17 +3,18 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -3,17 +3,18 @@ package com.zhiwei.searchhotcrawler.crawler;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.util.*
;
import
java.util.*
;
import
com.zhiwei.crawler.core.config.SslProvider
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
io.netty.handler.ssl.SslProvider
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
...
@@ -34,7 +35,8 @@ import static java.util.Objects.nonNull;
...
@@ -34,7 +35,8 @@ import static java.util.Objects.nonNull;
@Log4j2
@Log4j2
public
class
ZhihuHotSearchCrawler
{
public
class
ZhihuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
sslProvider
(
SslProvider
.
CONSCRYPT
).
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
build
();
//private static HttpBoot httpBoot = HttpBoot.newBuilder().sslProvider(SslProvider.CONSCRYPT).retryTimes(3).build();
/**
/**
* @Title: getZhihuHotList
* @Title: getZhihuHotList
* @author hero
* @author hero
...
@@ -98,11 +100,13 @@ public class ZhihuHotSearchCrawler {
...
@@ -98,11 +100,13 @@ public class ZhihuHotSearchCrawler {
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
);
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
caus
e
);
return
list
;
return
list
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
try
{
try
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"author"
))
{
...
@@ -160,17 +164,22 @@ public class ZhihuHotSearchCrawler {
...
@@ -160,17 +164,22 @@ public class ZhihuHotSearchCrawler {
Map
<
String
,
String
>
Map
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
Map
=
HeaderTool
.
getCommonHead
();
Map
.
put
(
"cookie"
,
"_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4"
);
Map
.
put
(
"cookie"
,
"_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
Map
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
Map
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
String
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"单条知乎热搜数据页面连接失败"
,
cause
);
return
doc
;
}
else
{
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"QuestionHeader"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"QuestionHeader"
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
//获取标签
//获取标签
String
label
=
""
;
String
label
=
""
;
Elements
select
=
document
.
select
(
"div.Tag"
);
Elements
select
=
document
.
select
(
"div.Tag"
);
for
(
Element
element
:
select
)
{
for
(
Element
element
:
select
)
{
String
text
=
"`"
+
element
.
select
(
"div.Popover"
).
text
()+
";"
;
String
text
=
"`"
+
element
.
select
(
"div.Popover"
).
text
()+
";"
;
label
=
label
+
text
;
label
=
label
+
text
;
}
}
doc
.
put
(
"tag"
,
label
.
trim
());
doc
.
put
(
"tag"
,
label
.
trim
());
String
strong
=
document
.
select
(
"div.NumberBoard-itemInner"
).
select
(
"strong"
).
text
();
String
strong
=
document
.
select
(
"div.NumberBoard-itemInner"
).
select
(
"strong"
).
text
();
String
[]
count
=
strong
.
split
(
" "
);
String
[]
count
=
strong
.
split
(
" "
);
...
@@ -182,9 +191,6 @@ public class ZhihuHotSearchCrawler {
...
@@ -182,9 +191,6 @@ public class ZhihuHotSearchCrawler {
}
else
{
}
else
{
return
doc
;
return
doc
;
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"单条知乎热搜数据页面连接失败"
,
e
);
return
doc
;
}
}
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuTopicSearchCrawler.java
View file @
5448a5f4
...
@@ -2,29 +2,21 @@ package com.zhiwei.searchhotcrawler.crawler;
...
@@ -2,29 +2,21 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.Data
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.select.Elements
;
import
org.w3c.dom.Element
;
import
java.io.IOException
;
import
java.util.*
;
import
java.util.*
;
@Log4j2
@Log4j2
public
class
ZhihuTopicSearchCrawler
{
public
class
ZhihuTopicSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
public
static
List
<
HotSearchList
>
getZhihuTopicSearch
(
Date
date
){
public
static
List
<
HotSearchList
>
getZhihuTopicSearch
(
Date
date
){
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
...
@@ -33,10 +25,12 @@ public class ZhihuTopicSearchCrawler {
...
@@ -33,10 +25,12 @@ public class ZhihuTopicSearchCrawler {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
jsonObject
==
null
;
t
++)
{
for
(
int
t
=
0
;
t
<
3
&&
jsonObject
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"知乎热搜页面连接异常"
,
e
);
log
.
error
(
"知乎热搜页面连接异常"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
!=
null
)
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
5448a5f4
package
com
.
zhiwei
.
searchhotcrawler
.
run
;
package
com
.
zhiwei
.
searchhotcrawler
.
run
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.*
;
import
com.zhiwei.searchhotcrawler.timer.*
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearch36KrCrawlerTest.java
View file @
5448a5f4
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
...
@@ -26,7 +27,8 @@ import java.util.*;
...
@@ -26,7 +27,8 @@ import java.util.*;
@Log4j2
@Log4j2
public
class
HotSearch36KrCrawlerTest
{
public
class
HotSearch36KrCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
/**
* @return void 返回类型
* @return void 返回类型
...
@@ -38,12 +40,13 @@ public class HotSearch36KrCrawlerTest {
...
@@ -38,12 +40,13 @@ public class HotSearch36KrCrawlerTest {
String
url
=
"https://www.36kr.com/hot-list/catalog"
;
String
url
=
"https://www.36kr.com/hot-list/catalog"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
if
(
response
.
hasCause
()){
htmlBody
=
response
.
body
().
string
();
Throwable
cause
=
response
.
cause
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
,
cause
);
log
.
error
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
,
e
);
}
else
{
}
htmlBody
=
response
.
bodyString
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"article-list"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"article-list"
))
{
return
ansysData
(
htmlBody
,
date
);
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchRunTest.java
View file @
5448a5f4
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HuXiuHotSearchCrawlerTest.java
View file @
5448a5f4
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.client.methods.HttpGet
;
...
@@ -31,17 +31,20 @@ import java.util.*;
...
@@ -31,17 +31,20 @@ import java.util.*;
*/
*/
@Log4j2
@Log4j2
public
class
HuXiuHotSearchCrawlerTest
{
public
class
HuXiuHotSearchCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
public
static
List
<
HotSearchList
>
HuXiuHotArticleRecommended
(
Date
date
){
public
static
List
<
HotSearchList
>
HuXiuHotArticleRecommended
(
Date
date
){
String
url
=
"https://www.huxiu.com/"
;
String
url
=
"https://www.huxiu.com/"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
Exception
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
,
e
);
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"hot__list"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"hot__list"
))
{
return
ansysData
(
htmlBody
,
date
);
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/Job51Test.java
View file @
5448a5f4
...
@@ -4,10 +4,12 @@ package com.zhiwei.searchhotcrawler.test;
...
@@ -4,10 +4,12 @@ package com.zhiwei.searchhotcrawler.test;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.client.MongoDatabase
;
import
com.mongodb.client.MongoDatabase
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
...
@@ -17,7 +19,7 @@ import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBLocalTemplate;
...
@@ -17,7 +19,7 @@ import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBLocalTemplate;
import
com.zhiwei.searchhotcrawler.util.HttpClientUtils
;
import
com.zhiwei.searchhotcrawler.util.HttpClientUtils
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
org.springframework.context.ApplicationContext
;
import
org.springframework.context.ApplicationContext
;
...
@@ -38,7 +40,7 @@ public class Job51Test {
...
@@ -38,7 +40,7 @@ public class Job51Test {
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
ProxyFactory
.
init
(
simpleConfig
);
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
// MongoDatabase mongoDBLocal = MongoDBLocalTemplate.getDB(DBConfig.dbName);
// MongoDatabase mongoDBLocal = MongoDBLocalTemplate.getDB(DBConfig.dbName);
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
...
@@ -62,11 +64,12 @@ public class Job51Test {
...
@@ -62,11 +64,12 @@ public class Job51Test {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
header
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
header
);
for
(
int
t
=
0
;
t
<
1
&&
jsonObject
==
null
;
t
++)
{
for
(
int
t
=
0
;
t
<
1
&&
jsonObject
==
null
;
t
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
if
(
response
.
hasCause
()){
htmlBody
=
response
.
body
().
string
();
Throwable
cause
=
response
.
cause
();
}
catch
(
IOException
e
)
{
log
.
error
(
"知乎热搜页面连接异常"
,
cause
);
log
.
error
(
"知乎热搜页面连接异常"
,
e
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
!=
null
)
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchCrawlerTest.java
View file @
5448a5f4
...
@@ -3,14 +3,15 @@ package com.zhiwei.searchhotcrawler.test;
...
@@ -3,14 +3,15 @@ package com.zhiwei.searchhotcrawler.test;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
java.time.Duration
;
import
java.time.Duration
;
import
java.util.*
;
import
java.util.*
;
...
@@ -23,7 +24,8 @@ import java.util.*;
...
@@ -23,7 +24,8 @@ import java.util.*;
*/
*/
@Log4j2
@Log4j2
public
class
KuaiShouHotSearchCrawlerTest
{
public
class
KuaiShouHotSearchCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
/**
* @return void 返回类型
* @return void 返回类型
...
@@ -35,11 +37,12 @@ public class KuaiShouHotSearchCrawlerTest {
...
@@ -35,11 +37,12 @@ public class KuaiShouHotSearchCrawlerTest {
String
url
=
"https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"
;
String
url
=
"https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
if
(
response
.
hasCause
()){
htmlBody
=
response
.
body
().
string
();
Throwable
cause
=
response
.
cause
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
cause
);
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
e
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"APOLLO_STATE"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"APOLLO_STATE"
))
{
return
ansysData
(
htmlBody
,
date
);
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/TaoBaoHotSearchCrawlerTest.java
View file @
5448a5f4
...
@@ -2,15 +2,17 @@ package com.zhiwei.searchhotcrawler.test;
...
@@ -2,15 +2,17 @@ package com.zhiwei.searchhotcrawler.test;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.MD5Util
;
import
com.zhiwei.searchhotcrawler.util.MD5Util
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
;
import
java.time.Duration
;
import
java.time.Duration
;
import
java.util.*
;
import
java.util.*
;
...
@@ -23,7 +25,8 @@ import java.util.*;
...
@@ -23,7 +25,8 @@ import java.util.*;
*/
*/
@Log4j2
@Log4j2
public
class
TaoBaoHotSearchCrawlerTest
{
public
class
TaoBaoHotSearchCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
public
static
List
<
HotSearchList
>
taoBaoHotSearch
(
Date
date
)
{
public
static
List
<
HotSearchList
>
taoBaoHotSearch
(
Date
date
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
...
@@ -34,8 +37,12 @@ public class TaoBaoHotSearchCrawlerTest {
...
@@ -34,8 +37,12 @@ public class TaoBaoHotSearchCrawlerTest {
String
urls
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign=&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
String
urls
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign=&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
Request
request1
=
RequestUtils
.
wrapGet
(
urls
);
Request
request1
=
RequestUtils
.
wrapGet
(
urls
);
String
token
=
null
;
String
token
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response1
=
httpBoot
.
syncCall
(
request1
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
List
<
String
>
values
=
response
.
networkResponse
().
headers
().
values
(
"Set-Cookie"
);
if
(
response1
.
hasCause
()){
Throwable
cause
=
response1
.
cause
();
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
List
<
String
>
values
=
response1
.
headers
().
values
(
"Set-Cookie"
);
String
tk
=
values
.
get
(
1
);
String
tk
=
values
.
get
(
1
);
String
[]
splitTk
=
tk
.
split
(
";"
);
String
[]
splitTk
=
tk
.
split
(
";"
);
String
_m_h5_tk
=
splitTk
[
0
];
String
_m_h5_tk
=
splitTk
[
0
];
...
@@ -44,18 +51,18 @@ public class TaoBaoHotSearchCrawlerTest {
...
@@ -44,18 +51,18 @@ public class TaoBaoHotSearchCrawlerTest {
String
[]
splitEnc
=
enc
.
split
(
";"
);
String
[]
splitEnc
=
enc
.
split
(
";"
);
String
_m_h5_tk_enc
=
splitEnc
[
0
];
String
_m_h5_tk_enc
=
splitEnc
[
0
];
headerMap
.
put
(
"cookie"
,
_m_h5_tk
+
";"
+
_m_h5_tk_enc
);
headerMap
.
put
(
"cookie"
,
_m_h5_tk
+
";"
+
_m_h5_tk_enc
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
e
);
}
}
String
signs
=
token
+
"&"
+
time
+
"&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}"
;
String
signs
=
token
+
"&"
+
time
+
"&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}"
;
String
sign
=
MD5Util
.
getMD5
(
signs
).
toLowerCase
();
String
sign
=
MD5Util
.
getMD5
(
signs
).
toLowerCase
();
String
url
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign="
+
sign
+
"&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
String
url
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign="
+
sign
+
"&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
ht
=
!
htmlBody
.
contains
(
"非法请求"
);
ht
=
!
htmlBody
.
contains
(
"非法请求"
);
}
catch
(
Exception
e
)
{
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
e
);
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
return
ansysData
(
htmlBody
,
date
);
return
ansysData
(
htmlBody
,
date
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/TaoBaoRunTest.java
View file @
5448a5f4
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.
crawler.core
.proxy.ProxyFactory
;
import
com.zhiwei.
http
.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun
;
import
com.zhiwei.searchhotcrawler.timer.WeiboSuperTopicRun
;
import
com.zhiwei.searchhotcrawler.timer.WeiboTopicRun
;
import
java.text.ParseException
;
import
java.text.ParseException
;
...
@@ -18,8 +20,10 @@ public class TaoBaoRunTest {
...
@@ -18,8 +20,10 @@ public class TaoBaoRunTest {
//微博热搜开始采集
//微博热搜开始采集
// new WeiboHotSearchRun().start();
// new WeiboHotSearchRun().start();
//快手热榜开始采集
//快手热榜开始采集
//
new KuaiShouHotSearchRun().start();
//
new KuaiShouHotSearchRun().start();
//百度热搜
//百度热搜
// new TaoBaoHotSearchRun().run();
//new TaoBaoHotSearchRun().run();
//超话测试
//new WeiboSuperTopicRun().run();
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/test/WeiboEntertainmentCrawlerTest.java
View file @
5448a5f4
...
@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.test;
...
@@ -2,14 +2,16 @@ package com.zhiwei.searchhotcrawler.test;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.util.*
;
import
java.util.*
;
...
@@ -24,7 +26,8 @@ import java.util.*;
...
@@ -24,7 +26,8 @@ import java.util.*;
@Log4j2
@Log4j2
public
class
WeiboEntertainmentCrawlerTest
{
public
class
WeiboEntertainmentCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
build
();
/**
/**
...
@@ -38,10 +41,12 @@ public class WeiboEntertainmentCrawlerTest {
...
@@ -38,10 +41,12 @@ public class WeiboEntertainmentCrawlerTest {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博娱乐榜时出现连接失败"
,
e
);
log
.
error
(
"解析微博娱乐榜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
List
<
HotSearchList
>
result
=
new
ArrayList
();
List
<
HotSearchList
>
result
=
new
ArrayList
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboSuperTopicRun.java
View file @
5448a5f4
...
@@ -43,7 +43,7 @@ public class WeiboSuperTopicRun extends Thread{
...
@@ -43,7 +43,7 @@ public class WeiboSuperTopicRun extends Thread{
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"
score_num"
,
topic
.
getScore
());
doc
.
put
(
"
read_Num"
,
topic
.
getReadNum
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"type"
,
topic
.
getType
());
doc
.
put
(
"type"
,
topic
.
getType
());
...
@@ -53,7 +53,7 @@ public class WeiboSuperTopicRun extends Thread{
...
@@ -53,7 +53,7 @@ public class WeiboSuperTopicRun extends Thread{
data
.
add
(
doc
);
data
.
add
(
doc
);
}
}
weiboTopicDAO
.
addTopicList
(
data
);
weiboTopicDAO
.
addTopicList
(
data
);
log
.
info
(
"微博
话题
采集结束........"
);
log
.
info
(
"微博
超话
采集结束........"
);
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
5448a5f4
...
@@ -371,7 +371,7 @@ public class GatherTimer {
...
@@ -371,7 +371,7 @@ public class GatherTimer {
* 知乎热搜国际分类采集
* 知乎热搜国际分类采集
*/
*/
@Async
(
value
=
"myScheduler"
)
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"20 * * * * ? "
)
//
@Scheduled(cron = "20 * * * * ? ")
public
void
crawlerZhiHuFocus
(){
public
void
crawlerZhiHuFocus
(){
this
.
crawlerZhiHuChild
(
FOCUS
);
this
.
crawlerZhiHuChild
(
FOCUS
);
}
}
...
@@ -380,7 +380,7 @@ public class GatherTimer {
...
@@ -380,7 +380,7 @@ public class GatherTimer {
* 知乎热搜时事分类采集
* 知乎热搜时事分类采集
*/
*/
@Async
(
value
=
"myScheduler"
)
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"20 * * * * ? "
)
//
@Scheduled(cron = "20 * * * * ? ")
public
void
crawlerZhiHuDepth
(){
public
void
crawlerZhiHuDepth
(){
this
.
crawlerZhiHuChild
(
DEPTH
);
this
.
crawlerZhiHuChild
(
DEPTH
);
}
}
...
@@ -442,7 +442,7 @@ public class GatherTimer {
...
@@ -442,7 +442,7 @@ public class GatherTimer {
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"rank"
,
topic
.
getRank
());
doc
.
put
(
"
score_num"
,
topic
.
getScore
());
doc
.
put
(
"
read_Num"
,
topic
.
getReadNum
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"fensi_num"
,
topic
.
getFensi
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"post_num"
,
topic
.
getPostNum
());
doc
.
put
(
"type"
,
topic
.
getType
());
doc
.
put
(
"type"
,
topic
.
getType
());
...
@@ -452,7 +452,7 @@ public class GatherTimer {
...
@@ -452,7 +452,7 @@ public class GatherTimer {
data
.
add
(
doc
);
data
.
add
(
doc
);
}
}
weiboTopicDAO
.
addTopicList
(
data
);
weiboTopicDAO
.
addTopicList
(
data
);
log
.
info
(
"微博
话题
采集结束........"
);
log
.
info
(
"微博
超话
采集结束........"
);
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/util/HttpClientUtils.java
View file @
5448a5f4
package
com
.
zhiwei
.
searchhotcrawler
.
util
;
package
com
.
zhiwei
.
searchhotcrawler
.
util
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.util.RequestUtils
;
import
okhttp3.MediaType
;
import
okhttp3.MediaType
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.RequestBody
;
import
okhttp3.RequestBody
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
...
@@ -24,7 +24,8 @@ public final class HttpClientUtils {
...
@@ -24,7 +24,8 @@ public final class HttpClientUtils {
private
static
final
String
QUERY_PARAM_SEP
=
"&"
;
private
static
final
String
QUERY_PARAM_SEP
=
"&"
;
private
static
final
String
URL_QUERY_PARAM_SEPARATOR
=
"?"
;
private
static
final
String
URL_QUERY_PARAM_SEPARATOR
=
"?"
;
private
static
final
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
2
).
build
();
//private static final HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(2).build();
private
static
final
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
2
).
build
();
public
static
String
sendPost
(
String
url
,
String
jsonParam
){
public
static
String
sendPost
(
String
url
,
String
jsonParam
){
return
sendPost
(
url
,
jsonParam
,
null
,
Charset
.
forName
(
"UTF-8"
));
return
sendPost
(
url
,
jsonParam
,
null
,
Charset
.
forName
(
"UTF-8"
));
}
}
...
@@ -39,12 +40,13 @@ public final class HttpClientUtils {
...
@@ -39,12 +40,13 @@ public final class HttpClientUtils {
String
result
=
null
;
String
result
=
null
;
Request
request
=
RequestUtils
.
wrapPost
(
url
,
headers
,
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
Request
request
=
RequestUtils
.
wrapPost
(
url
,
headers
,
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
jsonParam
));
jsonParam
));
Response
response
=
httpBoot
.
syncCall
(
request
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
if
(
response
.
hasCause
()){
result
=
response
.
body
().
string
();
Throwable
cause
=
response
.
cause
();
}
catch
(
IOException
e
)
{
LOGGER
.
error
(
"http connection error :"
+
cause
.
getMessage
(),
cause
);
LOGGER
.
error
(
"http connection error :"
+
e
.
getMessage
(),
e
);
}
else
{
}
result
=
response
.
bodyString
();
}
return
result
;
return
result
;
}
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/util/WechatCodeUtil.java
View file @
5448a5f4
...
@@ -4,18 +4,15 @@ import java.io.IOException;
...
@@ -4,18 +4,15 @@ import java.io.IOException;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.util.RequestUtils
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
okhttp3.MediaType
;
import
okhttp3.MediaType
;
...
@@ -23,7 +20,8 @@ import okhttp3.RequestBody;
...
@@ -23,7 +20,8 @@ import okhttp3.RequestBody;
public
class
WechatCodeUtil
{
public
class
WechatCodeUtil
{
private
static
Logger
log
=
LogManager
.
getLogger
(
WechatCodeUtil
.
class
);
private
static
Logger
log
=
LogManager
.
getLogger
(
WechatCodeUtil
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
//private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
build
();
/**
/**
* @Title: getToken
* @Title: getToken
* @author hero
* @author hero
...
@@ -40,12 +38,13 @@ public class WechatCodeUtil {
...
@@ -40,12 +38,13 @@ public class WechatCodeUtil {
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
result
=
null
;
String
result
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
Response
response
=
httpBoot
.
syncCall
(
request
);
result
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
e
.
printStackTrace
();
log
.
error
(
"获取微信公众号推送token失败,问题为:::{}"
,
cause
.
fillInStackTrace
());
log
.
error
(
"获取微信公众号推送token失败,问题为:::{}"
,
e
.
fillInStackTrace
());
return
null
;
return
null
;
}
else
{
result
=
response
.
bodyString
();
}
}
if
(
result
!=
null
)
{
if
(
result
!=
null
)
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
result
);
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
result
);
...
@@ -73,11 +72,13 @@ public class WechatCodeUtil {
...
@@ -73,11 +72,13 @@ public class WechatCodeUtil {
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
templateJson
.
toJSONString
());
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
templateJson
.
toJSONString
());
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
Response
response
=
httpBoot
.
syncCall
(
request
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
Exception
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"消息推送失败,错误为::{}"
,
e
.
fillInStackTrace
());
log
.
error
(
"消息推送失败,错误为::{}"
,
caus
e
.
fillInStackTrace
());
msgid
=
0
;
msgid
=
0
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
...
@@ -115,11 +116,13 @@ public class WechatCodeUtil {
...
@@ -115,11 +116,13 @@ public class WechatCodeUtil {
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
postData
.
toJSONString
());
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
postData
.
toJSONString
());
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
Response
response
=
httpBoot
.
syncCall
(
request
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"页面连接获取失败"
,
e
);
log
.
error
(
"页面连接获取失败"
,
caus
e
);
return
null
;
return
null
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
...
@@ -154,11 +157,13 @@ public class WechatCodeUtil {
...
@@ -154,11 +157,13 @@ public class WechatCodeUtil {
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
postData
.
toJSONString
());
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
postData
.
toJSONString
());
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
Request
request
=
RequestUtils
.
wrapPost
(
url
,
requestBody
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
)){
Response
response
=
httpBoot
.
syncCall
(
request
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"页面链接获取失败"
,
e
);
log
.
error
(
"页面链接获取失败"
,
caus
e
);
return
null
;
return
null
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
...
@@ -197,11 +202,13 @@ public class WechatCodeUtil {
...
@@ -197,11 +202,13 @@ public class WechatCodeUtil {
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
Response
response
=
httpBoot
.
syncCall
(
request
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"获取分组id时出现错误"
,
e
.
fillInStackTrace
());
log
.
error
(
"获取分组id时出现错误"
,
caus
e
.
fillInStackTrace
());
return
null
;
return
null
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
.
contains
(
"tags"
))
{
if
(
htmlBody
.
contains
(
"tags"
))
{
...
@@ -230,11 +237,13 @@ public class WechatCodeUtil {
...
@@ -230,11 +237,13 @@ public class WechatCodeUtil {
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
htmlBody
=
null
;
String
htmlBody
=
null
;
try
(
Response
response
=
httpBoot
.
syncCall
(
request
))
{
Response
response
=
httpBoot
.
syncCall
(
request
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"获取分组id时出现错误"
,
e
.
fillInStackTrace
());
log
.
error
(
"获取分组id时出现错误"
,
caus
e
.
fillInStackTrace
());
return
null
;
return
null
;
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
.
contains
(
"tags"
))
{
if
(
htmlBody
.
contains
(
"tags"
))
{
...
...
src/test/java/hotSaerchTest/HotSearchTest.java
View file @
5448a5f4
...
@@ -2,10 +2,12 @@ package hotSaerchTest;
...
@@ -2,10 +2,12 @@ package hotSaerchTest;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.client.MongoCollection
;
import
com.mongodb.client.MongoCollection
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
...
@@ -14,11 +16,11 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
...
@@ -14,11 +16,11 @@ import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest
;
import
com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest
;
import
com.zhiwei.searchhotcrawler.test.TaoBaoHotSearchCrawlerTest
;
import
com.zhiwei.searchhotcrawler.test.TaoBaoHotSearchCrawlerTest
;
import
com.zhiwei.searchhotcrawler.util.
TaoBaoUtils
;
import
com.zhiwei.searchhotcrawler.util.
QYWechatUtil
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.searchhotcrawler.util.WechatCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.bson.Document
;
import
org.bson.Document
;
import
org.junit.Test
;
import
org.junit.Test
;
import
org.junit.runner.RunWith
;
import
org.junit.runner.RunWith
;
...
@@ -29,7 +31,6 @@ import java.io.IOException;
...
@@ -29,7 +31,6 @@ import java.io.IOException;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
static
com
.
ibm
.
icu
.
util
.
LocalePriorityList
.
add
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
/**
...
@@ -42,7 +43,7 @@ import static java.util.Objects.nonNull;
...
@@ -42,7 +43,7 @@ import static java.util.Objects.nonNull;
{
"classpath:applicationContext.xml"
})
{
"classpath:applicationContext.xml"
})
public
class
HotSearchTest
{
public
class
HotSearchTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
/**
* 测试快手热榜采集
* 测试快手热榜采集
...
@@ -71,10 +72,12 @@ public class HotSearchTest {
...
@@ -71,10 +72,12 @@ public class HotSearchTest {
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23可口可乐回应C罗拒绝与可乐同框%23"
;
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23可口可乐回应C罗拒绝与可乐同框%23"
;
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"cardlistInfo"
);
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"cardlistInfo"
);
...
@@ -167,8 +170,18 @@ public class HotSearchTest {
...
@@ -167,8 +170,18 @@ public class HotSearchTest {
long
time
=
new
Date
().
getTime
();
long
time
=
new
Date
().
getTime
();
String
signs
=
"undefined&1625624820156&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}"
;
String
signs
=
"undefined&1625624820156&12574478&{\"appId\":\"10211\",\"params\":\"{\\\"multi_hintq_show\\\":\\\"on\\\",\\\"src\\\":\\\"c2c\\\",\\\"area\\\":\\\"active_page\\\",\\\"sversion\\\":\\\"7.5\\\",\\\"bangdan_src\\\":\\\"list\\\"}\"}"
;
// https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=1624930984092&sign=acf994dbcee6c0c1d7a8a566a6b8ff0a&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D
// https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t=1624930984092&sign=acf994dbcee6c0c1d7a8a566a6b8ff0a&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D
String
s
=
TaoBaoUtils
.
parsJSFunction
(
signs
);
// String s = TaoBaoUtils.parsJSFunction(signs);
System
.
out
.
println
(
s
);
// System.out.println(s);
}
private
static
String
key
=
"a8e26ce3-8aaa-4d3e-bcf6-30b81526050b"
;
/**
* 测试预警发送
*/
@Test
public
void
testWarn
(){
QYWechatUtil
.
send
(
key
,
QYWechatUtil
.
MSGTYPE_TEXT
,
"你好"
,
null
,
null
);
}
}
}
}
...
...
src/test/java/weiboTest/WeiboHotSearchTest.java
View file @
5448a5f4
...
@@ -3,10 +3,12 @@ package weiboTest;
...
@@ -3,10 +3,12 @@ package weiboTest;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
...
@@ -19,7 +21,6 @@ import com.zhiwei.searchhotcrawler.util.TipsUtils;
...
@@ -19,7 +21,6 @@ import com.zhiwei.searchhotcrawler.util.TipsUtils;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
...
@@ -50,7 +51,7 @@ public class WeiboHotSearchTest {
...
@@ -50,7 +51,7 @@ public class WeiboHotSearchTest {
static
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
static
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
//调用weiBoUserDao添加数据
//调用weiBoUserDao添加数据
static
WeiBoUserDao
weiBoUserDao
=
new
WeiBoUserDao
();
static
WeiBoUserDao
weiBoUserDao
=
new
WeiBoUserDao
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
@Test
@Test
public
void
test
()
{
public
void
test
()
{
...
@@ -122,10 +123,12 @@ public class WeiboHotSearchTest {
...
@@ -122,10 +123,12 @@ public class WeiboHotSearchTest {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
2
;
count
++)
{
for
(
int
count
=
0
;
count
<=
2
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
...
@@ -213,10 +216,12 @@ public class WeiboHotSearchTest {
...
@@ -213,10 +216,12 @@ public class WeiboHotSearchTest {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
2
;
count
++)
{
for
(
int
count
=
0
;
count
<=
2
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
...
@@ -527,10 +532,12 @@ public class WeiboHotSearchTest {
...
@@ -527,10 +532,12 @@ public class WeiboHotSearchTest {
String
htmlBody
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
htmlBody
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
log
.
error
(
"解析微博时热搜时出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
...
...
src/test/java/weiboTest/WeiboTopInfoTest.java
View file @
5448a5f4
package
weiboTest
;
package
weiboTest
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.util.AESUtils
;
import
com.zhiwei.searchhotcrawler.util.AESUtils
;
import
com.zhiwei.searchhotcrawler.util.HttpClientUtils
;
import
com.zhiwei.searchhotcrawler.util.HttpClientUtils
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.codec.DecoderException
;
import
org.apache.commons.codec.DecoderException
;
import
org.apache.commons.codec.binary.Hex
;
import
org.apache.commons.codec.binary.Hex
;
import
org.junit.Test
;
import
org.junit.Test
;
...
@@ -19,6 +21,7 @@ import javax.crypto.spec.SecretKeySpec;
...
@@ -19,6 +21,7 @@ import javax.crypto.spec.SecretKeySpec;
import
java.beans.Encoder
;
import
java.beans.Encoder
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.io.UnsupportedEncodingException
;
import
java.io.UnsupportedEncodingException
;
import
java.net.Proxy
;
import
java.net.URLDecoder
;
import
java.net.URLDecoder
;
import
java.net.URLEncoder
;
import
java.net.URLEncoder
;
import
java.nio.charset.Charset
;
import
java.nio.charset.Charset
;
...
@@ -33,7 +36,7 @@ import java.util.Map;
...
@@ -33,7 +36,7 @@ import java.util.Map;
*/
*/
public
class
WeiboTopInfoTest
{
public
class
WeiboTopInfoTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
3
).
build
();
/**
/**
* 加密测试
* 加密测试
...
@@ -106,10 +109,11 @@ public class WeiboTopInfoTest {
...
@@ -106,10 +109,11 @@ public class WeiboTopInfoTest {
System
.
out
.
println
(
url
);
System
.
out
.
println
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
getHeaderMap
());
Request
request
=
RequestUtils
.
wrapGet
(
url
,
getHeaderMap
());
//测试使用空代理
//测试使用空代理
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NONE_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NONE_PROXY
);
System
.
out
.
println
(
response
.
body
().
string
());
if
(
response
.
hasCause
()){
}
catch
(
IOException
e
)
{
response
.
cause
().
printStackTrace
();
e
.
printStackTrace
();
}
else
{
System
.
out
.
println
(
response
.
bodyString
());
}
}
}
}
...
@@ -135,8 +139,11 @@ public class WeiboTopInfoTest {
...
@@ -135,8 +139,11 @@ public class WeiboTopInfoTest {
System
.
out
.
println
(
url
);
System
.
out
.
println
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
getHeaderMap
());
Request
request
=
RequestUtils
.
wrapGet
(
url
,
getHeaderMap
());
//测试使用空代理
//测试使用空代理
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NONE_PROXY
))
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NONE_PROXY
);
String
result
=
response
.
body
().
string
();
if
(
response
.
hasCause
()){
response
.
cause
().
printStackTrace
();
}
else
{
String
result
=
response
.
bodyString
();
//结果解密
//结果解密
String
decodeResult
=
decodeStr
(
key
,
result
);
String
decodeResult
=
decodeStr
(
key
,
result
);
System
.
out
.
println
(
decodeResult
);
System
.
out
.
println
(
decodeResult
);
...
@@ -144,11 +151,7 @@ public class WeiboTopInfoTest {
...
@@ -144,11 +151,7 @@ public class WeiboTopInfoTest {
JSONArray
jsonArray
=
JSONArray
.
parseArray
(
decodeResult
);
JSONArray
jsonArray
=
JSONArray
.
parseArray
(
decodeResult
);
for
(
Object
o
:
jsonArray
)
{
for
(
Object
o
:
jsonArray
)
{
System
.
out
.
println
(
o
);
System
.
out
.
println
(
o
);
}
}
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment