Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
1d9b8289
Commit
1d9b8289
authored
Nov 08, 2022
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'working' into 'master'
Working See merge request
!209
parents
a8cc6e0b
1d1a7503
Show whitespace changes
Inline
Side-by-side
Showing
48 changed files
with
333 additions
and
145 deletions
+333
-145
pom.xml
+14
-2
src/main/java/com/zhiwei/searchhotcrawler/config/ProxyConfig.java
+23
-4
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/BiliComprehensiveHotCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/BililiCrawler.java
+4
-3
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
+4
-3
src/main/java/com/zhiwei/searchhotcrawler/crawler/FengHuangSearchCrawler.java
+3
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/HotSearch36KrCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/HuXiuHotSearchCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/MaiMaiHotSearchCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
+3
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/SouhuTopicCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/TaoBaoHotSearchCrawler.java
+3
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
+3
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
+13
-17
src/main/java/com/zhiwei/searchhotcrawler/crawler/WangYiHotSearchCrawler.java
+3
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoBrandCrawler.java
+10
-9
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoSearchBoxHotWordsCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoSearchHotWordsCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiShiHotSearchCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboEntertainmentCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+6
-5
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboNewsCrawler.java
+4
-3
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboOutCircleCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboSuperTopicCrawler.java
+3
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
+3
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboVideoCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/XinLangHotSearchCrawler.java
+3
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuChildHotSearchCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+3
-2
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuTopicSearchCrawler.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+34
-5
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearch36KrCrawlerTest.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchRunTest.java
+3
-4
src/main/java/com/zhiwei/searchhotcrawler/test/HuXiuHotSearchCrawlerTest.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/test/Job51Test.java
+5
-5
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchCrawlerTest.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/test/TaoBaoHotSearchCrawlerTest.java
+3
-2
src/main/java/com/zhiwei/searchhotcrawler/test/TaoBaoRunTest.java
+3
-4
src/main/java/com/zhiwei/searchhotcrawler/test/WeiboEntertainmentCrawlerTest.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+10
-5
src/main/resources/proxyip.properties
+12
-4
src/test/java/InfoTest/InfoTest.java
+4
-4
src/test/java/baiduTest/BaiduTest.java
+5
-5
src/test/java/hotSaerchTest/HotSearchTest.java
+14
-14
src/test/java/proxy/ProxyTest.java
+86
-0
src/test/java/weiboTest/WeiboHotSearchTest.java
+11
-10
No files found.
pom.xml
View file @
1d9b8289
...
...
@@ -15,6 +15,11 @@
<log4j.version>
2.15.0
</log4j.version>
<commons-lang3.version>
3.12.0
</commons-lang3.version>
<http-boot.version>
0.1.0.8-SNAPSHOT
</http-boot.version>
<cynomys-consumer.version>
0.0.5-SNAPSHOT
</cynomys-consumer.version>
<proxy-client.version>
2.0.1-SNAPSHOT
</proxy-client.version>
<maven.compiler.source>
8
</maven.compiler.source>
<maven.compiler.target>
8
</maven.compiler.target>
</properties>
...
...
@@ -56,7 +61,7 @@
<dependency>
<groupId>
com.zhiwei.http
</groupId>
<artifactId>
http-boot
</artifactId>
<version>
0.0.8.2-SNAPSHOT
</version>
<version>
${http-boot.version}
</version>
</dependency>
<dependency>
<groupId>
org.apache.commons
</groupId>
...
...
@@ -67,8 +72,15 @@
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
proxy-client
</artifactId>
<version>
1.1.5-SNAPSHOT
</version>
<version>
${proxy-client.version}
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.network
</groupId>
<artifactId>
cynomys-consumer
</artifactId>
<version>
${cynomys-consumer.version}
</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.conscrypt/conscrypt-openjdk-uber -->
<dependency>
<groupId>
org.conscrypt
</groupId>
...
...
src/main/java/com/zhiwei/searchhotcrawler/config/ProxyConfig.java
View file @
1d9b8289
...
...
@@ -12,15 +12,34 @@ public class ProxyConfig {
conf
=
new
Properties
();
conf
.
load
(
is
);
is
.
close
();
registry
=
conf
.
getProperty
(
"registry"
);
group
=
conf
.
getProperty
(
"group"
);
localRegistry
=
conf
.
getProperty
(
"local.registry"
);
localGroup
=
conf
.
getProperty
(
"local.group"
);
localUsername
=
conf
.
getProperty
(
"local.username"
);
localPassword
=
conf
.
getProperty
(
"local.password"
);
hangzhouRegistry
=
conf
.
getProperty
(
"hangzhou.registry"
);
hangzhouGroup
=
conf
.
getProperty
(
"hangzhou.group"
);
hangzhouUsername
=
conf
.
getProperty
(
"hangzhou.username"
);
hangzhouPassword
=
conf
.
getProperty
(
"hangzhou.password"
);
isLocal
=
Boolean
.
parseBoolean
(
conf
.
getProperty
(
"isLocal"
));
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
public
static
boolean
isLocal
;
public
static
String
localRegistry
;
public
static
String
localGroup
;
public
static
String
localUsername
;
public
static
String
localPassword
;
public
static
String
registry
;
public
static
String
group
;
public
static
String
hangzhouRegistry
;
public
static
String
hangzhouGroup
;
public
static
String
hangzhouUsername
;
public
static
String
hangzhouPassword
;
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -6,6 +6,7 @@ import java.util.*;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
lombok.extern.log4j.Log4j2
;
...
...
@@ -40,7 +41,7 @@ public class BaiDuHotSearchCrawler {
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/BiliComprehensiveHotCrawler.java
View file @
1d9b8289
...
...
@@ -4,6 +4,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -53,7 +54,7 @@ public class BiliComprehensiveHotCrawler {
for
(
int
i
=
0
;
i
<
urlList
.
size
();
i
++)
{
Request
request
=
RequestUtils
.
wrapGet
(
urlList
.
get
(
i
));
//发送请求每次获取20条数据
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
fmt
.
format
(
date
)+
":第"
+
i
+
1
+
"次请求解析B站综合热门时出现连接失败"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/BililiCrawler.java
View file @
1d9b8289
...
...
@@ -5,6 +5,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -43,7 +44,7 @@ public class BililiCrawler {
String
url
=
"https://api.bilibili.com/x/web-interface/ranking/v2?rid=0&type=all"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"B站排行榜页面连接失败"
,
cause
.
fillInStackTrace
());
...
...
@@ -133,7 +134,7 @@ public class BililiCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
{
System
.
setProperty
(
"https.protocols"
,
"TLSv1,TLSv1.1,TLSv1.2,SSLv3"
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
String
htmlBody
=
response
.
bodyString
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"v-wrap"
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
@@ -181,7 +182,7 @@ public class BililiCrawler {
String
url
=
"https://app.biliapi.com/x/v2/search/square?build=616050&limit=10"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"B站热搜页面连接失败"
,
cause
.
fillInStackTrace
());
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -5,6 +5,7 @@ import java.util.*;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
lombok.extern.log4j.Log4j2
;
...
...
@@ -46,7 +47,7 @@ public class DouyinHotSearchCrawler {
String
url
=
"https://api.amemv.com/aweme/v1/hot/search/list/"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
cause
);
...
...
@@ -90,7 +91,7 @@ public class DouyinHotSearchCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
x
=
0
;
x
<
3
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取抖音热搜榜链接时出现问题:{}"
,
cause
);
...
...
@@ -125,7 +126,7 @@ public class DouyinHotSearchCrawler {
String
url
=
"https://api5-normal-c-lq.amemv.com/aweme/v1/hot/search/list/?board_type=2&board_sub_type=2&version_code=140900"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取抖音娱乐榜榜时出现问题:{}"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/FengHuangSearchCrawler.java
View file @
1d9b8289
...
...
@@ -5,6 +5,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -34,7 +35,7 @@ public class FengHuangSearchCrawler {
String
url
=
"https://nine.ifeng.com/hotspotlist?gv=7.9.1&page="
+
page
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"凤凰新闻热榜页面连接异常..."
,
cause
);
...
...
@@ -74,7 +75,7 @@ public class FengHuangSearchCrawler {
String
url
=
"https://shankapi.ifeng.com/autumn/sogouSearchHotword"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"凤凰新闻热搜页面连接异常..."
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/HotSearch36KrCrawler.java
View file @
1d9b8289
...
...
@@ -2,6 +2,7 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -46,7 +47,7 @@ public class HotSearch36KrCrawler {
headerMap
.
put
(
"sec-fetch-dest"
,
"empty"
);
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/HuXiuHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -5,6 +5,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -42,7 +43,7 @@ public class HuXiuHotSearchCrawler {
headerMap
.
put
(
"sec-ch-ua"
,
" Not A;Brand\";v=\"99\", \"Chromium\";v=\"101\", \"Microsoft Edge\";v=\"101"
);
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/KuaiShouHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -4,6 +4,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -36,7 +37,7 @@ public class KuaiShouHotSearchCrawler {
String
url
=
"https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/MaiMaiHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -5,6 +5,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -35,7 +36,7 @@ public class MaiMaiHotSearchCrawler {
String
url
=
"https://open.taou.com/maimai/feed/v6/hot_posts_list?tab=profession&count=15&version=5.3.34&u=232258287&access_token=1.4c82e8ad6d6b4e03262a48f334dea336"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"脉脉热榜页面连接异常..."
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.util.HeaderTool
;
...
...
@@ -46,7 +47,7 @@ public class SougoHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
cause
);
...
...
@@ -90,7 +91,7 @@ public class SougoHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headMap
);
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/SouhuTopicCrawler.java
View file @
1d9b8289
...
...
@@ -4,6 +4,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -31,7 +32,7 @@ public class SouhuTopicCrawler {
String
url
=
"https://api.k.sohu.com/api/news/moment/v2/list.go?pageSize=50&v=6.4.4"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++){
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"搜狐话题页面连接失败"
,
cause
.
fillInStackTrace
());
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/TaoBaoHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -4,6 +4,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -35,7 +36,7 @@ public class TaoBaoHotSearchCrawler {
String
urls
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign=&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
Request
request1
=
RequestUtils
.
wrapGet
(
urls
);
String
token
=
null
;
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
cause
);
...
...
@@ -54,7 +55,7 @@ public class TaoBaoHotSearchCrawler {
String
sign
=
MD5Util
.
getMD5
(
signs
).
toLowerCase
();
String
url
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign="
+
sign
+
"&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Response
response1
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response1
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response1
.
hasCause
()){
Throwable
cause
=
response1
.
cause
();
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
View file @
1d9b8289
...
...
@@ -4,6 +4,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -35,7 +36,7 @@ public class TengXunCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
response
.
cause
().
printStackTrace
();
...
...
@@ -99,7 +100,7 @@ public class TengXunCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
response
.
cause
().
printStackTrace
();
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -49,7 +50,7 @@ public class ToutiaoHotSearchCrawler {
String
jsUrl
=
"https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js"
;
Request
jsRequest
=
RequestUtils
.
wrapGet
(
jsUrl
);
String
jsBody
=
null
;
Response
response
=
httpBoot
.
syncCall
(
jsRequest
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
jsRequest
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"获取今日头条实时热搜头部信息标识失败"
,
cause
);
...
...
@@ -65,7 +66,7 @@ public class ToutiaoHotSearchCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
Response
response1
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response1
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response1
.
hasCause
())
{
Throwable
cause
=
response1
.
cause
();
log
.
error
(
"解析今日头条实时热搜时出现连接失败"
,
cause
);
...
...
@@ -165,34 +166,29 @@ public class ToutiaoHotSearchCrawler {
String
htmlBody
=
null
;
String
url
=
hotSearchList
.
getUrl
();
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
i
=
0
;
i
<=
5
;
i
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyServerSupplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析今日头条热搜详情页面出现连接失败"
,
cause
);
}
else
{
htmlBody
=
response
.
bodyString
();
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
".result-content .cs-view .cs-topone-tail .cs-view .margin-bottom-m .margin-left-m"
);
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
())
{
Element
element
=
elements
.
first
();
String
readCount
=
element
.
text
().
replaceAll
(
"阅读"
,
""
);
Long
count
=
TipsUtils
.
getHotCount
(
readCount
);
log
.
info
(
"{},阅读量:{}"
,
hotSearchList
.
getName
(),
count
);
hotSearchList
.
setCommentCount
(
count
);
if
(
StringUtils
.
isNotBlank
(
htmlBody
)&&
htmlBody
.
contains
(
"data"
))
{
try
{
String
substring
=
htmlBody
.
substring
(
htmlBody
.
indexOf
(
"read_count"
)+
12
,
htmlBody
.
indexOf
(
"search_bar_controll"
));
String
s
=
substring
.
split
(
","
)[
0
];
Long
commentCount
=
Long
.
valueOf
(
s
);
hotSearchList
.
setCommentCount
(
commentCount
);
hotSearchListDAO
.
updateTouTiaoReadCount
(
hotSearchList
);
return
hotSearchList
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
ZhiWeiTools
.
sleep
(
1000L
);
}
}
return
hotSearchList
;
}
/**
* 热搜类型
*
...
...
@@ -231,7 +227,7 @@ public class ToutiaoHotSearchCrawler {
headerMap
.
put
(
"User-Agent"
,
"com.ss.android.article.news/8770 (Linux; U; Android 9; zh_CN; Redmi 8; Build/PKQ1.190319.001; Cronet/TTNetVersion:a867b489 2022-03-11 QuicVersion:b314d107 2021-11-24) Accept-Encoding: gzip, deflate, br"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
htmlBody
=
null
;
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"获取今日头条榜单出错"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WangYiHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -5,6 +5,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -41,7 +42,7 @@ public class WangYiHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
htmlBody
=
null
;
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"网易新闻实时热榜页面连接异常..."
,
cause
);
...
...
@@ -84,7 +85,7 @@ public class WangYiHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
htmlBody
=
null
;
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"网易新闻跟贴热议页面连接异常..."
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoBrandCrawler.java
View file @
1d9b8289
...
...
@@ -4,6 +4,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -42,7 +43,7 @@ public class WeiBoBrandCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌总榜时出现连接失败"
,
cause
);
...
...
@@ -76,7 +77,7 @@ public class WeiBoBrandCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌汽车榜时出现连接失败"
,
cause
);
...
...
@@ -110,7 +111,7 @@ public class WeiBoBrandCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌手机榜时出现连接失败"
,
cause
);
...
...
@@ -144,7 +145,7 @@ public class WeiBoBrandCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌美妆榜时出现连接失败"
,
cause
);
...
...
@@ -178,7 +179,7 @@ public class WeiBoBrandCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌奢侈品榜时出现连接失败"
,
cause
);
...
...
@@ -212,7 +213,7 @@ public class WeiBoBrandCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌食品饮料榜时出现连接失败"
,
cause
);
...
...
@@ -246,7 +247,7 @@ public class WeiBoBrandCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌家电榜时出现连接失败"
,
cause
);
...
...
@@ -280,7 +281,7 @@ public class WeiBoBrandCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌服装鞋帽榜时出现连接失败"
,
cause
);
...
...
@@ -314,7 +315,7 @@ public class WeiBoBrandCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//重试两次
for
(
int
x
=
0
;
x
<
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博品牌服装鞋帽榜时出现连接失败"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoSearchBoxHotWordsCrawler.java
View file @
1d9b8289
...
...
@@ -4,6 +4,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoSearchBoxHotWords
;
...
...
@@ -36,7 +37,7 @@ public class WeiBoSearchBoxHotWordsCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博搜索框热词时出现解析错误,页面结构有问题"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiBoSearchHotWordsCrawler.java
View file @
1d9b8289
...
...
@@ -4,6 +4,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoSearchBoxHotWords
;
...
...
@@ -35,7 +36,7 @@ public class WeiBoSearchHotWordsCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiShiHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -4,6 +4,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -44,7 +45,7 @@ public class WeiShiHotSearchCrawler {
headerMap
.
put
(
"Host"
,
"api.weishi.qq.com"
);
Request
request
=
RequestUtils
.
wrapPost
(
url
,
headerMap
,
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
"{\"req_body\":{\"hotRankID\":\"\",\"attachInfo\":\"\",\"hotRankType\":1,\"sourceID\":\"WSSearchH5\"}}"
));
for
(
int
count
=
0
;
count
<=
3
;
count
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微视热榜时出现连接失败"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboEntertainmentCrawler.java
View file @
1d9b8289
...
...
@@ -4,6 +4,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -37,7 +38,7 @@ public class WeiboEntertainmentCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博娱乐榜时出现连接失败"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -12,6 +12,7 @@ import java.util.stream.Collectors;
import
com.alibaba.fastjson.JSON
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.*
;
...
...
@@ -67,7 +68,7 @@ public class WeiboHotSearchCrawler {
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
if
(
i
==
2
)
{
return
list
;
...
...
@@ -263,7 +264,7 @@ public class WeiboHotSearchCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博时热搜时出现连接失败"
,
cause
);
...
...
@@ -361,7 +362,7 @@ public class WeiboHotSearchCrawler {
String
url
=
"https://api.weibo.cn/2/guest/page?c=android&s=3d477777&from=10A8395010&gsid=_2AkMoFNQvf8NhqwJRm_gWy2rkbo1_yA7EieKeSCX0JRM3HRl-wT9kqkIltRV6A-gElEGNj31RgrfclQ31YPAf7UBZPBx2&containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜时出现连接失败"
,
cause
);
...
...
@@ -419,7 +420,7 @@ public class WeiboHotSearchCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
cause
);
...
...
@@ -517,7 +518,7 @@ public class WeiboHotSearchCrawler {
//该cookie有效期一年,微博pc端获取游客cookie链接 https://s.weibo.com/top/summary?cate=realtimehot
headerMap
.
put
(
"Cookie"
,
"SUB=_2AkMUShJMf8NxqwJRmP0RyWvgb4RwwgnEieKiFuOXJRMxHRl-yT92qlQvtRB6P8o8oso9Ew-s6vf16fdCca-Xz6DwwAMH; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFdAobr6HdAbgQQ9vbUQKDx"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博时热搜时出现连接失败"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboNewsCrawler.java
View file @
1d9b8289
...
...
@@ -5,6 +5,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -48,7 +49,7 @@ public class WeiboNewsCrawler {
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
List
<
HotSearchList
>
result
=
new
ArrayList
();
//发送第一次请求获取前20条数据
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request1
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"第一次请求解析微博要闻榜时出现连接失败"
,
cause
);
...
...
@@ -70,7 +71,7 @@ public class WeiboNewsCrawler {
continue
;
}
//发送第二次请求获取中间20条数据
Response
response1
=
httpBoot
.
syncCall
(
request2
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response1
=
httpBoot
.
syncCall
(
request2
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response1
.
hasCause
()){
Throwable
cause
=
response1
.
cause
();
log
.
error
(
"第二次请求解析微博要闻榜时出现连接失败"
,
cause
);
...
...
@@ -93,7 +94,7 @@ public class WeiboNewsCrawler {
continue
;
}
//发送第三次请求获取最后10条数据
Response
response2
=
httpBoot
.
syncCall
(
request3
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response2
=
httpBoot
.
syncCall
(
request3
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response2
.
hasCause
()){
Throwable
cause
=
response2
.
cause
();
log
.
error
(
"第三次请求解析微博要闻榜时出现连接失败"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboOutCircleCrawler.java
View file @
1d9b8289
...
...
@@ -2,6 +2,7 @@ package com.zhiwei.searchhotcrawler.crawler;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -42,7 +43,7 @@ public class WeiboOutCircleCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
x
=
0
;
x
<=
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博出圈榜时出现连接失败"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboSuperTopicCrawler.java
View file @
1d9b8289
...
...
@@ -10,6 +10,7 @@ import java.util.Objects;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
...
...
@@ -64,7 +65,7 @@ public class WeiboSuperTopicCrawler {
String
htmlBody
=
null
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"获取榜单列表页面时出现错误,错误为:{}"
,
cause
);
...
...
@@ -140,7 +141,7 @@ public class WeiboSuperTopicCrawler {
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid="
+
id
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
htmlBody
=
null
;
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析榜单详情页面时出现错误,错误为:{}"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboTopicCrawler.java
View file @
1d9b8289
...
...
@@ -5,6 +5,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -139,7 +140,7 @@ public class WeiboTopicCrawler {
String
htmlBody
=
null
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
5
;
retryTimes
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"下载榜单列表页面时出现错误,错误为:{}"
,
cause
);
...
...
@@ -228,7 +229,7 @@ public class WeiboTopicCrawler {
String
htmlBody
=
null
;
//重试三次
for
(
int
retryTimes
=
1
;
retryTimes
<=
3
;
retryTimes
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"下载榜单列表页面时出现错误,错误为:{}"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboVideoCrawler.java
View file @
1d9b8289
...
...
@@ -4,6 +4,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -41,7 +42,7 @@ public class WeiboVideoCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
x
=
0
;
x
<=
2
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博视频榜时出现连接失败"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/XinLangHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -4,6 +4,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -39,7 +40,7 @@ public class XinLangHotSearchCrawler {
String
htmlBody
=
null
;
JSONObject
jsonObject
=
null
;
for
(
int
t
=
0
;
t
<
3
&&
jsonObject
==
null
;
t
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"新浪热榜页面连接异常..."
,
cause
);
...
...
@@ -115,7 +116,7 @@ public class XinLangHotSearchCrawler {
String
htmlBody
=
null
;
JSONArray
dataJson
=
null
;
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"新浪热点页面连接异常..."
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuChildHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -7,6 +7,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -41,7 +42,7 @@ public class ZhihuChildHotSearchCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
&&
dataJson
==
null
;
t
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
response
.
cause
().
printStackTrace
();
}
else
{
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
1d9b8289
...
...
@@ -5,6 +5,7 @@ import java.util.*;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
io.netty.handler.ssl.SslProvider
;
...
...
@@ -99,7 +100,7 @@ public class ZhihuHotSearchCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
x
=
0
;
x
<=
5
;
x
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
debug
(
"获取知乎热搜时出现问题:{}"
,
cause
);
...
...
@@ -170,7 +171,7 @@ public class ZhihuHotSearchCrawler {
Map
.
put
(
"cookie"
,
"_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
Map
);
try
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
())
{
Throwable
cause
=
response
.
cause
();
log
.
error
(
"单条知乎热搜数据页面连接失败"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuTopicSearchCrawler.java
View file @
1d9b8289
...
...
@@ -4,6 +4,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -25,7 +26,7 @@ public class ZhihuTopicSearchCrawler {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
t
=
0
;
t
<
3
&&
jsonObject
==
null
;
t
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"知乎热搜页面连接异常"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
1d9b8289
package
com
.
zhiwei
.
searchhotcrawler
.
run
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.http.proxy.CynomysFactory
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumer
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.*
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.dubbo.config.ApplicationConfig
;
import
org.apache.dubbo.config.ConsumerConfig
;
import
org.apache.dubbo.config.RegistryConfig
;
import
org.springframework.context.ApplicationContext
;
import
org.springframework.context.support.ClassPathXmlApplicationContext
;
...
...
@@ -15,9 +19,34 @@ public class HotSearchRun {
public
static
void
main
(
String
[]
args
)
{
ApplicationContext
context
=
new
ClassPathXmlApplicationContext
(
"applicationContext.xml"
);
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
// SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
// .group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
// ProxyFactory.init(simpleConfig);
ApplicationConfig
applicationConfig
=
new
ApplicationConfig
();
applicationConfig
.
setName
(
"hot_search-project"
);
RegistryConfig
registryConfig
=
new
RegistryConfig
();
ConsumerConfig
consumerConfig
=
new
ConsumerConfig
();
String
username
=
null
;
String
password
=
null
;
if
(
ProxyConfig
.
isLocal
)
{
registryConfig
.
setAddress
(
ProxyConfig
.
localRegistry
);
// 设置分组
consumerConfig
.
setGroup
(
ProxyConfig
.
localGroup
);
username
=
ProxyConfig
.
localUsername
;
password
=
ProxyConfig
.
localPassword
;
}
else
{
registryConfig
.
setAddress
(
ProxyConfig
.
hangzhouRegistry
);
// 设置分组
consumerConfig
.
setGroup
(
ProxyConfig
.
hangzhouGroup
);
username
=
ProxyConfig
.
hangzhouUsername
;
password
=
ProxyConfig
.
hangzhouPassword
;
}
// 创建 consumer,applicationConfig 非必需参数
CynomysConsumer
consumer
=
CynomysConsumerFactory
.
create
(
applicationConfig
,
registryConfig
,
consumerConfig
,
username
,
password
);
// 初始化 http-boot 桥接
CynomysFactory
.
init
(
consumer
);
new
UpdateWechatUserRun
().
start
();
ZhiWeiTools
.
sleep
(
10000
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearch36KrCrawlerTest.java
View file @
1d9b8289
...
...
@@ -3,6 +3,7 @@ package com.zhiwei.searchhotcrawler.test;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -40,7 +41,7 @@ public class HotSearch36KrCrawlerTest {
String
url
=
"https://www.36kr.com/hot-list/catalog"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchRunTest.java
View file @
1d9b8289
...
...
@@ -2,7 +2,6 @@ package com.zhiwei.searchhotcrawler.test;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun
;
...
...
@@ -13,9 +12,9 @@ import java.text.ParseException;
public
class
HotSearchRunTest
{
public
static
void
main
(
String
[]
args
)
throws
ParseException
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
//
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
//
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
//
ProxyFactory.init(simpleConfig);
//微博热搜开始采集
// new WeiboHotSearchRun().start();
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HuXiuHotSearchCrawlerTest.java
View file @
1d9b8289
...
...
@@ -2,6 +2,7 @@ package com.zhiwei.searchhotcrawler.test;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -39,7 +40,7 @@ public class HuXiuHotSearchCrawlerTest {
String
url
=
"https://www.huxiu.com/"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/Job51Test.java
View file @
1d9b8289
...
...
@@ -7,7 +7,7 @@ import com.mongodb.client.MongoDatabase;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.Proxy
Factory
;
import
com.zhiwei.http.proxy.Proxy
ServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
...
...
@@ -36,9 +36,9 @@ public class Job51Test {
public
static
void
main
(
String
[]
args
)
{
// ApplicationContext context = new ClassPathXmlApplicationContext("applicationContext.xml");
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
//
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
//
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
//
ProxyFactory.init(simpleConfig);
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
3
).
build
();
// MongoDatabase mongoDBLocal = MongoDBLocalTemplate.getDB(DBConfig.dbName);
...
...
@@ -64,7 +64,7 @@ public class Job51Test {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
header
);
for
(
int
t
=
0
;
t
<
1
&&
jsonObject
==
null
;
t
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"知乎热搜页面连接异常"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/KuaiShouHotSearchCrawlerTest.java
View file @
1d9b8289
...
...
@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -37,7 +38,7 @@ public class KuaiShouHotSearchCrawlerTest {
String
url
=
"https://video.kuaishou.com/?utm_source=aa&utm_medium=05&utm_campaign=aa_05_pp_yr&plan_id=138090084&unit_id=5205658029&creative_id=43661481717&keyword_id=202928529242&keyword=202928529242&bd_vid=11937382025080724791"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析快手热榜时出现解析错误,页面结构有问题"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/TaoBaoHotSearchCrawlerTest.java
View file @
1d9b8289
...
...
@@ -5,6 +5,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -37,7 +38,7 @@ public class TaoBaoHotSearchCrawlerTest {
String
urls
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign=&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
Request
request1
=
RequestUtils
.
wrapGet
(
urls
);
String
token
=
null
;
Response
response1
=
httpBoot
.
syncCall
(
request1
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response1
=
httpBoot
.
syncCall
(
request1
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response1
.
hasCause
()){
Throwable
cause
=
response1
.
cause
();
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
cause
);
...
...
@@ -56,7 +57,7 @@ public class TaoBaoHotSearchCrawlerTest {
String
sign
=
MD5Util
.
getMD5
(
signs
).
toLowerCase
();
String
url
=
"https://acs.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?appKey=12574478&t="
+
time
+
"&sign="
+
sign
+
"&api=mtop.relationrecommend.WirelessRecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22appId%22%3A%2210211%22%2C%22params%22%3A%22%7B%5C%22multi_hintq_show%5C%22%3A%5C%22on%5C%22%2C%5C%22src%5C%22%3A%5C%22c2c%5C%22%2C%5C%22area%5C%22%3A%5C%22active_page%5C%22%2C%5C%22sversion%5C%22%3A%5C%227.5%5C%22%2C%5C%22bangdan_src%5C%22%3A%5C%22list%5C%22%7D%22%7D"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析淘宝热搜时出现解析错误,页面结构有问题"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/TaoBaoRunTest.java
View file @
1d9b8289
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.timer.BaiduHotSearchRun
;
...
...
@@ -13,9 +12,9 @@ import java.text.ParseException;
public
class
TaoBaoRunTest
{
public
static
void
main
(
String
[]
args
)
throws
ParseException
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
//
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
//
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
//
ProxyFactory.init(simpleConfig);
//微博热搜开始采集
// new WeiboHotSearchRun().start();
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/WeiboEntertainmentCrawlerTest.java
View file @
1d9b8289
...
...
@@ -5,6 +5,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
...
...
@@ -41,7 +42,7 @@ public class WeiboEntertainmentCrawlerTest {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博娱乐榜时出现连接失败"
,
cause
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
1d9b8289
...
...
@@ -113,8 +113,13 @@ public class GatherTimer {
log
.
info
(
"{}, 今日头条此轮采集到的数据量为:{}"
,
new
Date
(),
toutiaoList
!=
null
?
toutiaoList
.
size
()
:
0
);
TipsUtils
.
addHotList
(
HotSearchType
.
今日头条热搜
.
name
(),
toutiaoList
);
log
.
info
(
"今日头条热搜采集结束..."
);
log
.
info
(
"今日头条热搜详情趋势阅读量更新..."
);
TouTiaoExecutor
.
countTouTiaoReadCount
(
toutiaoList
);
//暂停今日头条阅读量更新
// log.info("今日头条热搜详情趋势阅读量更新开始...");
// //TouTiaoExecutor.countTouTiaoReadCount(toutiaoList);
// for (HotSearchList hotSearchList : toutiaoList) {
// ToutiaoHotSearchCrawler.toutiaoReadCount(hotSearchList);
// }
// log.info("今日头条热搜详情趋势阅读量更新结束...");
}
/**
...
...
@@ -362,7 +367,7 @@ public class GatherTimer {
* 知乎热搜数码分类采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"20 * * * * ? "
)
//
@Scheduled(cron = "20 * * * * ? ")
public
void
crawlerZhiHuDigital
(){
this
.
crawlerZhiHuChild
(
DIGITAL
);
}
...
...
@@ -428,7 +433,7 @@ public class GatherTimer {
* 微博超话的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 0 0/3 * * ? "
)
//
@Scheduled(cron = "0 0 0/3 * * ? ")
public
void
crawlerWeiBoSuperTopic
(){
log
.
info
(
"微博超话采集开始........"
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
...
...
@@ -675,7 +680,7 @@ public class GatherTimer {
*微博热词采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 0 0/1 * * ? "
)
//
@Scheduled(cron = "0 0 0/1 * * ? ")
public
void
WeiBoSearchHotWordsCrawler
(){
log
.
info
(
"微博热词采集开始........"
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
...
...
src/main/resources/proxyip.properties
View file @
1d9b8289
registry
=
zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182&timeout=60000
group
=
hangzhou
isLocal
=
false
hangzhou.registry
=
zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182&timeout=60000
hangzhou.group
=
hangzhou
hangzhou.username
=
hot_search
hangzhou.password
=
gRG9QJ6QghuLcCC9
########################################################
#registry=zookeeper://192.168.0.35:2181?backup=192.168.0.30:2181,192.168.0.11:2181&timeout=60000
#group=local
local.registry
=
zookeeper://192.168.0.35:2181?backup=192.168.0.30:2181,192.168.0.11:2181&timeout=60000
local.group
=
local
#local.username=15139460980
#local.password=lllq2w3e4r
local.username
=
15757871020
local.password
=
Cwt1q2w3e4r@
src/test/java/InfoTest/InfoTest.java
View file @
1d9b8289
package
InfoTest
;
import
com.mongodb.client.MongoCollection
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
...
...
@@ -36,9 +36,9 @@ public class InfoTest {
@Test
public
void
testMaimai
(){
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
//
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
//
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
//
ProxyFactory.init(simpleConfig);
List
<
HotSearchList
>
hotSearchLists
=
MaiMaiHotSearchCrawler
.
getMaiMaiHotData
(
new
Date
());
...
...
src/test/java/baiduTest/BaiduTest.java
View file @
1d9b8289
package
baiduTest
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
...
...
@@ -62,9 +62,9 @@ public class BaiduTest {
@Test
public
void
test
(){
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
//
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
//
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
//
ProxyFactory.init(simpleConfig);
List
<
HotSearchList
>
hotSearchLists
=
baiduHotSearch
(
new
Date
());
...
...
@@ -122,7 +122,7 @@ public class BaiduTest {
// headers.put("Content-type","text/html; charset=gb2312");
// Request request = RequestUtils.wrapGet(url, HeadersUtils.convertRepeatably(headers, Charset.forName("gb2312")));
Request
request
=
RequestUtils
.
wrapGet
(
url
);
// try(Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY)) {
// try(Response response = httpBoot.syncCall(request, ProxyS
erverS
upplier.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// } catch (Exception e) {
// log.error("解析百度风云榜时出现解析错误,页面结构有问题", e);
...
...
src/test/java/hotSaerchTest/HotSearchTest.java
View file @
1d9b8289
...
...
@@ -5,7 +5,7 @@ import com.mongodb.client.MongoCollection;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.Proxy
Factory
;
import
com.zhiwei.http.proxy.Proxy
ServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
...
...
@@ -51,9 +51,9 @@ public class HotSearchTest {
@Test
public
void
kuaiShouTestCrawler
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
//
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
//
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
//
ProxyFactory.init(simpleConfig);
List
<
HotSearchList
>
hotSearchLists
=
KuaiShouHotSearchCrawlerTest
.
KuaiShouHotSearchCrawler
(
new
Date
());
System
.
out
.
println
(
hotSearchLists
);
...
...
@@ -64,15 +64,15 @@ public class HotSearchTest {
@Test
public
void
WeiBoUpdate
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
//
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
//
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
//
ProxyFactory.init(simpleConfig);
Document
document
=
new
Document
();
//String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26t%3D10%26q%3D%23我国新冠疫苗接种剂次超9亿%23";
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23可口可乐回应C罗拒绝与可乐同框%23"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
cause
);
...
...
@@ -135,9 +135,9 @@ public class HotSearchTest {
@Test
public
void
taoBaoTestCrawler
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
//
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
//
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
//
ProxyFactory.init(simpleConfig);
List
<
HotSearchList
>
hotSearchLists
=
TaoBaoHotSearchCrawlerTest
.
taoBaoHotSearch
(
new
Date
());
...
...
@@ -152,9 +152,9 @@ public class HotSearchTest {
@Test
public
void
baiDuTestCrawler
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
//
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
//
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
//
ProxyFactory.init(simpleConfig);
List
<
HotSearchList
>
hotSearchLists
=
BaiDuHotSearchCrawler
.
baiduHotSearch
(
new
Date
());
System
.
out
.
println
(
hotSearchLists
);
...
...
src/test/java/proxy/ProxyTest.java
0 → 100644
View file @
1d9b8289
package
proxy
;
import
com.zhiwei.http.proxy.CynomysFactory
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumer
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.crawler.HotSearch36KrCrawler
;
import
lombok.extern.log4j.Log4j2
;
import
org.apache.dubbo.config.ApplicationConfig
;
import
org.apache.dubbo.config.ConsumerConfig
;
import
org.apache.dubbo.config.RegistryConfig
;
import
org.junit.Test
;
import
org.junit.runner.RunWith
;
import
org.springframework.test.context.ContextConfiguration
;
import
org.springframework.test.context.junit4.SpringJUnit4ClassRunner
;
import
java.util.Date
;
import
java.util.List
;
/**
* @author cwt
* @date 2022/11/8 10:22
*/
@Log4j2
@RunWith
(
SpringJUnit4ClassRunner
.
class
)
@ContextConfiguration
(
locations
=
{
"classpath:applicationContext.xml"
})
public
class
ProxyTest
{
@Test
public
void
initTest
()
{
ApplicationConfig
applicationConfig
=
new
ApplicationConfig
();
applicationConfig
.
setName
(
"hot_search-project"
);
RegistryConfig
registryConfig
=
new
RegistryConfig
();
ConsumerConfig
consumerConfig
=
new
ConsumerConfig
();
String
username
=
null
;
String
password
=
null
;
if
(
ProxyConfig
.
isLocal
)
{
registryConfig
.
setAddress
(
ProxyConfig
.
localRegistry
);
// 设置分组
consumerConfig
.
setGroup
(
ProxyConfig
.
localGroup
);
username
=
ProxyConfig
.
localUsername
;
password
=
ProxyConfig
.
localPassword
;
}
else
{
registryConfig
.
setAddress
(
ProxyConfig
.
hangzhouRegistry
);
// 设置分组
consumerConfig
.
setGroup
(
ProxyConfig
.
hangzhouGroup
);
username
=
ProxyConfig
.
hangzhouUsername
;
password
=
ProxyConfig
.
hangzhouPassword
;
}
// 创建 consumer,applicationConfig 非必需参数
CynomysConsumer
consumer
=
CynomysConsumerFactory
.
create
(
applicationConfig
,
registryConfig
,
consumerConfig
,
username
,
password
);
// 初始化 http-boot 桥接
CynomysFactory
.
init
(
consumer
);
log
.
info
(
"桥接初始化完成"
);
try
{
Thread
.
sleep
(
1000L
);
}
catch
(
InterruptedException
e
)
{
e
.
printStackTrace
();
}
List
<
HotSearchList
>
hotSearchLists
=
HotSearch36KrCrawler
.
hotSearch36Kr
(
new
Date
());
hotSearchLists
.
forEach
(
System
.
out
::
println
);
}
@Test
public
void
configTest
()
{
System
.
out
.
println
(
ProxyConfig
.
isLocal
);
System
.
out
.
println
(
ProxyConfig
.
hangzhouGroup
);
}
}
src/test/java/weiboTest/WeiboHotSearchTest.java
View file @
1d9b8289
...
...
@@ -6,7 +6,8 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.boot.Response
;
import
com.zhiwei.http.proxy.ProxyFactory
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
...
...
@@ -69,9 +70,9 @@ public class WeiboHotSearchTest {
@Test
public
void
testHotWeibo
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
//
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
//
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
//
ProxyFactory.init(simpleConfig);
while
(
true
)
{
try
{
...
...
@@ -110,9 +111,9 @@ public class WeiboHotSearchTest {
//org.bson.Document document
// @Test
public
void
test12
(
org
.
bson
.
Document
document
)
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
//
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
//
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
//
ProxyFactory.init(simpleConfig);
// org.bson.Document document = new org.bson.Document();
// document.put("name","新疆人讲述真实的新疆");
// document.put("url","https://m.weibo.cn/search?containerid=100103type%3D1%26t%3D10%26q%3D%23%E6%96%B0%E7%96%86%E4%BA%BA%E8%AE%B2%E8%BF%B0%E7%9C%9F%E5%AE%9E%E7%9A%84%E6%96%B0%E7%96%86%23&isnewpage=1&extparam=seat%3D1%26filter_type%3Drealtimehot%26dgr%3D0%26cate%3D0%26pos%3D1%26realpos%3D2%26flag%3D1%26c_type%3D31%26display_time%3D1622705918&luicode=10000011&lfid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot");
...
...
@@ -123,7 +124,7 @@ public class WeiboHotSearchTest {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
2
;
count
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
cause
);
...
...
@@ -216,7 +217,7 @@ public class WeiboHotSearchTest {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
2
;
count
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
cause
);
...
...
@@ -532,7 +533,7 @@ public class WeiboHotSearchTest {
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxySupplier
.
NAT_HEAVY_PROXY
);
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyS
erverS
upplier
.
NAT_HEAVY_PROXY
);
if
(
response
.
hasCause
()){
Throwable
cause
=
response
.
cause
();
log
.
error
(
"解析微博时热搜时出现连接失败"
,
cause
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment