Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
b3fce9ac
Commit
b3fce9ac
authored
Mar 22, 2022
by
chenweiyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
基础包升级版
parent
d705de1f
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
72 additions
and
56 deletions
+72
-56
pom.xml
+22
-5
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+10
-9
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+10
-10
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+8
-7
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+10
-9
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
+11
-14
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
+1
-2
No files found.
pom.xml
View file @
b3fce9ac
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<version>
0.
3
.0-SNAPSHOT
</version>
<version>
0.
4
.0-SNAPSHOT
</version>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
...
@@ -30,13 +30,30 @@
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.
2.4
-SNAPSHOT
</version>
<version>
0.
3.2
-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.6.6.8
-SNAPSHOT
</version>
<groupId>
com.zhiwei.http
</groupId>
<artifactId>
http-boot
</artifactId>
<version>
0.0.4.3
-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.async
</groupId>
<artifactId>
task-boot
</artifactId>
<version>
0.20.1-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.kohlschutter.boilerpipe
</groupId>
<artifactId>
boilerpipe-extractor
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
proxy-client
</artifactId>
<version>
1.1.5-SNAPSHOT
</version>
</dependency>
</dependencies>
<!-- 打包管理 -->
...
...
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
b3fce9ac
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.time.Duration
;
import
java.util.Objects
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
public
class
ContentCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
connectTimeout
(
Duration
.
ofSeconds
(
10
)).
retryTimes
(
3
).
detectChineseCharset
(
true
).
build
();
/**
*
...
...
@@ -75,15 +76,15 @@ public class ContentCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
Proxy
Hold
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
Proxy
Suppli
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
s
tring
(),
attr
,
callback
);
parseHtml
(
rs
.
body
S
tring
(),
attr
,
callback
);
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"搜索结果访问失败:
{}
"
,
ex
);
logger
.
info
(
"搜索结果访问失败:
"
,
ex
);
}
finally
{
counter
.
done
();
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
b3fce9ac
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.time.Duration
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -34,7 +34,7 @@ import okhttp3.Request;
public
class
MediaSelfSourceCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
MediaSelfSourceCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
connectTimeout
(
Duration
.
ofSeconds
(
10
)).
retryTimes
(
3
).
detectChineseCharset
(
true
).
build
();
/**
*
...
...
@@ -91,7 +91,7 @@ public class MediaSelfSourceCrawler {
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Proxy
Holder
ph
=
ProxyHold
er
.
NAT_HEAVY_PROXY
;
Proxy
Supplier
ph
=
ProxySuppli
er
.
NAT_HEAVY_PROXY
;
if
(
url
.
contains
(
"toutiao.com"
))
{
map
.
put
(
"referer"
,
url
);
}
...
...
@@ -104,7 +104,7 @@ public class MediaSelfSourceCrawler {
try
{
if
(
Objects
.
isNull
(
ex
))
{
try
{
parseHtml
(
rs
.
body
().
s
tring
(),
attr
,
callback
,
rs
.
request
().
url
().
uri
().
toString
());
parseHtml
(
rs
.
body
S
tring
(),
attr
,
callback
,
rs
.
request
().
url
().
uri
().
toString
());
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
}
...
...
@@ -169,7 +169,7 @@ public class MediaSelfSourceCrawler {
source
=
MatchSource
.
matchMediaSelfSource
(
url
+
eUrl
,
result
);
channel
=
MatchChannel
.
verifyChannel
(
url
);
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
Jsoup
.
parse
(
result
).
head
().
childNodes
();
List
<
org
.
jsoup
.
nodes
.
Node
>
nodeList
=
Jsoup
.
parse
(
result
).
head
().
childNodes
();
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
b3fce9ac
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.time.Duration
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
...
...
@@ -11,16 +12,16 @@ import org.jsoup.Jsoup;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -29,7 +30,7 @@ public class SourceForwardCrawler {
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SourceForwardCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
connectTimeout
(
Duration
.
ofSeconds
(
10
)).
retryTimes
(
3
).
detectChineseCharset
(
true
).
build
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
public
GroupSync
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
...
...
@@ -77,10 +78,10 @@ public class SourceForwardCrawler {
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
Proxy
Hold
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
Proxy
Suppli
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
s
tring
(),
attr
,
callback
);
parseHtml
(
rs
.
body
S
tring
(),
attr
,
callback
);
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
b3fce9ac
...
...
@@ -2,6 +2,7 @@ package com.zhiwei.source_forward.crawler;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.time.Duration
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.List
;
...
...
@@ -17,13 +18,13 @@ import org.jsoup.nodes.Document;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONPath
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.proxy.ProxySupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -39,7 +40,7 @@ import okhttp3.Request;
public
class
UrlLiveCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
true
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
connectTimeout
(
Duration
.
ofSeconds
(
10
)).
retryTimes
(
3
).
detectChineseCharset
(
true
).
build
();
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
...
...
@@ -71,12 +72,12 @@ public class UrlLiveCrawler {
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
Proxy
Holder
ph
=
ProxyHold
er
.
NAT_HEAVY_PROXY
;
Proxy
Supplier
ph
=
ProxySuppli
er
.
NAT_HEAVY_PROXY
;
if
(
url
.
contains
(
"toutiao.com"
)){
ph
=
Proxy
Hold
er
.
NAT_HEAVY_PROXY
;
ph
=
Proxy
Suppli
er
.
NAT_HEAVY_PROXY
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
ph
=
Proxy
Hold
er
.
NAT_HEAVY_PROXY
;
ph
=
Proxy
Suppli
er
.
NAT_HEAVY_PROXY
;
}
try
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
...
...
@@ -87,7 +88,7 @@ public class UrlLiveCrawler {
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
isSuccessful
())
{
parseHtml
(
rs
.
body
().
s
tring
(),
attr
,
callback
);
parseHtml
(
rs
.
body
S
tring
(),
attr
,
callback
);
}
else
if
(
rs
.
code
()
==
404
){
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
{
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
View file @
b3fce9ac
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.time.Duration
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.HashMap
;
...
...
@@ -13,19 +14,18 @@ import org.jsoup.Jsoup;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.async.TaskBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.task.async.TaskBoot
;
import
com.zhiwei.task.sync.GroupSync
;
import
okhttp3.Request
;
import
okhttp3.Response
;
public
class
UrlLiveCrawlerNew
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawlerNew
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
fals
e
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
connectTimeout
(
Duration
.
ofSeconds
(
10
)).
retryTimes
(
3
).
detectChineseCharset
(
tru
e
).
build
();
public
List
<
UrlLiveBean
>
judgeIsDelete
(
List
<
String
>
urlList
)
{
GroupSync
counter
=
new
GroupSync
();
...
...
@@ -74,14 +74,11 @@ public class UrlLiveCrawlerNew {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
int
code
=
404
;
for
(
int
i
=
0
;
i
<
2
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
)){
if
(
response
.
isSuccessful
())
{
return
matchDel
(
response
.
body
().
string
(),
url
);
}
else
{
code
=
response
.
code
();
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析"
,
e
);
com
.
zhiwei
.
http
.
boot
.
Response
response
=
httpBoot
.
syncCall
(
request
);
if
(
response
.
isSuccessful
())
{
return
matchDel
(
response
.
bodyString
(),
url
);
}
else
{
code
=
response
.
code
();
}
}
if
(
code
==
403
){
...
...
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
View file @
b3fce9ac
package
com
.
zhiwei
.
source_forward
.
util
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.source_forward.config.ProxyConfig
;
...
...
@@ -19,7 +18,7 @@ public class ProxyInit {
String
address
=
ProxyConfig
.
registry
;
String
appName
=
"xumiaoxin"
;
long
appId
=
ProxyConfig
.
proxyid
;
ProxyFactory
.
init
(
SimpleConfig
.
builder
().
registry
(
address
).
appName
(
appName
).
appId
(
appId
).
group
(
ProxyConfig
.
group
).
build
());
com
.
zhiwei
.
http
.
proxy
.
ProxyFactory
.
init
(
SimpleConfig
.
builder
().
registry
(
address
).
appName
(
appName
).
appId
(
appId
).
group
(
ProxyConfig
.
group
).
build
());
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment