Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
2c9d4fa2
Commit
2c9d4fa2
authored
Nov 02, 2022
by
chenweiyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
代理升级版本
parent
4860f41e
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
123 additions
and
63 deletions
+123
-63
pom.xml
+27
-5
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+14
-9
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+13
-8
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+12
-7
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+36
-25
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
+0
-0
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+1
-1
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+1
-1
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+0
-0
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
+19
-7
No files found.
pom.xml
View file @
2c9d4fa2
...
@@ -3,13 +3,17 @@
...
@@ -3,13 +3,17 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<artifactId>
source-forward
</artifactId>
<version>
0.3.
0
-SNAPSHOT
</version>
<version>
0.3.
1
-SNAPSHOT
</version>
<name>
source-forward
</name>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
<properties>
<properties>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
<http-boot.version>
0.1.0.8-SNAPSHOT
</http-boot.version>
<task-boot.version>
1.1.2-SNAPSHOT
</task-boot.version>
<boilerpipe.version>
0.0.1-SNAPSHOT
</boilerpipe.version>
<conomys-consumer.version>
0.0.3-SNAPSHOT
</conomys-consumer.version>
</properties>
</properties>
<developers>
<developers>
...
@@ -30,12 +34,30 @@
...
@@ -30,12 +34,30 @@
<dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.
2.4
-SNAPSHOT
</version>
<version>
0.
4.5
-SNAPSHOT
</version>
</dependency>
</dependency>
<dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<groupId>
com.kohlschutter.boilerpipe
</groupId>
<artifactId>
crawler-core
</artifactId>
<artifactId>
boilerpipe-extractor
</artifactId>
<version>
0.6.6.8-SNAPSHOT
</version>
<version>
${boilerpipe.version}
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.http
</groupId>
<artifactId>
http-boot
</artifactId>
<version>
${http-boot.version}
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.async
</groupId>
<artifactId>
task-boot
</artifactId>
<version>
${task-boot.version}
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.network
</groupId>
<artifactId>
cynomys-consumer
</artifactId>
<version>
${conomys-consumer.version}
</version>
</dependency>
</dependency>
</dependencies>
</dependencies>
...
...
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
2c9d4fa2
package
com
.
zhiwei
.
source_forward
.
crawler
;
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.Objects
;
import
java.util.Objects
;
import
java.util.concurrent.Semaphore
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
import
okhttp3.Request
;
public
class
ContentCrawler
{
public
class
ContentCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
2
).
build
();
private
static
Semaphore
semaphore
=
new
Semaphore
(
5
);
/**
/**
*
*
...
@@ -51,9 +53,12 @@ public class ContentCrawler {
...
@@ -51,9 +53,12 @@ public class ContentCrawler {
ZhiWeiTools
.
sleep
(
100
);
ZhiWeiTools
.
sleep
(
100
);
if
(
url
!=
null
)
{
if
(
url
!=
null
)
{
try
{
try
{
semaphore
.
acquire
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
logger
.
error
(
"搜索创建出错"
,
e
);
}
finally
{
semaphore
.
release
();
}
}
}
}
}
}
...
@@ -75,15 +80,15 @@ public class ContentCrawler {
...
@@ -75,15 +80,15 @@ public class ContentCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
counter
.
add
();
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
Proxy
Hold
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
Proxy
ServerSuppli
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
s
tring
(),
attr
,
callback
);
parseHtml
(
rs
.
body
S
tring
(),
attr
,
callback
);
}
else
{
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"搜索结果访问失败:
{}
"
,
ex
);
logger
.
info
(
"搜索结果访问失败: "
,
ex
);
}
finally
{
}
finally
{
counter
.
done
();
counter
.
done
();
}
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
2c9d4fa2
...
@@ -4,22 +4,23 @@ import java.util.HashMap;
...
@@ -4,22 +4,23 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
java.util.concurrent.Semaphore
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
import
okhttp3.Request
;
...
@@ -34,7 +35,8 @@ import okhttp3.Request;
...
@@ -34,7 +35,8 @@ import okhttp3.Request;
public
class
MediaSelfSourceCrawler
{
public
class
MediaSelfSourceCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
MediaSelfSourceCrawler
.
class
);
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
MediaSelfSourceCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
2
).
build
();
private
static
Semaphore
semaphore
=
new
Semaphore
(
5
);
/**
/**
*
*
...
@@ -69,9 +71,12 @@ public class MediaSelfSourceCrawler {
...
@@ -69,9 +71,12 @@ public class MediaSelfSourceCrawler {
counter
.
add
();
counter
.
add
();
if
(
url
!=
null
)
{
if
(
url
!=
null
)
{
try
{
try
{
semaphore
.
acquire
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
logger
.
error
(
"搜索创建出错"
,
e
);
}
finally
{
semaphore
.
release
();
}
}
}
}
counter
.
done
();
counter
.
done
();
...
@@ -91,7 +96,7 @@ public class MediaSelfSourceCrawler {
...
@@ -91,7 +96,7 @@ public class MediaSelfSourceCrawler {
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Proxy
Holder
ph
=
ProxyHold
er
.
NAT_HEAVY_PROXY
;
Proxy
ServerSupplier
ph
=
ProxyServerSuppli
er
.
NAT_HEAVY_PROXY
;
if
(
url
.
contains
(
"toutiao.com"
))
{
if
(
url
.
contains
(
"toutiao.com"
))
{
map
.
put
(
"referer"
,
url
);
map
.
put
(
"referer"
,
url
);
}
}
...
@@ -104,7 +109,7 @@ public class MediaSelfSourceCrawler {
...
@@ -104,7 +109,7 @@ public class MediaSelfSourceCrawler {
try
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
Objects
.
isNull
(
ex
))
{
try
{
try
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
,
rs
.
r
equest
().
url
().
uri
().
toString
());
parseHtml
(
rs
.
body
String
(),
attr
,
callback
,
rs
.
bootR
equest
().
url
().
uri
().
toString
());
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
logger
.
error
(
"解析出错"
,
e
);
}
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
2c9d4fa2
...
@@ -4,6 +4,7 @@ import java.util.HashMap;
...
@@ -4,6 +4,7 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
java.util.concurrent.Semaphore
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
...
@@ -11,16 +12,16 @@ import org.jsoup.Jsoup;
...
@@ -11,16 +12,16 @@ import org.jsoup.Jsoup;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
import
okhttp3.Request
;
...
@@ -29,8 +30,9 @@ public class SourceForwardCrawler {
...
@@ -29,8 +30,9 @@ public class SourceForwardCrawler {
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SourceForwardCrawler
.
class
);
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SourceForwardCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
2
).
build
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
private
static
Semaphore
semaphore
=
new
Semaphore
(
5
);
public
GroupSync
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
public
GroupSync
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
try
{
try
{
...
@@ -50,9 +52,12 @@ public class SourceForwardCrawler {
...
@@ -50,9 +52,12 @@ public class SourceForwardCrawler {
ZhiWeiTools
.
sleep
(
100
);
ZhiWeiTools
.
sleep
(
100
);
if
(
url
!=
null
)
{
if
(
url
!=
null
)
{
try
{
try
{
semaphore
.
acquire
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
logger
.
error
(
"搜索创建出错"
,
e
);
}
finally
{
semaphore
.
release
();
}
}
}
}
counter
.
done
();
counter
.
done
();
...
@@ -77,10 +82,10 @@ public class SourceForwardCrawler {
...
@@ -77,10 +82,10 @@ public class SourceForwardCrawler {
}
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
counter
.
add
();
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
Proxy
Hold
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
Proxy
ServerSuppli
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
s
tring
(),
attr
,
callback
);
parseHtml
(
rs
.
body
S
tring
(),
attr
,
callback
);
}
else
{
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
2c9d4fa2
package
com
.
zhiwei
.
source_forward
.
crawler
;
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.concurrent.Semaphore
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONPath
;
import
com.alibaba.fastjson.JSONPath
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
java.util.*
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
okhttp3.Request
;
/**
/**
* @author byte-zbs
* @author byte-zbs
...
@@ -32,7 +39,8 @@ import static java.util.Objects.nonNull;
...
@@ -32,7 +39,8 @@ import static java.util.Objects.nonNull;
public
class
UrlLiveCrawler
{
public
class
UrlLiveCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
true
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
2
).
build
();
private
static
Semaphore
semaphore
=
new
Semaphore
(
5
);
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
GroupSync
counter
=
new
GroupSync
();
...
@@ -43,17 +51,20 @@ public class UrlLiveCrawler {
...
@@ -43,17 +51,20 @@ public class UrlLiveCrawler {
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
try
{
counter
.
add
();
counter
.
add
();
ZhiWeiTools
.
sleep
(
100
);
semaphore
.
acquire
();
ZhiWeiTools
.
sleep
(
200
);
if
(
nonNull
(
url
))
{
if
(
nonNull
(
url
))
{
try
{
// ZhiWeiTools.sleep(3000);
// ZhiWeiTools.sleep(3000);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
logger
.
error
(
"搜索创建出错:"
,
e
);
}
}
finally
{
}
counter
.
done
();
counter
.
done
();
semaphore
.
release
();
}
}
}
}
}
}
}
...
@@ -63,12 +74,12 @@ public class UrlLiveCrawler {
...
@@ -63,12 +74,12 @@ public class UrlLiveCrawler {
url
=
dealUrl
(
url
);
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
Proxy
Holder
ph
=
ProxyHold
er
.
NAT_HEAVY_PROXY
;
Proxy
ServerSupplier
ph
=
ProxyServerSuppli
er
.
NAT_HEAVY_PROXY
;
if
(
url
.
contains
(
"toutiao.com"
))
{
if
(
url
.
contains
(
"toutiao.com"
))
{
ph
=
Proxy
Hold
er
.
NAT_HEAVY_PROXY
;
ph
=
Proxy
ServerSuppli
er
.
NAT_HEAVY_PROXY
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
url
=
treatZhihuUrl
(
url
);
ph
=
Proxy
Hold
er
.
NAT_HEAVY_PROXY
;
ph
=
Proxy
ServerSuppli
er
.
NAT_HEAVY_PROXY
;
}
}
try
{
try
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
...
@@ -80,7 +91,7 @@ public class UrlLiveCrawler {
...
@@ -80,7 +91,7 @@ public class UrlLiveCrawler {
System
.
out
.
println
(
rs
.
code
());
System
.
out
.
println
(
rs
.
code
());
if
(
Objects
.
isNull
(
ex
))
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
isSuccessful
())
{
if
(
rs
.
isSuccessful
())
{
parseHtml
(
rs
.
body
().
s
tring
(),
attr
,
callback
);
parseHtml
(
rs
.
body
S
tring
(),
attr
,
callback
);
}
else
if
(
rs
.
code
()
==
404
)
{
}
else
if
(
rs
.
code
()
==
404
)
{
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
{
}
else
{
...
@@ -91,7 +102,7 @@ public class UrlLiveCrawler {
...
@@ -91,7 +102,7 @@ public class UrlLiveCrawler {
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
" 数据是否删除 采集出错
{}
"
,
e
);
logger
.
error
(
" 数据是否删除 采集出错 "
,
e
);
}
finally
{
}
finally
{
counter
.
done
();
counter
.
done
();
}
}
...
@@ -99,7 +110,7 @@ public class UrlLiveCrawler {
...
@@ -99,7 +110,7 @@ public class UrlLiveCrawler {
return
counter
;
return
counter
;
}
}
}
catch
(
Exception
e2
)
{
}
catch
(
Exception
e2
)
{
logger
.
error
(
"数据出错
{}
"
,
e2
);
logger
.
error
(
"数据出错 "
,
e2
);
}
}
return
counter
;
return
counter
;
}
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
View file @
2c9d4fa2
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
2c9d4fa2
...
@@ -80,7 +80,7 @@ public class SourceForward {
...
@@ -80,7 +80,7 @@ public class SourceForward {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ProxyInit
.
initProxy
();
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http
://gu.qq.com/resources/shy/news/detail-v2/index.html?#/index?id=SN202006091653447945411f
"
);
urlList
.
add
(
"http
s://ypstatic.cnnb.com.cn/yppage-share/news/share/news_detail?newsId=627223d9e4b042b45e211c5a
"
);
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
for
(
SourceForwardBean
sfb
:
da
)
{
for
(
SourceForwardBean
sfb
:
da
)
{
System
.
out
.
println
(
"=============="
+
sfb
.
toString
());
System
.
out
.
println
(
"=============="
+
sfb
.
toString
());
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
2c9d4fa2
...
@@ -98,7 +98,7 @@ public class URLLive {
...
@@ -98,7 +98,7 @@ public class URLLive {
};
};
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
logger
.
error
(
" 数据采集运行有问题
{}
"
,
e
);
logger
.
error
(
" 数据采集运行有问题 "
,
e
);
}
}
return
list
;
return
list
;
}
}
...
...
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
View file @
2c9d4fa2
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
View file @
2c9d4fa2
package
com
.
zhiwei
.
source_forward
.
util
;
package
com
.
zhiwei
.
source_forward
.
util
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
org.apache.dubbo.config.ApplicationConfig
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
org.apache.dubbo.config.ConsumerConfig
;
import
com.zhiwei.source_forward.config.ProxyConfig
;
import
org.apache.dubbo.config.RegistryConfig
;
import
com.zhiwei.http.proxy.CynomysFactory
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumer
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory
;
/**
/**
* 初始化代理
* 初始化代理
...
@@ -16,10 +20,18 @@ public class ProxyInit {
...
@@ -16,10 +20,18 @@ public class ProxyInit {
* void
* void
*/
*/
public
static
void
initProxy
()
{
public
static
void
initProxy
()
{
String
address
=
ProxyConfig
.
registry
;
ApplicationConfig
applicationConfig
=
new
ApplicationConfig
();
String
appName
=
"xumiaoxin"
;
applicationConfig
.
setName
(
"actool"
);
long
appId
=
ProxyConfig
.
proxyid
;
RegistryConfig
registryConfig
=
new
RegistryConfig
();
ProxyFactory
.
init
(
SimpleConfig
.
builder
().
registry
(
address
).
appName
(
appName
).
appId
(
appId
).
group
(
ProxyConfig
.
group
).
build
());
registryConfig
.
setAddress
(
"zookeeper://192.168.0.30:2181"
);
ConsumerConfig
consumerConfig
=
new
ConsumerConfig
();
// 设置分组
consumerConfig
.
setGroup
(
"local"
);
String
username
=
"18271694195"
;
String
password
=
"Zhiwei289"
;
// 创建 consumer,applicationConfig 非必需参数
CynomysConsumer
consumer
=
CynomysConsumerFactory
.
create
(
applicationConfig
,
registryConfig
,
consumerConfig
,
username
,
password
);
CynomysFactory
.
init
(
consumer
);
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment