Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
9fcfba2d
Commit
9fcfba2d
authored
Aug 13, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
各个采集验证添加休眠,避免数据过多导致程序阻塞
parent
aa059934
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
253 additions
and
248 deletions
+253
-248
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+121
-119
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+2
-0
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+130
-128
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+0
-0
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+0
-1
No files found.
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
9fcfba2d
package
com
.
zhiwei
.
source_forward
.
crawler
;
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.Objects
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
okhttp3.Request
;
import
okhttp3.Request
;
public
class
ContentCrawler
{
public
class
ContentCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
/**
* @Description 链接传入 并 返回采集完信号
*
* @param callback
* @Description 链接传入 并 返回采集完信号
* @param urls
* @param callback
* @return
* @param urls
* @throws Exception
* @return
*/
* @throws Exception
public
GroupSync
submitTask
(
ContentDataCallback
callback
,
*/
String
...
urls
)
{
public
GroupSync
submitTask
(
ContentDataCallback
callback
,
GroupSync
counter
=
new
GroupSync
();
String
...
urls
)
{
start
(
counter
,
callback
,
urls
);
GroupSync
counter
=
new
GroupSync
();
return
counter
;
start
(
counter
,
callback
,
urls
);
}
return
counter
;
}
/**
*
/**
* @Description 提交链接
*
* @param counter
* @Description 提交链接
* @param callback
* @param counter
* @param urls
* @param callback
*/
* @param urls
private
void
start
(
GroupSync
counter
,
*/
ContentDataCallback
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
ContentDataCallback
callback
,
String
...
urls
)
{
for
(
String
url
:
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
if
(
url
!=
null
)
{
for
(
String
url
:
urls
)
{
try
{
ZhiWeiTools
.
sleep
(
100
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
if
(
url
!=
null
)
{
}
catch
(
Exception
e
)
{
try
{
logger
.
error
(
"搜索创建出错"
,
e
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"搜索创建出错"
,
e
);
}
}
}
}
}
}
}
/**
}
*
* @Description 链接获取文章信息
/**
* @param counter
*
* @param url
* @Description 链接获取文章信息
* @param attr
* @param counter
* @param callback
* @param url
* @return
* @param attr
*/
* @param callback
private
GroupSync
search
(
GroupSync
counter
,
* @return
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
*/
logger
.
info
(
"当前处理 URL: {}"
,
url
);
private
GroupSync
search
(
GroupSync
counter
,
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
counter
.
add
();
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
counter
.
add
();
try
{
if
(
Objects
.
isNull
(
ex
))
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
try
{
}
else
{
if
(
Objects
.
isNull
(
ex
))
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
}
else
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
logger
.
info
(
"搜索结果访问失败: {}"
,
ex
);
}
}
finally
{
}
catch
(
Exception
e
)
{
counter
.
done
();
logger
.
info
(
"搜索结果访问失败: {}"
,
ex
);
}
}
finally
{
counter
.
done
();
});
}
return
counter
;
});
}
return
counter
;
/**
}
*
*
/**
* @Description 获取正文解析
*
* @param response
*
* @param attr
* @Description 获取正文解析
* @param callback
* @param response
*/
* @param attr
private
void
parseHtml
(
String
result
,
Attribution
attr
,
* @param callback
ContentDataCallback
callback
)
{
*/
try
{
private
void
parseHtml
(
String
result
,
Attribution
attr
,
String
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
ContentDataCallback
callback
)
{
result
);
try
{
ContentBean
cb
=
new
ContentBean
(
attr
.
get
().
toString
(),
content
);
String
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
if
(
callback
==
null
)
{
result
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
ContentBean
cb
=
new
ContentBean
(
attr
.
get
().
toString
(),
content
);
}
else
{
if
(
callback
==
null
)
{
callback
.
onData
(
cb
,
attr
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
}
else
{
}
catch
(
Exception
e
)
{
callback
.
onData
(
cb
,
attr
);
logger
.
error
(
"网页链接失效"
,
e
);
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"网页链接失效"
,
e
);
}
}
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
9fcfba2d
...
@@ -5,6 +5,7 @@ import java.util.List;
...
@@ -5,6 +5,7 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
...
@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
...
@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
private
void
start
(
GroupSync
counter
,
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
ZhiWeiTools
.
sleep
(
100
);
counter
.
add
();
counter
.
add
();
if
(
url
!=
null
)
{
if
(
url
!=
null
)
{
try
{
try
{
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
9fcfba2d
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
9fcfba2d
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
9fcfba2d
...
@@ -325,7 +325,6 @@ public class MatchSource {
...
@@ -325,7 +325,6 @@ public class MatchSource {
source
=
source
.
replaceAll
(
".*来源:|)"
,
""
);
source
=
source
.
replaceAll
(
".*来源:|)"
,
""
);
}
}
}
}
if
(
Objects
.
nonNull
(
source
)
&&
source
.
length
()
!=
0
)
{
if
(
Objects
.
nonNull
(
source
)
&&
source
.
length
()
!=
0
)
{
return
source
;
return
source
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment