Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
9fcfba2d
Commit
9fcfba2d
authored
Aug 13, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
各个采集验证添加休眠,避免数据过多导致程序阻塞
parent
aa059934
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
253 additions
and
248 deletions
+253
-248
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+121
-119
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+2
-0
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+130
-128
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+0
-0
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+0
-1
No files found.
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
9fcfba2d
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
okhttp3.Request
;
public
class
ContentCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public
GroupSync
submitTask
(
ContentDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private
void
start
(
GroupSync
counter
,
ContentDataCallback
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
if
(
url
!=
null
)
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
}
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"搜索结果访问失败: {}"
,
ex
);
}
finally
{
counter
.
done
();
}
});
return
counter
;
}
/**
*
*
* @Description 获取正文解析
* @param response
* @param attr
* @param callback
*/
private
void
parseHtml
(
String
result
,
Attribution
attr
,
ContentDataCallback
callback
)
{
try
{
String
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
result
);
ContentBean
cb
=
new
ContentBean
(
attr
.
get
().
toString
(),
content
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
cb
,
attr
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"网页链接失效"
,
e
);
}
}
}
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.Objects
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
okhttp3.Request
;
public
class
ContentCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public
GroupSync
submitTask
(
ContentDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private
void
start
(
GroupSync
counter
,
ContentDataCallback
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
ZhiWeiTools
.
sleep
(
100
);
if
(
url
!=
null
)
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
}
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"搜索结果访问失败: {}"
,
ex
);
}
finally
{
counter
.
done
();
}
});
return
counter
;
}
/**
*
*
* @Description 获取正文解析
* @param response
* @param attr
* @param callback
*/
private
void
parseHtml
(
String
result
,
Attribution
attr
,
ContentDataCallback
callback
)
{
try
{
String
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
result
);
ContentBean
cb
=
new
ContentBean
(
attr
.
get
().
toString
(),
content
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
cb
,
attr
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"网页链接失效"
,
e
);
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
9fcfba2d
...
...
@@ -5,6 +5,7 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Objects
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
...
...
@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
private
void
start
(
GroupSync
counter
,
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
ZhiWeiTools
.
sleep
(
100
);
counter
.
add
();
if
(
url
!=
null
)
{
try
{
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
9fcfba2d
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
9fcfba2d
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
9fcfba2d
...
...
@@ -325,7 +325,6 @@ public class MatchSource {
source
=
source
.
replaceAll
(
".*来源:|)"
,
""
);
}
}
if
(
Objects
.
nonNull
(
source
)
&&
source
.
length
()
!=
0
)
{
return
source
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment