Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
39b30f08
Commit
39b30f08
authored
Mar 25, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
无效链接传入处理
parent
554dd201
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
18 additions
and
20 deletions
+18
-20
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+18
-20
No files found.
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
39b30f08
...
@@ -22,6 +22,8 @@ import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
...
@@ -22,6 +22,8 @@ import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
okhttp3.Request
;
/**
/**
*
*
* @ClassName UrlLiveCrawler
* @ClassName UrlLiveCrawler
...
@@ -65,28 +67,20 @@ public class UrlLiveCrawler {
...
@@ -65,28 +67,20 @@ public class UrlLiveCrawler {
if
(
url
.
contains
(
"www.toutiao.com"
)){
if
(
url
.
contains
(
"www.toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
headers
.
put
(
"referer"
,
url
);
}
}
try
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
if
(
Objects
.
nonNull
(
request
))
{
counter
.
add
();
counter
.
add
();
httpBoot
.
asyncCall
(
RequestUtils
.
wrapGet
(
url
,
headers
)
,
ProxyHolder
.
NAT_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
try
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
code
()
==
200
)
{
if
(
rs
.
code
()
==
200
)
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
,
counter
);
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
{
}
else
{
if
(
attr
.
getCount
()
>
2
)
{
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
{
attr
.
AddCount
();
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
}
}
}
}
else
{
}
else
{
if
(
attr
.
getCount
()
>
3
)
{
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
callBack
(
callback
,
attr
,
-
1
,
null
);
logger
.
info
(
"搜索结果访问失败: {}"
,
ex
);
}
else
{
attr
.
AddCount
();
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
...
@@ -96,6 +90,11 @@ public class UrlLiveCrawler {
...
@@ -96,6 +90,11 @@ public class UrlLiveCrawler {
});
});
return
counter
;
return
counter
;
}
}
}
catch
(
Exception
e2
)
{
logger
.
error
(
"数据出错 {}"
,
e2
);
}
return
counter
;
}
private
void
callBack
(
UrlLiveDataCallback
callback
,
Attribution
attr
,
int
i
,
String
title
)
{
private
void
callBack
(
UrlLiveDataCallback
callback
,
Attribution
attr
,
int
i
,
String
title
)
{
UrlLiveBean
ulb
=
null
;
UrlLiveBean
ulb
=
null
;
...
@@ -150,7 +149,7 @@ public class UrlLiveCrawler {
...
@@ -150,7 +149,7 @@ public class UrlLiveCrawler {
* @param callback
* @param callback
*/
*/
private
void
parseHtml
(
String
html
,
Attribution
attr
,
private
void
parseHtml
(
String
html
,
Attribution
attr
,
UrlLiveDataCallback
callback
,
GroupSync
counter
)
{
UrlLiveDataCallback
callback
)
{
if
(
callback
==
null
)
{
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
}
else
{
...
@@ -158,12 +157,7 @@ public class UrlLiveCrawler {
...
@@ -158,12 +157,7 @@ public class UrlLiveCrawler {
if
(
Objects
.
nonNull
(
ulb
))
{
if
(
Objects
.
nonNull
(
ulb
))
{
callback
.
onData
(
ulb
,
attr
);
callback
.
onData
(
ulb
,
attr
);
}
else
{
}
else
{
if
(
attr
.
getCount
()
>
3
)
{
callBack
(
callback
,
attr
,
-
1
,
null
);
callBack
(
callback
,
attr
,
-
1
,
null
);
}
else
{
attr
.
AddCount
();
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
}
}
}
}
}
}
}
...
@@ -177,6 +171,7 @@ public class UrlLiveCrawler {
...
@@ -177,6 +171,7 @@ public class UrlLiveCrawler {
* @return boolean 返回类型
* @return boolean 返回类型
*/
*/
public
UrlLiveBean
matchDel
(
String
result
,
Attribution
attr
,
String
url
){
public
UrlLiveBean
matchDel
(
String
result
,
Attribution
attr
,
String
url
){
try
{
Document
doc
=
Jsoup
.
parse
(
result
);
Document
doc
=
Jsoup
.
parse
(
result
);
String
title
=
null
;
String
title
=
null
;
if
(
url
.
contains
(
"mp.weixin.qq.com"
)
||
url
.
contains
(
"post.mp.qq.com"
)){
if
(
url
.
contains
(
"mp.weixin.qq.com"
)
||
url
.
contains
(
"post.mp.qq.com"
)){
...
@@ -245,6 +240,9 @@ public class UrlLiveCrawler {
...
@@ -245,6 +240,9 @@ public class UrlLiveCrawler {
}
else
{
}
else
{
return
null
;
return
null
;
}
}
}
catch
(
Exception
e
)
{
return
null
;
}
}
}
/**
/**
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment