Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
a71c606b
Commit
a71c606b
authored
Jan 10, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
链接是否删除 修改
parent
37ac4e23
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
119 additions
and
42 deletions
+119
-42
src/main/java/com/zhiwei/source_forward/bean/UrlLiveBean.java
+21
-2
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+87
-37
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+11
-3
No files found.
src/main/java/com/zhiwei/source_forward/bean/UrlLiveBean.java
View file @
a71c606b
...
...
@@ -6,6 +6,8 @@ public class UrlLiveBean {
private
Integer
isLive
;
private
String
title
;
public
UrlLiveBean
()
{
super
();
}
...
...
@@ -16,9 +18,17 @@ public class UrlLiveBean {
this
.
isLive
=
isLive
;
}
public
UrlLiveBean
(
String
url
,
boolean
isLive
)
{
public
UrlLiveBean
(
String
url
,
Integer
isLive
,
String
title
)
{
super
();
this
.
url
=
url
;
this
.
isLive
=
isLive
;
this
.
title
=
title
;
}
public
UrlLiveBean
(
String
url
,
boolean
isLive
,
String
title
)
{
super
();
this
.
url
=
url
;
this
.
title
=
title
;
if
(
isLive
)
{
this
.
isLive
=
1
;
//已删除
}
else
{
...
...
@@ -26,6 +36,14 @@ public class UrlLiveBean {
}
}
public
String
getTitle
()
{
return
title
;
}
public
void
setTitle
(
String
title
)
{
this
.
title
=
title
;
}
public
String
getUrl
()
{
return
url
;
}
...
...
@@ -44,7 +62,8 @@ public class UrlLiveBean {
@Override
public
String
toString
()
{
return
"UrlLiveBean [url="
+
url
+
", isLive="
+
isLive
+
"]"
;
return
"UrlLiveBean [url="
+
url
+
", isLive="
+
isLive
+
", title="
+
title
+
"]"
;
}
/**
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
a71c606b
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.concurrent.TimeUnit
;
import
org.apache.logging.log4j.LogManager
;
...
...
@@ -34,7 +35,7 @@ public class UrlLiveCrawler {
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
public
MultiThreadingCounter
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
throws
Exception
{
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
(
2
0
,
TimeUnit
.
MINUTES
,
false
);
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
(
1
0
,
TimeUnit
.
MINUTES
,
false
);
start
(
counter
,
callback
,
urls
);
return
counter
;
}
...
...
@@ -57,21 +58,29 @@ public class UrlLiveCrawler {
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
// Map<String,String> headers = new HashMap<>();
Map
<
String
,
String
>
headers
=
HeaderTool
.
getCommonHead
();
if
(
url
.
contains
(
"www.toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
counter
.
increase
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
,
false
).
addListener
(
future
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
).
addListener
(
future
->
{
try
{
if
(
future
.
isSuccess
())
{
Response
response
=
future
.
result
();
try
{
if
(
response
.
code
()
==
200
)
{
parseHtml
(
response
.
body
().
string
(),
attr
,
callback
);
parseHtml
(
response
.
body
().
string
(),
attr
,
callback
,
counter
);
}
else
if
(
response
.
code
()
==
404
){
if
(
attr
.
getCount
()
>
2
)
{
callBack
(
callback
,
attr
,
1
,
"404"
);
}
else
{
callBack
(
callback
,
attr
,
1
);
attr
.
AddCount
();
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
}
}
else
{
callBack
(
callback
,
attr
,
-
2
,
String
.
valueOf
(
response
.
code
()));
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
...
...
@@ -82,10 +91,10 @@ public class UrlLiveCrawler {
}
}
else
{
if
(
future
.
cause
().
getMessage
().
contains
(
"status code: "
))
{
callBack
(
callback
,
attr
,
1
);
callBack
(
callback
,
attr
,
1
,
null
);
}
else
{
if
(
attr
.
getCount
()
>
3
)
{
callBack
(
callback
,
attr
,
-
1
);
callBack
(
callback
,
attr
,
-
1
,
null
);
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
}
else
{
attr
.
AddCount
();
...
...
@@ -102,8 +111,13 @@ public class UrlLiveCrawler {
return
counter
;
}
private
void
callBack
(
UrlLiveDataCallback
callback
,
Attribution
attr
,
int
i
)
{
UrlLiveBean
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
i
);
private
void
callBack
(
UrlLiveDataCallback
callback
,
Attribution
attr
,
int
i
,
String
title
)
{
UrlLiveBean
ulb
=
null
;
if
(
i
==
1
)
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
true
,
title
);
}
else
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
i
);
}
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
...
...
@@ -149,19 +163,21 @@ public class UrlLiveCrawler {
* @param callback
*/
private
void
parseHtml
(
String
html
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
/***验证网页是否能够连通*/
boolean
f
=
true
;
try
{
f
=
matchDel
(
html
,
attr
.
getAttr
().
toString
());
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据判断出错 "
,
e
);
}
UrlLiveBean
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
f
);
UrlLiveDataCallback
callback
,
MultiThreadingCounter
counter
)
{
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
UrlLiveBean
ulb
=
matchDel
(
html
,
attr
,
attr
.
getAttr
().
toString
());
if
(
Objects
.
nonNull
(
ulb
))
{
callback
.
onData
(
ulb
,
attr
);
}
else
{
if
(
attr
.
getCount
()
>
3
)
{
callBack
(
callback
,
attr
,
-
1
,
null
);
}
else
{
attr
.
AddCount
();
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
}
}
}
}
...
...
@@ -173,22 +189,65 @@ public class UrlLiveCrawler {
* @param @return 设定文件
* @return boolean 返回类型
*/
public
boolean
matchDel
(
String
result
,
String
url
){
public
UrlLiveBean
matchDel
(
String
result
,
Attribution
attr
,
String
url
){
Document
doc
=
Jsoup
.
parse
(
result
);
String
title
=
null
;
boolean
f
=
false
;
if
(
url
.
contains
(
"mp.weixin.qq.com"
)
||
url
.
contains
(
"post.mp.qq.com"
)){
title
=
doc
.
select
(
"h2.rich_media_title"
).
text
().
replaceAll
(
" "
,
""
);
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"p.title"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"h3.msg-title"
).
text
();
}
}
else
if
(
url
.
contains
(
"kuaibao"
)){
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);;
}
else
{
title
=
doc
.
select
(
"title"
).
text
().
replaceAll
(
" "
,
""
);;
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
title
=
doc
.
select
(
"p.style1"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"baidu.com"
)
||
url
.
contains
(
"hao123.com"
))
{
title
=
doc
.
select
(
"p#contaniner"
).
text
();
}
else
if
(
url
.
contains
(
"kanfanews.com"
))
{
title
=
doc
.
select
(
"p#tit"
).
text
();
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"div.adiv > p > span"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"title"
).
text
().
replaceAll
(
" "
,
""
);
}
if
(
title
!=
null
&&
!
title
.
equals
(
""
)){
if
(
title
.
contains
(
"未知错误"
)
||
title
.
contains
(
"Object moved"
)
||
title
.
contains
(
"404"
)
||
title
.
contains
(
"页面没有找到"
)
||
title
.
contains
(
"页面未找到"
)
||
title
.
contains
(
"301 Moved Permanently"
)){
return
true
;
if
(
title
!=
null
&&
title
.
length
()
>
1
){
if
(
Objects
.
equals
(
"网页已删除"
,
title
)
||
Objects
.
equals
(
"页面提示"
,
title
)
||
title
.
contains
(
"正在维护中"
)
||
Objects
.
equals
(
"此文章被第三方评估为不实信息"
,
title
)
||
title
.
contains
(
"提示信息-"
)
||
Objects
.
equals
(
"财经头条"
,
title
)
||
Objects
.
equals
(
"知识100题"
,
title
)
||
Objects
.
equals
(
"502BadGateway"
,
title
)
||
Objects
.
equals
(
"提示信息"
,
title
)
||
Objects
.
equals
(
"跳转页"
,
title
)
||
Objects
.
equals
(
"跳转中..."
,
title
)
||
Objects
.
equals
(
"此帐号在冻结期,内容无法查看"
,
title
)
||
Objects
.
equals
(
"东北新闻网"
,
title
)
||
Objects
.
equals
(
"百度一下,你就知道"
,
title
)
||
Objects
.
equals
(
"帐号已迁移"
,
title
)
||
Objects
.
equals
(
"手机百度"
,
title
)
||
Objects
.
equals
(
"内容被删除"
,
title
)
||
Objects
.
equals
(
"亚博国际|首页"
,
title
)
||
Objects
.
equals
(
"中国软件网"
,
title
)
||
Objects
.
equals
(
"云广网"
,
title
)
||
Objects
.
equals
(
"新浪首页"
,
title
)
||
Objects
.
equals
(
"文章暂时找不到了"
,
title
)
||
title
.
contains
(
"此内容因违规无法查看"
)
||
title
.
contains
(
"微信公众号不存在"
)
||
title
.
contains
(
"此内容被投诉且经审核涉嫌侵权,无法查看"
)
||
Objects
.
equals
(
"-法易网"
,
title
)
||
Objects
.
equals
(
"【一点资讯】www.yidianzixun.com"
,
title
)
||
title
.
contains
(
"您访问的链接不存在"
)
||
Objects
.
equals
(
"文章暂时不能查看"
,
title
)
||
Objects
.
equals
(
"错误页面"
,
title
)
||
title
.
contains
(
"thepageyourequestedwasnotfound"
)
||
Objects
.
equals
(
"此帐号已被屏蔽, 内容无法查看"
,
title
)
||
Objects
.
equals
(
"网站暂停通知"
,
title
)
||
title
.
contains
(
"未知错误"
)
||
title
.
contains
(
"Object moved"
)
||
title
.
contains
(
"404"
)
||
title
.
contains
(
"页面没有找到"
)
||
title
.
contains
(
"页面未找到"
)
||
title
.
contains
(
"301MovedPermanently"
)){
f
=
true
;
}
}
else
{
return
null
;
}
return
false
;
return
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
f
,
title
)
;
}
// /**
...
...
@@ -200,8 +259,7 @@ public class UrlLiveCrawler {
// * @time 2016年6月3日上午9:54:00
// * @return boolean
// */
// private boolean rulerYaoyan(Document doc)
// {
// private boolean rulerYaoyan(Document doc){
// boolean flg = false;
// if ("谣言".equals(doc.select(".pic_rumor").text()))
// {
...
...
@@ -219,24 +277,16 @@ public class UrlLiveCrawler {
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerWe
igui
(Document doc)
// private boolean rulerWe
chat
(Document doc)
// {
// boolean flg = false;
// if ((doc.select("p.title").text()).contains("此内容因违规无法查看") || doc.select("p.title").text().contains("此帐号在冻结期,内容无法查看"))
// if ((doc.select("
h3.msg-title").text()).contains("此内容被投诉且经审核涉嫌侵权,无法查看") || (doc.select("
p.title").text()).contains("此内容因违规无法查看") || doc.select("p.title").text().contains("此帐号在冻结期,内容无法查看"))
// {
// flg = true;
// }
// return flg;
// }
//
// private boolean rulerWechatWeigui(Document doc) {
// boolean flg = false;
// if ((doc.select("h3.msg-title").text()).contains("此内容被投诉且经审核涉嫌侵权,无法查看"))
// {
// flg = true;
// }
// return flg;
// }
//
// /**
// *
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
a71c606b
package
com
.
zhiwei
.
source_forward
.
run
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.crawler.UrlLiveCrawler
;
...
...
@@ -46,6 +51,7 @@ public class URLLive {
}
else
if
(
i
==
0
)
{
map
.
put
(
"是否删除"
,
false
);
}
map
.
put
(
"title"
,
ub
.
getTitle
());
dataMap
.
put
(
url
,
map
);
}
}
...
...
@@ -60,13 +66,15 @@ public class URLLive {
*/
public
static
List
<
UrlLiveBean
>
verificationURLLive
(
List
<
String
>
urlList
){
//启动验证链接是否有效程序程序
List
<
UrlLiveBean
>
dataList
=
UrlLiveCrawlerThread
.
getUrlLiveCrawle
(
urlList
);
return
dataList
;
return
UrlLiveCrawlerThread
.
getUrlLiveCrawle
(
urlList
);
}
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://www.teso.cc/html/zixun/201606/233848.html"
);
urlList
.
add
(
"https://www.hao123.com/mid/16981890690654602094"
);
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
for
(
UrlLiveBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment