Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
4860f41e
Commit
4860f41e
authored
Mar 22, 2022
by
maojirui
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
懂车帝判断情况修改
parent
d705de1f
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
135 additions
and
142 deletions
+135
-142
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+135
-142
No files found.
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
4860f41e
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONPath
;
import
com.zhiwei.async.GroupSync
;
...
...
@@ -25,29 +10,37 @@ import com.zhiwei.source_forward.bean.UrlLiveBean;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
java.util.*
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
*
* @author byte-zbs
* @version 1.0.0
* @ClassName UrlLiveCrawler
* @Description 判断页面是否存在
* @author byte-zbs
* @Date 2018年8月20日 下午3:34:57
* @version 1.0.0
*/
public
class
UrlLiveCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
true
).
build
();
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
counter
.
add
();
...
...
@@ -64,33 +57,33 @@ public class UrlLiveCrawler {
}
}
}
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
// System.out.println(url);
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
ProxyHolder
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
if
(
url
.
contains
(
"toutiao.com"
)){
if
(
url
.
contains
(
"toutiao.com"
))
{
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
try
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
if
(
Objects
.
nonNull
(
request
))
{
if
(
Objects
.
nonNull
(
request
))
{
counter
.
add
();
// , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
httpBoot
.
asyncCall
(
request
,
ph
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
ph
).
whenComplete
((
rs
,
ex
)
->
{
try
{
System
.
out
.
println
(
rs
.
code
());
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
isSuccessful
())
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
if
(
rs
.
code
()
==
404
)
{
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
{
if
(
rs
.
isSuccessful
())
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
if
(
rs
.
code
()
==
404
)
{
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
{
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
else
{
...
...
@@ -98,24 +91,24 @@ public class UrlLiveCrawler {
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
}
finally
{
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
}
finally
{
counter
.
done
();
}
});
return
counter
;
}
}
catch
(
Exception
e2
)
{
logger
.
error
(
"数据出错 {}"
,
e2
);
logger
.
error
(
"数据出错 {}"
,
e2
);
}
return
counter
;
}
private
void
callBack
(
UrlLiveDataCallback
callback
,
Attribution
attr
,
int
i
,
String
title
)
{
private
void
callBack
(
UrlLiveDataCallback
callback
,
Attribution
attr
,
int
i
,
String
title
)
{
UrlLiveBean
ulb
=
null
;
if
(
i
==
1
)
{
if
(
i
==
1
)
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
true
,
title
);
}
else
{
}
else
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
i
,
title
);
}
if
(
callback
==
null
)
{
...
...
@@ -124,20 +117,20 @@ public class UrlLiveCrawler {
callback
.
onData
(
ulb
,
attr
);
}
}
private
String
dealUrl
(
String
url
)
{
try
{
if
(
url
.
contains
(
"toutiao.com"
))
{
if
(
url
.
contains
(
"toutiao.com"
))
{
return
dealToutiaoUrl
(
url
);
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
if
(
url
.
contains
(
"https"
))
{
}
else
{
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
if
(
url
.
contains
(
"https"
))
{
}
else
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
)
&&
url
.
contains
(
"wm_aid"
))
{
url
=
"http://ff.dayu.com/contents/origin/"
+
url
.
split
(
"wm_aid="
)[
1
].
split
(
"!!wm_id"
)[
0
]+
"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"
;
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
)
&&
url
.
contains
(
"infoid="
))
{
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
)
&&
url
.
contains
(
"wm_aid"
))
{
url
=
"http://ff.dayu.com/contents/origin/"
+
url
.
split
(
"wm_aid="
)[
1
].
split
(
"!!wm_id"
)[
0
]
+
"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"
;
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
)
&&
url
.
contains
(
"infoid="
))
{
// https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473
url
=
"https://tznew.58.com/tznew/c/info-detail?infoid="
+
url
.
split
(
"infoid="
)[
1
].
split
(
"&"
)[
0
];
}
...
...
@@ -146,9 +139,9 @@ public class UrlLiveCrawler {
return
url
;
}
}
private
static
Pattern
pa
=
Pattern
.
compile
(
"\\d+"
);
private
String
dealToutiaoUrl
(
String
url
)
{
try
{
String
data
=
url
.
split
(
"\\?"
)[
0
];
...
...
@@ -162,146 +155,147 @@ public class UrlLiveCrawler {
}
return
url
;
}
/**
*
* @Description 判断是否删除
* @param html
* @param attr
* @param callback
* @Description 判断是否删除
*/
private
void
parseHtml
(
String
html
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
UrlLiveDataCallback
callback
)
{
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
UrlLiveBean
ulb
=
matchDel
(
html
,
attr
,
attr
.
getAttr
().
toString
());
if
(
Objects
.
nonNull
(
ulb
))
{
if
(
Objects
.
nonNull
(
ulb
))
{
callback
.
onData
(
ulb
,
attr
);
}
else
{
}
else
{
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
}
/***
* @Title: matchDel
* @author hero
* @Title: matchDel
* @author hero
* @Description: 验证链接是否有效
* @param @param page
* @param @return 设定文件
* @return boolean 返回类型
*/
public
UrlLiveBean
matchDel
(
String
result
,
Attribution
attr
,
String
url
)
{
public
UrlLiveBean
matchDel
(
String
result
,
Attribution
attr
,
String
url
)
{
try
{
Document
doc
=
Jsoup
.
parse
(
result
);
String
title
=
null
;
if
(
url
.
contains
(
"mp.weixin.qq.com"
)
||
url
.
contains
(
"post.mp.qq.com"
)
||
url
.
contains
(
"weixin.sogou.com"
))
{
title
=
doc
.
select
(
"h2.rich_media_title"
).
text
().
replaceAll
(
" "
,
""
);
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"p.title"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
url
.
contains
(
"mp.weixin.qq.com"
)
||
url
.
contains
(
"post.mp.qq.com"
)
||
url
.
contains
(
"weixin.sogou.com"
))
{
title
=
doc
.
select
(
"h2.rich_media_title"
).
text
().
replaceAll
(
" "
,
""
);
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"p.title"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"h3.msg-title"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"div.global_error_msg.warn"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"div.warn"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"p.tips"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"h2"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"div.weui-msg__text-area > h3"
).
text
();
}
// 获取title
// 获取title
Matcher
ma5
=
Pattern
.
compile
(
"var msg_title = \'(.*)\'"
)
.
matcher
(
result
);
if
(
ma5
.
find
())
{
title
=
ma5
.
group
(
1
).
replaceAll
(
" "
,
" "
).
trim
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
result
.
contains
(
"此帐号已被屏蔽, 内容无法查看"
)
||
result
.
contains
(
"该公众号已迁移"
)
||
result
.
contains
(
"此帐号已自主注销,内容无法查看"
)
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
result
.
contains
(
"此帐号已被屏蔽, 内容无法查看"
)
||
result
.
contains
(
"该公众号已迁移"
)
||
result
.
contains
(
"此帐号已自主注销,内容无法查看"
)
||
result
.
contains
(
"此帐号处于帐号迁移流程中"
)
||
result
.
contains
(
"该内容已被发布者删除"
)
||
result
.
contains
(
"此内容被投诉且经审核涉嫌侵权"
))
{
title
=
"网页已删除"
;
}
}
}
else
if
(
url
.
contains
(
"kuaibao"
))
{
}
else
if
(
url
.
contains
(
"kuaibao"
))
{
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
))
{
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
))
{
title
=
doc
.
select
(
"p.style1"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"baidu.com"
)
||
url
.
contains
(
"hao123.com"
))
{
}
else
if
(
url
.
contains
(
"baidu.com"
)
||
url
.
contains
(
"hao123.com"
))
{
title
=
doc
.
select
(
"p#contaniner"
).
text
();
}
else
if
(
url
.
contains
(
"kanfanews.com"
))
{
}
else
if
(
url
.
contains
(
"kanfanews.com"
))
{
title
=
doc
.
select
(
"p#tit"
).
text
();
}
else
if
(
url
.
contains
(
"ifeng.com"
)
&&
result
.
contains
(
"url=http://www.ifeng.com/"
))
{
}
else
if
(
url
.
contains
(
"ifeng.com"
)
&&
result
.
contains
(
"url=http://www.ifeng.com/"
))
{
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
))
{
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
json
.
getJSONObject
(
"data"
).
getString
(
"title"
);
}
catch
(
Exception
e
)
{
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
}
else
if
(
url
.
contains
(
"huanqiu.com"
)
&&
result
.
contains
(
"www.huanqiu.com/404.html"
))
{
}
else
if
(
url
.
contains
(
"huanqiu.com"
)
&&
result
.
contains
(
"www.huanqiu.com/404.html"
))
{
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
if
(
url
.
contains
(
"/answer/"
))
{
title
=
resultJson
.
getJSONObject
(
"question"
).
getString
(
"title"
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
)
||
url
.
contains
(
"/p/"
))
{
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
if
(
url
.
contains
(
"/answer/"
))
{
title
=
resultJson
.
getJSONObject
(
"question"
).
getString
(
"title"
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
)
||
url
.
contains
(
"/p/"
))
{
title
=
resultJson
.
getString
(
"title"
);
}
}
else
if
(
url
.
contains
(
"360kuai.com"
)
&&
result
.
contains
(
"您访问的文章走失了"
))
{
}
else
if
(
url
.
contains
(
"360kuai.com"
)
&&
result
.
contains
(
"您访问的文章走失了"
))
{
title
=
String
.
valueOf
(
"404"
);
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
title
=
"文章未找到"
;
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
))
{
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
json
.
getJSONObject
(
"result"
).
getString
(
"title"
);
}
catch
(
Exception
e
)
{
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
}
else
if
(
attr
.
getAttr
().
toString
().
contains
(
"toutiao.com"
))
{
if
(
result
.
contains
(
"\"success\":false"
))
{
}
else
if
(
attr
.
getAttr
().
toString
().
contains
(
"toutiao.com"
))
{
if
(
result
.
contains
(
"\"success\":false"
))
{
title
=
"网页已删除"
;
}
else
{
}
else
{
title
=
String
.
valueOf
(
JSONPath
.
read
(
result
,
"$..title"
));
}
}
else
if
(
url
.
contains
(
"page.om.qq.com"
))
{
if
(
result
.
contains
(
"内容被删除"
))
{
}
else
if
(
url
.
contains
(
"page.om.qq.com"
))
{
if
(
result
.
contains
(
"内容被删除"
))
{
title
=
"网页已删除"
;
}
}
else
if
(
url
.
contains
(
"m.dcdapp.com"
)
&&
!
result
.
contains
(
"title"
))
{
title
=
"网页已删除"
;
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"div.adiv > p > span"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"title"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"h1"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
// title = "网页已删除";
// }
if
(
Objects
.
nonNull
(
title
)
&&
title
.
length
()
>
1
)
{
return
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
isDelete
(
title
),
title
);
if
(
Objects
.
nonNull
(
title
)
&&
title
.
length
()
>
1
)
{
return
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
isDelete
(
title
),
title
);
}
else
{
return
null
;
}
...
...
@@ -309,48 +303,46 @@ public class UrlLiveCrawler {
return
null
;
}
}
/**
*
* @Description 标题判断
* @param title
* @return
* @Description 标题判断
*/
private
boolean
isDelete
(
String
title
)
{
List
<
String
>
eList
=
Arrays
.
asList
(
"系统出错"
,
"该内容已被发布者删除"
,
"网页已删除"
,
"此帐号已自主注销,内容无法查看"
,
"页面提示"
,
"正在维护中"
,
"此文章被第三方评估为不实信息"
,
"财经头条"
,
"知识100题"
,
"502BadGateway"
,
"提示信息"
,
"跳转页"
,
"跳转中..."
,
"此帐号在冻结期,内容无法查看"
,
"东北新闻网"
,
"百度一下,你就知道"
,
"帐号已迁移"
,
"手机百度"
,
"内容被删除"
,
"亚博国际|首页"
,
"中国软件网"
,
"云广网"
,
"新浪首页"
,
"文章暂时找不到了"
,
"-法易网"
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
,
"百度新闻——全球最大的中文新闻平台"
,
"以上文章由以下机构判定为不实信息"
,
"该公众号已迁移"
,
"财经网-CAIJING.COM.CN"
,
"蚂蚁资讯"
,
"参数错误"
,
"时尚头条_YOKA时尚网"
,
"该文章已经被删除"
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
,
"文章未找到"
List
<
String
>
eList
=
Arrays
.
asList
(
"系统出错"
,
"该内容已被发布者删除"
,
"网页已删除"
,
"此帐号已自主注销,内容无法查看"
,
"页面提示"
,
"正在维护中"
,
"此文章被第三方评估为不实信息"
,
"财经头条"
,
"知识100题"
,
"502BadGateway"
,
"提示信息"
,
"跳转页"
,
"跳转中..."
,
"此帐号在冻结期,内容无法查看"
,
"东北新闻网"
,
"百度一下,你就知道"
,
"帐号已迁移"
,
"手机百度"
,
"内容被删除"
,
"亚博国际|首页"
,
"中国软件网"
,
"云广网"
,
"新浪首页"
,
"文章暂时找不到了"
,
"-法易网"
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
,
"百度新闻——全球最大的中文新闻平台"
,
"以上文章由以下机构判定为不实信息"
,
"该公众号已迁移"
,
"财经网-CAIJING.COM.CN"
,
"蚂蚁资讯"
,
"参数错误"
,
"时尚头条_YOKA时尚网"
,
"该文章已经被删除"
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
,
"文章未找到"
,
"UC头条"
,
"该内容暂无法显示"
,
"手机搜狐网"
,
"此内容被投诉且经审核涉嫌侵权,无法查看。"
);
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
,
"Objectmoved"
,
"404"
,
"页面没有找到"
,
"页面未找到"
,
"301MovedPermanently"
,
"加载异常"
,
"此帐号已被屏蔽, 内容无法查看"
,
"链接不存在"
,
"新闻已删除"
,
"视频去哪了呢"
);
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
,
"Objectmoved"
,
"404"
,
"页面没有找到"
,
"页面未找到"
,
"301MovedPermanently"
,
"加载异常"
,
"此帐号已被屏蔽, 内容无法查看"
,
"链接不存在"
,
"新闻已删除"
,
"视频去哪了呢"
);
return
cList
.
stream
().
anyMatch
(
title:
:
contains
)
||
eList
.
stream
().
anyMatch
(
title:
:
equals
);
}
/**
* 处理知乎链接
*
* */
*/
private
static
String
treatZhihuUrl
(
String
url
)
{
if
(
url
.
contains
(
"/answer/"
))
{
url
=
"https://api.zhihu.com/answers/"
+
url
.
replaceAll
(
".*/answer/"
,
""
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
))
{
url
=
"https://api.zhihu.com/questions/"
+
url
.
replaceAll
(
".*/question/"
,
""
);
}
else
if
(
url
.
contains
(
"/p/"
))
{
url
=
"https://api.zhihu.com/articles/"
+
url
.
replaceAll
(
".*/p/"
,
""
);
}
return
url
;
if
(
url
.
contains
(
"/answer/"
))
{
url
=
"https://api.zhihu.com/answers/"
+
url
.
replaceAll
(
".*/answer/"
,
""
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
))
{
url
=
"https://api.zhihu.com/questions/"
+
url
.
replaceAll
(
".*/question/"
,
""
);
}
else
if
(
url
.
contains
(
"/p/"
))
{
url
=
"https://api.zhihu.com/articles/"
+
url
.
replaceAll
(
".*/p/"
,
""
);
}
return
url
;
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment