Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
d6f4e440
Commit
d6f4e440
authored
Sep 03, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
验证是否删除添加知乎验证
parent
b7e91b0a
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
55 additions
and
216 deletions
+55
-216
pom.xml
+1
-2
src/main/java/com/zhiwei/source_forward/bean/UrlLiveBean.java
+18
-2
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+34
-210
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+2
-2
No files found.
pom.xml
View file @
d6f4e440
...
...
@@ -29,8 +29,7 @@
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.3.6-RELEASE
</version>
<scope>
provided
</scope>
<version>
0.5.2-RELEASE
</version>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/source_forward/bean/UrlLiveBean.java
View file @
d6f4e440
...
...
@@ -77,6 +77,8 @@ public class UrlLiveBean {
private
Integer
count
;
private
Integer
code
;
/**
* Constructor
*
...
...
@@ -92,6 +94,12 @@ public class UrlLiveBean {
* @param attr
* @param count
*/
private
Attribution
(
Object
attr
,
Integer
count
,
Integer
code
){
this
.
attr
=
attr
;
this
.
count
=
count
;
this
.
code
=
code
;
}
private
Attribution
(
Object
attr
,
Integer
count
){
this
.
attr
=
attr
;
this
.
count
=
count
;
...
...
@@ -114,7 +122,11 @@ public class UrlLiveBean {
* @return Attribution
*/
public
static
Attribution
of
(
Object
attr
,
Integer
count
)
{
return
new
Attribution
(
attr
,
count
);
return
new
Attribution
(
attr
,
count
);
}
public
static
Attribution
of
(
Object
attr
,
Integer
count
,
Integer
code
)
{
return
new
Attribution
(
attr
,
count
,
code
);
}
/**
...
...
@@ -135,7 +147,11 @@ public class UrlLiveBean {
return
count
;
}
public
void
AddCount
()
{
public
Integer
getCode
()
{
return
code
;
}
public
void
addCount
()
{
count
++;
}
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
d6f4e440
...
...
@@ -51,7 +51,7 @@ public class UrlLiveCrawler {
if
(
nonNull
(
url
))
{
try
{
ZhiWeiTools
.
sleep
(
10
);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
}
...
...
@@ -68,6 +68,8 @@ public class UrlLiveCrawler {
Map
<
String
,
String
>
headers
=
HeaderTool
.
getCommonHead
();
if
(
url
.
contains
(
"www.toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
}
try
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
...
...
@@ -76,10 +78,12 @@ public class UrlLiveCrawler {
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
code
()
==
200
)
{
if
(
rs
.
isSuccessful
()
)
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
if
(
rs
.
code
()
==
403
){
callBack
(
callback
,
attr
,
-
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
{
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
}
else
{
callBack
(
callback
,
attr
,
1
,
"未访问成功"
);
...
...
@@ -155,7 +159,7 @@ public class UrlLiveCrawler {
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
UrlLiveBean
ulb
=
matchDel
(
html
,
attr
,
attr
.
getAttr
().
toString
());
UrlLiveBean
ulb
=
matchDel
(
html
,
attr
,
attr
.
getAttr
().
toString
());
if
(
Objects
.
nonNull
(
ulb
))
{
callback
.
onData
(
ulb
,
attr
);
}
else
{
...
...
@@ -215,6 +219,10 @@ public class UrlLiveCrawler {
}
}
else
if
(
url
.
contains
(
"huanqiu.com"
)
&&
result
.
contains
(
"www.huanqiu.com/404.html"
))
{
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
title
=
resultJson
.
getString
(
"title"
)!=
null
?
resultJson
.
getString
(
"title"
):
resultJson
.
getString
(
"message"
);
}
//若title 为拿到 用 此方法
...
...
@@ -263,7 +271,7 @@ public class UrlLiveCrawler {
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
,
"百度新闻——全球最大的中文新闻平台"
,
"以上文章由以下机构判定为不实信息"
,
"该公众号已迁移"
,
"财经网-CAIJING.COM.CN"
,
"蚂蚁资讯"
,
"参数错误"
,
"时尚头条_YOKA时尚网"
,
"该文章已经被删除"
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
);
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
);
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
...
...
@@ -273,210 +281,26 @@ public class UrlLiveCrawler {
return
cList
.
stream
().
anyMatch
(
title:
:
contains
)
||
eList
.
stream
().
anyMatch
(
title:
:
equals
);
}
// /**
// *
// * ( 微信谣言的无效网址筛选规则)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:54:00
// * @return boolean
// */
// private boolean rulerYaoyan(Document doc){
// boolean flg = false;
// if ("谣言".equals(doc.select(".pic_rumor").text()))
// {
// flg = true;
// }
// return flg;
// }
//
// /**
// *
// * ( 微信内容违规的无效网址筛选规则)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerWechat(Document doc)
// {
// boolean flg = false;
// if ((doc.select("h3.msg-title").text()).contains("此内容被投诉且经审核涉嫌侵权,无法查看") || (doc.select("p.title").text()).contains("此内容因违规无法查看") || doc.select("p.title").text().contains("此帐号在冻结期,内容无法查看"))
// {
// flg = true;
// }
// return flg;
// }
//
//
// /**
// *
// * ( 微信内容违规的无效网址筛选规则)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerTousu(Document doc)
// {
// boolean flg = false;
// if (0 < doc.select("i[class=\"icon_msg warn\"]").size())
// {
// flg = true;
// }
// return flg;
// }
//
// /**
// *
// * ( 环球的无效网址筛选规则)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerHuanqiuWuxiao(Document doc)
// {
// boolean flg = false;
// if (0 < doc.select("div[class=\"errMsg\"]").size())
// {
// flg = true;
// }
// return flg;
// }
//
// /**
// *
// * ( 空的无效网址筛选规则)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerKong(Document doc)
// {
// boolean flg = false;
// if (14 > doc.select("body").toString().length()
// &&
// 14 > doc.select("head").toString().length())
// {
// flg = true;
// }
// return flg;
// }
//
// /**
// *
// * ( 内容不存在)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerBucunzai(Document doc)
// {
// boolean flg = false;
// if (doc.text().contains("很抱歉,您访问的页面不存在")||doc.text().contains("该内容已被发布者删除"))
// {
// flg = true;
// }
// return flg;
// }
//
// /**
// *
// * ( 招商网的无效网址筛选规则)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerZhaoshang(Document doc)
// {
// boolean flg = false;
// try
// {
// if ("<a href=\"\"> </a>".equals(doc.select("div[class=\"paths\"]")
// .first().child(2).toString()))
// {
// flg = true;
// }
// }
// catch (Exception e)
// {
// e.printStackTrace();
// // TODO: handle exception
// }
//
// return flg;
// }
//
//
// /**
// *
// * ( 一点资讯的无效网址筛选规则)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerYidian(Document doc)
// {
// boolean flg = false;
// try
// {
// if (doc.select("div[class=\"content\"]").text().contains("文章没有找到"))
// {
// flg = true;
// }
// }
// catch (Exception e)
// {
// e.printStackTrace();
// // : handle exception
// }
// return flg;
// }
//
// /**
// * @Title: rulerHead
// * @author hero
// * @Description: 验证链接头部
// * @param @param doc
// * @param @return 设定文件
// * @return boolean 返回类型
// */
// private boolean rulerHead(Document doc)
// {
// List<Node> nodeList = doc.head().childNodes();
// try {
// for (Node node : nodeList) {
// if (node.outerHtml().contains("<title>")) {
// String title = node.toString().split("<title>")[1].split("</title>")[0];
// if(title.contains("未知错误") || title.contains("Object moved") || title.contains("404") || title.contains("页面没有找到") || title.contains("页面未找到") || title.contains("301 Moved Permanently")){
// return true;
// }
// }
// if (node.outerHtml().contains("meta")) {
// String meta = node.toString();
// if(meta.contains("公益404页面")) {
// return true;
// }
// }
// }
// } catch (Exception e) {
// e.printStackTrace();
// return false;
// }
// return false;
// }
/**
* 处理知乎链接
*
* */
private
static
String
treatZhihuUrl
(
String
url
)
{
if
(
url
.
contains
(
"/answer/"
))
{
url
=
"https://api.zhihu.com/answers/"
+
url
.
replaceAll
(
".*/answer/"
,
""
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
))
{
url
=
"https://api.zhihu.com/questions/"
+
url
.
replaceAll
(
".*/question/"
,
""
);
}
else
if
(
url
.
contains
(
"/p/"
))
{
url
=
"https://api.zhihu.com/articles/"
+
url
.
replaceAll
(
".*/p/"
,
""
);
}
return
url
;
}
}
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
d6f4e440
...
...
@@ -11,6 +11,7 @@ import org.apache.logging.log4j.Logger;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.crawler.UrlLiveCrawler
;
...
...
@@ -74,8 +75,7 @@ public class URLLive {
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://www.ebrun.com/ebrungo/zb/316384.shtml"
);
urlList
.
add
(
"https://www.zhihu.com/question/340524333"
);
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
for
(
UrlLiveBean
b
:
u
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment