Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
1ecccbba
Commit
1ecccbba
authored
Jan 11, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
链接是否删除 初步完成 版本提升至0.0.9
parent
256d62f0
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
83 additions
and
49 deletions
+83
-49
pom.xml
+1
-1
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+71
-32
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+11
-16
No files found.
pom.xml
View file @
1ecccbba
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<version>
0.0.
8
-SNAPSHOT
</version>
<version>
0.0.
9
-SNAPSHOT
</version>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
1ecccbba
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.Arrays
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.concurrent.TimeUnit
;
...
...
@@ -9,6 +11,7 @@ import org.apache.logging.log4j.Logger;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.async.MultiThreadingCounter
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
...
...
@@ -58,7 +61,6 @@ public class UrlLiveCrawler {
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
// Map<String,String> headers = new HashMap<>();
Map
<
String
,
String
>
headers
=
HeaderTool
.
getCommonHead
();
if
(
url
.
contains
(
"www.toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
...
...
@@ -72,15 +74,13 @@ public class UrlLiveCrawler {
try
{
if
(
response
.
code
()
==
200
)
{
parseHtml
(
response
.
body
().
string
(),
attr
,
callback
,
counter
);
}
else
if
(
response
.
code
()
==
404
)
{
}
else
{
if
(
attr
.
getCount
()
>
2
)
{
callBack
(
callback
,
attr
,
1
,
"404"
);
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
response
.
code
())
);
}
else
{
attr
.
AddCount
();
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
}
}
else
{
callBack
(
callback
,
attr
,
-
2
,
String
.
valueOf
(
response
.
code
()));
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
...
...
@@ -116,7 +116,7 @@ public class UrlLiveCrawler {
if
(
i
==
1
)
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
true
,
title
);
}
else
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
i
);
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
i
,
title
);
}
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
...
...
@@ -126,6 +126,7 @@ public class UrlLiveCrawler {
}
private
String
dealUrl
(
String
url
)
{
try
{
if
(
url
.
contains
(
"toutiao.com"
))
{
try
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
...
...
@@ -151,8 +152,13 @@ public class UrlLiveCrawler {
}
else
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
)
&&
url
.
contains
(
"wm_aid"
))
{
url
=
"http://ff.dayu.com/contents/origin/"
+
url
.
split
(
"wm_aid="
)[
1
].
split
(
"!!wm_id"
)[
0
]+
"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"
;
}
return
url
;
}
catch
(
Exception
e
)
{
return
url
;
}
}
/**
...
...
@@ -192,15 +198,23 @@ public class UrlLiveCrawler {
public
UrlLiveBean
matchDel
(
String
result
,
Attribution
attr
,
String
url
){
Document
doc
=
Jsoup
.
parse
(
result
);
String
title
=
null
;
boolean
f
=
false
;
if
(
url
.
contains
(
"mp.weixin.qq.com"
)
||
url
.
contains
(
"post.mp.qq.com"
)){
title
=
doc
.
select
(
"h2.rich_media_title"
).
text
().
replaceAll
(
" "
,
""
);
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
()
)
{
title
=
doc
.
select
(
"p.title"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
()
)
{
title
=
doc
.
select
(
"h3.msg-title"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"div.global_error_msg.warn"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"p.tips"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"h2"
).
text
();
}
}
else
if
(
url
.
contains
(
"kuaibao"
)){
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
...
...
@@ -209,6 +223,20 @@ public class UrlLiveCrawler {
title
=
doc
.
select
(
"p#contaniner"
).
text
();
}
else
if
(
url
.
contains
(
"kanfanews.com"
))
{
title
=
doc
.
select
(
"p#tit"
).
text
();
}
else
if
(
url
.
contains
(
"ifeng.com"
)
&&
result
.
contains
(
"url=http://www.ifeng.com/"
))
{
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
json
.
getJSONObject
(
"data"
).
getString
(
"title"
);
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
"网页已删除"
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
}
else
if
(
url
.
contains
(
"huanqiu.com"
)
&&
result
.
contains
(
"www.huanqiu.com/404.html"
))
{
title
=
"网页已删除"
;
}
//若title 为拿到 用 此方法
...
...
@@ -221,33 +249,44 @@ public class UrlLiveCrawler {
title
=
doc
.
select
(
"title"
).
text
().
replaceAll
(
" "
,
""
);
}
if
(
title
!=
null
&&
title
.
length
()
>
1
){
if
(
Objects
.
equals
(
"网页已删除"
,
title
)
||
Objects
.
equals
(
"页面提示"
,
title
)
||
title
.
contains
(
"正在维护中"
)
||
Objects
.
equals
(
"此文章被第三方评估为不实信息"
,
title
)
||
title
.
contains
(
"提示信息-"
)
||
Objects
.
equals
(
"财经头条"
,
title
)
||
Objects
.
equals
(
"知识100题"
,
title
)
||
Objects
.
equals
(
"502BadGateway"
,
title
)
||
Objects
.
equals
(
"提示信息"
,
title
)
||
Objects
.
equals
(
"跳转页"
,
title
)
||
Objects
.
equals
(
"跳转中..."
,
title
)
||
Objects
.
equals
(
"此帐号在冻结期,内容无法查看"
,
title
)
||
Objects
.
equals
(
"东北新闻网"
,
title
)
||
Objects
.
equals
(
"百度一下,你就知道"
,
title
)
||
Objects
.
equals
(
"帐号已迁移"
,
title
)
||
Objects
.
equals
(
"手机百度"
,
title
)
||
Objects
.
equals
(
"内容被删除"
,
title
)
||
Objects
.
equals
(
"亚博国际|首页"
,
title
)
||
Objects
.
equals
(
"中国软件网"
,
title
)
||
Objects
.
equals
(
"云广网"
,
title
)
||
Objects
.
equals
(
"新浪首页"
,
title
)
||
Objects
.
equals
(
"文章暂时找不到了"
,
title
)
||
title
.
contains
(
"此内容因违规无法查看"
)
||
title
.
contains
(
"微信公众号不存在"
)
||
title
.
contains
(
"此内容被投诉且经审核涉嫌侵权,无法查看"
)
||
Objects
.
equals
(
"-法易网"
,
title
)
||
Objects
.
equals
(
"【一点资讯】www.yidianzixun.com"
,
title
)
||
title
.
contains
(
"您访问的链接不存在"
)
||
Objects
.
equals
(
"文章暂时不能查看"
,
title
)
||
Objects
.
equals
(
"错误页面"
,
title
)
||
title
.
contains
(
"thepageyourequestedwasnotfound"
)
||
Objects
.
equals
(
"此帐号已被屏蔽, 内容无法查看"
,
title
)
||
Objects
.
equals
(
"网站暂停通知"
,
title
)
||
title
.
contains
(
"未知错误"
)
||
title
.
contains
(
"Object moved"
)
||
title
.
contains
(
"404"
)
||
title
.
contains
(
"页面没有找到"
)
||
title
.
contains
(
"页面未找到"
)
||
title
.
contains
(
"301MovedPermanently"
)){
f
=
true
;
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"h1"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
||
result
.
length
()
<
200
)
{
title
=
"网页已删除"
;
}
if
(
Objects
.
nonNull
(
title
)
&&
title
.
length
()
>
1
){
return
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
isDelete
(
title
),
title
);
}
else
{
return
null
;
}
return
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
f
,
title
);
}
/**
*
* @Description 标题判断
* @param title
* @return
*/
private
boolean
isDelete
(
String
title
)
{
List
<
String
>
eList
=
Arrays
.
asList
(
"系统出错"
,
"该内容已被发布者删除"
,
"网页已删除"
,
"此帐号已自主注销,内容无法查看"
,
"页面提示"
,
"正在维护中"
,
"此文章被第三方评估为不实信息"
,
"财经头条"
,
"知识100题"
,
"502BadGateway"
,
"提示信息"
,
"跳转页"
,
"跳转中..."
,
"此帐号在冻结期,内容无法查看"
,
"东北新闻网"
,
"百度一下,你就知道"
,
"帐号已迁移"
,
"手机百度"
,
"内容被删除"
,
"亚博国际|首页"
,
"中国软件网"
,
"云广网"
,
"新浪首页"
,
"文章暂时找不到了"
,
"-法易网"
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
);
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
,
"Objectmoved"
,
"404"
,
"页面没有找到"
,
"页面未找到"
,
"301MovedPermanently"
,
"加载异常"
,
"此帐号已被屏蔽, 内容无法查看"
);
return
cList
.
stream
().
anyMatch
(
title:
:
contains
)
||
eList
.
stream
().
anyMatch
(
title:
:
equals
);
}
// /**
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
1ecccbba
package
com
.
zhiwei
.
source_forward
.
run
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.crawler.UrlLiveCrawler
;
...
...
@@ -69,17 +64,17 @@ public class URLLive {
return
UrlLiveCrawlerThread
.
getUrlLiveCrawle
(
urlList
);
}
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"https://www.hao123.com/mid/16981890690654602094
"
);
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
for
(
UrlLiveBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
}
}
//
public static void main(String[] args) {
//
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
//
List<String> urlList = new ArrayList<>();
// urlList.add("http://sh.qihoo.com/mob/transcoding?sign=360_e39369d1&n=10&pg=41&u=84c80ad777cd9a41152b4fd9c44f96e2&gzh=3093075895&news_sdk_version=&sqid=&_=1545026725607&callback=jsonp75&url=http%3A%2F%2Fzm.news.so.com%2F708e22872ce43ca08eec2a1fc57c6897&check=e0fae47326e7916f&ucheck=75e961d9583cfebe81a39e2dd972b0aa&uid=84c80ad777cd9a41152b4fd9c44f96e2&360newsdetail=1&c=detail&apiflag=detail&articlety=zmt
");
//
//
//
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
//
for(UrlLiveBean b : u) {
//
System.out.println(b.toString());
//
}
//
}
static
class
UrlLiveCrawlerThread
extends
Thread
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment