Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
a94682af
Commit
a94682af
authored
Jun 10, 2020
by
chenweiyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
删除误导性判断
parent
7f7e4a1c
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
31 additions
and
18 deletions
+31
-18
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+29
-16
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+2
-2
No files found.
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
a94682af
...
@@ -2,6 +2,9 @@ package com.zhiwei.source_forward.crawler;
...
@@ -2,6 +2,9 @@ package com.zhiwei.source_forward.crawler;
import
static
java
.
util
.
Objects
.
nonNull
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.net.InetSocketAddress
;
import
java.net.Proxy
;
import
java.net.Proxy.Type
;
import
java.util.Arrays
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
...
@@ -52,6 +55,7 @@ public class UrlLiveCrawler {
...
@@ -52,6 +55,7 @@ public class UrlLiveCrawler {
counter
.
add
();
counter
.
add
();
if
(
nonNull
(
url
))
{
if
(
nonNull
(
url
))
{
try
{
try
{
// ZhiWeiTools.sleep(3000);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
logger
.
error
(
"搜索创建出错:"
,
e
);
...
@@ -71,7 +75,7 @@ public class UrlLiveCrawler {
...
@@ -71,7 +75,7 @@ public class UrlLiveCrawler {
ProxyHolder
ph
=
null
;
ProxyHolder
ph
=
null
;
if
(
url
.
contains
(
"toutiao.com"
)){
if
(
url
.
contains
(
"toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
headers
.
put
(
"referer"
,
url
);
// headers.put("cookie", "
__ac_nonce=05ed0c7bb00bc34aa36be; __ac_signature=0fFbMAAgEBBBDtmbXG3W-tHxWiAAI8q; ttcid=cfbee5ddf00b4013b5236b534c8cf36c19; tt_webid=6832180195202909704; s_v_web_id=verify_kary2om5_954yc9QS_twaQ_42XG_9Sei_dsAVEudiEodo; __tasessionId=4bmcvzruo1590740924839; tt_webid=6832180195202909704; SLARDAR_WEB_ID=fb4d8abf-bdd7-4e9e-ba38-8c00f0c13846; csrftoken=6430b380cc664479dfa0b0e5061b2db9; tt_scid=kRdSxPldqsXGPvYrxh3K4HZ5ayX0isXRzk08ZTjlIGmNW3HaSLrhBfHJ.CRjNom.b0fe
");
// headers.put("cookie", "
csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; tt_webid=6833273737980659213; tt_webid=6833273737980659213; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_scid=KdPOCLtoSVDQTnptuiejH4SkyYa7RodIcBHFpAGwf17X9rUWJJadFYALAeJ5C8xI71e5; __ac_nonce=05ee037380054152ddc38; __ac_signature=6C1-YAAgEBB40vzLiGE95-gsf3AALbYjxEHG0FQERCcxB-9tebz.fovM7gew-AHObLDUegpmF7k8G57XzXokCbi72klNkdvS.ukzrfuuFk3UL836QudGNHE6IJQ47kFRkiT; __tasessionId=nz5ags6bk1591752505915
");
headers
.
put
(
"accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
headers
.
put
(
"accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
headers
.
put
(
"accept-encoding"
,
"gzip, deflate, br"
);
headers
.
put
(
"accept-encoding"
,
"gzip, deflate, br"
);
headers
.
put
(
"accept-language"
,
"zh-CN,zh;q=0.9"
);
headers
.
put
(
"accept-language"
,
"zh-CN,zh;q=0.9"
);
...
@@ -91,19 +95,20 @@ public class UrlLiveCrawler {
...
@@ -91,19 +95,20 @@ public class UrlLiveCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
if
(
Objects
.
nonNull
(
request
))
{
if
(
Objects
.
nonNull
(
request
))
{
counter
.
add
();
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ph
).
whenComplete
((
rs
,
ex
)
->
{
// , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
try
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
isSuccessful
())
{
if
(
rs
.
isSuccessful
())
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
if
(
rs
.
code
()
==
403
){
}
else
if
(
rs
.
code
()
==
404
){
callBack
(
callback
,
attr
,
-
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
{
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
{
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
}
else
{
}
else
{
logger
.
error
(
"e"
,
ex
);
logger
.
error
(
"e"
,
ex
);
callBack
(
callback
,
attr
,
1
,
"未访问成功
"
);
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断
"
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
...
@@ -157,6 +162,9 @@ public class UrlLiveCrawler {
...
@@ -157,6 +162,9 @@ public class UrlLiveCrawler {
}
}
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
)
&&
url
.
contains
(
"wm_aid"
))
{
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
)
&&
url
.
contains
(
"wm_aid"
))
{
url
=
"http://ff.dayu.com/contents/origin/"
+
url
.
split
(
"wm_aid="
)[
1
].
split
(
"!!wm_id"
)[
0
]+
"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"
;
url
=
"http://ff.dayu.com/contents/origin/"
+
url
.
split
(
"wm_aid="
)[
1
].
split
(
"!!wm_id"
)[
0
]+
"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"
;
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
)
&&
url
.
contains
(
"infoid="
))
{
// https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473
url
=
"https://tznew.58.com/tznew/c/info-detail?infoid="
+
url
.
split
(
"infoid="
)[
1
].
split
(
"&"
)[
0
];
}
}
return
url
;
return
url
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -180,7 +188,7 @@ public class UrlLiveCrawler {
...
@@ -180,7 +188,7 @@ public class UrlLiveCrawler {
if
(
Objects
.
nonNull
(
ulb
))
{
if
(
Objects
.
nonNull
(
ulb
))
{
callback
.
onData
(
ulb
,
attr
);
callback
.
onData
(
ulb
,
attr
);
}
else
{
}
else
{
callBack
(
callback
,
attr
,
-
1
,
null
);
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
}
}
}
}
...
@@ -234,9 +242,6 @@ public class UrlLiveCrawler {
...
@@ -234,9 +242,6 @@ public class UrlLiveCrawler {
try
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
json
.
getJSONObject
(
"data"
).
getString
(
"title"
);
title
=
json
.
getJSONObject
(
"data"
).
getString
(
"title"
);
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
"网页已删除"
;
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
}
...
@@ -253,6 +258,13 @@ public class UrlLiveCrawler {
...
@@ -253,6 +258,13 @@ public class UrlLiveCrawler {
title
=
String
.
valueOf
(
"404"
);
title
=
String
.
valueOf
(
"404"
);
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
title
=
"文章未找到"
;
title
=
"文章未找到"
;
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
json
.
getJSONObject
(
"result"
).
getString
(
"title"
);
}
catch
(
Exception
e
)
{
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
}
}
//若title 为拿到 用 此方法
//若title 为拿到 用 此方法
...
@@ -270,10 +282,10 @@ public class UrlLiveCrawler {
...
@@ -270,10 +282,10 @@ public class UrlLiveCrawler {
title
=
doc
.
select
(
"h1"
).
text
().
replaceAll
(
" "
,
""
);
title
=
doc
.
select
(
"h1"
).
text
().
replaceAll
(
" "
,
""
);
}
}
//若title 为拿到 用 此方法
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
||
result
.
length
()
<
200
)
{
//
if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
title
=
"网页已删除"
;
//
title = "网页已删除";
}
//
}
if
(
Objects
.
nonNull
(
title
)
&&
title
.
length
()
>
1
){
if
(
Objects
.
nonNull
(
title
)
&&
title
.
length
()
>
1
){
return
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
isDelete
(
title
),
title
);
return
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
isDelete
(
title
),
title
);
...
@@ -301,12 +313,13 @@ public class UrlLiveCrawler {
...
@@ -301,12 +313,13 @@ public class UrlLiveCrawler {
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
,
"百度新闻——全球最大的中文新闻平台"
,
"以上文章由以下机构判定为不实信息"
,
"该公众号已迁移"
,
"百度新闻——全球最大的中文新闻平台"
,
"以上文章由以下机构判定为不实信息"
,
"该公众号已迁移"
,
"财经网-CAIJING.COM.CN"
,
"蚂蚁资讯"
,
"参数错误"
,
"时尚头条_YOKA时尚网"
,
"该文章已经被删除"
,
"财经网-CAIJING.COM.CN"
,
"蚂蚁资讯"
,
"参数错误"
,
"时尚头条_YOKA时尚网"
,
"该文章已经被删除"
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
,
"文章未找到"
);
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
,
"文章未找到"
,
"UC头条"
);
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
,
"Objectmoved"
,
"404"
,
"页面没有找到"
,
"页面未找到"
,
"301MovedPermanently"
,
"加载异常"
,
,
"Objectmoved"
,
"404"
,
"页面没有找到"
,
"页面未找到"
,
"301MovedPermanently"
,
"加载异常"
,
"此帐号已被屏蔽, 内容无法查看"
,
"链接不存在"
);
"此帐号已被屏蔽, 内容无法查看"
,
"链接不存在"
,
"新闻已删除"
);
return
cList
.
stream
().
anyMatch
(
title:
:
contains
)
||
eList
.
stream
().
anyMatch
(
title:
:
equals
);
return
cList
.
stream
().
anyMatch
(
title:
:
contains
)
||
eList
.
stream
().
anyMatch
(
title:
:
equals
);
}
}
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
a94682af
...
@@ -72,8 +72,8 @@ public class URLLive {
...
@@ -72,8 +72,8 @@ public class URLLive {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ProxyInit
.
initProxy
();
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://www.toutiao.com/a1665677841741827"
);
//
urlList.add("http://www.toutiao.com/a1665677841741827");
// urlList.add("https://mp.weixin.qq.com/s?__biz=MzA3NjgyNTU5Nw==&mid=2247486586&idx=2&sn=419218b3c831b17d2b9bd9a5281ea842&scene=6#wechat_redirect
");
urlList
.
add
(
"http://www.yidianzixun.com/article/0PYO4Gbh
"
);
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
for
(
UrlLiveBean
b
:
u
)
{
for
(
UrlLiveBean
b
:
u
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment