Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
9fcfba2d
Commit
9fcfba2d
authored
Aug 13, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
各个采集验证添加休眠,避免数据过多导致程序阻塞
parent
aa059934
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
594 additions
and
588 deletions
+594
-588
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+121
-119
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+2
-0
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+130
-128
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+341
-340
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+0
-1
No files found.
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
9fcfba2d
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
okhttp3.Request
;
public
class
ContentCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public
GroupSync
submitTask
(
ContentDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private
void
start
(
GroupSync
counter
,
ContentDataCallback
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
if
(
url
!=
null
)
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
}
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"搜索结果访问失败: {}"
,
ex
);
}
finally
{
counter
.
done
();
}
});
return
counter
;
}
/**
*
*
* @Description 获取正文解析
* @param response
* @param attr
* @param callback
*/
private
void
parseHtml
(
String
result
,
Attribution
attr
,
ContentDataCallback
callback
)
{
try
{
String
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
result
);
ContentBean
cb
=
new
ContentBean
(
attr
.
get
().
toString
(),
content
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
cb
,
attr
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"网页链接失效"
,
e
);
}
}
}
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.Objects
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
okhttp3.Request
;
public
class
ContentCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public
GroupSync
submitTask
(
ContentDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private
void
start
(
GroupSync
counter
,
ContentDataCallback
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
ZhiWeiTools
.
sleep
(
100
);
if
(
url
!=
null
)
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
}
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"搜索结果访问失败: {}"
,
ex
);
}
finally
{
counter
.
done
();
}
});
return
counter
;
}
/**
*
*
* @Description 获取正文解析
* @param response
* @param attr
* @param callback
*/
private
void
parseHtml
(
String
result
,
Attribution
attr
,
ContentDataCallback
callback
)
{
try
{
String
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
result
);
ContentBean
cb
=
new
ContentBean
(
attr
.
get
().
toString
(),
content
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
cb
,
attr
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"网页链接失效"
,
e
);
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
9fcfba2d
...
...
@@ -5,6 +5,7 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Objects
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
...
...
@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
private
void
start
(
GroupSync
counter
,
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
ZhiWeiTools
.
sleep
(
100
);
counter
.
add
();
if
(
url
!=
null
)
{
try
{
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
9fcfba2d
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
okhttp3.Request
;
public
class
SourceForwardCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SourceForwardCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
public
GroupSync
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
try
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" exception "
,
e
);
return
null
;
}
}
private
void
start
(
GroupSync
counter
,
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
counter
.
add
();
if
(
url
!=
null
)
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
}
counter
.
done
();
}
}
}
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
// Map<String,String> headers = HeaderTool.getCommonHead();
if
(
url
.
contains
(
"www.toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
}
if
(
url
.
contains
(
"china.prcfe.com"
))
{
url
=
"http://china.prcfe.com/e/extend/ShowSource/?id="
+
url
.
split
(
"/"
)[
url
.
split
(
"/"
).
length
-
1
].
split
(
"\\."
)[
0
];
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
catch
(
Exception
e1
)
{
logger
.
error
(
"解析出错"
,
e1
);
}
finally
{
counter
.
done
();
}
});
return
counter
;
}
private
void
parseHtml
(
String
body
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
String
source
=
null
;
String
channel
=
"新闻"
;
String
isforward
=
"未知"
;
try
{
Document
document
=
Jsoup
.
parse
(
body
);
if
(
attr
.
get
().
toString
().
contains
(
"mp.weixin.qq.com"
)){
isforward
=
document
.
select
(
"div#meta_content"
).
select
(
"span#copyright_logo"
).
text
();
if
(
isforward
.
contains
(
"原创"
)){
isforward
=
"原创"
;
}
else
{
isforward
=
"未知"
;
}
}
else
if
(
attr
.
get
().
toString
().
contains
(
"www.toutiao.com"
)){
if
(
body
.
contains
(
"isOriginal"
)
&&
body
.
contains
(
"isOriginal: true"
)){
isforward
=
"原创"
;
}
}
else
{
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
document
.
head
().
childNodes
();
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
source
=
MatchSource
.
matchSource
(
attr
.
get
().
toString
(),
document
.
toString
(),
sourceList
);
}
}
catch
(
Exception
e
)
{
source
=
null
;
channel
=
"新闻"
;
}
logger
.
info
(
attr
.
get
().
toString
()+
"======="
+
channel
+
"================="
+
source
);
SourceForwardBean
sfb
=
new
SourceForwardBean
(
attr
.
get
().
toString
(),
channel
,
source
,
isforward
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
sfb
,
attr
);
}
}
}
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
okhttp3.Request
;
public
class
SourceForwardCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SourceForwardCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
public
GroupSync
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
try
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" exception "
,
e
);
return
null
;
}
}
private
void
start
(
GroupSync
counter
,
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
counter
.
add
();
ZhiWeiTools
.
sleep
(
100
);
if
(
url
!=
null
)
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
}
counter
.
done
();
}
}
}
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
// Map<String,String> headers = HeaderTool.getCommonHead();
if
(
url
.
contains
(
"www.toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
}
if
(
url
.
contains
(
"china.prcfe.com"
))
{
url
=
"http://china.prcfe.com/e/extend/ShowSource/?id="
+
url
.
split
(
"/"
)[
url
.
split
(
"/"
).
length
-
1
].
split
(
"\\."
)[
0
];
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
catch
(
Exception
e1
)
{
logger
.
error
(
"解析出错"
,
e1
);
}
finally
{
counter
.
done
();
}
});
return
counter
;
}
private
void
parseHtml
(
String
body
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
String
source
=
null
;
String
channel
=
"新闻"
;
String
isforward
=
"未知"
;
try
{
Document
document
=
Jsoup
.
parse
(
body
);
if
(
attr
.
get
().
toString
().
contains
(
"mp.weixin.qq.com"
)){
isforward
=
document
.
select
(
"div#meta_content"
).
select
(
"span#copyright_logo"
).
text
();
if
(
isforward
.
contains
(
"原创"
)){
isforward
=
"原创"
;
}
else
{
isforward
=
"未知"
;
}
}
else
if
(
attr
.
get
().
toString
().
contains
(
"www.toutiao.com"
)){
if
(
body
.
contains
(
"isOriginal"
)
&&
body
.
contains
(
"isOriginal: true"
)){
isforward
=
"原创"
;
}
}
else
{
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
document
.
head
().
childNodes
();
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
source
=
MatchSource
.
matchSource
(
attr
.
get
().
toString
(),
document
.
toString
(),
sourceList
);
}
}
catch
(
Exception
e
)
{
source
=
null
;
channel
=
"新闻"
;
}
logger
.
info
(
attr
.
get
().
toString
()+
"======="
+
channel
+
"================="
+
source
);
SourceForwardBean
sfb
=
new
SourceForwardBean
(
attr
.
get
().
toString
(),
channel
,
source
,
isforward
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
sfb
,
attr
);
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
9fcfba2d
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
/**
*
* @ClassName UrlLiveCrawler
* @Description 判断页面是否存在
* @author byte-zbs
* @Date 2018年8月20日 下午3:34:57
* @version 1.0.0
*/
public
class
UrlLiveCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
true
).
build
();
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
counter
.
add
();
if
(
nonNull
(
url
))
{
try
{
// ZhiWeiTools.sleep(3000);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
}
}
counter
.
done
();
}
}
}
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
// System.out.println(url);
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
ProxyHolder
ph
=
null
;
if
(
url
.
contains
(
"toutiao.com"
)){
// headers.put("referer", url);
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_webid=6837283963338622477; tt_webid=6837283963338622477; __tasessionId=dsei2aty41591951911851; tt_scid=M--AJ-FYwZ0qcYTzQCLyMeS5MlykLS6ktMkvqKKJmq-ghRxX4waEBhJ3YbheuNmi2b8a");
// headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
// headers.put("accept-encoding", "gzip, deflate, br");
// headers.put("accept-language", "zh-CN,zh;q=0.9");
// headers.put("cache-control", "no-cache");
// headers.put("sec-fetch-dest", "document");
// headers.put("sec-fetch-mode", "navigate");
// headers.put("sec-fetch-site", "same-origin");
// headers.put("sec-fetch-user", "?1");
// headers.put("upgrade-insecure-requests", "1");
// headers.put("user-agent", "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Mobile Safari/537.36");
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
try
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
if
(
Objects
.
nonNull
(
request
))
{
counter
.
add
();
// , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
httpBoot
.
asyncCall
(
request
,
ph
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
isSuccessful
())
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
if
(
rs
.
code
()
==
404
){
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
{
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
else
{
logger
.
error
(
"e"
,
ex
);
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
}
finally
{
counter
.
done
();
}
});
return
counter
;
}
}
catch
(
Exception
e2
)
{
logger
.
error
(
"数据出错 {}"
,
e2
);
}
return
counter
;
}
private
void
callBack
(
UrlLiveDataCallback
callback
,
Attribution
attr
,
int
i
,
String
title
)
{
UrlLiveBean
ulb
=
null
;
if
(
i
==
1
)
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
true
,
title
);
}
else
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
i
,
title
);
}
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
ulb
,
attr
);
}
}
private
String
dealUrl
(
String
url
)
{
try
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
}
else
{
url
=
url
.
replace
(
"toutiao.com"
,
"www.toutiao.com"
);
}
if
(
url
.
contains
(
"https"
))
{
}
else
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
if
(
url
.
contains
(
"group"
))
{
url
=
"https://www.toutiao.com/a"
+
url
.
split
(
"/"
)[
4
]
+
"/"
;
}
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
if
(
url
.
contains
(
"https"
))
{
}
else
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
)
&&
url
.
contains
(
"wm_aid"
))
{
url
=
"http://ff.dayu.com/contents/origin/"
+
url
.
split
(
"wm_aid="
)[
1
].
split
(
"!!wm_id"
)[
0
]+
"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"
;
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
)
&&
url
.
contains
(
"infoid="
))
{
// https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473
url
=
"https://tznew.58.com/tznew/c/info-detail?infoid="
+
url
.
split
(
"infoid="
)[
1
].
split
(
"&"
)[
0
];
}
return
url
;
}
catch
(
Exception
e
)
{
return
url
;
}
}
/**
*
* @Description 判断是否删除
* @param html
* @param attr
* @param callback
*/
private
void
parseHtml
(
String
html
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
UrlLiveBean
ulb
=
matchDel
(
html
,
attr
,
attr
.
getAttr
().
toString
());
if
(
Objects
.
nonNull
(
ulb
))
{
callback
.
onData
(
ulb
,
attr
);
}
else
{
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
}
/***
* @Title: matchDel
* @author hero
* @Description: 验证链接是否有效
* @param @param page
* @param @return 设定文件
* @return boolean 返回类型
*/
public
UrlLiveBean
matchDel
(
String
result
,
Attribution
attr
,
String
url
){
try
{
Document
doc
=
Jsoup
.
parse
(
result
);
String
title
=
null
;
if
(
url
.
contains
(
"mp.weixin.qq.com"
)
||
url
.
contains
(
"post.mp.qq.com"
)
||
url
.
contains
(
"weixin.sogou.com"
)){
title
=
doc
.
select
(
"h2.rich_media_title"
).
text
().
replaceAll
(
" "
,
""
);
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"p.title"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"h3.msg-title"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"div.global_error_msg.warn"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"p.tips"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"h2"
).
text
();
}
// 获取title
Matcher
ma5
=
Pattern
.
compile
(
"var msg_title = \'(.*)\'"
)
.
matcher
(
result
);
if
(
ma5
.
find
())
{
title
=
ma5
.
group
(
1
).
replaceAll
(
" "
,
" "
).
trim
();
}
}
else
if
(
url
.
contains
(
"kuaibao"
)){
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
title
=
doc
.
select
(
"p.style1"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"baidu.com"
)
||
url
.
contains
(
"hao123.com"
))
{
title
=
doc
.
select
(
"p#contaniner"
).
text
();
}
else
if
(
url
.
contains
(
"kanfanews.com"
))
{
title
=
doc
.
select
(
"p#tit"
).
text
();
}
else
if
(
url
.
contains
(
"ifeng.com"
)
&&
result
.
contains
(
"url=http://www.ifeng.com/"
))
{
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
json
.
getJSONObject
(
"data"
).
getString
(
"title"
);
}
catch
(
Exception
e
)
{
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
}
else
if
(
url
.
contains
(
"huanqiu.com"
)
&&
result
.
contains
(
"www.huanqiu.com/404.html"
))
{
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
if
(
url
.
contains
(
"/answer/"
))
{
title
=
resultJson
.
getJSONObject
(
"question"
).
getString
(
"title"
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
)
||
url
.
contains
(
"/p/"
))
{
title
=
resultJson
.
getString
(
"title"
);
}
}
else
if
(
url
.
contains
(
"360kuai.com"
)
&&
result
.
contains
(
"您访问的文章走失了"
))
{
title
=
String
.
valueOf
(
"404"
);
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
title
=
"文章未找到"
;
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
json
.
getJSONObject
(
"result"
).
getString
(
"title"
);
}
catch
(
Exception
e
)
{
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"div.adiv > p > span"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"title"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"h1"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
// title = "网页已删除";
// }
if
(
Objects
.
nonNull
(
title
)
&&
title
.
length
()
>
1
){
return
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
isDelete
(
title
),
title
);
}
else
{
return
null
;
}
}
catch
(
Exception
e
)
{
return
null
;
}
}
/**
*
* @Description 标题判断
* @param title
* @return
*/
private
boolean
isDelete
(
String
title
)
{
List
<
String
>
eList
=
Arrays
.
asList
(
"系统出错"
,
"该内容已被发布者删除"
,
"网页已删除"
,
"此帐号已自主注销,内容无法查看"
,
"页面提示"
,
"正在维护中"
,
"此文章被第三方评估为不实信息"
,
"财经头条"
,
"知识100题"
,
"502BadGateway"
,
"提示信息"
,
"跳转页"
,
"跳转中..."
,
"此帐号在冻结期,内容无法查看"
,
"东北新闻网"
,
"百度一下,你就知道"
,
"帐号已迁移"
,
"手机百度"
,
"内容被删除"
,
"亚博国际|首页"
,
"中国软件网"
,
"云广网"
,
"新浪首页"
,
"文章暂时找不到了"
,
"-法易网"
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
,
"百度新闻——全球最大的中文新闻平台"
,
"以上文章由以下机构判定为不实信息"
,
"该公众号已迁移"
,
"财经网-CAIJING.COM.CN"
,
"蚂蚁资讯"
,
"参数错误"
,
"时尚头条_YOKA时尚网"
,
"该文章已经被删除"
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
,
"文章未找到"
,
"UC头条"
);
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
,
"Objectmoved"
,
"404"
,
"页面没有找到"
,
"页面未找到"
,
"301MovedPermanently"
,
"加载异常"
,
"此帐号已被屏蔽, 内容无法查看"
,
"链接不存在"
,
"新闻已删除"
);
return
cList
.
stream
().
anyMatch
(
title:
:
contains
)
||
eList
.
stream
().
anyMatch
(
title:
:
equals
);
}
/**
* 处理知乎链接
*
* */
private
static
String
treatZhihuUrl
(
String
url
)
{
if
(
url
.
contains
(
"/answer/"
))
{
url
=
"https://api.zhihu.com/answers/"
+
url
.
replaceAll
(
".*/answer/"
,
""
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
))
{
url
=
"https://api.zhihu.com/questions/"
+
url
.
replaceAll
(
".*/question/"
,
""
);
}
else
if
(
url
.
contains
(
"/p/"
))
{
url
=
"https://api.zhihu.com/articles/"
+
url
.
replaceAll
(
".*/p/"
,
""
);
}
return
url
;
}
}
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
/**
*
* @ClassName UrlLiveCrawler
* @Description 判断页面是否存在
* @author byte-zbs
* @Date 2018年8月20日 下午3:34:57
* @version 1.0.0
*/
public
class
UrlLiveCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
true
).
build
();
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
counter
.
add
();
ZhiWeiTools
.
sleep
(
100
);
if
(
nonNull
(
url
))
{
try
{
// ZhiWeiTools.sleep(3000);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
}
}
counter
.
done
();
}
}
}
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
// System.out.println(url);
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
ProxyHolder
ph
=
null
;
if
(
url
.
contains
(
"toutiao.com"
)){
// headers.put("referer", url);
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_webid=6837283963338622477; tt_webid=6837283963338622477; __tasessionId=dsei2aty41591951911851; tt_scid=M--AJ-FYwZ0qcYTzQCLyMeS5MlykLS6ktMkvqKKJmq-ghRxX4waEBhJ3YbheuNmi2b8a");
// headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
// headers.put("accept-encoding", "gzip, deflate, br");
// headers.put("accept-language", "zh-CN,zh;q=0.9");
// headers.put("cache-control", "no-cache");
// headers.put("sec-fetch-dest", "document");
// headers.put("sec-fetch-mode", "navigate");
// headers.put("sec-fetch-site", "same-origin");
// headers.put("sec-fetch-user", "?1");
// headers.put("upgrade-insecure-requests", "1");
// headers.put("user-agent", "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Mobile Safari/537.36");
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
try
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
if
(
Objects
.
nonNull
(
request
))
{
counter
.
add
();
// , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
httpBoot
.
asyncCall
(
request
,
ph
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
isSuccessful
())
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
if
(
rs
.
code
()
==
404
){
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
{
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
else
{
logger
.
error
(
"e"
,
ex
);
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
}
finally
{
counter
.
done
();
}
});
return
counter
;
}
}
catch
(
Exception
e2
)
{
logger
.
error
(
"数据出错 {}"
,
e2
);
}
return
counter
;
}
private
void
callBack
(
UrlLiveDataCallback
callback
,
Attribution
attr
,
int
i
,
String
title
)
{
UrlLiveBean
ulb
=
null
;
if
(
i
==
1
)
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
true
,
title
);
}
else
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
i
,
title
);
}
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
ulb
,
attr
);
}
}
private
String
dealUrl
(
String
url
)
{
try
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
}
else
{
url
=
url
.
replace
(
"toutiao.com"
,
"www.toutiao.com"
);
}
if
(
url
.
contains
(
"https"
))
{
}
else
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
if
(
url
.
contains
(
"group"
))
{
url
=
"https://www.toutiao.com/a"
+
url
.
split
(
"/"
)[
4
]
+
"/"
;
}
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
if
(
url
.
contains
(
"https"
))
{
}
else
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
)
&&
url
.
contains
(
"wm_aid"
))
{
url
=
"http://ff.dayu.com/contents/origin/"
+
url
.
split
(
"wm_aid="
)[
1
].
split
(
"!!wm_id"
)[
0
]+
"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"
;
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
)
&&
url
.
contains
(
"infoid="
))
{
// https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473
url
=
"https://tznew.58.com/tznew/c/info-detail?infoid="
+
url
.
split
(
"infoid="
)[
1
].
split
(
"&"
)[
0
];
}
return
url
;
}
catch
(
Exception
e
)
{
return
url
;
}
}
/**
*
* @Description 判断是否删除
* @param html
* @param attr
* @param callback
*/
private
void
parseHtml
(
String
html
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
UrlLiveBean
ulb
=
matchDel
(
html
,
attr
,
attr
.
getAttr
().
toString
());
if
(
Objects
.
nonNull
(
ulb
))
{
callback
.
onData
(
ulb
,
attr
);
}
else
{
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
}
/***
* @Title: matchDel
* @author hero
* @Description: 验证链接是否有效
* @param @param page
* @param @return 设定文件
* @return boolean 返回类型
*/
public
UrlLiveBean
matchDel
(
String
result
,
Attribution
attr
,
String
url
){
try
{
Document
doc
=
Jsoup
.
parse
(
result
);
String
title
=
null
;
if
(
url
.
contains
(
"mp.weixin.qq.com"
)
||
url
.
contains
(
"post.mp.qq.com"
)
||
url
.
contains
(
"weixin.sogou.com"
)){
title
=
doc
.
select
(
"h2.rich_media_title"
).
text
().
replaceAll
(
" "
,
""
);
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"p.title"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"h3.msg-title"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"div.global_error_msg.warn"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"p.tips"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"h2"
).
text
();
}
// 获取title
Matcher
ma5
=
Pattern
.
compile
(
"var msg_title = \'(.*)\'"
)
.
matcher
(
result
);
if
(
ma5
.
find
())
{
title
=
ma5
.
group
(
1
).
replaceAll
(
" "
,
" "
).
trim
();
}
}
else
if
(
url
.
contains
(
"kuaibao"
)){
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
title
=
doc
.
select
(
"p.style1"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"baidu.com"
)
||
url
.
contains
(
"hao123.com"
))
{
title
=
doc
.
select
(
"p#contaniner"
).
text
();
}
else
if
(
url
.
contains
(
"kanfanews.com"
))
{
title
=
doc
.
select
(
"p#tit"
).
text
();
}
else
if
(
url
.
contains
(
"ifeng.com"
)
&&
result
.
contains
(
"url=http://www.ifeng.com/"
))
{
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
json
.
getJSONObject
(
"data"
).
getString
(
"title"
);
}
catch
(
Exception
e
)
{
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
}
else
if
(
url
.
contains
(
"huanqiu.com"
)
&&
result
.
contains
(
"www.huanqiu.com/404.html"
))
{
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
if
(
url
.
contains
(
"/answer/"
))
{
title
=
resultJson
.
getJSONObject
(
"question"
).
getString
(
"title"
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
)
||
url
.
contains
(
"/p/"
))
{
title
=
resultJson
.
getString
(
"title"
);
}
}
else
if
(
url
.
contains
(
"360kuai.com"
)
&&
result
.
contains
(
"您访问的文章走失了"
))
{
title
=
String
.
valueOf
(
"404"
);
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
title
=
"文章未找到"
;
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
json
.
getJSONObject
(
"result"
).
getString
(
"title"
);
}
catch
(
Exception
e
)
{
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"div.adiv > p > span"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"title"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"h1"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
// title = "网页已删除";
// }
if
(
Objects
.
nonNull
(
title
)
&&
title
.
length
()
>
1
){
return
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
isDelete
(
title
),
title
);
}
else
{
return
null
;
}
}
catch
(
Exception
e
)
{
return
null
;
}
}
/**
*
* @Description 标题判断
* @param title
* @return
*/
private
boolean
isDelete
(
String
title
)
{
List
<
String
>
eList
=
Arrays
.
asList
(
"系统出错"
,
"该内容已被发布者删除"
,
"网页已删除"
,
"此帐号已自主注销,内容无法查看"
,
"页面提示"
,
"正在维护中"
,
"此文章被第三方评估为不实信息"
,
"财经头条"
,
"知识100题"
,
"502BadGateway"
,
"提示信息"
,
"跳转页"
,
"跳转中..."
,
"此帐号在冻结期,内容无法查看"
,
"东北新闻网"
,
"百度一下,你就知道"
,
"帐号已迁移"
,
"手机百度"
,
"内容被删除"
,
"亚博国际|首页"
,
"中国软件网"
,
"云广网"
,
"新浪首页"
,
"文章暂时找不到了"
,
"-法易网"
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
,
"百度新闻——全球最大的中文新闻平台"
,
"以上文章由以下机构判定为不实信息"
,
"该公众号已迁移"
,
"财经网-CAIJING.COM.CN"
,
"蚂蚁资讯"
,
"参数错误"
,
"时尚头条_YOKA时尚网"
,
"该文章已经被删除"
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
,
"文章未找到"
,
"UC头条"
);
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
,
"Objectmoved"
,
"404"
,
"页面没有找到"
,
"页面未找到"
,
"301MovedPermanently"
,
"加载异常"
,
"此帐号已被屏蔽, 内容无法查看"
,
"链接不存在"
,
"新闻已删除"
);
return
cList
.
stream
().
anyMatch
(
title:
:
contains
)
||
eList
.
stream
().
anyMatch
(
title:
:
equals
);
}
/**
* 处理知乎链接
*
* */
private
static
String
treatZhihuUrl
(
String
url
)
{
if
(
url
.
contains
(
"/answer/"
))
{
url
=
"https://api.zhihu.com/answers/"
+
url
.
replaceAll
(
".*/answer/"
,
""
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
))
{
url
=
"https://api.zhihu.com/questions/"
+
url
.
replaceAll
(
".*/question/"
,
""
);
}
else
if
(
url
.
contains
(
"/p/"
))
{
url
=
"https://api.zhihu.com/articles/"
+
url
.
replaceAll
(
".*/p/"
,
""
);
}
return
url
;
}
}
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
9fcfba2d
...
...
@@ -325,7 +325,6 @@ public class MatchSource {
source
=
source
.
replaceAll
(
".*来源:|)"
,
""
);
}
}
if
(
Objects
.
nonNull
(
source
)
&&
source
.
length
()
!=
0
)
{
return
source
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment