Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
9fcfba2d
Commit
9fcfba2d
authored
Aug 13, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
各个采集验证添加休眠,避免数据过多导致程序阻塞
parent
aa059934
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
594 additions
and
588 deletions
+594
-588
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+121
-119
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+2
-0
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+130
-128
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+341
-340
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+0
-1
No files found.
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
9fcfba2d
package
com
.
zhiwei
.
source_forward
.
crawler
;
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.Objects
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
okhttp3.Request
;
import
okhttp3.Request
;
public
class
ContentCrawler
{
public
class
ContentCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
/**
* @Description 链接传入 并 返回采集完信号
*
* @param callback
* @Description 链接传入 并 返回采集完信号
* @param urls
* @param callback
* @return
* @param urls
* @throws Exception
* @return
*/
* @throws Exception
public
GroupSync
submitTask
(
ContentDataCallback
callback
,
*/
String
...
urls
)
{
public
GroupSync
submitTask
(
ContentDataCallback
callback
,
GroupSync
counter
=
new
GroupSync
();
String
...
urls
)
{
start
(
counter
,
callback
,
urls
);
GroupSync
counter
=
new
GroupSync
();
return
counter
;
start
(
counter
,
callback
,
urls
);
}
return
counter
;
}
/**
*
/**
* @Description 提交链接
*
* @param counter
* @Description 提交链接
* @param callback
* @param counter
* @param urls
* @param callback
*/
* @param urls
private
void
start
(
GroupSync
counter
,
*/
ContentDataCallback
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
ContentDataCallback
callback
,
String
...
urls
)
{
for
(
String
url
:
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
if
(
url
!=
null
)
{
for
(
String
url
:
urls
)
{
try
{
ZhiWeiTools
.
sleep
(
100
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
if
(
url
!=
null
)
{
}
catch
(
Exception
e
)
{
try
{
logger
.
error
(
"搜索创建出错"
,
e
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"搜索创建出错"
,
e
);
}
}
}
}
}
}
}
/**
}
*
* @Description 链接获取文章信息
/**
* @param counter
*
* @param url
* @Description 链接获取文章信息
* @param attr
* @param counter
* @param callback
* @param url
* @return
* @param attr
*/
* @param callback
private
GroupSync
search
(
GroupSync
counter
,
* @return
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
*/
logger
.
info
(
"当前处理 URL: {}"
,
url
);
private
GroupSync
search
(
GroupSync
counter
,
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
counter
.
add
();
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
counter
.
add
();
try
{
if
(
Objects
.
isNull
(
ex
))
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
try
{
}
else
{
if
(
Objects
.
isNull
(
ex
))
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
}
else
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
logger
.
info
(
"搜索结果访问失败: {}"
,
ex
);
}
}
finally
{
}
catch
(
Exception
e
)
{
counter
.
done
();
logger
.
info
(
"搜索结果访问失败: {}"
,
ex
);
}
}
finally
{
counter
.
done
();
});
}
return
counter
;
});
}
return
counter
;
/**
}
*
*
/**
* @Description 获取正文解析
*
* @param response
*
* @param attr
* @Description 获取正文解析
* @param callback
* @param response
*/
* @param attr
private
void
parseHtml
(
String
result
,
Attribution
attr
,
* @param callback
ContentDataCallback
callback
)
{
*/
try
{
private
void
parseHtml
(
String
result
,
Attribution
attr
,
String
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
ContentDataCallback
callback
)
{
result
);
try
{
ContentBean
cb
=
new
ContentBean
(
attr
.
get
().
toString
(),
content
);
String
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
if
(
callback
==
null
)
{
result
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
ContentBean
cb
=
new
ContentBean
(
attr
.
get
().
toString
(),
content
);
}
else
{
if
(
callback
==
null
)
{
callback
.
onData
(
cb
,
attr
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
}
else
{
}
catch
(
Exception
e
)
{
callback
.
onData
(
cb
,
attr
);
logger
.
error
(
"网页链接失效"
,
e
);
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"网页链接失效"
,
e
);
}
}
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
9fcfba2d
...
@@ -5,6 +5,7 @@ import java.util.List;
...
@@ -5,6 +5,7 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
...
@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
...
@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
private
void
start
(
GroupSync
counter
,
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
ZhiWeiTools
.
sleep
(
100
);
counter
.
add
();
counter
.
add
();
if
(
url
!=
null
)
{
if
(
url
!=
null
)
{
try
{
try
{
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
9fcfba2d
package
com
.
zhiwei
.
source_forward
.
crawler
;
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.LogManager
;
import
org.jsoup.Jsoup
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
okhttp3.Request
;
import
okhttp3.Request
;
public
class
SourceForwardCrawler
{
public
class
SourceForwardCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SourceForwardCrawler
.
class
);
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SourceForwardCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
public
GroupSync
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
try
{
public
GroupSync
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
try
{
start
(
counter
,
callback
,
urls
);
GroupSync
counter
=
new
GroupSync
();
return
counter
;
start
(
counter
,
callback
,
urls
);
}
catch
(
Exception
e
)
{
return
counter
;
logger
.
error
(
" exception "
,
e
);
}
catch
(
Exception
e
)
{
return
null
;
logger
.
error
(
" exception "
,
e
);
}
return
null
;
}
}
}
private
void
start
(
GroupSync
counter
,
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
private
void
start
(
GroupSync
counter
,
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
for
(
String
url
:
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
counter
.
add
();
for
(
String
url
:
urls
)
{
if
(
url
!=
null
)
{
counter
.
add
();
try
{
ZhiWeiTools
.
sleep
(
100
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
if
(
url
!=
null
)
{
}
catch
(
Exception
e
)
{
try
{
logger
.
error
(
"搜索创建出错"
,
e
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"搜索创建出错"
,
e
);
counter
.
done
();
}
}
}
}
counter
.
done
();
}
}
}
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
}
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
// Map<String,String> headers = HeaderTool.getCommonHead();
logger
.
info
(
"当前处理 URL: {}"
,
url
);
if
(
url
.
contains
(
"www.toutiao.com"
)){
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
headers
.
put
(
"referer"
,
url
);
// Map<String,String> headers = HeaderTool.getCommonHead();
}
if
(
url
.
contains
(
"www.toutiao.com"
)){
if
(
url
.
contains
(
"china.prcfe.com"
))
{
headers
.
put
(
"referer"
,
url
);
url
=
"http://china.prcfe.com/e/extend/ShowSource/?id="
+
url
.
split
(
"/"
)[
url
.
split
(
"/"
).
length
-
1
].
split
(
"\\."
)[
0
];
}
}
if
(
url
.
contains
(
"china.prcfe.com"
))
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
url
=
"http://china.prcfe.com/e/extend/ShowSource/?id="
+
url
.
split
(
"/"
)[
url
.
split
(
"/"
).
length
-
1
].
split
(
"\\."
)[
0
];
counter
.
add
();
}
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
try
{
counter
.
add
();
if
(
Objects
.
isNull
(
ex
))
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
try
{
}
else
{
if
(
Objects
.
isNull
(
ex
))
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
}
else
{
}
catch
(
Exception
e1
)
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
logger
.
error
(
"解析出错"
,
e1
);
}
}
finally
{
}
catch
(
Exception
e1
)
{
counter
.
done
();
logger
.
error
(
"解析出错"
,
e1
);
}
}
finally
{
});
counter
.
done
();
return
counter
;
}
}
});
return
counter
;
private
void
parseHtml
(
String
body
,
Attribution
attr
,
}
SourceForwardDataCallBack
callback
)
{
String
source
=
null
;
private
void
parseHtml
(
String
body
,
Attribution
attr
,
String
channel
=
"新闻"
;
SourceForwardDataCallBack
callback
)
{
String
isforward
=
"未知"
;
String
source
=
null
;
try
{
String
channel
=
"新闻"
;
Document
document
=
Jsoup
.
parse
(
body
);
String
isforward
=
"未知"
;
if
(
attr
.
get
().
toString
().
contains
(
"mp.weixin.qq.com"
)){
try
{
isforward
=
document
.
select
(
"div#meta_content"
).
select
(
"span#copyright_logo"
).
text
();
Document
document
=
Jsoup
.
parse
(
body
);
if
(
isforward
.
contains
(
"原创"
)){
if
(
attr
.
get
().
toString
().
contains
(
"mp.weixin.qq.com"
)){
isforward
=
"原创"
;
isforward
=
document
.
select
(
"div#meta_content"
).
select
(
"span#copyright_logo"
).
text
();
}
else
{
if
(
isforward
.
contains
(
"原创"
)){
isforward
=
"未知"
;
isforward
=
"原创"
;
}
}
else
{
}
else
if
(
attr
.
get
().
toString
().
contains
(
"www.toutiao.com"
)){
isforward
=
"未知"
;
if
(
body
.
contains
(
"isOriginal"
)
&&
body
.
contains
(
"isOriginal: true"
)){
}
isforward
=
"原创"
;
}
else
if
(
attr
.
get
().
toString
().
contains
(
"www.toutiao.com"
)){
}
if
(
body
.
contains
(
"isOriginal"
)
&&
body
.
contains
(
"isOriginal: true"
)){
}
else
{
isforward
=
"原创"
;
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
}
if
(
channel
==
null
){
}
else
{
List
<
Node
>
nodeList
=
document
.
head
().
childNodes
();
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
if
(
channel
==
null
){
}
List
<
Node
>
nodeList
=
document
.
head
().
childNodes
();
source
=
MatchSource
.
matchSource
(
attr
.
get
().
toString
(),
document
.
toString
(),
sourceList
);
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
}
}
catch
(
Exception
e
)
{
source
=
MatchSource
.
matchSource
(
attr
.
get
().
toString
(),
document
.
toString
(),
sourceList
);
source
=
null
;
}
channel
=
"新闻"
;
}
catch
(
Exception
e
)
{
}
source
=
null
;
logger
.
info
(
attr
.
get
().
toString
()+
"======="
+
channel
+
"================="
+
source
);
channel
=
"新闻"
;
SourceForwardBean
sfb
=
new
SourceForwardBean
(
attr
.
get
().
toString
(),
channel
,
source
,
isforward
);
}
if
(
callback
==
null
)
{
logger
.
info
(
attr
.
get
().
toString
()+
"======="
+
channel
+
"================="
+
source
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
SourceForwardBean
sfb
=
new
SourceForwardBean
(
attr
.
get
().
toString
(),
channel
,
source
,
isforward
);
}
else
{
if
(
callback
==
null
)
{
callback
.
onData
(
sfb
,
attr
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
}
else
{
}
callback
.
onData
(
sfb
,
attr
);
}
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
9fcfba2d
package
com
.
zhiwei
.
source_forward
.
crawler
;
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.Arrays
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
java.util.regex.Matcher
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
java.util.regex.Pattern
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
import
okhttp3.Request
;
/**
/**
*
*
* @ClassName UrlLiveCrawler
* @ClassName UrlLiveCrawler
* @Description 判断页面是否存在
* @Description 判断页面是否存在
* @author byte-zbs
* @author byte-zbs
* @Date 2018年8月20日 下午3:34:57
* @Date 2018年8月20日 下午3:34:57
* @version 1.0.0
* @version 1.0.0
*/
*/
public
class
UrlLiveCrawler
{
public
class
UrlLiveCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
true
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
true
).
build
();
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
start
(
counter
,
callback
,
urls
);
return
counter
;
return
counter
;
}
}
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
counter
.
add
();
counter
.
add
();
if
(
nonNull
(
url
))
{
ZhiWeiTools
.
sleep
(
100
);
try
{
if
(
nonNull
(
url
))
{
// ZhiWeiTools.sleep(3000);
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
// ZhiWeiTools.sleep(3000);
}
catch
(
Exception
e
)
{
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
logger
.
error
(
"搜索创建出错:"
,
e
);
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"搜索创建出错:"
,
e
);
}
}
counter
.
done
();
}
}
counter
.
done
();
}
}
}
}
}
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
// System.out.println(url);
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
url
=
dealUrl
(
url
);
// System.out.println(url);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
url
=
dealUrl
(
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
logger
.
info
(
"当前处理 URL: {}"
,
url
);
ProxyHolder
ph
=
null
;
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
if
(
url
.
contains
(
"toutiao.com"
)){
ProxyHolder
ph
=
null
;
// headers.put("referer", url);
if
(
url
.
contains
(
"toutiao.com"
)){
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_webid=6837283963338622477; tt_webid=6837283963338622477; __tasessionId=dsei2aty41591951911851; tt_scid=M--AJ-FYwZ0qcYTzQCLyMeS5MlykLS6ktMkvqKKJmq-ghRxX4waEBhJ3YbheuNmi2b8a");
// headers.put("referer", url);
// headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_webid=6837283963338622477; tt_webid=6837283963338622477; __tasessionId=dsei2aty41591951911851; tt_scid=M--AJ-FYwZ0qcYTzQCLyMeS5MlykLS6ktMkvqKKJmq-ghRxX4waEBhJ3YbheuNmi2b8a");
// headers.put("accept-encoding", "gzip, deflate, br");
// headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
// headers.put("accept-language", "zh-CN,zh;q=0.9");
// headers.put("accept-encoding", "gzip, deflate, br");
// headers.put("cache-control", "no-cache");
// headers.put("accept-language", "zh-CN,zh;q=0.9");
// headers.put("sec-fetch-dest", "document");
// headers.put("cache-control", "no-cache");
// headers.put("sec-fetch-mode", "navigate");
// headers.put("sec-fetch-dest", "document");
// headers.put("sec-fetch-site", "same-origin");
// headers.put("sec-fetch-mode", "navigate");
// headers.put("sec-fetch-user", "?1");
// headers.put("sec-fetch-site", "same-origin");
// headers.put("upgrade-insecure-requests", "1");
// headers.put("sec-fetch-user", "?1");
// headers.put("user-agent", "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Mobile Safari/537.36");
// headers.put("upgrade-insecure-requests", "1");
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
// headers.put("user-agent", "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Mobile Safari/537.36");
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
url
=
treatZhihuUrl
(
url
);
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
url
=
treatZhihuUrl
(
url
);
}
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
try
{
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
try
{
if
(
Objects
.
nonNull
(
request
))
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
counter
.
add
();
if
(
Objects
.
nonNull
(
request
))
{
// , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ph
).
whenComplete
((
rs
,
ex
)
->
{
// , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
try
{
httpBoot
.
asyncCall
(
request
,
ph
).
whenComplete
((
rs
,
ex
)
->
{
if
(
Objects
.
isNull
(
ex
))
{
try
{
if
(
rs
.
isSuccessful
())
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
if
(
rs
.
isSuccessful
())
{
}
else
if
(
rs
.
code
()
==
404
){
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
if
(
rs
.
code
()
==
404
){
}
else
{
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
else
{
}
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
else
{
}
logger
.
error
(
"e"
,
ex
);
}
else
{
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
logger
.
error
(
"e"
,
ex
);
}
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
catch
(
Exception
e
)
{
}
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
}
catch
(
Exception
e
)
{
}
finally
{
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
counter
.
done
();
}
finally
{
}
counter
.
done
();
});
}
return
counter
;
});
}
return
counter
;
}
catch
(
Exception
e2
)
{
}
logger
.
error
(
"数据出错 {}"
,
e2
);
}
catch
(
Exception
e2
)
{
}
logger
.
error
(
"数据出错 {}"
,
e2
);
return
counter
;
}
}
return
counter
;
}
private
void
callBack
(
UrlLiveDataCallback
callback
,
Attribution
attr
,
int
i
,
String
title
)
{
UrlLiveBean
ulb
=
null
;
private
void
callBack
(
UrlLiveDataCallback
callback
,
Attribution
attr
,
int
i
,
String
title
)
{
if
(
i
==
1
)
{
UrlLiveBean
ulb
=
null
;
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
true
,
title
);
if
(
i
==
1
)
{
}
else
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
true
,
title
);
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
i
,
title
);
}
else
{
}
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
i
,
title
);
if
(
callback
==
null
)
{
}
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
if
(
callback
==
null
)
{
}
else
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
callback
.
onData
(
ulb
,
attr
);
}
else
{
}
callback
.
onData
(
ulb
,
attr
);
}
}
}
private
String
dealUrl
(
String
url
)
{
try
{
private
String
dealUrl
(
String
url
)
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
try
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
}
else
{
url
=
url
.
replace
(
"toutiao.com"
,
"www.toutiao.com"
);
}
else
{
}
url
=
url
.
replace
(
"toutiao.com"
,
"www.toutiao.com"
);
if
(
url
.
contains
(
"https"
))
{
}
if
(
url
.
contains
(
"https"
))
{
}
else
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
else
{
}
url
=
url
.
replace
(
"http"
,
"https"
);
if
(
url
.
contains
(
"group"
))
{
}
url
=
"https://www.toutiao.com/a"
+
url
.
split
(
"/"
)[
4
]
+
"/"
;
if
(
url
.
contains
(
"group"
))
{
}
url
=
"https://www.toutiao.com/a"
+
url
.
split
(
"/"
)[
4
]
+
"/"
;
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
}
if
(
url
.
contains
(
"https"
))
{
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
if
(
url
.
contains
(
"https"
))
{
}
else
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
else
{
}
url
=
url
.
replace
(
"http"
,
"https"
);
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
)
&&
url
.
contains
(
"wm_aid"
))
{
}
url
=
"http://ff.dayu.com/contents/origin/"
+
url
.
split
(
"wm_aid="
)[
1
].
split
(
"!!wm_id"
)[
0
]+
"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"
;
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
)
&&
url
.
contains
(
"wm_aid"
))
{
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
)
&&
url
.
contains
(
"infoid="
))
{
url
=
"http://ff.dayu.com/contents/origin/"
+
url
.
split
(
"wm_aid="
)[
1
].
split
(
"!!wm_id"
)[
0
]+
"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"
;
// https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
)
&&
url
.
contains
(
"infoid="
))
{
url
=
"https://tznew.58.com/tznew/c/info-detail?infoid="
+
url
.
split
(
"infoid="
)[
1
].
split
(
"&"
)[
0
];
// https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473
}
url
=
"https://tznew.58.com/tznew/c/info-detail?infoid="
+
url
.
split
(
"infoid="
)[
1
].
split
(
"&"
)[
0
];
return
url
;
}
}
catch
(
Exception
e
)
{
return
url
;
return
url
;
}
catch
(
Exception
e
)
{
}
return
url
;
}
}
}
/**
*
/**
* @Description 判断是否删除
*
* @param html
* @Description 判断是否删除
* @param attr
* @param html
* @param callback
* @param attr
*/
* @param callback
private
void
parseHtml
(
String
html
,
Attribution
attr
,
*/
UrlLiveDataCallback
callback
)
{
private
void
parseHtml
(
String
html
,
Attribution
attr
,
if
(
callback
==
null
)
{
UrlLiveDataCallback
callback
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
if
(
callback
==
null
)
{
}
else
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
UrlLiveBean
ulb
=
matchDel
(
html
,
attr
,
attr
.
getAttr
().
toString
());
}
else
{
if
(
Objects
.
nonNull
(
ulb
))
{
UrlLiveBean
ulb
=
matchDel
(
html
,
attr
,
attr
.
getAttr
().
toString
());
callback
.
onData
(
ulb
,
attr
);
if
(
Objects
.
nonNull
(
ulb
))
{
}
else
{
callback
.
onData
(
ulb
,
attr
);
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
else
{
}
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
}
}
}
/***
* @Title: matchDel
/***
* @author hero
* @Title: matchDel
* @Description: 验证链接是否有效
* @author hero
* @param @param page
* @Description: 验证链接是否有效
* @param @return 设定文件
* @param @param page
* @return boolean 返回类型
* @param @return 设定文件
*/
* @return boolean 返回类型
public
UrlLiveBean
matchDel
(
String
result
,
Attribution
attr
,
String
url
){
*/
try
{
public
UrlLiveBean
matchDel
(
String
result
,
Attribution
attr
,
String
url
){
Document
doc
=
Jsoup
.
parse
(
result
);
try
{
String
title
=
null
;
Document
doc
=
Jsoup
.
parse
(
result
);
if
(
url
.
contains
(
"mp.weixin.qq.com"
)
||
url
.
contains
(
"post.mp.qq.com"
)
||
url
.
contains
(
"weixin.sogou.com"
)){
String
title
=
null
;
title
=
doc
.
select
(
"h2.rich_media_title"
).
text
().
replaceAll
(
" "
,
""
);
if
(
url
.
contains
(
"mp.weixin.qq.com"
)
||
url
.
contains
(
"post.mp.qq.com"
)
||
url
.
contains
(
"weixin.sogou.com"
)){
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"h2.rich_media_title"
).
text
().
replaceAll
(
" "
,
""
);
title
=
doc
.
select
(
"p.title"
).
text
();
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
}
title
=
doc
.
select
(
"p.title"
).
text
();
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
}
title
=
doc
.
select
(
"h3.msg-title"
).
text
();
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
}
title
=
doc
.
select
(
"h3.msg-title"
).
text
();
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
}
title
=
doc
.
select
(
"div.global_error_msg.warn"
).
text
();
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
}
title
=
doc
.
select
(
"div.global_error_msg.warn"
).
text
();
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
}
title
=
doc
.
select
(
"p.tips"
).
text
();
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
}
title
=
doc
.
select
(
"p.tips"
).
text
();
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
}
title
=
doc
.
select
(
"h2"
).
text
();
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
}
title
=
doc
.
select
(
"h2"
).
text
();
// 获取title
}
Matcher
ma5
=
Pattern
.
compile
(
"var msg_title = \'(.*)\'"
)
// 获取title
.
matcher
(
result
);
Matcher
ma5
=
Pattern
.
compile
(
"var msg_title = \'(.*)\'"
)
if
(
ma5
.
find
())
{
.
matcher
(
result
);
title
=
ma5
.
group
(
1
).
replaceAll
(
" "
,
" "
).
trim
();
if
(
ma5
.
find
())
{
}
title
=
ma5
.
group
(
1
).
replaceAll
(
" "
,
" "
).
trim
();
}
else
if
(
url
.
contains
(
"kuaibao"
)){
}
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"kuaibao"
)){
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
title
=
doc
.
select
(
"p.style1"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
}
else
if
(
url
.
contains
(
"baidu.com"
)
||
url
.
contains
(
"hao123.com"
))
{
title
=
doc
.
select
(
"p.style1"
).
text
().
replaceAll
(
" "
,
""
);
title
=
doc
.
select
(
"p#contaniner"
).
text
();
}
else
if
(
url
.
contains
(
"baidu.com"
)
||
url
.
contains
(
"hao123.com"
))
{
}
else
if
(
url
.
contains
(
"kanfanews.com"
))
{
title
=
doc
.
select
(
"p#contaniner"
).
text
();
title
=
doc
.
select
(
"p#tit"
).
text
();
}
else
if
(
url
.
contains
(
"kanfanews.com"
))
{
}
else
if
(
url
.
contains
(
"ifeng.com"
)
&&
result
.
contains
(
"url=http://www.ifeng.com/"
))
{
title
=
doc
.
select
(
"p#tit"
).
text
();
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"ifeng.com"
)
&&
result
.
contains
(
"url=http://www.ifeng.com/"
))
{
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
))
{
title
=
"网页已删除"
;
try
{
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
try
{
title
=
json
.
getJSONObject
(
"data"
).
getString
(
"title"
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
}
catch
(
Exception
e
)
{
title
=
json
.
getJSONObject
(
"data"
).
getString
(
"title"
);
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
catch
(
Exception
e
)
{
}
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
else
if
(
url
.
contains
(
"huanqiu.com"
)
&&
result
.
contains
(
"www.huanqiu.com/404.html"
))
{
}
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"huanqiu.com"
)
&&
result
.
contains
(
"www.huanqiu.com/404.html"
))
{
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
title
=
"网页已删除"
;
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
if
(
url
.
contains
(
"/answer/"
))
{
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
title
=
resultJson
.
getJSONObject
(
"question"
).
getString
(
"title"
);
if
(
url
.
contains
(
"/answer/"
))
{
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
)
||
url
.
contains
(
"/p/"
))
{
title
=
resultJson
.
getJSONObject
(
"question"
).
getString
(
"title"
);
title
=
resultJson
.
getString
(
"title"
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
)
||
url
.
contains
(
"/p/"
))
{
}
title
=
resultJson
.
getString
(
"title"
);
}
else
if
(
url
.
contains
(
"360kuai.com"
)
&&
result
.
contains
(
"您访问的文章走失了"
))
{
}
title
=
String
.
valueOf
(
"404"
);
}
else
if
(
url
.
contains
(
"360kuai.com"
)
&&
result
.
contains
(
"您访问的文章走失了"
))
{
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
title
=
String
.
valueOf
(
"404"
);
title
=
"文章未找到"
;
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
))
{
title
=
"文章未找到"
;
try
{
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
try
{
title
=
json
.
getJSONObject
(
"result"
).
getString
(
"title"
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
}
catch
(
Exception
e
)
{
title
=
json
.
getJSONObject
(
"result"
).
getString
(
"title"
);
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
catch
(
Exception
e
)
{
}
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
}
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
//若title 为拿到 用 此方法
title
=
doc
.
select
(
"div.adiv > p > span"
).
text
().
replaceAll
(
" "
,
""
);
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
}
title
=
doc
.
select
(
"div.adiv > p > span"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
//若title 为拿到 用 此方法
title
=
doc
.
select
(
"title"
).
text
().
replaceAll
(
" "
,
""
);
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
}
title
=
doc
.
select
(
"title"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
//若title 为拿到 用 此方法
title
=
doc
.
select
(
"h1"
).
text
().
replaceAll
(
" "
,
""
);
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
}
title
=
doc
.
select
(
"h1"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
// title = "网页已删除";
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
// }
// title = "网页已删除";
// }
if
(
Objects
.
nonNull
(
title
)
&&
title
.
length
()
>
1
){
return
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
isDelete
(
title
),
title
);
if
(
Objects
.
nonNull
(
title
)
&&
title
.
length
()
>
1
){
}
else
{
return
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
isDelete
(
title
),
title
);
return
null
;
}
else
{
}
return
null
;
}
catch
(
Exception
e
)
{
}
return
null
;
}
catch
(
Exception
e
)
{
}
return
null
;
}
}
}
/**
*
/**
* @Description 标题判断
*
* @param title
* @Description 标题判断
* @return
* @param title
*/
* @return
private
boolean
isDelete
(
String
title
)
{
*/
List
<
String
>
eList
=
Arrays
.
asList
(
"系统出错"
,
"该内容已被发布者删除"
,
"网页已删除"
private
boolean
isDelete
(
String
title
)
{
,
"此帐号已自主注销,内容无法查看"
,
"页面提示"
,
"正在维护中"
List
<
String
>
eList
=
Arrays
.
asList
(
"系统出错"
,
"该内容已被发布者删除"
,
"网页已删除"
,
"此文章被第三方评估为不实信息"
,
"财经头条"
,
"知识100题"
,
"502BadGateway"
,
"此帐号已自主注销,内容无法查看"
,
"页面提示"
,
"正在维护中"
,
"提示信息"
,
"跳转页"
,
"跳转中..."
,
"此帐号在冻结期,内容无法查看"
,
"东北新闻网"
,
"此文章被第三方评估为不实信息"
,
"财经头条"
,
"知识100题"
,
"502BadGateway"
,
"百度一下,你就知道"
,
"帐号已迁移"
,
"手机百度"
,
"内容被删除"
,
"亚博国际|首页"
,
"提示信息"
,
"跳转页"
,
"跳转中..."
,
"此帐号在冻结期,内容无法查看"
,
"东北新闻网"
,
"中国软件网"
,
"云广网"
,
"新浪首页"
,
"文章暂时找不到了"
,
"-法易网"
,
"百度一下,你就知道"
,
"帐号已迁移"
,
"手机百度"
,
"内容被删除"
,
"亚博国际|首页"
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
,
"中国软件网"
,
"云广网"
,
"新浪首页"
,
"文章暂时找不到了"
,
"-法易网"
,
"百度新闻——全球最大的中文新闻平台"
,
"以上文章由以下机构判定为不实信息"
,
"该公众号已迁移"
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
,
"财经网-CAIJING.COM.CN"
,
"蚂蚁资讯"
,
"参数错误"
,
"时尚头条_YOKA时尚网"
,
"该文章已经被删除"
,
"百度新闻——全球最大的中文新闻平台"
,
"以上文章由以下机构判定为不实信息"
,
"该公众号已迁移"
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
,
"文章未找到"
,
"财经网-CAIJING.COM.CN"
,
"蚂蚁资讯"
,
"参数错误"
,
"时尚头条_YOKA时尚网"
,
"该文章已经被删除"
,
"UC头条"
);
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
,
"文章未找到"
,
"UC头条"
);
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
,
"Objectmoved"
,
"404"
,
"页面没有找到"
,
"页面未找到"
,
"301MovedPermanently"
,
"加载异常"
,
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
"此帐号已被屏蔽, 内容无法查看"
,
"链接不存在"
,
"新闻已删除"
);
,
"Objectmoved"
,
"404"
,
"页面没有找到"
,
"页面未找到"
,
"301MovedPermanently"
,
"加载异常"
,
"此帐号已被屏蔽, 内容无法查看"
,
"链接不存在"
,
"新闻已删除"
);
return
cList
.
stream
().
anyMatch
(
title:
:
contains
)
||
eList
.
stream
().
anyMatch
(
title:
:
equals
);
}
return
cList
.
stream
().
anyMatch
(
title:
:
contains
)
||
eList
.
stream
().
anyMatch
(
title:
:
equals
);
}
/**
* 处理知乎链接
/**
*
* 处理知乎链接
* */
*
private
static
String
treatZhihuUrl
(
String
url
)
{
* */
if
(
url
.
contains
(
"/answer/"
))
{
private
static
String
treatZhihuUrl
(
String
url
)
{
url
=
"https://api.zhihu.com/answers/"
+
url
.
replaceAll
(
".*/answer/"
,
""
);
if
(
url
.
contains
(
"/answer/"
))
{
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
))
{
url
=
"https://api.zhihu.com/answers/"
+
url
.
replaceAll
(
".*/answer/"
,
""
);
url
=
"https://api.zhihu.com/questions/"
+
url
.
replaceAll
(
".*/question/"
,
""
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
))
{
}
else
if
(
url
.
contains
(
"/p/"
))
{
url
=
"https://api.zhihu.com/questions/"
+
url
.
replaceAll
(
".*/question/"
,
""
);
url
=
"https://api.zhihu.com/articles/"
+
url
.
replaceAll
(
".*/p/"
,
""
);
}
else
if
(
url
.
contains
(
"/p/"
))
{
}
url
=
"https://api.zhihu.com/articles/"
+
url
.
replaceAll
(
".*/p/"
,
""
);
return
url
;
}
}
return
url
;
}
}
}
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
9fcfba2d
...
@@ -325,7 +325,6 @@ public class MatchSource {
...
@@ -325,7 +325,6 @@ public class MatchSource {
source
=
source
.
replaceAll
(
".*来源:|)"
,
""
);
source
=
source
.
replaceAll
(
".*来源:|)"
,
""
);
}
}
}
}
if
(
Objects
.
nonNull
(
source
)
&&
source
.
length
()
!=
0
)
{
if
(
Objects
.
nonNull
(
source
)
&&
source
.
length
()
!=
0
)
{
return
source
;
return
source
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment