Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
b8ed38f4
Commit
b8ed38f4
authored
Nov 09, 2020
by
chenweiyang
Browse files
Options
Browse Files
Download
Plain Diff
链接是否删除部分修改
parents
bd0353ac
7003572f
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
616 additions
and
598 deletions
+616
-598
Log/crawler.log
+0
-0
pom.xml
+1
-1
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+121
-119
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+3
-5
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+141
-128
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+345
-339
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+1
-1
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+2
-3
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+2
-2
No files found.
Log/crawler.log
View file @
b8ed38f4
This source diff could not be displayed because it is too large. You can
view the blob
instead.
pom.xml
View file @
b8ed38f4
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<artifactId>
source-forward
</artifactId>
<version>
0.2.
7
-SNAPSHOT
</version>
<version>
0.2.
8
-SNAPSHOT
</version>
<name>
source-forward
</name>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
...
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
b8ed38f4
package
com
.
zhiwei
.
source_forward
.
crawler
;
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.Objects
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
okhttp3.Request
;
import
okhttp3.Request
;
public
class
ContentCrawler
{
public
class
ContentCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
/**
* @Description 链接传入 并 返回采集完信号
*
* @param callback
* @Description 链接传入 并 返回采集完信号
* @param urls
* @param callback
* @return
* @param urls
* @throws Exception
* @return
*/
* @throws Exception
public
GroupSync
submitTask
(
ContentDataCallback
callback
,
*/
String
...
urls
)
{
public
GroupSync
submitTask
(
ContentDataCallback
callback
,
GroupSync
counter
=
new
GroupSync
();
String
...
urls
)
{
start
(
counter
,
callback
,
urls
);
GroupSync
counter
=
new
GroupSync
();
return
counter
;
start
(
counter
,
callback
,
urls
);
}
return
counter
;
}
/**
*
/**
* @Description 提交链接
*
* @param counter
* @Description 提交链接
* @param callback
* @param counter
* @param urls
* @param callback
*/
* @param urls
private
void
start
(
GroupSync
counter
,
*/
ContentDataCallback
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
ContentDataCallback
callback
,
String
...
urls
)
{
for
(
String
url
:
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
if
(
url
!=
null
)
{
for
(
String
url
:
urls
)
{
try
{
ZhiWeiTools
.
sleep
(
100
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
if
(
url
!=
null
)
{
}
catch
(
Exception
e
)
{
try
{
logger
.
error
(
"搜索创建出错"
,
e
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"搜索创建出错"
,
e
);
}
}
}
}
}
}
}
/**
}
*
* @Description 链接获取文章信息
/**
* @param counter
*
* @param url
* @Description 链接获取文章信息
* @param attr
* @param counter
* @param callback
* @param url
* @return
* @param attr
*/
* @param callback
private
GroupSync
search
(
GroupSync
counter
,
* @return
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
*/
logger
.
info
(
"当前处理 URL: {}"
,
url
);
private
GroupSync
search
(
GroupSync
counter
,
Request
request
=
RequestUtils
.
wrapGet
(
url
);
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
counter
.
add
();
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
counter
.
add
();
try
{
if
(
Objects
.
isNull
(
ex
))
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
try
{
}
else
{
if
(
Objects
.
isNull
(
ex
))
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
}
else
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
logger
.
info
(
"搜索结果访问失败: {}"
,
ex
);
}
}
finally
{
}
catch
(
Exception
e
)
{
counter
.
done
();
logger
.
info
(
"搜索结果访问失败: {}"
,
ex
);
}
}
finally
{
counter
.
done
();
});
}
return
counter
;
});
}
return
counter
;
/**
}
*
*
/**
* @Description 获取正文解析
*
* @param response
*
* @param attr
* @Description 获取正文解析
* @param callback
* @param response
*/
* @param attr
private
void
parseHtml
(
String
result
,
Attribution
attr
,
* @param callback
ContentDataCallback
callback
)
{
*/
try
{
private
void
parseHtml
(
String
result
,
Attribution
attr
,
String
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
ContentDataCallback
callback
)
{
result
);
try
{
ContentBean
cb
=
new
ContentBean
(
attr
.
get
().
toString
(),
content
);
String
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
if
(
callback
==
null
)
{
result
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
ContentBean
cb
=
new
ContentBean
(
attr
.
get
().
toString
(),
content
);
}
else
{
if
(
callback
==
null
)
{
callback
.
onData
(
cb
,
attr
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
}
else
{
}
catch
(
Exception
e
)
{
callback
.
onData
(
cb
,
attr
);
logger
.
error
(
"网页链接失效"
,
e
);
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"网页链接失效"
,
e
);
}
}
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
b8ed38f4
...
@@ -5,6 +5,7 @@ import java.util.List;
...
@@ -5,6 +5,7 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
...
@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
...
@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
private
void
start
(
GroupSync
counter
,
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
ZhiWeiTools
.
sleep
(
100
);
counter
.
add
();
counter
.
add
();
if
(
url
!=
null
)
{
if
(
url
!=
null
)
{
try
{
try
{
...
@@ -89,12 +91,9 @@ public class MediaSelfSourceCrawler {
...
@@ -89,12 +91,9 @@ public class MediaSelfSourceCrawler {
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
ProxyHolder
ph
=
null
;
ProxyHolder
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
if
(
url
.
contains
(
"toutiao.com"
))
{
if
(
url
.
contains
(
"toutiao.com"
))
{
map
.
put
(
"referer"
,
url
);
map
.
put
(
"referer"
,
url
);
ph
=
ProxyHolder
.
SOUGOU_OUTER_PROXY
;
}
else
{
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
}
url
=
dealUrl
(
url
);
url
=
dealUrl
(
url
);
if
(
Objects
.
nonNull
(
url
))
{
if
(
Objects
.
nonNull
(
url
))
{
...
@@ -168,7 +167,6 @@ public class MediaSelfSourceCrawler {
...
@@ -168,7 +167,6 @@ public class MediaSelfSourceCrawler {
String
url
=
attr
.
get
().
toString
();
String
url
=
attr
.
get
().
toString
();
try
{
try
{
source
=
MatchSource
.
matchMediaSelfSource
(
url
+
eUrl
,
result
);
source
=
MatchSource
.
matchMediaSelfSource
(
url
+
eUrl
,
result
);
logger
.
info
(
url
+
"======="
+
source
);
channel
=
MatchChannel
.
verifyChannel
(
url
);
channel
=
MatchChannel
.
verifyChannel
(
url
);
if
(
channel
==
null
){
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
Jsoup
.
parse
(
result
).
head
().
childNodes
();
List
<
Node
>
nodeList
=
Jsoup
.
parse
(
result
).
head
().
childNodes
();
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
b8ed38f4
package
com
.
zhiwei
.
source_forward
.
crawler
;
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
import
okhttp3.Request
;
public
class
SourceForwardCrawler
{
public
class
SourceForwardCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SourceForwardCrawler
.
class
);
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SourceForwardCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
public
GroupSync
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
try
{
public
GroupSync
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
try
{
start
(
counter
,
callback
,
urls
);
GroupSync
counter
=
new
GroupSync
();
return
counter
;
start
(
counter
,
callback
,
urls
);
}
catch
(
Exception
e
)
{
return
counter
;
logger
.
error
(
" exception "
,
e
);
}
catch
(
Exception
e
)
{
return
null
;
logger
.
error
(
" exception "
,
e
);
}
return
null
;
}
}
}
private
void
start
(
GroupSync
counter
,
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
private
void
start
(
GroupSync
counter
,
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
for
(
String
url
:
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
counter
.
add
();
for
(
String
url
:
urls
)
{
if
(
url
!=
null
)
{
counter
.
add
();
try
{
ZhiWeiTools
.
sleep
(
100
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
if
(
url
!=
null
)
{
}
catch
(
Exception
e
)
{
try
{
logger
.
error
(
"搜索创建出错"
,
e
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"搜索创建出错"
,
e
);
counter
.
done
();
}
}
}
}
counter
.
done
();
}
}
}
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
}
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
// Map<String,String> headers = HeaderTool.getCommonHead();
logger
.
info
(
"当前处理 URL: {}"
,
url
);
if
(
url
.
contains
(
"www.toutiao.com"
)){
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
headers
.
put
(
"referer"
,
url
);
// Map<String,String> headers = HeaderTool.getCommonHead();
}
if
(
url
.
contains
(
"www.toutiao.com"
)){
if
(
url
.
contains
(
"china.prcfe.com"
))
{
headers
.
put
(
"referer"
,
url
);
url
=
"http://china.prcfe.com/e/extend/ShowSource/?id="
+
url
.
split
(
"/"
)[
url
.
split
(
"/"
).
length
-
1
].
split
(
"\\."
)[
0
];
}
}
if
(
url
.
contains
(
"china.prcfe.com"
))
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
url
=
"http://china.prcfe.com/e/extend/ShowSource/?id="
+
url
.
split
(
"/"
)[
url
.
split
(
"/"
).
length
-
1
].
split
(
"\\."
)[
0
];
counter
.
add
();
}
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
if
(
url
.
contains
(
"gu.qq.com"
))
{
try
{
String
id
=
url
.
split
(
"\\?id="
)[
1
];
if
(
Objects
.
isNull
(
ex
))
{
url
=
"https://snp.tenpay.com/cgi-bin/snpgw_unified_newsinfo.fcgi?&filter=0&zappid=zxg_h5&sign=b2aceeb8a8ef093862608d806c1d6ab8&nonce=8464&reserve=1572995&&channel=zxg&user_openid=undefined&user_skey=undefined&&news_id="
+
id
;
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
headers
.
put
(
"referer"
,
"https://gu.qq.com/resources/shy/news/detail-v2/index.html"
);
}
else
{
}
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
}
counter
.
add
();
}
catch
(
Exception
e1
)
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
logger
.
error
(
"解析出错"
,
e1
);
try
{
}
finally
{
if
(
Objects
.
isNull
(
ex
))
{
counter
.
done
();
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
}
else
{
});
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
return
counter
;
}
}
}
catch
(
Exception
e1
)
{
logger
.
error
(
"解析出错"
,
e1
);
private
void
parseHtml
(
String
body
,
Attribution
attr
,
}
finally
{
SourceForwardDataCallBack
callback
)
{
counter
.
done
();
String
source
=
null
;
}
String
channel
=
"新闻"
;
});
String
isforward
=
"未知"
;
return
counter
;
try
{
}
Document
document
=
Jsoup
.
parse
(
body
);
if
(
attr
.
get
().
toString
().
contains
(
"mp.weixin.qq.com"
)){
private
void
parseHtml
(
String
body
,
Attribution
attr
,
isforward
=
document
.
select
(
"div#meta_content"
).
select
(
"span#copyright_logo"
).
text
();
SourceForwardDataCallBack
callback
)
{
if
(
isforward
.
contains
(
"原创"
)){
String
source
=
null
;
isforward
=
"原创"
;
String
channel
=
"新闻"
;
}
else
{
String
isforward
=
"未知"
;
isforward
=
"未知"
;
try
{
}
if
(
attr
.
get
().
toString
().
contains
(
"mp.weixin.qq.com"
)){
}
else
if
(
attr
.
get
().
toString
().
contains
(
"www.toutiao.com"
)){
Document
document
=
Jsoup
.
parse
(
body
);
if
(
body
.
contains
(
"isOriginal"
)
&&
body
.
contains
(
"isOriginal: true"
)){
isforward
=
document
.
select
(
"div#meta_content"
).
select
(
"span#copyright_logo"
).
text
();
isforward
=
"原创"
;
if
(
isforward
.
contains
(
"原创"
)){
}
isforward
=
"原创"
;
}
else
{
}
else
{
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
isforward
=
"未知"
;
if
(
channel
==
null
){
}
List
<
Node
>
nodeList
=
document
.
head
().
childNodes
();
}
else
if
(
attr
.
get
().
toString
().
contains
(
"www.toutiao.com"
)){
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
if
(
body
.
contains
(
"isOriginal"
)
&&
body
.
contains
(
"isOriginal: true"
)){
}
isforward
=
"原创"
;
source
=
MatchSource
.
matchSource
(
attr
.
get
().
toString
(),
document
.
toString
(),
sourceList
);
}
}
}
else
if
(
attr
.
get
().
toString
().
contains
(
"snp.tenpay.com"
)
||
attr
.
get
().
toString
().
contains
(
"gu.qq.com"
)){
}
catch
(
Exception
e
)
{
if
(
body
.
contains
(
"source"
)){
source
=
null
;
source
=
body
.
split
(
"\"source\":\""
)[
1
].
split
(
"\""
)[
0
];
channel
=
"新闻"
;
}
}
}
else
{
logger
.
info
(
attr
.
get
().
toString
()+
"======="
+
channel
+
"================="
+
source
);
Document
document
=
Jsoup
.
parse
(
body
);
SourceForwardBean
sfb
=
new
SourceForwardBean
(
attr
.
get
().
toString
(),
channel
,
source
,
isforward
);
source
=
MatchSource
.
matchSource
(
attr
.
get
().
toString
(),
document
.
toString
(),
sourceList
);
if
(
callback
==
null
)
{
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
if
(
channel
==
null
){
}
else
{
List
<
Node
>
nodeList
=
document
.
head
().
childNodes
();
callback
.
onData
(
sfb
,
attr
);
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
}
}
}
}
catch
(
Exception
e
)
{
}
e
.
printStackTrace
();
source
=
null
;
channel
=
"新闻"
;
}
logger
.
info
(
attr
.
get
().
toString
()+
"======="
+
channel
+
"================="
+
source
);
SourceForwardBean
sfb
=
new
SourceForwardBean
(
attr
.
get
().
toString
(),
channel
,
source
,
isforward
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
sfb
,
attr
);
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
b8ed38f4
package
com
.
zhiwei
.
source_forward
.
crawler
;
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.Arrays
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
java.util.regex.Matcher
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
java.util.regex.Pattern
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
import
okhttp3.Request
;
/**
/**
*
*
* @ClassName UrlLiveCrawler
* @ClassName UrlLiveCrawler
* @Description 判断页面是否存在
* @Description 判断页面是否存在
* @author byte-zbs
* @author byte-zbs
* @Date 2018年8月20日 下午3:34:57
* @Date 2018年8月20日 下午3:34:57
* @version 1.0.0
* @version 1.0.0
*/
*/
public
class
UrlLiveCrawler
{
public
class
UrlLiveCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
true
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
true
).
build
();
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
start
(
counter
,
callback
,
urls
);
return
counter
;
return
counter
;
}
}
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
counter
.
add
();
counter
.
add
();
if
(
nonNull
(
url
))
{
ZhiWeiTools
.
sleep
(
100
);
try
{
if
(
nonNull
(
url
))
{
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
try
{
}
catch
(
Exception
e
)
{
// ZhiWeiTools.sleep(3000);
logger
.
error
(
"搜索创建出错:"
,
e
);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
}
catch
(
Exception
e
)
{
}
logger
.
error
(
"搜索创建出错:"
,
e
);
counter
.
done
();
}
}
}
}
counter
.
done
();
}
}
}
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
}
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
// System.out.println(url);
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
url
=
dealUrl
(
url
);
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
// System.out.println(url);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
url
=
dealUrl
(
url
);
ProxyHolder
ph
=
null
;
logger
.
info
(
"当前处理 URL: {}"
,
url
);
if
(
url
.
contains
(
"toutiao.com"
)){
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
// headers.put("referer", url);
ProxyHolder
ph
=
null
;
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_webid=6837283963338622477; tt_webid=6837283963338622477; __tasessionId=dsei2aty41591951911851; tt_scid=M--AJ-FYwZ0qcYTzQCLyMeS5MlykLS6ktMkvqKKJmq-ghRxX4waEBhJ3YbheuNmi2b8a");
if
(
url
.
contains
(
"toutiao.com"
)){
// headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
// headers.put("referer", url);
// headers.put("accept-encoding", "gzip, deflate, br");
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_webid=6837283963338622477; tt_webid=6837283963338622477; __tasessionId=dsei2aty41591951911851; tt_scid=M--AJ-FYwZ0qcYTzQCLyMeS5MlykLS6ktMkvqKKJmq-ghRxX4waEBhJ3YbheuNmi2b8a");
// headers.put("accept-language", "zh-CN,zh;q=0.9");
// headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
// headers.put("cache-control", "no-cache");
// headers.put("accept-encoding", "gzip, deflate, br");
// headers.put("sec-fetch-dest", "document");
// headers.put("accept-language", "zh-CN,zh;q=0.9");
// headers.put("sec-fetch-mode", "navigate");
// headers.put("cache-control", "no-cache");
// headers.put("sec-fetch-site", "same-origin");
// headers.put("sec-fetch-dest", "document");
// headers.put("sec-fetch-user", "?1");
// headers.put("sec-fetch-mode", "navigate");
// headers.put("upgrade-insecure-requests", "1");
// headers.put("sec-fetch-site", "same-origin");
// headers.put("user-agent", "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Mobile Safari/537.36");
// headers.put("sec-fetch-user", "?1");
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
// headers.put("upgrade-insecure-requests", "1");
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
// headers.put("user-agent", "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Mobile Safari/537.36");
url
=
treatZhihuUrl
(
url
);
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
}
url
=
treatZhihuUrl
(
url
);
try
{
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
}
if
(
Objects
.
nonNull
(
request
))
{
try
{
counter
.
add
();
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
// , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
if
(
Objects
.
nonNull
(
request
))
{
httpBoot
.
asyncCall
(
request
,
ph
).
whenComplete
((
rs
,
ex
)
->
{
counter
.
add
();
try
{
// , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
if
(
Objects
.
isNull
(
ex
))
{
httpBoot
.
asyncCall
(
request
,
ph
).
whenComplete
((
rs
,
ex
)
->
{
if
(
rs
.
isSuccessful
())
{
try
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
if
(
Objects
.
isNull
(
ex
))
{
}
else
if
(
rs
.
code
()
==
404
){
if
(
rs
.
isSuccessful
())
{
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
}
else
{
}
else
if
(
rs
.
code
()
==
404
){
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
}
else
{
}
else
{
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
logger
.
error
(
"e"
,
ex
);
}
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
else
{
}
logger
.
error
(
"e"
,
ex
);
}
catch
(
Exception
e
)
{
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
}
}
finally
{
}
catch
(
Exception
e
)
{
counter
.
done
();
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
}
}
finally
{
});
counter
.
done
();
return
counter
;
}
}
});
}
catch
(
Exception
e2
)
{
return
counter
;
logger
.
error
(
"数据出错 {}"
,
e2
);
}
}
}
catch
(
Exception
e2
)
{
return
counter
;
logger
.
error
(
"数据出错 {}"
,
e2
);
}
}
return
counter
;
private
void
callBack
(
UrlLiveDataCallback
callback
,
Attribution
attr
,
int
i
,
String
title
)
{
}
UrlLiveBean
ulb
=
null
;
if
(
i
==
1
)
{
private
void
callBack
(
UrlLiveDataCallback
callback
,
Attribution
attr
,
int
i
,
String
title
)
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
true
,
title
);
UrlLiveBean
ulb
=
null
;
}
else
{
if
(
i
==
1
)
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
i
,
title
);
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
true
,
title
);
}
}
else
{
if
(
callback
==
null
)
{
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
i
,
title
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
}
else
{
if
(
callback
==
null
)
{
callback
.
onData
(
ulb
,
attr
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
}
else
{
}
callback
.
onData
(
ulb
,
attr
);
}
private
String
dealUrl
(
String
url
)
{
}
try
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
private
String
dealUrl
(
String
url
)
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
try
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
}
else
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
url
=
url
.
replace
(
"toutiao.com"
,
"www.toutiao.com"
);
}
}
else
{
if
(
url
.
contains
(
"https"
))
{
url
=
url
.
replace
(
"toutiao.com"
,
"www.toutiao.com"
);
}
}
else
{
if
(
url
.
contains
(
"https"
))
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
}
else
{
if
(
url
.
contains
(
"group"
))
{
url
=
url
.
replace
(
"http"
,
"https"
);
url
=
"https://www.toutiao.com/a"
+
url
.
split
(
"/"
)[
4
]
+
"/"
;
}
}
if
(
url
.
contains
(
"group"
))
{
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
url
=
"https://www.toutiao.com/a"
+
url
.
split
(
"/"
)[
4
]
+
"/"
;
if
(
url
.
contains
(
"https"
))
{
}
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
}
else
{
if
(
url
.
contains
(
"https"
))
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
}
else
{
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
)
&&
url
.
contains
(
"wm_aid"
))
{
url
=
url
.
replace
(
"http"
,
"https"
);
url
=
"http://ff.dayu.com/contents/origin/"
+
url
.
split
(
"wm_aid="
)[
1
].
split
(
"!!wm_id"
)[
0
]+
"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"
;
}
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
)
&&
url
.
contains
(
"infoid="
))
{
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
)
&&
url
.
contains
(
"wm_aid"
))
{
// https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473
url
=
"http://ff.dayu.com/contents/origin/"
+
url
.
split
(
"wm_aid="
)[
1
].
split
(
"!!wm_id"
)[
0
]+
"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"
;
url
=
"https://tznew.58.com/tznew/c/info-detail?infoid="
+
url
.
split
(
"infoid="
)[
1
].
split
(
"&"
)[
0
];
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
)
&&
url
.
contains
(
"infoid="
))
{
}
// https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473
return
url
;
url
=
"https://tznew.58.com/tznew/c/info-detail?infoid="
+
url
.
split
(
"infoid="
)[
1
].
split
(
"&"
)[
0
];
}
catch
(
Exception
e
)
{
}
return
url
;
return
url
;
}
}
catch
(
Exception
e
)
{
}
return
url
;
}
/**
}
*
* @Description 判断是否删除
/**
* @param html
*
* @param attr
* @Description 判断是否删除
* @param callback
* @param html
*/
* @param attr
private
void
parseHtml
(
String
html
,
Attribution
attr
,
* @param callback
UrlLiveDataCallback
callback
)
{
*/
if
(
callback
==
null
)
{
private
void
parseHtml
(
String
html
,
Attribution
attr
,
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
UrlLiveDataCallback
callback
)
{
}
else
{
if
(
callback
==
null
)
{
UrlLiveBean
ulb
=
matchDel
(
html
,
attr
,
attr
.
getAttr
().
toString
());
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
if
(
Objects
.
nonNull
(
ulb
))
{
}
else
{
callback
.
onData
(
ulb
,
attr
);
UrlLiveBean
ulb
=
matchDel
(
html
,
attr
,
attr
.
getAttr
().
toString
());
}
else
{
if
(
Objects
.
nonNull
(
ulb
))
{
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
callback
.
onData
(
ulb
,
attr
);
}
}
else
{
}
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
}
/***
}
* @Title: matchDel
* @author hero
/***
* @Description: 验证链接是否有效
* @Title: matchDel
* @param @param page
* @author hero
* @param @return 设定文件
* @Description: 验证链接是否有效
* @return boolean 返回类型
* @param @param page
*/
* @param @return 设定文件
public
UrlLiveBean
matchDel
(
String
result
,
Attribution
attr
,
String
url
){
* @return boolean 返回类型
try
{
*/
Document
doc
=
Jsoup
.
parse
(
result
);
public
UrlLiveBean
matchDel
(
String
result
,
Attribution
attr
,
String
url
){
String
title
=
null
;
try
{
if
(
url
.
contains
(
"mp.weixin.qq.com"
)
||
url
.
contains
(
"post.mp.qq.com"
)
||
url
.
contains
(
"weixin.sogou.com"
)){
Document
doc
=
Jsoup
.
parse
(
result
);
title
=
doc
.
select
(
"h2.rich_media_title"
).
text
().
replaceAll
(
" "
,
""
);
String
title
=
null
;
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
url
.
contains
(
"mp.weixin.qq.com"
)
||
url
.
contains
(
"post.mp.qq.com"
)
||
url
.
contains
(
"weixin.sogou.com"
)){
title
=
doc
.
select
(
"p.title"
).
text
();
title
=
doc
.
select
(
"h2.rich_media_title"
).
text
().
replaceAll
(
" "
,
""
);
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"p.title"
).
text
();
title
=
doc
.
select
(
"h3.msg-title"
).
text
();
}
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"h3.msg-title"
).
text
();
title
=
doc
.
select
(
"div.global_error_msg.warn"
).
text
();
}
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"div.global_error_msg.warn"
).
text
();
title
=
doc
.
select
(
"p.tips"
).
text
();
}
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"div.warn"
).
text
();
title
=
doc
.
select
(
"h2"
).
text
();
}
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
// 获取title
title
=
doc
.
select
(
"p.tips"
).
text
();
Matcher
ma5
=
Pattern
.
compile
(
"var msg_title = \'(.*)\'"
)
}
.
matcher
(
result
);
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
ma5
.
find
())
{
title
=
doc
.
select
(
"h2"
).
text
();
title
=
ma5
.
group
(
1
).
replaceAll
(
" "
,
" "
).
trim
();
}
}
// 获取title
}
else
if
(
url
.
contains
(
"kuaibao"
)){
Matcher
ma5
=
Pattern
.
compile
(
"var msg_title = \'(.*)\'"
)
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
.
matcher
(
result
);
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
if
(
ma5
.
find
())
{
title
=
doc
.
select
(
"p.style1"
).
text
().
replaceAll
(
" "
,
""
);
title
=
ma5
.
group
(
1
).
replaceAll
(
" "
,
" "
).
trim
();
}
else
if
(
url
.
contains
(
"baidu.com"
)
||
url
.
contains
(
"hao123.com"
))
{
}
title
=
doc
.
select
(
"p#contaniner"
).
text
();
}
else
if
(
url
.
contains
(
"kuaibao"
)){
}
else
if
(
url
.
contains
(
"kanfanews.com"
))
{
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
title
=
doc
.
select
(
"p#tit"
).
text
();
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
}
else
if
(
url
.
contains
(
"ifeng.com"
)
&&
result
.
contains
(
"url=http://www.ifeng.com/"
))
{
title
=
doc
.
select
(
"p.style1"
).
text
().
replaceAll
(
" "
,
""
);
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"baidu.com"
)
||
url
.
contains
(
"hao123.com"
))
{
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
))
{
title
=
doc
.
select
(
"p#contaniner"
).
text
();
try
{
}
else
if
(
url
.
contains
(
"kanfanews.com"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
doc
.
select
(
"p#tit"
).
text
();
title
=
json
.
getJSONObject
(
"data"
).
getString
(
"title"
);
}
else
if
(
url
.
contains
(
"ifeng.com"
)
&&
result
.
contains
(
"url=http://www.ifeng.com/"
))
{
}
catch
(
Exception
e
)
{
title
=
"网页已删除"
;
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
))
{
}
try
{
}
else
if
(
url
.
contains
(
"huanqiu.com"
)
&&
result
.
contains
(
"www.huanqiu.com/404.html"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
"网页已删除"
;
title
=
json
.
getJSONObject
(
"data"
).
getString
(
"title"
);
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
}
catch
(
Exception
e
)
{
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
if
(
url
.
contains
(
"/answer/"
))
{
}
title
=
resultJson
.
getJSONObject
(
"question"
).
getString
(
"title"
);
}
else
if
(
url
.
contains
(
"huanqiu.com"
)
&&
result
.
contains
(
"www.huanqiu.com/404.html"
))
{
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
)
||
url
.
contains
(
"/p/"
))
{
title
=
"网页已删除"
;
title
=
resultJson
.
getString
(
"title"
);
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
}
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
}
else
if
(
url
.
contains
(
"360kuai.com"
)
&&
result
.
contains
(
"您访问的文章走失了"
))
{
if
(
url
.
contains
(
"/answer/"
))
{
title
=
String
.
valueOf
(
"404"
);
title
=
resultJson
.
getJSONObject
(
"question"
).
getString
(
"title"
);
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
)
||
url
.
contains
(
"/p/"
))
{
title
=
"文章未找到"
;
title
=
resultJson
.
getString
(
"title"
);
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
))
{
}
try
{
}
else
if
(
url
.
contains
(
"360kuai.com"
)
&&
result
.
contains
(
"您访问的文章走失了"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
String
.
valueOf
(
"404"
);
title
=
json
.
getJSONObject
(
"result"
).
getString
(
"title"
);
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
}
catch
(
Exception
e
)
{
title
=
"文章未找到"
;
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
else
if
(
url
.
contains
(
"tznew.58.com/view"
))
{
}
try
{
}
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
json
.
getJSONObject
(
"result"
).
getString
(
"title"
);
//若title 为拿到 用 此方法
}
catch
(
Exception
e
)
{
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
title
=
doc
.
select
(
"div.adiv > p > span"
).
text
().
replaceAll
(
" "
,
""
);
}
}
}
//若title 为拿到 用 此方法
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"title"
).
text
().
replaceAll
(
" "
,
""
);
title
=
doc
.
select
(
"div.adiv > p > span"
).
text
().
replaceAll
(
" "
,
""
);
}
}
//若title 为拿到 用 此方法
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"h1"
).
text
().
replaceAll
(
" "
,
""
);
title
=
doc
.
select
(
"title"
).
text
().
replaceAll
(
" "
,
""
);
}
}
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
//若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
// title = "网页已删除";
title
=
doc
.
select
(
"h1"
).
text
().
replaceAll
(
" "
,
""
);
// }
}
if
(
Objects
.
nonNull
(
title
)
&&
title
.
length
()
>
1
){
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
return
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
isDelete
(
title
),
title
);
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
}
else
{
// title = "网页已删除";
return
null
;
// }
}
}
catch
(
Exception
e
)
{
if
(
Objects
.
nonNull
(
title
)
&&
title
.
length
()
>
1
){
return
null
;
return
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
isDelete
(
title
),
title
);
}
}
else
{
}
return
null
;
}
/**
}
catch
(
Exception
e
)
{
*
return
null
;
* @Description 标题判断
}
* @param title
}
* @return
*/
/**
private
boolean
isDelete
(
String
title
)
{
*
List
<
String
>
eList
=
Arrays
.
asList
(
"系统出错"
,
"该内容已被发布者删除"
,
"网页已删除"
* @Description 标题判断
,
"此帐号已自主注销,内容无法查看"
,
"页面提示"
,
"正在维护中"
* @param title
,
"此文章被第三方评估为不实信息"
,
"财经头条"
,
"知识100题"
,
"502BadGateway"
* @return
,
"提示信息"
,
"跳转页"
,
"跳转中..."
,
"此帐号在冻结期,内容无法查看"
,
"东北新闻网"
*/
,
"百度一下,你就知道"
,
"帐号已迁移"
,
"手机百度"
,
"内容被删除"
,
"亚博国际|首页"
private
boolean
isDelete
(
String
title
)
{
,
"中国软件网"
,
"云广网"
,
"新浪首页"
,
"文章暂时找不到了"
,
"-法易网"
List
<
String
>
eList
=
Arrays
.
asList
(
"系统出错"
,
"该内容已被发布者删除"
,
"网页已删除"
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
,
"此帐号已自主注销,内容无法查看"
,
"页面提示"
,
"正在维护中"
,
"百度新闻——全球最大的中文新闻平台"
,
"以上文章由以下机构判定为不实信息"
,
"该公众号已迁移"
,
"此文章被第三方评估为不实信息"
,
"财经头条"
,
"知识100题"
,
"502BadGateway"
,
"财经网-CAIJING.COM.CN"
,
"蚂蚁资讯"
,
"参数错误"
,
"时尚头条_YOKA时尚网"
,
"该文章已经被删除"
,
"提示信息"
,
"跳转页"
,
"跳转中..."
,
"此帐号在冻结期,内容无法查看"
,
"东北新闻网"
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
,
"文章未找到"
,
"百度一下,你就知道"
,
"帐号已迁移"
,
"手机百度"
,
"内容被删除"
,
"亚博国际|首页"
,
"UC头条"
);
,
"中国软件网"
,
"云广网"
,
"新浪首页"
,
"文章暂时找不到了"
,
"-法易网"
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
,
"百度新闻——全球最大的中文新闻平台"
,
"以上文章由以下机构判定为不实信息"
,
"该公众号已迁移"
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
,
"财经网-CAIJING.COM.CN"
,
"蚂蚁资讯"
,
"参数错误"
,
"时尚头条_YOKA时尚网"
,
"该文章已经被删除"
,
"Objectmoved"
,
"404"
,
"页面没有找到"
,
"页面未找到"
,
"301MovedPermanently"
,
"加载异常"
,
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
,
"文章未找到"
"此帐号已被屏蔽, 内容无法查看"
,
"链接不存在"
,
"新闻已删除"
);
,
"UC头条"
);
return
cList
.
stream
().
anyMatch
(
title:
:
contains
)
||
eList
.
stream
().
anyMatch
(
title:
:
equals
);
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
}
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
,
"Objectmoved"
,
"404"
,
"页面没有找到"
,
"页面未找到"
,
"301MovedPermanently"
,
"加载异常"
,
"此帐号已被屏蔽, 内容无法查看"
,
"链接不存在"
,
"新闻已删除"
);
/**
* 处理知乎链接
return
cList
.
stream
().
anyMatch
(
title:
:
contains
)
||
eList
.
stream
().
anyMatch
(
title:
:
equals
);
*
}
* */
private
static
String
treatZhihuUrl
(
String
url
)
{
if
(
url
.
contains
(
"/answer/"
))
{
/**
url
=
"https://api.zhihu.com/answers/"
+
url
.
replaceAll
(
".*/answer/"
,
""
);
* 处理知乎链接
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
))
{
*
url
=
"https://api.zhihu.com/questions/"
+
url
.
replaceAll
(
".*/question/"
,
""
);
* */
}
else
if
(
url
.
contains
(
"/p/"
))
{
private
static
String
treatZhihuUrl
(
String
url
)
{
url
=
"https://api.zhihu.com/articles/"
+
url
.
replaceAll
(
".*/p/"
,
""
);
if
(
url
.
contains
(
"/answer/"
))
{
}
url
=
"https://api.zhihu.com/answers/"
+
url
.
replaceAll
(
".*/answer/"
,
""
);
return
url
;
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
))
{
}
url
=
"https://api.zhihu.com/questions/"
+
url
.
replaceAll
(
".*/question/"
,
""
);
}
else
if
(
url
.
contains
(
"/p/"
))
{
}
url
=
"https://api.zhihu.com/articles/"
+
url
.
replaceAll
(
".*/p/"
,
""
);
}
return
url
;
}
}
\ No newline at end of file
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
View file @
b8ed38f4
...
@@ -32,7 +32,7 @@ public class MediaSelfSource {
...
@@ -32,7 +32,7 @@ public class MediaSelfSource {
ProxyInit
.
initProxy
();
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"https://
new.qq.com/omn/20200507/20200507A0Q9JV00.html
"
);
urlList
.
add
(
"https://
k.sina.com.cn/article_1060093724_3f2fbf1c00100vsqd.html?from=mood
"
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
for
(
MediaSelfSourceBean
b
:
u
)
{
for
(
MediaSelfSourceBean
b
:
u
)
{
...
...
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
b8ed38f4
...
@@ -80,10 +80,10 @@ public class SourceForward {
...
@@ -80,10 +80,10 @@ public class SourceForward {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ProxyInit
.
initProxy
();
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://
www.wangjiaozixun.com/html/zx20/2020/0730/1396388.html
"
);
urlList
.
add
(
"http://
gu.qq.com/resources/shy/news/detail-v2/index.html?#/index?id=SN202006091653447945411f
"
);
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
for
(
SourceForwardBean
sfb
:
da
)
{
for
(
SourceForwardBean
sfb
:
da
)
{
System
.
out
.
println
(
sfb
.
toString
());
System
.
out
.
println
(
"=============="
+
sfb
.
toString
());
}
}
}
}
...
@@ -94,7 +94,6 @@ public class SourceForward {
...
@@ -94,7 +94,6 @@ public class SourceForward {
try
{
try
{
SourceForwardCrawler
crawler
=
new
SourceForwardCrawler
();
SourceForwardCrawler
crawler
=
new
SourceForwardCrawler
();
SourceForwardDataCallBack
callback
=
new
SourceForwardDataCallBack
()
{
SourceForwardDataCallBack
callback
=
new
SourceForwardDataCallBack
()
{
@Override
@Override
public
void
onData
(
SourceForwardBean
data
,
Attribution
attr
)
{
public
void
onData
(
SourceForwardBean
data
,
Attribution
attr
)
{
list
.
add
(
data
);
list
.
add
(
data
);
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
b8ed38f4
...
@@ -72,13 +72,13 @@ public class URLLive {
...
@@ -72,13 +72,13 @@ public class URLLive {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ProxyInit
.
initProxy
();
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://
www.toutiao.com/item/1668646006370318/
"
);
urlList
.
add
(
"http://
mp.weixin.qq.com/s?__biz=Mzg3MDMzNTc5Mg==&mid=2247485220&idx=1&sn=9118543ca120489cccbdc102be58f881
"
);
// urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
// urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
for
(
UrlLiveBean
b
:
u
)
{
for
(
UrlLiveBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
System
.
out
.
println
(
b
.
toString
());
}
}
}
}
static
class
UrlLiveCrawlerThread
extends
Thread
{
static
class
UrlLiveCrawlerThread
extends
Thread
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment