Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
b8ed38f4
Commit
b8ed38f4
authored
Nov 09, 2020
by
chenweiyang
Browse files
Options
Browse Files
Download
Plain Diff
链接是否删除部分修改
parents
bd0353ac
7003572f
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
30 additions
and
13 deletions
+30
-13
Log/crawler.log
+0
-0
pom.xml
+1
-1
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+2
-0
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+3
-5
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+15
-2
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+5
-0
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+1
-1
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+2
-3
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+1
-1
No files found.
Log/crawler.log
View file @
b8ed38f4
This source diff could not be displayed because it is too large. You can
view the blob
instead.
pom.xml
View file @
b8ed38f4
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<artifactId>
source-forward
</artifactId>
<version>
0.2.
7
-SNAPSHOT
</version>
<version>
0.2.
8
-SNAPSHOT
</version>
<name>
source-forward
</name>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
...
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
b8ed38f4
...
@@ -2,6 +2,7 @@ package com.zhiwei.source_forward.crawler;
...
@@ -2,6 +2,7 @@ package com.zhiwei.source_forward.crawler;
import
java.util.Objects
;
import
java.util.Objects
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
...
@@ -47,6 +48,7 @@ public class ContentCrawler {
...
@@ -47,6 +48,7 @@ public class ContentCrawler {
ContentDataCallback
callback
,
String
...
urls
)
{
ContentDataCallback
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
ZhiWeiTools
.
sleep
(
100
);
if
(
url
!=
null
)
{
if
(
url
!=
null
)
{
try
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
b8ed38f4
...
@@ -5,6 +5,7 @@ import java.util.List;
...
@@ -5,6 +5,7 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
...
@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
...
@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
private
void
start
(
GroupSync
counter
,
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
ZhiWeiTools
.
sleep
(
100
);
counter
.
add
();
counter
.
add
();
if
(
url
!=
null
)
{
if
(
url
!=
null
)
{
try
{
try
{
...
@@ -89,12 +91,9 @@ public class MediaSelfSourceCrawler {
...
@@ -89,12 +91,9 @@ public class MediaSelfSourceCrawler {
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
ProxyHolder
ph
=
null
;
ProxyHolder
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
if
(
url
.
contains
(
"toutiao.com"
))
{
if
(
url
.
contains
(
"toutiao.com"
))
{
map
.
put
(
"referer"
,
url
);
map
.
put
(
"referer"
,
url
);
ph
=
ProxyHolder
.
SOUGOU_OUTER_PROXY
;
}
else
{
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
}
url
=
dealUrl
(
url
);
url
=
dealUrl
(
url
);
if
(
Objects
.
nonNull
(
url
))
{
if
(
Objects
.
nonNull
(
url
))
{
...
@@ -168,7 +167,6 @@ public class MediaSelfSourceCrawler {
...
@@ -168,7 +167,6 @@ public class MediaSelfSourceCrawler {
String
url
=
attr
.
get
().
toString
();
String
url
=
attr
.
get
().
toString
();
try
{
try
{
source
=
MatchSource
.
matchMediaSelfSource
(
url
+
eUrl
,
result
);
source
=
MatchSource
.
matchMediaSelfSource
(
url
+
eUrl
,
result
);
logger
.
info
(
url
+
"======="
+
source
);
channel
=
MatchChannel
.
verifyChannel
(
url
);
channel
=
MatchChannel
.
verifyChannel
(
url
);
if
(
channel
==
null
){
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
Jsoup
.
parse
(
result
).
head
().
childNodes
();
List
<
Node
>
nodeList
=
Jsoup
.
parse
(
result
).
head
().
childNodes
();
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
b8ed38f4
...
@@ -21,6 +21,7 @@ import com.zhiwei.source_forward.util.MatchChannel;
...
@@ -21,6 +21,7 @@ import com.zhiwei.source_forward.util.MatchChannel;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
import
okhttp3.Request
;
...
@@ -46,6 +47,7 @@ public class SourceForwardCrawler {
...
@@ -46,6 +47,7 @@ public class SourceForwardCrawler {
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
counter
.
add
();
counter
.
add
();
ZhiWeiTools
.
sleep
(
100
);
if
(
url
!=
null
)
{
if
(
url
!=
null
)
{
try
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
...
@@ -68,6 +70,11 @@ public class SourceForwardCrawler {
...
@@ -68,6 +70,11 @@ public class SourceForwardCrawler {
if
(
url
.
contains
(
"china.prcfe.com"
))
{
if
(
url
.
contains
(
"china.prcfe.com"
))
{
url
=
"http://china.prcfe.com/e/extend/ShowSource/?id="
+
url
.
split
(
"/"
)[
url
.
split
(
"/"
).
length
-
1
].
split
(
"\\."
)[
0
];
url
=
"http://china.prcfe.com/e/extend/ShowSource/?id="
+
url
.
split
(
"/"
)[
url
.
split
(
"/"
).
length
-
1
].
split
(
"\\."
)[
0
];
}
}
if
(
url
.
contains
(
"gu.qq.com"
))
{
String
id
=
url
.
split
(
"\\?id="
)[
1
];
url
=
"https://snp.tenpay.com/cgi-bin/snpgw_unified_newsinfo.fcgi?&filter=0&zappid=zxg_h5&sign=b2aceeb8a8ef093862608d806c1d6ab8&nonce=8464&reserve=1572995&&channel=zxg&user_openid=undefined&user_skey=undefined&&news_id="
+
id
;
headers
.
put
(
"referer"
,
"https://gu.qq.com/resources/shy/news/detail-v2/index.html"
);
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
counter
.
add
();
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
...
@@ -92,8 +99,8 @@ public class SourceForwardCrawler {
...
@@ -92,8 +99,8 @@ public class SourceForwardCrawler {
String
channel
=
"新闻"
;
String
channel
=
"新闻"
;
String
isforward
=
"未知"
;
String
isforward
=
"未知"
;
try
{
try
{
Document
document
=
Jsoup
.
parse
(
body
);
if
(
attr
.
get
().
toString
().
contains
(
"mp.weixin.qq.com"
)){
if
(
attr
.
get
().
toString
().
contains
(
"mp.weixin.qq.com"
)){
Document
document
=
Jsoup
.
parse
(
body
);
isforward
=
document
.
select
(
"div#meta_content"
).
select
(
"span#copyright_logo"
).
text
();
isforward
=
document
.
select
(
"div#meta_content"
).
select
(
"span#copyright_logo"
).
text
();
if
(
isforward
.
contains
(
"原创"
)){
if
(
isforward
.
contains
(
"原创"
)){
isforward
=
"原创"
;
isforward
=
"原创"
;
...
@@ -104,15 +111,21 @@ public class SourceForwardCrawler {
...
@@ -104,15 +111,21 @@ public class SourceForwardCrawler {
if
(
body
.
contains
(
"isOriginal"
)
&&
body
.
contains
(
"isOriginal: true"
)){
if
(
body
.
contains
(
"isOriginal"
)
&&
body
.
contains
(
"isOriginal: true"
)){
isforward
=
"原创"
;
isforward
=
"原创"
;
}
}
}
else
if
(
attr
.
get
().
toString
().
contains
(
"snp.tenpay.com"
)
||
attr
.
get
().
toString
().
contains
(
"gu.qq.com"
)){
if
(
body
.
contains
(
"source"
)){
source
=
body
.
split
(
"\"source\":\""
)[
1
].
split
(
"\""
)[
0
];
}
}
else
{
}
else
{
Document
document
=
Jsoup
.
parse
(
body
);
source
=
MatchSource
.
matchSource
(
attr
.
get
().
toString
(),
document
.
toString
(),
sourceList
);
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
if
(
channel
==
null
){
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
document
.
head
().
childNodes
();
List
<
Node
>
nodeList
=
document
.
head
().
childNodes
();
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
}
source
=
MatchSource
.
matchSource
(
attr
.
get
().
toString
(),
document
.
toString
(),
sourceList
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
source
=
null
;
source
=
null
;
channel
=
"新闻"
;
channel
=
"新闻"
;
}
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
b8ed38f4
...
@@ -50,8 +50,10 @@ public class UrlLiveCrawler {
...
@@ -50,8 +50,10 @@ public class UrlLiveCrawler {
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
counter
.
add
();
counter
.
add
();
ZhiWeiTools
.
sleep
(
100
);
if
(
nonNull
(
url
))
{
if
(
nonNull
(
url
))
{
try
{
try
{
// ZhiWeiTools.sleep(3000);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
logger
.
error
(
"搜索创建出错:"
,
e
);
...
@@ -213,6 +215,9 @@ public class UrlLiveCrawler {
...
@@ -213,6 +215,9 @@ public class UrlLiveCrawler {
title
=
doc
.
select
(
"div.global_error_msg.warn"
).
text
();
title
=
doc
.
select
(
"div.global_error_msg.warn"
).
text
();
}
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"div.warn"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"p.tips"
).
text
();
title
=
doc
.
select
(
"p.tips"
).
text
();
}
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
...
...
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
View file @
b8ed38f4
...
@@ -32,7 +32,7 @@ public class MediaSelfSource {
...
@@ -32,7 +32,7 @@ public class MediaSelfSource {
ProxyInit
.
initProxy
();
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"https://
new.qq.com/omn/20200507/20200507A0Q9JV00.html
"
);
urlList
.
add
(
"https://
k.sina.com.cn/article_1060093724_3f2fbf1c00100vsqd.html?from=mood
"
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
for
(
MediaSelfSourceBean
b
:
u
)
{
for
(
MediaSelfSourceBean
b
:
u
)
{
...
...
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
b8ed38f4
...
@@ -80,10 +80,10 @@ public class SourceForward {
...
@@ -80,10 +80,10 @@ public class SourceForward {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ProxyInit
.
initProxy
();
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://
www.wangjiaozixun.com/html/zx20/2020/0730/1396388.html
"
);
urlList
.
add
(
"http://
gu.qq.com/resources/shy/news/detail-v2/index.html?#/index?id=SN202006091653447945411f
"
);
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
for
(
SourceForwardBean
sfb
:
da
)
{
for
(
SourceForwardBean
sfb
:
da
)
{
System
.
out
.
println
(
sfb
.
toString
());
System
.
out
.
println
(
"=============="
+
sfb
.
toString
());
}
}
}
}
...
@@ -94,7 +94,6 @@ public class SourceForward {
...
@@ -94,7 +94,6 @@ public class SourceForward {
try
{
try
{
SourceForwardCrawler
crawler
=
new
SourceForwardCrawler
();
SourceForwardCrawler
crawler
=
new
SourceForwardCrawler
();
SourceForwardDataCallBack
callback
=
new
SourceForwardDataCallBack
()
{
SourceForwardDataCallBack
callback
=
new
SourceForwardDataCallBack
()
{
@Override
@Override
public
void
onData
(
SourceForwardBean
data
,
Attribution
attr
)
{
public
void
onData
(
SourceForwardBean
data
,
Attribution
attr
)
{
list
.
add
(
data
);
list
.
add
(
data
);
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
b8ed38f4
...
@@ -72,7 +72,7 @@ public class URLLive {
...
@@ -72,7 +72,7 @@ public class URLLive {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ProxyInit
.
initProxy
();
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://
www.toutiao.com/item/1668646006370318/
"
);
urlList
.
add
(
"http://
mp.weixin.qq.com/s?__biz=Mzg3MDMzNTc5Mg==&mid=2247485220&idx=1&sn=9118543ca120489cccbdc102be58f881
"
);
// urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
// urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment