Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
dad70819
You need to sign in or sign up before continuing.
Commit
dad70819
authored
May 20, 2020
by
cwy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
自媒体获取修改
parent
391fcd6c
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
67 additions
and
36 deletions
+67
-36
pom.xml
+1
-1
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+3
-3
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+30
-15
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+2
-1
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+3
-2
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+28
-14
No files found.
pom.xml
View file @
dad70819
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<artifactId>
source-forward
</artifactId>
<version>
0.2.
3
-SNAPSHOT
</version>
<version>
0.2.
4
-SNAPSHOT
</version>
<name>
source-forward
</name>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
dad70819
...
@@ -101,7 +101,7 @@ public class MediaSelfSourceCrawler {
...
@@ -101,7 +101,7 @@ public class MediaSelfSourceCrawler {
try
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
Objects
.
isNull
(
ex
))
{
try
{
try
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
,
rs
.
request
().
url
().
uri
().
toString
()
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
logger
.
error
(
"解析出错"
,
e
);
}
}
...
@@ -151,12 +151,12 @@ public class MediaSelfSourceCrawler {
...
@@ -151,12 +151,12 @@ public class MediaSelfSourceCrawler {
* @param callback
* @param callback
*/
*/
private
void
parseHtml
(
String
result
,
Attribution
attr
,
private
void
parseHtml
(
String
result
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
MediaSelfSourceDataCallBack
callback
,
String
eUrl
)
{
String
source
=
null
;
String
source
=
null
;
String
channel
=
null
;
String
channel
=
null
;
String
url
=
attr
.
get
().
toString
();
String
url
=
attr
.
get
().
toString
();
try
{
try
{
source
=
MatchSource
.
matchMediaSelfSource
(
url
,
result
);
source
=
MatchSource
.
matchMediaSelfSource
(
url
+
eUrl
,
result
);
logger
.
info
(
url
+
"======="
+
source
);
logger
.
info
(
url
+
"======="
+
source
);
channel
=
MatchChannel
.
verifyChannel
(
url
);
channel
=
MatchChannel
.
verifyChannel
(
url
);
if
(
channel
==
null
){
if
(
channel
==
null
){
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
dad70819
...
@@ -3,9 +3,12 @@ package com.zhiwei.source_forward.crawler;
...
@@ -3,9 +3,12 @@ package com.zhiwei.source_forward.crawler;
import
static
java
.
util
.
Objects
.
nonNull
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.Arrays
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
...
@@ -20,7 +23,6 @@ import com.zhiwei.crawler.utils.RequestUtils;
...
@@ -20,7 +23,6 @@ import com.zhiwei.crawler.utils.RequestUtils;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
import
okhttp3.Request
;
...
@@ -50,7 +52,7 @@ public class UrlLiveCrawler {
...
@@ -50,7 +52,7 @@ public class UrlLiveCrawler {
counter
.
add
();
counter
.
add
();
if
(
nonNull
(
url
))
{
if
(
nonNull
(
url
))
{
try
{
try
{
ZhiWeiTools
.
sleep
(
1
0
);
// ZhiWeiTools.sleep(300
0);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
logger
.
error
(
"搜索创建出错:"
,
e
);
...
@@ -63,13 +65,23 @@ public class UrlLiveCrawler {
...
@@ -63,13 +65,23 @@ public class UrlLiveCrawler {
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
System
.
out
.
println
(
url
);
//
System.out.println(url);
url
=
dealUrl
(
url
);
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
// Map<String,String> headers = new HashMap<>();
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
Map
<
String
,
String
>
headers
=
HeaderTool
.
getCommonHead
();
if
(
url
.
contains
(
"toutiao.com"
)){
if
(
url
.
contains
(
"www.toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
headers
.
put
(
"referer"
,
url
);
headers
.
put
(
"cookie"
,
"csrftoken=6d0e5967684dbb57cea14dc12858d263; tt_webid=6763913092738418180; tt_webid=6763913092738418180; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; s_v_web_id=verify_k9wn4wvx_J8Tm9B3v_4KQj_4pYw_B3C5_Bz00jljwk2Ik; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1589341084.1589355043.4; CNZZDATA1259612802=2091325281-1587691681-%7C1589354688; __ac_nonce=05ec2023000312916dbf0; __ac_signature=YYVItAAgEBDxesof46KjamGESaAAD9LCPu9LY3i693yRwgjuLokObvXcXAHluuEslefdgz60kyPRc1WnihwB4acMsJgn1wYE8IuqB3toZpnIZRexNBULILeZxouOJAtnxO6; __tasessionId=402dor9vo1589772849201; tt_scid=yP.oipZ1w-SChWahT4a7rhJ2gsjG-rJO.4UkyTROzer4MBRJ4bAv7POpDKAcZwzc497f"
);
headers
.
put
(
"accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
headers
.
put
(
"accept-encoding"
,
"gzip, deflate, br"
);
headers
.
put
(
"accept-language"
,
"zh-CN,zh;q=0.9"
);
headers
.
put
(
"cache-control"
,
"no-cache"
);
headers
.
put
(
"sec-fetch-dest"
,
"document"
);
headers
.
put
(
"sec-fetch-mode"
,
"navigate"
);
headers
.
put
(
"sec-fetch-site"
,
"same-origin"
);
headers
.
put
(
"sec-fetch-user"
,
"?1"
);
headers
.
put
(
"upgrade-insecure-requests"
,
"1"
);
headers
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
);
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
url
=
treatZhihuUrl
(
url
);
}
}
...
@@ -77,7 +89,7 @@ public class UrlLiveCrawler {
...
@@ -77,7 +89,7 @@ public class UrlLiveCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
if
(
Objects
.
nonNull
(
request
))
{
if
(
Objects
.
nonNull
(
request
))
{
counter
.
add
();
counter
.
add
();
httpBoot
.
asyncCall
(
RequestUtils
.
wrapGet
(
url
)
,
ProxyHolder
.
NAT_HEAVY_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
try
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
isSuccessful
())
{
if
(
rs
.
isSuccessful
())
{
...
@@ -200,6 +212,12 @@ public class UrlLiveCrawler {
...
@@ -200,6 +212,12 @@ public class UrlLiveCrawler {
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"h2"
).
text
();
title
=
doc
.
select
(
"h2"
).
text
();
}
}
// 获取title
Matcher
ma5
=
Pattern
.
compile
(
"var msg_title = \'(.*)\'"
)
.
matcher
(
result
);
if
(
ma5
.
find
())
{
title
=
ma5
.
group
(
1
).
replaceAll
(
" "
,
" "
).
trim
();
}
}
else
if
(
url
.
contains
(
"kuaibao"
)){
}
else
if
(
url
.
contains
(
"kuaibao"
)){
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
...
@@ -224,8 +242,11 @@ public class UrlLiveCrawler {
...
@@ -224,8 +242,11 @@ public class UrlLiveCrawler {
title
=
"网页已删除"
;
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
if
(
url
.
contains
(
"/answer/"
))
{
title
=
resultJson
.
getString
(
"title"
)!=
null
?
resultJson
.
getString
(
"title"
):
resultJson
.
getString
(
"message"
);
title
=
resultJson
.
getJSONObject
(
"question"
).
getString
(
"title"
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
)
||
url
.
contains
(
"/p/"
))
{
title
=
resultJson
.
getString
(
"title"
);
}
}
else
if
(
url
.
contains
(
"360kuai.com"
)
&&
result
.
contains
(
"您访问的文章走失了"
))
{
}
else
if
(
url
.
contains
(
"360kuai.com"
)
&&
result
.
contains
(
"您访问的文章走失了"
))
{
title
=
String
.
valueOf
(
"404"
);
title
=
String
.
valueOf
(
"404"
);
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
...
@@ -304,10 +325,4 @@ public class UrlLiveCrawler {
...
@@ -304,10 +325,4 @@ public class UrlLiveCrawler {
return
url
;
return
url
;
}
}
}
}
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
View file @
dad70819
...
@@ -32,7 +32,8 @@ public class MediaSelfSource {
...
@@ -32,7 +32,8 @@ public class MediaSelfSource {
ProxyInit
.
initProxy
();
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"https://www.dcdapp.com/article/6819085953756299789"
);
urlList
.
add
(
"http://iphone.myzaker.com/l.php?l=5ec0d951b15ec0157b6b4e46"
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
for
(
MediaSelfSourceBean
b
:
u
)
{
for
(
MediaSelfSourceBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
System
.
out
.
println
(
b
.
toString
());
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
dad70819
...
@@ -72,12 +72,13 @@ public class URLLive {
...
@@ -72,12 +72,13 @@ public class URLLive {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ProxyInit
.
initProxy
();
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754"
);
urlList
.
add
(
"http://www.toutiao.com/a1665677841741827"
);
// urlList.add("https://mp.weixin.qq.com/s?__biz=MzA3NjgyNTU5Nw==&mid=2247486586&idx=2&sn=419218b3c831b17d2b9bd9a5281ea842&scene=6#wechat_redirect");
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
for
(
UrlLiveBean
b
:
u
)
{
for
(
UrlLiveBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
System
.
out
.
println
(
b
.
toString
());
}
}
}
}
static
class
UrlLiveCrawlerThread
extends
Thread
{
static
class
UrlLiveCrawlerThread
extends
Thread
{
...
...
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
dad70819
...
@@ -432,8 +432,11 @@ public class MatchSource {
...
@@ -432,8 +432,11 @@ public class MatchSource {
source
=
"快资讯-"
+
source
;
source
=
"快资讯-"
+
source
;
}
}
}
else
if
(
url
.
contains
(
"cj.sina.com.cn"
)
||
url
.
contains
(
"finance.sina.cn"
)
||
}
else
if
(
url
.
contains
(
"cj.sina.com.cn"
)
||
url
.
contains
(
"finance.sina.cn"
)
||
url
.
contains
(
"tech.sina.cn"
)
||
url
.
contains
(
"news.sina.cn"
)){
url
.
contains
(
"tech.sina.cn"
)
||
url
.
contains
(
"news.sina.cn"
)
||
url
.
contains
(
"k.sina.cn"
)
){
source
=
document
.
select
(
"h2.weibo_user"
).
text
();
source
=
document
.
select
(
"h2.weibo_user"
).
text
();
if
(
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
{
source
=
document
.
select
(
"#top_bar > div > div.date-source > a"
).
text
();
}
if
((
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
&&
html
.
contains
(
"<meta name=\"mediaid\""
)){
if
((
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
&&
html
.
contains
(
"<meta name=\"mediaid\""
)){
//新浪科技头条号
//新浪科技头条号
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
].
trim
();
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
].
trim
();
...
@@ -453,6 +456,16 @@ public class MatchSource {
...
@@ -453,6 +456,16 @@ public class MatchSource {
if
(
source
!=
null
&&
source
.
length
()>
1
){
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"新浪-"
+
source
;
source
=
"新浪-"
+
source
;
}
}
}
else
if
(
url
.
contains
(
"finance.ifeng.com"
)){
source
=
html
.
split
(
"weMediaName\":\""
)[
1
].
split
(
"\","
)[
0
];
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"大风号-"
+
source
;
}
}
else
if
(
url
.
contains
(
"ihouse.ifeng.com"
)){
source
=
document
.
select
(
"body > section.article > span > a"
).
text
();
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"大风号-"
+
source
;
}
}
else
if
(
url
.
contains
(
"k.sina.cn"
)){
}
else
if
(
url
.
contains
(
"k.sina.cn"
)){
source
=
document
.
select
(
"h2.weibo_user"
).
text
();
source
=
document
.
select
(
"h2.weibo_user"
).
text
();
if
(
source
!=
null
&&
source
.
length
()>
1
){
if
(
source
!=
null
&&
source
.
length
()>
1
){
...
@@ -635,19 +648,6 @@ public class MatchSource {
...
@@ -635,19 +648,6 @@ public class MatchSource {
if
(
StringUtils
.
isNotBlank
(
source
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"创业邦-"
+
source
;
source
=
"创业邦-"
+
source
;
}
}
}
else
if
(
url
.
contains
(
"36kr.com"
)){
source
=
document
.
select
(
"div.info-header-text > a.author-name"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
source
=
document
.
select
(
"h4.author-name"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
source
=
document
.
select
(
"span.author-nickname"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
}
else
if
(
url
.
contains
(
"lianxianjia.com"
)){
}
else
if
(
url
.
contains
(
"lianxianjia.com"
)){
source
=
document
.
select
(
"span.author-name"
).
text
();
source
=
document
.
select
(
"span.author-name"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
...
@@ -760,6 +760,20 @@ public class MatchSource {
...
@@ -760,6 +760,20 @@ public class MatchSource {
source
=
"推酷-"
+
source
;
source
=
"推酷-"
+
source
;
}
}
}
}
if
(
url
.
contains
(
"36kr.com"
)){
source
=
document
.
select
(
"div.info-header-text > a.author-name"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
source
=
document
.
select
(
"h4.author-name"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
source
=
document
.
select
(
"span.author-nickname"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
}
return
source
;
return
source
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
printStackTrace
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment