Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
dad70819
Commit
dad70819
authored
May 20, 2020
by
cwy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
自媒体获取修改
parent
391fcd6c
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
67 additions
and
36 deletions
+67
-36
pom.xml
+1
-1
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+3
-3
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+30
-15
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+2
-1
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+3
-2
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+28
-14
No files found.
pom.xml
View file @
dad70819
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<version>
0.2.
3
-SNAPSHOT
</version>
<version>
0.2.
4
-SNAPSHOT
</version>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
dad70819
...
...
@@ -101,7 +101,7 @@ public class MediaSelfSourceCrawler {
try
{
if
(
Objects
.
isNull
(
ex
))
{
try
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
,
rs
.
request
().
url
().
uri
().
toString
()
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
}
...
...
@@ -151,12 +151,12 @@ public class MediaSelfSourceCrawler {
* @param callback
*/
private
void
parseHtml
(
String
result
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
MediaSelfSourceDataCallBack
callback
,
String
eUrl
)
{
String
source
=
null
;
String
channel
=
null
;
String
url
=
attr
.
get
().
toString
();
try
{
source
=
MatchSource
.
matchMediaSelfSource
(
url
,
result
);
source
=
MatchSource
.
matchMediaSelfSource
(
url
+
eUrl
,
result
);
logger
.
info
(
url
+
"======="
+
source
);
channel
=
MatchChannel
.
verifyChannel
(
url
);
if
(
channel
==
null
){
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
dad70819
...
...
@@ -3,9 +3,12 @@ package com.zhiwei.source_forward.crawler;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
...
...
@@ -20,7 +23,6 @@ import com.zhiwei.crawler.utils.RequestUtils;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -50,7 +52,7 @@ public class UrlLiveCrawler {
counter
.
add
();
if
(
nonNull
(
url
))
{
try
{
ZhiWeiTools
.
sleep
(
1
0
);
// ZhiWeiTools.sleep(300
0);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
...
...
@@ -63,13 +65,23 @@ public class UrlLiveCrawler {
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
System
.
out
.
println
(
url
);
//
System.out.println(url);
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
// Map<String,String> headers = new HashMap<>();
Map
<
String
,
String
>
headers
=
HeaderTool
.
getCommonHead
();
if
(
url
.
contains
(
"www.toutiao.com"
)){
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
if
(
url
.
contains
(
"toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
headers
.
put
(
"cookie"
,
"csrftoken=6d0e5967684dbb57cea14dc12858d263; tt_webid=6763913092738418180; tt_webid=6763913092738418180; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; s_v_web_id=verify_k9wn4wvx_J8Tm9B3v_4KQj_4pYw_B3C5_Bz00jljwk2Ik; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1589341084.1589355043.4; CNZZDATA1259612802=2091325281-1587691681-%7C1589354688; __ac_nonce=05ec2023000312916dbf0; __ac_signature=YYVItAAgEBDxesof46KjamGESaAAD9LCPu9LY3i693yRwgjuLokObvXcXAHluuEslefdgz60kyPRc1WnihwB4acMsJgn1wYE8IuqB3toZpnIZRexNBULILeZxouOJAtnxO6; __tasessionId=402dor9vo1589772849201; tt_scid=yP.oipZ1w-SChWahT4a7rhJ2gsjG-rJO.4UkyTROzer4MBRJ4bAv7POpDKAcZwzc497f"
);
headers
.
put
(
"accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
headers
.
put
(
"accept-encoding"
,
"gzip, deflate, br"
);
headers
.
put
(
"accept-language"
,
"zh-CN,zh;q=0.9"
);
headers
.
put
(
"cache-control"
,
"no-cache"
);
headers
.
put
(
"sec-fetch-dest"
,
"document"
);
headers
.
put
(
"sec-fetch-mode"
,
"navigate"
);
headers
.
put
(
"sec-fetch-site"
,
"same-origin"
);
headers
.
put
(
"sec-fetch-user"
,
"?1"
);
headers
.
put
(
"upgrade-insecure-requests"
,
"1"
);
headers
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
);
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
}
...
...
@@ -77,7 +89,7 @@ public class UrlLiveCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
if
(
Objects
.
nonNull
(
request
))
{
counter
.
add
();
httpBoot
.
asyncCall
(
RequestUtils
.
wrapGet
(
url
)
,
ProxyHolder
.
NAT_HEAVY_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
isSuccessful
())
{
...
...
@@ -200,6 +212,12 @@ public class UrlLiveCrawler {
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"h2"
).
text
();
}
// 获取title
Matcher
ma5
=
Pattern
.
compile
(
"var msg_title = \'(.*)\'"
)
.
matcher
(
result
);
if
(
ma5
.
find
())
{
title
=
ma5
.
group
(
1
).
replaceAll
(
" "
,
" "
).
trim
();
}
}
else
if
(
url
.
contains
(
"kuaibao"
)){
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
...
...
@@ -224,8 +242,11 @@ public class UrlLiveCrawler {
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
title
=
resultJson
.
getString
(
"title"
)!=
null
?
resultJson
.
getString
(
"title"
):
resultJson
.
getString
(
"message"
);
if
(
url
.
contains
(
"/answer/"
))
{
title
=
resultJson
.
getJSONObject
(
"question"
).
getString
(
"title"
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
)
||
url
.
contains
(
"/p/"
))
{
title
=
resultJson
.
getString
(
"title"
);
}
}
else
if
(
url
.
contains
(
"360kuai.com"
)
&&
result
.
contains
(
"您访问的文章走失了"
))
{
title
=
String
.
valueOf
(
"404"
);
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
...
...
@@ -304,10 +325,4 @@ public class UrlLiveCrawler {
return
url
;
}
}
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
View file @
dad70819
...
...
@@ -32,7 +32,8 @@ public class MediaSelfSource {
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"https://www.dcdapp.com/article/6819085953756299789"
);
urlList
.
add
(
"http://iphone.myzaker.com/l.php?l=5ec0d951b15ec0157b6b4e46"
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
for
(
MediaSelfSourceBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
dad70819
...
...
@@ -72,12 +72,13 @@ public class URLLive {
public
static
void
main
(
String
[]
args
)
{
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754"
);
urlList
.
add
(
"http://www.toutiao.com/a1665677841741827"
);
// urlList.add("https://mp.weixin.qq.com/s?__biz=MzA3NjgyNTU5Nw==&mid=2247486586&idx=2&sn=419218b3c831b17d2b9bd9a5281ea842&scene=6#wechat_redirect");
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
for
(
UrlLiveBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
}
}
}
static
class
UrlLiveCrawlerThread
extends
Thread
{
...
...
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
dad70819
...
...
@@ -432,8 +432,11 @@ public class MatchSource {
source
=
"快资讯-"
+
source
;
}
}
else
if
(
url
.
contains
(
"cj.sina.com.cn"
)
||
url
.
contains
(
"finance.sina.cn"
)
||
url
.
contains
(
"tech.sina.cn"
)
||
url
.
contains
(
"news.sina.cn"
)){
url
.
contains
(
"tech.sina.cn"
)
||
url
.
contains
(
"news.sina.cn"
)
||
url
.
contains
(
"k.sina.cn"
)
){
source
=
document
.
select
(
"h2.weibo_user"
).
text
();
if
(
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
{
source
=
document
.
select
(
"#top_bar > div > div.date-source > a"
).
text
();
}
if
((
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
&&
html
.
contains
(
"<meta name=\"mediaid\""
)){
//新浪科技头条号
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
].
trim
();
...
...
@@ -453,6 +456,16 @@ public class MatchSource {
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"新浪-"
+
source
;
}
}
else
if
(
url
.
contains
(
"finance.ifeng.com"
)){
source
=
html
.
split
(
"weMediaName\":\""
)[
1
].
split
(
"\","
)[
0
];
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"大风号-"
+
source
;
}
}
else
if
(
url
.
contains
(
"ihouse.ifeng.com"
)){
source
=
document
.
select
(
"body > section.article > span > a"
).
text
();
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"大风号-"
+
source
;
}
}
else
if
(
url
.
contains
(
"k.sina.cn"
)){
source
=
document
.
select
(
"h2.weibo_user"
).
text
();
if
(
source
!=
null
&&
source
.
length
()>
1
){
...
...
@@ -635,19 +648,6 @@ public class MatchSource {
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"创业邦-"
+
source
;
}
}
else
if
(
url
.
contains
(
"36kr.com"
)){
source
=
document
.
select
(
"div.info-header-text > a.author-name"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
source
=
document
.
select
(
"h4.author-name"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
source
=
document
.
select
(
"span.author-nickname"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
}
else
if
(
url
.
contains
(
"lianxianjia.com"
)){
source
=
document
.
select
(
"span.author-name"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
...
...
@@ -760,6 +760,20 @@ public class MatchSource {
source
=
"推酷-"
+
source
;
}
}
if
(
url
.
contains
(
"36kr.com"
)){
source
=
document
.
select
(
"div.info-header-text > a.author-name"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
source
=
document
.
select
(
"h4.author-name"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
source
=
document
.
select
(
"span.author-nickname"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
}
return
source
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment