Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
7f7e4a1c
Commit
7f7e4a1c
authored
Jun 03, 2020
by
cwy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
头条数据更新使用白名单
parent
ea0833d3
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
13 additions
and
7 deletions
+13
-7
pom.xml
+1
-1
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+5
-1
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+6
-4
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+1
-1
No files found.
pom.xml
View file @
7f7e4a1c
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<artifactId>
source-forward
</artifactId>
<version>
0.2.
4
-SNAPSHOT
</version>
<version>
0.2.
5
-SNAPSHOT
</version>
<name>
source-forward
</name>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
7f7e4a1c
...
@@ -89,15 +89,19 @@ public class MediaSelfSourceCrawler {
...
@@ -89,15 +89,19 @@ public class MediaSelfSourceCrawler {
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
ProxyHolder
ph
=
null
;
if
(
url
.
contains
(
"toutiao.com"
))
{
if
(
url
.
contains
(
"toutiao.com"
))
{
map
.
put
(
"referer"
,
url
);
map
.
put
(
"referer"
,
url
);
ph
=
ProxyHolder
.
SOUGOU_OUTER_PROXY
;
}
else
{
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
}
url
=
dealUrl
(
url
);
url
=
dealUrl
(
url
);
if
(
Objects
.
nonNull
(
url
))
{
if
(
Objects
.
nonNull
(
url
))
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
map
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
map
);
counter
.
add
();
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
ph
).
whenComplete
((
rs
,
ex
)
->
{
try
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
Objects
.
isNull
(
ex
))
{
try
{
try
{
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
7f7e4a1c
...
@@ -52,7 +52,6 @@ public class UrlLiveCrawler {
...
@@ -52,7 +52,6 @@ public class UrlLiveCrawler {
counter
.
add
();
counter
.
add
();
if
(
nonNull
(
url
))
{
if
(
nonNull
(
url
))
{
try
{
try
{
// ZhiWeiTools.sleep(3000);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
logger
.
error
(
"搜索创建出错:"
,
e
);
...
@@ -69,9 +68,10 @@ public class UrlLiveCrawler {
...
@@ -69,9 +68,10 @@ public class UrlLiveCrawler {
url
=
dealUrl
(
url
);
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
ProxyHolder
ph
=
null
;
if
(
url
.
contains
(
"toutiao.com"
)){
if
(
url
.
contains
(
"toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
headers
.
put
(
"referer"
,
url
);
headers
.
put
(
"cookie"
,
"csrftoken=6d0e5967684dbb57cea14dc12858d263; tt_webid=6763913092738418180; tt_webid=6763913092738418180; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; s_v_web_id=verify_k9wn4wvx_J8Tm9B3v_4KQj_4pYw_B3C5_Bz00jljwk2Ik; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1589341084.1589355043.4; CNZZDATA1259612802=2091325281-1587691681-%7C1589354688; __ac_nonce=05ec2023000312916dbf0; __ac_signature=YYVItAAgEBDxesof46KjamGESaAAD9LCPu9LY3i693yRwgjuLokObvXcXAHluuEslefdgz60kyPRc1WnihwB4acMsJgn1wYE8IuqB3toZpnIZRexNBULILeZxouOJAtnxO6; __tasessionId=402dor9vo1589772849201; tt_scid=yP.oipZ1w-SChWahT4a7rhJ2gsjG-rJO.4UkyTROzer4MBRJ4bAv7POpDKAcZwzc497f
"
);
// headers.put("cookie", "__ac_nonce=05ed0c7bb00bc34aa36be; __ac_signature=0fFbMAAgEBBBDtmbXG3W-tHxWiAAI8q; ttcid=cfbee5ddf00b4013b5236b534c8cf36c19; tt_webid=6832180195202909704; s_v_web_id=verify_kary2om5_954yc9QS_twaQ_42XG_9Sei_dsAVEudiEodo; __tasessionId=4bmcvzruo1590740924839; tt_webid=6832180195202909704; SLARDAR_WEB_ID=fb4d8abf-bdd7-4e9e-ba38-8c00f0c13846; csrftoken=6430b380cc664479dfa0b0e5061b2db9; tt_scid=kRdSxPldqsXGPvYrxh3K4HZ5ayX0isXRzk08ZTjlIGmNW3HaSLrhBfHJ.CRjNom.b0fe
");
headers
.
put
(
"accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
headers
.
put
(
"accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
headers
.
put
(
"accept-encoding"
,
"gzip, deflate, br"
);
headers
.
put
(
"accept-encoding"
,
"gzip, deflate, br"
);
headers
.
put
(
"accept-language"
,
"zh-CN,zh;q=0.9"
);
headers
.
put
(
"accept-language"
,
"zh-CN,zh;q=0.9"
);
...
@@ -81,15 +81,17 @@ public class UrlLiveCrawler {
...
@@ -81,15 +81,17 @@ public class UrlLiveCrawler {
headers
.
put
(
"sec-fetch-site"
,
"same-origin"
);
headers
.
put
(
"sec-fetch-site"
,
"same-origin"
);
headers
.
put
(
"sec-fetch-user"
,
"?1"
);
headers
.
put
(
"sec-fetch-user"
,
"?1"
);
headers
.
put
(
"upgrade-insecure-requests"
,
"1"
);
headers
.
put
(
"upgrade-insecure-requests"
,
"1"
);
headers
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
);
headers
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
);
ph
=
ProxyHolder
.
SOUGOU_OUTER_PROXY
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
url
=
treatZhihuUrl
(
url
);
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
}
try
{
try
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
if
(
Objects
.
nonNull
(
request
))
{
if
(
Objects
.
nonNull
(
request
))
{
counter
.
add
();
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
ph
).
whenComplete
((
rs
,
ex
)
->
{
try
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
isSuccessful
())
{
if
(
rs
.
isSuccessful
())
{
...
...
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
View file @
7f7e4a1c
...
@@ -32,7 +32,7 @@ public class MediaSelfSource {
...
@@ -32,7 +32,7 @@ public class MediaSelfSource {
ProxyInit
.
initProxy
();
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"https://new.qq.com/
rain/a/20200511A0LUU600
"
);
urlList
.
add
(
"https://new.qq.com/
omn/20200507/20200507A0Q9JV00.html
"
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
for
(
MediaSelfSourceBean
b
:
u
)
{
for
(
MediaSelfSourceBean
b
:
u
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment