Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
0c98f43b
Commit
0c98f43b
authored
Dec 05, 2019
by
cwy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
升级版本 和修复快资讯自媒体号获取
parent
7c541080
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
43 additions
and
23 deletions
+43
-23
pom.xml
+2
-2
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+0
-2
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+0
-2
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+3
-4
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+13
-6
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
+0
-0
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+9
-2
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+1
-1
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+13
-2
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+2
-2
No files found.
pom.xml
View file @
0c98f43b
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<version>
0.
1.9
-SNAPSHOT
</version>
<version>
0.
2.1
-SNAPSHOT
</version>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
...
@@ -29,7 +29,7 @@
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.5.
2-RELEASE
</version>
<version>
0.5.
5.6-SNAPSHOT
</version>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
0c98f43b
...
...
@@ -13,7 +13,6 @@ import com.zhiwei.source_forward.bean.ContentBean;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -50,7 +49,6 @@ public class ContentCrawler {
for
(
String
url
:
urls
)
{
if
(
url
!=
null
)
{
try
{
ZhiWeiTools
.
sleep
(
10
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
0c98f43b
...
...
@@ -19,7 +19,6 @@ import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -68,7 +67,6 @@ public class MediaSelfSourceCrawler {
counter
.
add
();
if
(
url
!=
null
)
{
try
{
ZhiWeiTools
.
sleep
(
10
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
0c98f43b
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
...
...
@@ -20,8 +21,6 @@ import com.zhiwei.source_forward.util.MatchChannel;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -50,7 +49,6 @@ public class SourceForwardCrawler {
if
(
url
!=
null
)
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
ZhiWeiTools
.
sleep
(
10
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
...
...
@@ -62,7 +60,8 @@ public class SourceForwardCrawler {
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
// Map<String,String> headers = HeaderTool.getCommonHead();
if
(
url
.
contains
(
"www.toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
0c98f43b
...
...
@@ -36,14 +36,14 @@ import okhttp3.Request;
public
class
UrlLiveCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
true
).
build
();
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
...
...
@@ -60,11 +60,13 @@ public class UrlLiveCrawler {
}
}
}
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
System
.
out
.
println
(
url
);
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
// Map<String,String> headers = new HashMap<>();
Map
<
String
,
String
>
headers
=
HeaderTool
.
getCommonHead
();
if
(
url
.
contains
(
"www.toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
...
...
@@ -75,7 +77,7 @@ public class UrlLiveCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
if
(
Objects
.
nonNull
(
request
))
{
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
RequestUtils
.
wrapGet
(
url
)
,
ProxyHolder
.
NAT_HEAVY_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
isSuccessful
())
{
...
...
@@ -86,6 +88,7 @@ public class UrlLiveCrawler {
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
}
else
{
logger
.
error
(
"e"
,
ex
);
callBack
(
callback
,
attr
,
1
,
"未访问成功"
);
}
}
catch
(
Exception
e
)
{
...
...
@@ -118,7 +121,7 @@ public class UrlLiveCrawler {
private
String
dealUrl
(
String
url
)
{
try
{
if
(
url
.
contains
(
"toutiao.com"
))
{
if
(
url
.
contains
(
"
www.
toutiao.com"
))
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
}
else
{
...
...
@@ -223,6 +226,10 @@ public class UrlLiveCrawler {
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
title
=
resultJson
.
getString
(
"title"
)!=
null
?
resultJson
.
getString
(
"title"
):
resultJson
.
getString
(
"message"
);
}
else
if
(
url
.
contains
(
"360kuai.com"
)
&&
result
.
contains
(
"您访问的文章走失了"
))
{
title
=
String
.
valueOf
(
"404"
);
}
else
if
(
result
.
contains
(
"文章没有找到哦"
)
&&
url
.
contains
(
"yidianzixun.com"
))
{
title
=
"文章未找到"
;
}
//若title 为拿到 用 此方法
...
...
@@ -271,7 +278,7 @@ public class UrlLiveCrawler {
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
,
"百度新闻——全球最大的中文新闻平台"
,
"以上文章由以下机构判定为不实信息"
,
"该公众号已迁移"
,
"财经网-CAIJING.COM.CN"
,
"蚂蚁资讯"
,
"参数错误"
,
"时尚头条_YOKA时尚网"
,
"该文章已经被删除"
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
);
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
,
"文章未找到"
);
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
0 → 100644
View file @
0c98f43b
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
View file @
0c98f43b
...
...
@@ -14,6 +14,13 @@ import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import
com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler
;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
/**
*
* @ClassName: MediaSelfSource
* @Description: 自媒体号匹配
* @author 0xff
* @date 2019年12月5日 下午4:05:08
*/
public
class
MediaSelfSource
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
MediaSelfSource
.
class
);
...
...
@@ -23,9 +30,9 @@ public class MediaSelfSource {
}
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.
36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
ProxyFactory
.
init
(
"zookeeper://192.168.0.
11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"https://
v.qq.com/x/page/g0904sm9wti.html
"
);
urlList
.
add
(
"https://
www.360kuai.com/pc/922e4596800e5ef0a?cota=3&kuai_so=1&sign=360_e39369d1&refer_scene=so_3
"
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
for
(
MediaSelfSourceBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
...
...
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
0c98f43b
...
...
@@ -79,7 +79,7 @@ public class SourceForward {
}
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002
);
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://software.it168.com/a2019/0621/6005/000006005693.shtml"
);
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
0c98f43b
package
com
.
zhiwei
.
source_forward
.
run
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
...
...
@@ -9,14 +11,21 @@ import java.util.Map.Entry;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.TaskBoot
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.crawler.UrlLiveCrawler
;
import
com.zhiwei.source_forward.crawler.UrlLiveCrawlerNew
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
okhttp3.Request
;
import
okhttp3.Response
;
/**
* @ClassName: URLLive
* @Description: 验证链接是否已删除
...
...
@@ -27,6 +36,8 @@ public class URLLive {
private
static
Logger
logger
=
LogManager
.
getLogger
(
URLLive
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
build
();
/**
* @Title: verificationURLLive
* @author hero
...
...
@@ -73,9 +84,9 @@ public class URLLive {
}
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002
);
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://
weixin.sogou.com/api/share?timestamp=1569677503&signature=qIbwY*nI6KU9tBso4VCd8lYSesxOYgLcHX5tlbqlMR8N6flDHs4LLcFgRw7FjTAOm-VL1HR*9bkHkS0mWu-ZWc0ngS8ZsOYF7bq3mJCtAXbdMD8klA3ZAVBmYq2GVTJu2*fqwGdiiXgkPsBKht7mUN0o-rO8uYoVU6yfvrHHg29Hj1YBH4TG2Jtkz-zMRkQYKDOXTQgexDeAYfmgWMyar1GxXsDGbOjibPJZpqlwY-A=
"
);
urlList
.
add
(
"http://
a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754
"
);
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
for
(
UrlLiveBean
b
:
u
)
{
...
...
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
0c98f43b
...
...
@@ -395,7 +395,7 @@ public class MatchSource {
source
=
document
.
select
(
"p.article-info"
).
select
(
"a"
).
text
().
trim
();
}
if
(
source
.
length
()
<
1
&&
html
.
contains
(
"window.__INITIAL_DATA__ ="
))
{
Matcher
ma
=
Pattern
.
compile
(
"window.__INITIAL_DATA__ =[\\s\\S]+?
\\</script\\>
"
).
matcher
(
html
);
Matcher
ma
=
Pattern
.
compile
(
"window.__INITIAL_DATA__ =[\\s\\S]+?
}};
"
).
matcher
(
html
);
if
(
ma
.
find
())
{
String
result
=
ma
.
group
().
replaceAll
(
"window.__INITIAL_DATA__ =|\\</script\\>|"
,
""
).
trim
();
if
(
result
.
contains
(
"window.autohomePVDDWhiteList"
))
{
...
...
@@ -404,7 +404,7 @@ public class MatchSource {
JSONObject
json
=
JSONObject
.
parseObject
(
result
.
trim
().
substring
(
0
,
result
.
lastIndexOf
(
";"
)));
source
=
json
.
getJSONObject
(
"detail"
).
getString
(
"sec_src"
);
if
(
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
{
source
=
json
.
getJSONObject
(
"detail"
).
getString
(
"src"
);
source
=
json
.
getJSONObject
(
"detail"
).
getString
(
"src"
);
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment