Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
0abfbd4a
Commit
0abfbd4a
authored
Apr 02, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加自媒体匹配
parent
4e02a60f
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
136 additions
and
66 deletions
+136
-66
Log/crawler.log
+0
-0
pom.xml
+3
-3
src/main/java/com/zhiwei/source_forward/config/ProxyConfig.java
+3
-0
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+7
-10
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+5
-4
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+2
-3
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+2
-12
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+86
-33
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
+25
-0
src/main/resources/proxyip.properties
+3
-1
src/test/java/com/zhiwei/source_forward/sourceforward/test/MediaSelfSourceTest.java
+0
-0
No files found.
Log/crawler.log
View file @
0abfbd4a
This source diff could not be displayed because it is too large. You can
view the blob
instead.
pom.xml
View file @
0abfbd4a
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<version>
0.2.
1-SNAPSHOT
</version>
<version>
0.2.
2-SNAPSHOT
</version>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
...
@@ -24,12 +24,12 @@
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.
3-SNAPSHOT
</version>
<version>
0.1.
6-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.
5.5.6-SNAPSHOT
</version>
<version>
0.
6.1.0-SNAPSHOT
</version>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/source_forward/config/ProxyConfig.java
View file @
0abfbd4a
...
...
@@ -13,7 +13,9 @@ public class ProxyConfig {
conf
.
load
(
is
);
is
.
close
();
registry
=
conf
.
getProperty
(
"registry"
);
proxyid
=
Long
.
valueOf
(
conf
.
getProperty
(
"proxyid"
));
group
=
conf
.
getProperty
(
"group"
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
...
...
@@ -21,6 +23,7 @@ public class ProxyConfig {
public
static
String
registry
;
public
static
Long
proxyid
;
public
static
String
group
;
}
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
0abfbd4a
...
...
@@ -5,6 +5,7 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Objects
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
...
...
@@ -87,12 +88,11 @@ public class MediaSelfSourceCrawler {
* @return
*/
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
if
(
url
.
contains
(
"toutiao.com"
))
{
map
.
put
(
"referer"
,
url
);
}
map
.
put
(
"Connection"
,
"close"
);
url
=
dealUrl
(
url
);
if
(
Objects
.
nonNull
(
url
))
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
map
);
...
...
@@ -148,7 +148,6 @@ public class MediaSelfSourceCrawler {
/**
*
* @Description 解析文章获取相关数据
* @param response
* @param attr
* @param callback
*/
...
...
@@ -156,12 +155,11 @@ public class MediaSelfSourceCrawler {
MediaSelfSourceDataCallBack
callback
)
{
String
source
=
null
;
String
channel
=
null
;
String
url
=
attr
.
get
().
toString
();
try
{
source
=
MatchSource
.
matchMediaSelfSource
(
attr
.
get
().
toString
(),
result
);
if
(
source
==
null
||
source
.
equals
(
""
)){
source
=
null
;
}
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
source
=
MatchSource
.
matchMediaSelfSource
(
url
,
result
);
logger
.
info
(
url
+
"======="
+
source
);
channel
=
MatchChannel
.
verifyChannel
(
url
);
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
Jsoup
.
parse
(
result
).
head
().
childNodes
();
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
...
...
@@ -170,8 +168,7 @@ public class MediaSelfSourceCrawler {
logger
.
error
(
"exception "
,
e
);
source
=
null
;
}
logger
.
info
(
attr
.
get
()+
"=================来源"
+
source
);
MediaSelfSourceBean
msfb
=
new
MediaSelfSourceBean
(
attr
.
get
().
toString
(),
source
,
channel
);
MediaSelfSourceBean
msfb
=
new
MediaSelfSourceBean
(
url
,
source
,
channel
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
...
...
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
View file @
0abfbd4a
...
...
@@ -4,11 +4,11 @@ import java.util.ArrayList;
import
java.util.Collections
;
import
java.util.List
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.source_forward.util.ProxyInit
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler
;
...
...
@@ -30,9 +30,10 @@ public class MediaSelfSource {
}
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"https://w
ap.peopleapp.com/article/rmh12074926/0"
);
urlList
.
add
(
"https://w
ww.tuicool.com/articles/nIfmu2B"
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
for
(
MediaSelfSourceBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
...
...
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
0abfbd4a
...
...
@@ -6,11 +6,10 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.source_forward.util.ProxyInit
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.crawler.SourceForwardCrawler
;
...
...
@@ -79,7 +78,7 @@ public class SourceForward {
}
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002
);
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://software.it168.com/a2019/0621/6005/000006005693.shtml"
);
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
0abfbd4a
package
com
.
zhiwei
.
source_forward
.
run
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.source_forward.util.ProxyInit
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.TaskBoot
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.crawler.UrlLiveCrawler
;
import
com.zhiwei.source_forward.crawler.UrlLiveCrawlerNew
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
okhttp3.Request
;
import
okhttp3.Response
;
/**
* @ClassName: URLLive
* @Description: 验证链接是否已删除
...
...
@@ -84,7 +74,7 @@ public class URLLive {
}
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002
);
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754"
);
...
...
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
0abfbd4a
...
...
@@ -5,6 +5,7 @@ import java.util.Objects;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -60,7 +61,7 @@ public class MatchSource {
if
(
url
.
contains
(
"thepaper.cn"
)){
//单独处理澎湃数据
source
=
document
.
select
(
"div.news_about"
).
select
(
"p"
).
select
(
"span"
).
text
().
replaceAll
(
".*来源:"
,
""
);
if
(
source
.
length
()
==
0
)
{
if
(
StringUtils
.
isNotBlank
(
source
))
{
source
=
document
.
select
(
"div.news_about"
).
text
().
replaceAll
(
" \\d{4}.*|.*/"
,
""
);
}
}
else
if
(
url
.
contains
(
"sports.eastday.com"
)){
...
...
@@ -372,14 +373,15 @@ public class MatchSource {
}
}
}
else
if
(
url
.
contains
(
"tznew.58.com"
)){
//58
source
=
JSONObject
.
parseObject
(
html
).
getJSONObject
(
"result"
).
getString
(
"author"
);
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"58-"
+
source
;
}
}
else
if
(
url
.
contains
(
"c.m.163.com"
)){
//58
source
=
document
.
select
(
"section.g-article.js-article > div.js-article-inner > div > b"
).
text
();
if
(
StringUtils
.
isBlank
(
source
)){
source
=
document
.
select
(
"div.info > h3"
).
text
();
}
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"网易新闻-"
+
source
;
}
...
...
@@ -445,10 +447,23 @@ public class MatchSource {
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"新浪-"
+
source
;
}
}
else
if
(
url
.
contains
(
"baijiahao.baidu.com"
)){
//百度百家
source
=
document
.
select
(
"p.author-name"
).
first
().
text
().
trim
();
}
else
if
(
url
.
contains
(
"k.sina.cn"
)){
source
=
document
.
select
(
"h2.weibo_user"
).
text
();
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"新浪-"
+
source
;
}
}
else
if
(
url
.
contains
(
"blog.sina.com.cn"
)){
source
=
document
.
select
(
"strong#ownernick"
).
text
();
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"新浪博客-"
+
source
;
}
}
else
if
(
url
.
contains
(
"baijiahao.baidu.com"
)
||
url
.
contains
(
"mbd.baidu.com"
)){
//百度百家
source
=
document
.
select
(
"span.userNameSpan"
).
text
();
if
(
StringUtils
.
isBlank
(
source
)){
source
=
document
.
select
(
"p.author-name:nth-child(1)"
).
text
();
}
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"百度百家-"
+
source
;
}
}
else
if
(
url
.
contains
(
"app.myzaker.com"
)){
...
...
@@ -528,12 +543,12 @@ public class MatchSource {
}
}
else
if
(
url
.
contains
(
"mp.qq.com"
)){
source
=
document
.
select
(
"div#account_top > div.puin_text > div.pname"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"QQ看点-"
+
source
;
}
}
else
if
(
url
.
contains
(
"v.qq.com"
))
{
source
=
document
.
select
(
"span.user_name"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"腾讯视频-"
+
source
;
}
}
else
if
(
url
.
contains
(
"qq.com/"
)){
...
...
@@ -569,137 +584,175 @@ public class MatchSource {
}
else
if
(
url
.
contains
(
"3g.163.com"
)){
source
=
document
.
select
(
"div.info"
).
select
(
"[class=\"source js-source\"]"
)
.
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"网易号-"
+
source
;
}
}
else
if
(
url
.
contains
(
"myzaker.com"
)){
source
=
document
.
select
(
"div.article_header > div > a > span.auther"
)
.
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"zaker-"
+
source
;
}
}
else
if
(
url
.
contains
(
"edushi.com"
)){
source
=
document
.
select
(
"div.eds-name-box > div.eds-name > a > div.name"
)
.
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"今日潮闻-"
+
source
;
}
}
else
if
(
url
.
contains
(
"ijiandao.com"
)){
source
=
document
.
select
(
"div.article-author > span.author-name > a"
)
.
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"爱尖刀-"
+
source
;
}
}
else
if
(
url
.
contains
(
"chuangyejia.com"
)){
source
=
document
.
select
(
"div.article-title > ul.article-author > li:nth-child(1)"
)
.
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"创业家-"
+
source
;
}
}
else
if
(
url
.
contains
(
"kejixun.com"
)){
source
=
document
.
select
(
"div.r-box.r-box-none.clearfix > div > div.big-man > h3 > a"
)
.
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"科技讯-"
+
source
;
}
}
else
if
(
url
.
contains
(
"tmtpost.com"
)){
source
=
document
.
select
(
"article > div.post-info > a"
)
.
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"钛媒体-"
+
source
;
}
}
else
if
(
url
.
contains
(
"cyzone.cn"
)){
source
=
document
.
select
(
"div.article-author-info > div.author-main > div > div.a-word > div.a-name > a"
)
.
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"创业邦-"
+
source
;
}
}
else
if
(
url
.
contains
(
"36kr.com"
)){
source
=
document
.
select
(
"div.info-header-text > a.author-name"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
source
=
document
.
select
(
"h4.author-name"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
source
=
document
.
select
(
"span.author-nickname"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
}
}
else
if
(
url
.
contains
(
"lianxianjia.com"
)){
source
=
document
.
select
(
"span.author-name"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"连线家-"
+
source
;
}
}
else
if
(
url
.
contains
(
"itouchtv.cn"
)){
source
=
document
.
select
(
"div.index__article-media-20Tg_ > span:nth-child(1)"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"触电新闻-"
+
source
;
}
}
else
if
(
url
.
contains
(
"whb.cn"
)){
source
=
document
.
select
(
"div.yidian-info > span:nth-child(1)"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"文汇APP-"
+
source
;
}
}
else
if
(
url
.
contains
(
"blogchina.com"
)){
source
=
document
.
select
(
"div.meta-top > label.lm_name > span > a"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"博客中国-"
+
source
;
}
}
else
if
(
url
.
contains
(
".iqiyi.com"
))
{
source
=
JSONObject
.
parseObject
(
html
.
split
(
"page-info='"
)[
1
].
split
(
"'"
)[
0
]).
getJSONObject
(
"user"
).
getString
(
"name"
);
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"爱奇艺-"
+
source
;
}
}
else
if
(
url
.
contains
(
"v.youku.com"
))
{
source
=
document
.
select
(
"a.sub-name"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"优酷-"
+
source
;
}
}
else
if
(
url
.
contains
(
"jiemian.com"
))
{
source
=
document
.
select
(
"div.article-info > p > span.author > a"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"界面新闻-"
+
source
;
}
}
else
if
(
url
.
contains
(
"iyiou.com"
))
{
source
=
document
.
select
(
"div#post_author > a"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
))
{
if
(
StringUtils
.
isNotBlank
(
source
))
{
source
=
"亿欧网-"
+
source
;
}
}
else
if
(
url
.
contains
(
"lanjingtmt.com"
))
{
source
=
document
.
select
(
"div.scd-title > a:nth-child(2)"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
))
{
if
(
StringUtils
.
isNotBlank
(
source
))
{
source
=
"蓝鲸-"
+
source
;
}
}
else
if
(
url
.
contains
(
"lanjinger.com"
))
{
if
(
document
.
select
(
"div.content_left > div:nth-child(2) > span"
).
text
().
contains
(
"专栏"
))
{
source
=
document
.
select
(
"a.author_name"
).
text
().
replaceAll
(
".*编辑| "
,
""
);
if
(
source
!=
null
&&
!
source
.
equals
(
""
))
{
if
(
StringUtils
.
isNotBlank
(
source
))
{
source
=
"蓝鲸财经-"
+
source
;
}
}
}
else
if
(
url
.
contains
(
"huxiu.com"
))
{
source
=
document
.
select
(
"div.article__author-info-box > a.article-author-info > span.author-info__username"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
))
{
if
(
StringUtils
.
isNotBlank
(
source
))
{
source
=
"虎嗅-"
+
source
;
}
}
else
if
(
url
.
contains
(
"chuansongme.com"
))
{
source
=
document
.
select
(
"div.rich_media_meta_list > span.rich_media_meta.rich_media_meta_text"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
))
{
if
(
StringUtils
.
isNotBlank
(
source
))
{
source
=
"传送门-"
+
source
;
}
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
html
);
source
=
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"_author"
).
getString
(
"author_name"
);
if
(
source
!=
null
&&
!
source
.
equals
(
""
))
{
if
(
StringUtils
.
isNotBlank
(
source
))
{
source
=
"uc-"
+
source
;
}
}
else
if
(
url
.
contains
(
"m.uczzd.cn"
))
{
if
(
html
.
contains
(
"var xissJsonData ="
)){
html
=
html
.
split
(
"var xissJsonData = "
)[
1
].
split
(
"};"
)[
0
]+
"}"
;
source
=
JSONObject
.
parseObject
(
html
).
getString
(
"source_name"
);
}
if
(
StringUtils
.
isNotBlank
(
source
))
{
source
=
"uc-"
+
source
;
}
}
else
if
(
url
.
contains
(
"kd.youth.cn"
))
{
source
=
document
.
select
(
"body > div > div > div.rich_media_meta_list > a"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
))
{
if
(
StringUtils
.
isNotBlank
(
source
))
{
source
=
"中青在线-"
+
source
;
}
}
else
if
(
url
.
contains
(
"zhuanlan.zhihu.com"
))
{
source
=
document
.
select
(
"a.UserLink-link"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
))
{
source
=
"知乎专栏-"
+
source
;
}
}
else
if
(
url
.
contains
(
"wulizixun.com"
))
{
source
=
document
.
select
(
"span.newdetailOrigin"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
))
{
source
=
"唔哩头条-"
+
source
;
}
}
else
if
(
url
.
contains
(
"t.10jqka.com.cn"
)){
source
=
document
.
select
(
"a[class=\"link777 post-author db fl\"]"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
))
{
source
=
"同花顺-"
+
source
;
}
}
else
if
(
url
.
contains
(
"shangyexinzhi.com"
)){
source
=
document
.
select
(
"span.hover-color_change"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
))
{
source
=
"商业新知-"
+
source
;
}
}
else
if
(
url
.
contains
(
"thepaper.cn"
)){
source
=
document
.
select
(
"a> div.name"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"澎湃新闻-"
+
source
;
}
}
else
if
(
url
.
contains
(
"tuicool.com"
)){
source
=
document
.
select
(
"span.from> a"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"推酷-"
+
source
;
}
}
return
source
;
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
0 → 100644
View file @
0abfbd4a
package
com
.
zhiwei
.
source_forward
.
util
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.source_forward.config.ProxyConfig
;
/**
* 初始化代理
* @author xMx
* @date 2020年1月6日 上午9:29:04
*/
public
class
ProxyInit
{
/**
* 初始化代理
* void
*/
public
static
void
initProxy
()
{
String
address
=
ProxyConfig
.
registry
;
String
appName
=
"xumiaoxin"
;
long
appId
=
ProxyConfig
.
proxyid
;
ProxyFactory
.
init
(
SimpleConfig
.
builder
().
registry
(
address
).
appName
(
appName
).
appId
(
appId
).
group
(
ProxyConfig
.
group
).
build
());
}
}
src/main/resources/proxyip.properties
View file @
0abfbd4a
#registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181
#group=hangzhou
##########################测试地址##############################
registry
=
zookeeper://192.168.0.36:2181
registry
=
zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181
proxyid
=
10000002
group
=
local
\ No newline at end of file
src/test/java/com/zhiwei/source_forward/sourceforward/test/MediaSelfSourceTest.java
View file @
0abfbd4a
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment