Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
0abfbd4a
Commit
0abfbd4a
authored
Apr 02, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加自媒体匹配
parent
4e02a60f
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
50 additions
and
33 deletions
+50
-33
Log/crawler.log
+0
-0
pom.xml
+3
-3
src/main/java/com/zhiwei/source_forward/config/ProxyConfig.java
+3
-0
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+7
-10
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+5
-4
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+2
-3
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+2
-12
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+0
-0
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
+25
-0
src/main/resources/proxyip.properties
+3
-1
src/test/java/com/zhiwei/source_forward/sourceforward/test/MediaSelfSourceTest.java
+0
-0
No files found.
Log/crawler.log
View file @
0abfbd4a
This source diff could not be displayed because it is too large. You can
view the blob
instead.
pom.xml
View file @
0abfbd4a
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<version>
0.2.
1-SNAPSHOT
</version>
<version>
0.2.
2-SNAPSHOT
</version>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
...
@@ -24,12 +24,12 @@
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.
3-SNAPSHOT
</version>
<version>
0.1.
6-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.
5.5.6-SNAPSHOT
</version>
<version>
0.
6.1.0-SNAPSHOT
</version>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/source_forward/config/ProxyConfig.java
View file @
0abfbd4a
...
...
@@ -13,7 +13,9 @@ public class ProxyConfig {
conf
.
load
(
is
);
is
.
close
();
registry
=
conf
.
getProperty
(
"registry"
);
proxyid
=
Long
.
valueOf
(
conf
.
getProperty
(
"proxyid"
));
group
=
conf
.
getProperty
(
"group"
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
...
...
@@ -21,6 +23,7 @@ public class ProxyConfig {
public
static
String
registry
;
public
static
Long
proxyid
;
public
static
String
group
;
}
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
0abfbd4a
...
...
@@ -5,6 +5,7 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Objects
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
...
...
@@ -87,12 +88,11 @@ public class MediaSelfSourceCrawler {
* @return
*/
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
if
(
url
.
contains
(
"toutiao.com"
))
{
map
.
put
(
"referer"
,
url
);
}
map
.
put
(
"Connection"
,
"close"
);
url
=
dealUrl
(
url
);
if
(
Objects
.
nonNull
(
url
))
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
map
);
...
...
@@ -148,7 +148,6 @@ public class MediaSelfSourceCrawler {
/**
*
* @Description 解析文章获取相关数据
* @param response
* @param attr
* @param callback
*/
...
...
@@ -156,12 +155,11 @@ public class MediaSelfSourceCrawler {
MediaSelfSourceDataCallBack
callback
)
{
String
source
=
null
;
String
channel
=
null
;
String
url
=
attr
.
get
().
toString
();
try
{
source
=
MatchSource
.
matchMediaSelfSource
(
attr
.
get
().
toString
(),
result
);
if
(
source
==
null
||
source
.
equals
(
""
)){
source
=
null
;
}
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
source
=
MatchSource
.
matchMediaSelfSource
(
url
,
result
);
logger
.
info
(
url
+
"======="
+
source
);
channel
=
MatchChannel
.
verifyChannel
(
url
);
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
Jsoup
.
parse
(
result
).
head
().
childNodes
();
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
...
...
@@ -170,8 +168,7 @@ public class MediaSelfSourceCrawler {
logger
.
error
(
"exception "
,
e
);
source
=
null
;
}
logger
.
info
(
attr
.
get
()+
"=================来源"
+
source
);
MediaSelfSourceBean
msfb
=
new
MediaSelfSourceBean
(
attr
.
get
().
toString
(),
source
,
channel
);
MediaSelfSourceBean
msfb
=
new
MediaSelfSourceBean
(
url
,
source
,
channel
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
...
...
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
View file @
0abfbd4a
...
...
@@ -4,11 +4,11 @@ import java.util.ArrayList;
import
java.util.Collections
;
import
java.util.List
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.source_forward.util.ProxyInit
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler
;
...
...
@@ -30,9 +30,10 @@ public class MediaSelfSource {
}
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"https://w
ap.peopleapp.com/article/rmh12074926/0"
);
urlList
.
add
(
"https://w
ww.tuicool.com/articles/nIfmu2B"
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
for
(
MediaSelfSourceBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
...
...
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
0abfbd4a
...
...
@@ -6,11 +6,10 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.source_forward.util.ProxyInit
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.crawler.SourceForwardCrawler
;
...
...
@@ -79,7 +78,7 @@ public class SourceForward {
}
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002
);
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://software.it168.com/a2019/0621/6005/000006005693.shtml"
);
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
0abfbd4a
package
com
.
zhiwei
.
source_forward
.
run
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.source_forward.util.ProxyInit
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.TaskBoot
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.crawler.UrlLiveCrawler
;
import
com.zhiwei.source_forward.crawler.UrlLiveCrawlerNew
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
okhttp3.Request
;
import
okhttp3.Response
;
/**
* @ClassName: URLLive
* @Description: 验证链接是否已删除
...
...
@@ -84,7 +74,7 @@ public class URLLive {
}
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002
);
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754"
);
...
...
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
0abfbd4a
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
0 → 100644
View file @
0abfbd4a
package
com
.
zhiwei
.
source_forward
.
util
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.source_forward.config.ProxyConfig
;
/**
* 初始化代理
* @author xMx
* @date 2020年1月6日 上午9:29:04
*/
public
class
ProxyInit
{
/**
* 初始化代理
* void
*/
public
static
void
initProxy
()
{
String
address
=
ProxyConfig
.
registry
;
String
appName
=
"xumiaoxin"
;
long
appId
=
ProxyConfig
.
proxyid
;
ProxyFactory
.
init
(
SimpleConfig
.
builder
().
registry
(
address
).
appName
(
appName
).
appId
(
appId
).
group
(
ProxyConfig
.
group
).
build
());
}
}
src/main/resources/proxyip.properties
View file @
0abfbd4a
#registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181
#group=hangzhou
##########################测试地址##############################
registry
=
zookeeper://192.168.0.36:2181
registry
=
zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181
proxyid
=
10000002
group
=
local
\ No newline at end of file
src/test/java/com/zhiwei/source_forward/sourceforward/test/MediaSelfSourceTest.java
View file @
0abfbd4a
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment