Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
7f4a87a2
Commit
7f4a87a2
authored
Mar 19, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改 crawler-core 提升版本
parent
1b20782c
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
106 additions
and
125 deletions
+106
-125
pom.xml
+3
-3
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+25
-32
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+19
-27
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+18
-30
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+17
-32
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+1
-1
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+23
-0
No files found.
pom.xml
View file @
7f4a87a2
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<artifactId>
source-forward
</artifactId>
<version>
0.1.
1
-SNAPSHOT
</version>
<version>
0.1.
3
-SNAPSHOT
</version>
<name>
source-forward
</name>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
@@ -24,12 +24,12 @@
...
@@ -24,12 +24,12 @@
<dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.
1
-SNAPSHOT
</version>
<version>
0.1.
2
-SNAPSHOT
</version>
</dependency>
</dependency>
<dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<artifactId>
crawler-core
</artifactId>
<version>
0.
1.1
-RELEASE
</version>
<version>
0.
3.0
-RELEASE
</version>
<scope>
provided
</scope>
<scope>
provided
</scope>
</dependency>
</dependency>
</dependencies>
</dependencies>
...
...
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
7f4a87a2
package
com
.
zhiwei
.
source_forward
.
crawler
;
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.
concurrent.TimeUnit
;
import
java.util.
Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.
crawler.async.MultiThreadingCounter
;
import
com.zhiwei.
async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
public
class
ContentCrawler
{
public
class
ContentCrawler
{
...
@@ -30,10 +29,10 @@ public class ContentCrawler {
...
@@ -30,10 +29,10 @@ public class ContentCrawler {
* @return
* @return
* @throws Exception
* @throws Exception
*/
*/
public
MultiThreadingCounter
submitTask
(
ContentDataCallback
callback
,
public
GroupSync
submitTask
(
ContentDataCallback
callback
,
String
...
urls
)
{
String
...
urls
)
{
try
{
try
{
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
(
15
,
TimeUnit
.
MINUTES
,
false
);
GroupSync
counter
=
new
GroupSync
(
);
start
(
counter
,
callback
,
urls
);
start
(
counter
,
callback
,
urls
);
return
counter
;
return
counter
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -49,7 +48,7 @@ public class ContentCrawler {
...
@@ -49,7 +48,7 @@ public class ContentCrawler {
* @param callback
* @param callback
* @param urls
* @param urls
*/
*/
private
void
start
(
MultiThreadingCounter
counter
,
private
void
start
(
GroupSync
counter
,
ContentDataCallback
callback
,
String
...
urls
)
{
ContentDataCallback
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
...
@@ -73,23 +72,25 @@ public class ContentCrawler {
...
@@ -73,23 +72,25 @@ public class ContentCrawler {
* @param callback
* @param callback
* @return
* @return
*/
*/
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
);
counter
.
increase
();
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
,
true
).
addListener
(
future
->
{
if
(
future
.
isSuccess
())
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
Response
response
=
future
.
result
();
try
{
try
{
parseHtml
(
response
,
attr
,
callback
);
if
(
Objects
.
isNull
(
ex
))
{
}
catch
(
Exception
e
)
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
logger
.
error
(
"解析出错"
,
e
);
}
}
else
{
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"搜索结果访问失败: {}"
,
ex
);
}
finally
{
counter
.
done
();
}
}
counter
.
reduce
();
});
});
return
counter
;
return
counter
;
...
@@ -103,28 +104,20 @@ public class ContentCrawler {
...
@@ -103,28 +104,20 @@ public class ContentCrawler {
* @param attr
* @param attr
* @param callback
* @param callback
*/
*/
private
void
parseHtml
(
Response
response
,
Attribution
attr
,
private
void
parseHtml
(
String
result
,
Attribution
attr
,
ContentDataCallback
callback
)
{
ContentDataCallback
callback
)
{
String
content
=
null
;
try
{
try
{
if
(
response
.
isSuccessful
())
{
String
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
String
html
=
response
.
body
().
string
();
result
);
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
html
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"网页链接失效"
,
e
);
}
finally
{
if
(
response
!=
null
)
{
response
.
close
();
}
}
ContentBean
cb
=
new
ContentBean
(
attr
.
get
().
toString
(),
content
);
ContentBean
cb
=
new
ContentBean
(
attr
.
get
().
toString
(),
content
);
if
(
callback
==
null
)
{
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
}
else
{
callback
.
onData
(
cb
,
attr
);
callback
.
onData
(
cb
,
attr
);
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"网页链接失效"
,
e
);
}
}
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
7f4a87a2
...
@@ -3,17 +3,17 @@ package com.zhiwei.source_forward.crawler;
...
@@ -3,17 +3,17 @@ package com.zhiwei.source_forward.crawler;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.
concurrent.TimeUnit
;
import
java.util.
Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.
crawler.async.MultiThreadingCounter
;
import
com.zhiwei.
async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchChannel
;
...
@@ -21,7 +21,6 @@ import com.zhiwei.source_forward.util.MatchSource;
...
@@ -21,7 +21,6 @@ import com.zhiwei.source_forward.util.MatchSource;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
/**
/**
*
*
...
@@ -44,9 +43,9 @@ public class MediaSelfSourceCrawler {
...
@@ -44,9 +43,9 @@ public class MediaSelfSourceCrawler {
* @return
* @return
* @throws Exception
* @throws Exception
*/
*/
public
MultiThreadingCounter
submitTask
(
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
public
GroupSync
submitTask
(
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
try
{
try
{
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
(
"任务======= "
,
15
,
TimeUnit
.
MINUTES
,
true
);
GroupSync
counter
=
new
GroupSync
(
);
start
(
counter
,
callback
,
urls
);
start
(
counter
,
callback
,
urls
);
return
counter
;
return
counter
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -62,10 +61,10 @@ public class MediaSelfSourceCrawler {
...
@@ -62,10 +61,10 @@ public class MediaSelfSourceCrawler {
* @param callback
* @param callback
* @param urls
* @param urls
*/
*/
private
void
start
(
MultiThreadingCounter
counter
,
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
counter
.
increase
();
counter
.
add
();
if
(
url
!=
null
)
{
if
(
url
!=
null
)
{
try
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
...
@@ -73,7 +72,7 @@ public class MediaSelfSourceCrawler {
...
@@ -73,7 +72,7 @@ public class MediaSelfSourceCrawler {
logger
.
error
(
"搜索创建出错"
,
e
);
logger
.
error
(
"搜索创建出错"
,
e
);
}
}
}
}
counter
.
reduc
e
();
counter
.
don
e
();
}
}
}
}
}
}
...
@@ -87,28 +86,28 @@ public class MediaSelfSourceCrawler {
...
@@ -87,28 +86,28 @@ public class MediaSelfSourceCrawler {
* @param callback
* @param callback
* @return
* @return
*/
*/
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
if
(
url
.
contains
(
"toutiao.com"
))
{
if
(
url
.
contains
(
"toutiao.com"
))
{
map
.
put
(
"referer"
,
url
);
map
.
put
(
"referer"
,
url
);
}
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
map
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
map
);
counter
.
increase
();
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
,
true
).
addListener
(
future
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
try
{
try
{
if
(
future
.
isSuccess
())
{
if
(
Objects
.
isNull
(
ex
))
{
Response
response
=
future
.
result
();
try
{
try
{
parseHtml
(
r
esponse
,
attr
,
callback
);
parseHtml
(
r
s
.
body
().
string
()
,
attr
,
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
logger
.
error
(
"解析出错"
,
e
);
}
}
}
else
{
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
()
);
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
}
finally
{
}
finally
{
counter
.
reduc
e
();
counter
.
don
e
();
}
}
});
});
return
counter
;
return
counter
;
...
@@ -121,30 +120,23 @@ public class MediaSelfSourceCrawler {
...
@@ -121,30 +120,23 @@ public class MediaSelfSourceCrawler {
* @param attr
* @param attr
* @param callback
* @param callback
*/
*/
private
void
parseHtml
(
Response
response
,
Attribution
attr
,
private
void
parseHtml
(
String
result
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
MediaSelfSourceDataCallBack
callback
)
{
String
source
=
null
;
String
source
=
null
;
String
channel
=
null
;
String
channel
=
null
;
try
{
try
{
if
(
response
.
isSuccessful
()){
source
=
MatchSource
.
matchMediaSelfSource
(
attr
.
get
().
toString
(),
result
);
String
html
=
response
.
body
().
string
();
source
=
MatchSource
.
matchMediaSelfSource
(
attr
.
get
().
toString
(),
html
);
if
(
source
==
null
||
source
.
equals
(
""
)){
if
(
source
==
null
||
source
.
equals
(
""
)){
source
=
null
;
source
=
null
;
}
}
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
if
(
channel
==
null
){
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
Jsoup
.
parse
(
html
).
head
().
childNodes
();
List
<
Node
>
nodeList
=
Jsoup
.
parse
(
result
).
head
().
childNodes
();
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"exception "
,
e
);
logger
.
error
(
"exception "
,
e
);
source
=
null
;
source
=
null
;
}
finally
{
if
(
response
!=
null
)
{
response
.
close
();
}
}
}
logger
.
info
(
attr
.
get
()+
"================="
+
source
);
logger
.
info
(
attr
.
get
()+
"================="
+
source
);
MediaSelfSourceBean
msfb
=
new
MediaSelfSourceBean
(
attr
.
get
().
toString
(),
source
,
channel
);
MediaSelfSourceBean
msfb
=
new
MediaSelfSourceBean
(
attr
.
get
().
toString
(),
source
,
channel
);
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
7f4a87a2
...
@@ -2,7 +2,7 @@ package com.zhiwei.source_forward.crawler;
...
@@ -2,7 +2,7 @@ package com.zhiwei.source_forward.crawler;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.
concurrent.TimeUnit
;
import
java.util.
Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
...
@@ -10,10 +10,10 @@ import org.jsoup.Jsoup;
...
@@ -10,10 +10,10 @@ import org.jsoup.Jsoup;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.
crawler.async.MultiThreadingCounter
;
import
com.zhiwei.
async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchChannel
;
...
@@ -23,7 +23,6 @@ import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
...
@@ -23,7 +23,6 @@ import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
public
class
SourceForwardCrawler
{
public
class
SourceForwardCrawler
{
...
@@ -32,9 +31,9 @@ public class SourceForwardCrawler {
...
@@ -32,9 +31,9 @@ public class SourceForwardCrawler {
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
public
MultiThreadingCounter
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
public
GroupSync
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
try
{
try
{
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
(
15
,
TimeUnit
.
MINUTES
,
false
);
GroupSync
counter
=
new
GroupSync
(
);
start
(
counter
,
callback
,
urls
);
start
(
counter
,
callback
,
urls
);
return
counter
;
return
counter
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -43,10 +42,10 @@ public class SourceForwardCrawler {
...
@@ -43,10 +42,10 @@ public class SourceForwardCrawler {
}
}
}
}
private
void
start
(
MultiThreadingCounter
counter
,
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
counter
.
increase
();
counter
.
add
();
if
(
url
!=
null
)
{
if
(
url
!=
null
)
{
try
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
...
@@ -54,12 +53,12 @@ public class SourceForwardCrawler {
...
@@ -54,12 +53,12 @@ public class SourceForwardCrawler {
logger
.
error
(
"搜索创建出错"
,
e
);
logger
.
error
(
"搜索创建出错"
,
e
);
}
}
}
}
counter
.
reduc
e
();
counter
.
don
e
();
}
}
}
}
}
}
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
String
url
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headers
=
HeaderTool
.
getCommonHead
();
if
(
url
.
contains
(
"www.toutiao.com"
)){
if
(
url
.
contains
(
"www.toutiao.com"
)){
...
@@ -67,34 +66,29 @@ public class SourceForwardCrawler {
...
@@ -67,34 +66,29 @@ public class SourceForwardCrawler {
}
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
counter
.
increase
();
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
,
true
).
addListener
(
future
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
try
{
if
(
future
.
isSuccess
())
{
if
(
Objects
.
isNull
(
ex
))
{
Response
response
=
future
.
result
();
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
try
{
parseHtml
(
response
,
attr
,
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
}
}
else
{
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
()
);
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
}
catch
(
Exception
e1
)
{
logger
.
error
(
"解析出错"
,
e1
);
}
finally
{
}
finally
{
counter
.
reduc
e
();
counter
.
don
e
();
}
}
});
});
return
counter
;
return
counter
;
}
}
private
void
parseHtml
(
Response
response
,
Attribution
attr
,
private
void
parseHtml
(
String
body
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
SourceForwardDataCallBack
callback
)
{
String
source
=
null
;
String
source
=
null
;
String
channel
=
"新闻"
;
String
channel
=
"新闻"
;
String
isforward
=
"未知"
;
String
isforward
=
"未知"
;
try
{
try
{
if
(
response
.
isSuccessful
()){
String
body
=
response
.
body
().
string
();
Document
document
=
Jsoup
.
parse
(
body
);
Document
document
=
Jsoup
.
parse
(
body
);
if
(
attr
.
get
().
toString
().
contains
(
"mp.weixin.qq.com"
)){
if
(
attr
.
get
().
toString
().
contains
(
"mp.weixin.qq.com"
)){
isforward
=
document
.
select
(
"div#meta_content"
).
select
(
"span#copyright_logo"
).
text
();
isforward
=
document
.
select
(
"div#meta_content"
).
select
(
"span#copyright_logo"
).
text
();
...
@@ -115,15 +109,9 @@ public class SourceForwardCrawler {
...
@@ -115,15 +109,9 @@ public class SourceForwardCrawler {
}
}
source
=
MatchSource
.
matchSource
(
attr
.
get
().
toString
(),
document
.
toString
(),
sourceList
);
source
=
MatchSource
.
matchSource
(
attr
.
get
().
toString
(),
document
.
toString
(),
sourceList
);
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
source
=
null
;
source
=
null
;
channel
=
"新闻"
;
channel
=
"新闻"
;
}
finally
{
if
(
response
!=
null
)
{
response
.
close
();
}
}
}
logger
.
info
(
attr
.
get
().
toString
()+
"======="
+
channel
+
"================="
+
source
);
logger
.
info
(
attr
.
get
().
toString
()+
"======="
+
channel
+
"================="
+
source
);
SourceForwardBean
sfb
=
new
SourceForwardBean
(
attr
.
get
().
toString
(),
channel
,
source
,
isforward
);
SourceForwardBean
sfb
=
new
SourceForwardBean
(
attr
.
get
().
toString
(),
channel
,
source
,
isforward
);
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
7f4a87a2
...
@@ -6,7 +6,6 @@ import java.util.Arrays;
...
@@ -6,7 +6,6 @@ import java.util.Arrays;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.Objects
;
import
java.util.concurrent.TimeUnit
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
...
@@ -14,17 +13,16 @@ import org.jsoup.Jsoup;
...
@@ -14,17 +13,16 @@ import org.jsoup.Jsoup;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.
crawler.async.MultiThreadingCounter
;
import
com.zhiwei.
async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
/**
/**
*
*
...
@@ -39,9 +37,9 @@ public class UrlLiveCrawler {
...
@@ -39,9 +37,9 @@ public class UrlLiveCrawler {
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
public
MultiThreadingCounter
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
try
{
try
{
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
(
10
,
TimeUnit
.
MINUTES
,
false
);
GroupSync
counter
=
new
GroupSync
(
);
start
(
counter
,
callback
,
urls
);
start
(
counter
,
callback
,
urls
);
return
counter
;
return
counter
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -50,10 +48,10 @@ public class UrlLiveCrawler {
...
@@ -50,10 +48,10 @@ public class UrlLiveCrawler {
}
}
}
}
private
void
start
(
MultiThreadingCounter
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
counter
.
increase
();
counter
.
add
();
if
(
nonNull
(
url
))
{
if
(
nonNull
(
url
))
{
try
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
...
@@ -61,12 +59,12 @@ public class UrlLiveCrawler {
...
@@ -61,12 +59,12 @@ public class UrlLiveCrawler {
logger
.
error
(
"搜索创建出错:"
,
e
);
logger
.
error
(
"搜索创建出错:"
,
e
);
}
}
}
}
counter
.
reduc
e
();
counter
.
don
e
();
}
}
}
}
}
}
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
String
url
,
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
url
=
dealUrl
(
url
);
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
...
@@ -75,46 +73,33 @@ public class UrlLiveCrawler {
...
@@ -75,46 +73,33 @@ public class UrlLiveCrawler {
headers
.
put
(
"referer"
,
url
);
headers
.
put
(
"referer"
,
url
);
}
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
counter
.
increase
();
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
).
addListener
(
future
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
try
{
try
{
if
(
future
.
isSuccess
())
{
if
(
Objects
.
isNull
(
ex
))
{
Response
response
=
future
.
result
();
if
(
rs
.
code
()
==
200
)
{
try
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
,
counter
);
if
(
response
.
code
()
==
200
)
{
parseHtml
(
response
.
body
().
string
(),
attr
,
callback
,
counter
);
}
else
{
}
else
{
if
(
attr
.
getCount
()
>
2
)
{
if
(
attr
.
getCount
()
>
2
)
{
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
response
.
code
()));
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
{
}
else
{
attr
.
AddCount
();
attr
.
AddCount
();
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
}
}
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错 {}"
,
e
);
}
finally
{
if
(
response
!=
null
)
{
response
.
close
();
}
}
}
else
{
}
else
{
if
(
future
.
cause
().
getMessage
().
contains
(
"status code: "
))
{
callBack
(
callback
,
attr
,
1
,
null
);
}
else
{
if
(
attr
.
getCount
()
>
3
)
{
if
(
attr
.
getCount
()
>
3
)
{
callBack
(
callback
,
attr
,
-
1
,
null
);
callBack
(
callback
,
attr
,
-
1
,
null
);
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
()
);
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
else
{
}
else
{
attr
.
AddCount
();
attr
.
AddCount
();
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
}
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
}
finally
{
}
finally
{
counter
.
reduc
e
();
counter
.
don
e
();
}
}
});
});
return
counter
;
return
counter
;
...
@@ -178,7 +163,7 @@ public class UrlLiveCrawler {
...
@@ -178,7 +163,7 @@ public class UrlLiveCrawler {
* @param callback
* @param callback
*/
*/
private
void
parseHtml
(
String
html
,
Attribution
attr
,
private
void
parseHtml
(
String
html
,
Attribution
attr
,
UrlLiveDataCallback
callback
,
MultiThreadingCounter
counter
)
{
UrlLiveDataCallback
callback
,
GroupSync
counter
)
{
if
(
callback
==
null
)
{
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
}
else
{
...
...
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
View file @
7f4a87a2
...
@@ -25,7 +25,7 @@ public class MediaSelfSource {
...
@@ -25,7 +25,7 @@ public class MediaSelfSource {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"https://
sports.qq.com/a/20190227/001177.htm
"
);
urlList
.
add
(
"https://
www.toutiao.com/a6669697912458445059/
"
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
for
(
MediaSelfSourceBean
b
:
u
)
{
for
(
MediaSelfSourceBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
System
.
out
.
println
(
b
.
toString
());
...
...
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
7f4a87a2
package
com
.
zhiwei
.
source_forward
.
util
;
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Objects
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
...
@@ -148,6 +149,16 @@ public class MatchSource {
...
@@ -148,6 +149,16 @@ public class MatchSource {
source
=
"东方头条-"
+
source
;
source
=
"东方头条-"
+
source
;
}
}
}
else
if
(
url
.
contains
(
"fashion.eastday.com"
)){
//处理东方头条网-自媒体号匹配
source
=
document
.
select
(
"div.J-title_detail.title_detail > div > div.fl > i:nth-child(2)"
).
text
().
trim
();
if
(
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
{
source
=
document
.
select
(
"div.J-title_detail.title_detail > div > div.fl > a"
).
text
().
trim
();
}
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"东方看点-"
+
source
;
}
}
else
if
(
url
.
contains
(
"sh.qihoo.com"
)){
}
else
if
(
url
.
contains
(
"sh.qihoo.com"
)){
//今日报点解析
//今日报点解析
source
=
document
.
select
(
"span.source"
).
text
().
trim
();
source
=
document
.
select
(
"span.source"
).
text
().
trim
();
...
@@ -237,6 +248,18 @@ public class MatchSource {
...
@@ -237,6 +248,18 @@ public class MatchSource {
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"网易号-"
+
source
;
source
=
"网易号-"
+
source
;
}
}
}
else
if
(
url
.
contains
(
"myzaker.com"
)){
source
=
document
.
select
(
"div.article_header > div > a > span.auther"
)
.
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"zaker-"
+
source
;
}
}
else
if
(
url
.
contains
(
"edushi.com"
)){
source
=
document
.
select
(
"div.eds-name-box > div.eds-name > a > div.name"
)
.
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"今日潮闻-"
+
source
;
}
}
}
return
source
;
return
source
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment