Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
196e523d
Commit
196e523d
authored
Apr 26, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
crawler-core 版本提升
parent
39b30f08
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
39 additions
and
36 deletions
+39
-36
pom.xml
+3
-3
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+7
-10
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+4
-2
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+4
-2
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+4
-2
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+17
-17
No files found.
pom.xml
View file @
196e523d
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<version>
0.1.
3
-SNAPSHOT
</version>
<version>
0.1.
5
-SNAPSHOT
</version>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
...
@@ -24,12 +24,12 @@
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.
2
-SNAPSHOT
</version>
<version>
0.1.
3
-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.3.
1
-RELEASE
</version>
<version>
0.3.
6
-RELEASE
</version>
<scope>
provided
</scope>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
196e523d
...
...
@@ -13,13 +13,14 @@ import com.zhiwei.source_forward.bean.ContentBean;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
public
class
ContentCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
...
...
@@ -31,14 +32,9 @@ public class ContentCrawler {
*/
public
GroupSync
submitTask
(
ContentDataCallback
callback
,
String
...
urls
)
{
try
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" exception {}"
,
e
);
return
null
;
}
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
/**
...
...
@@ -54,6 +50,7 @@ public class ContentCrawler {
for
(
String
url
:
urls
)
{
if
(
url
!=
null
)
{
try
{
ZhiWeiTools
.
sleep
(
10
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
...
...
@@ -78,7 +75,7 @@ public class ContentCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_
HEAVY_
PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
196e523d
...
...
@@ -19,6 +19,7 @@ import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -33,7 +34,7 @@ import okhttp3.Request;
public
class
MediaSelfSourceCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
MediaSelfSourceCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
...
...
@@ -67,6 +68,7 @@ public class MediaSelfSourceCrawler {
counter
.
add
();
if
(
url
!=
null
)
{
try
{
ZhiWeiTools
.
sleep
(
10
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
...
...
@@ -96,7 +98,7 @@ public class MediaSelfSourceCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
map
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_
HEAVY_
PROXY
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
try
{
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
196e523d
...
...
@@ -21,6 +21,7 @@ import com.zhiwei.source_forward.util.MatchSource;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -28,7 +29,7 @@ public class SourceForwardCrawler {
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SourceForwardCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
public
GroupSync
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
...
...
@@ -49,6 +50,7 @@ public class SourceForwardCrawler {
if
(
url
!=
null
)
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
ZhiWeiTools
.
sleep
(
10
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
...
...
@@ -67,7 +69,7 @@ public class SourceForwardCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_
HEAVY_
PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
);
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
196e523d
...
...
@@ -21,6 +21,7 @@ import com.zhiwei.source_forward.bean.UrlLiveBean;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -35,7 +36,7 @@ import okhttp3.Request;
public
class
UrlLiveCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
(
false
,
2
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
(
);
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
...
...
@@ -49,6 +50,7 @@ public class UrlLiveCrawler {
counter
.
add
();
if
(
nonNull
(
url
))
{
try
{
ZhiWeiTools
.
sleep
(
10
);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
...
...
@@ -71,7 +73,7 @@ public class UrlLiveCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
if
(
Objects
.
nonNull
(
request
))
{
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_
HEAVY_
PROXY
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
code
()
==
200
)
{
...
...
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
View file @
196e523d
...
...
@@ -35,7 +35,7 @@ public class MatchContent {
try
{
Document
document
=
Jsoup
.
parse
(
html
);
if
(
url
.
contains
(
"weixin.qq.com"
))
{
content
=
matchContentWeixin
(
document
);
content
=
matchContentWeixin
(
html
);
}
else
if
(
url
.
contains
(
"toutiao.com"
))
{
content
=
matchContentToutiao
(
html
);
}
...
...
@@ -71,22 +71,22 @@ public class MatchContent {
* @param html
* @return
*/
private
static
String
matchContentWeixin
(
Document
document
)
{
try
{
String
content
=
document
.
select
(
"div.rich_media_content"
).
text
(
);
if
(
document
.
toString
().
contains
(
"<script id=\"content_tpl\"
"
))
{
Pattern
pa
=
Pattern
.
compile
(
"\\<script id=\"content_tpl(.*?)\\</script\\>"
);
Matcher
ma
=
pa
.
matcher
(
document
.
toString
());
while
(
ma
.
find
())
{
return
ma
.
group
(
0
).
replaceAll
(
"<script id=\"content_tpl\" type=\"text/html\">"
,
""
).
replaceAll
(
"</script>"
,
""
);
}
return
content
;
}
return
content
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
(
);
return
""
;
}
private
static
String
matchContentWeixin
(
String
contentHtml
)
{
try
{
Document
document
=
Jsoup
.
parse
(
contentHtml
);
if
(
contentHtml
.
contains
(
"js_article
"
))
{
return
document
.
select
(
"div#js_article"
).
text
(
);
}
else
if
(
contentHtml
.
contains
(
"js_share_content"
))
{
return
document
.
select
(
"div#js_share_content"
).
text
();
}
if
(
contentHtml
.
contains
(
"content_tpl"
))
{
String
text
=
document
.
select
(
"script#content_tpl"
).
html
()
;
return
Jsoup
.
parse
(
text
).
text
();
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"微信全文解析出错 {}"
,
e
);
}
return
""
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment