Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
dd6b6b30
Commit
dd6b6b30
authored
Aug 18, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
自媒体匹配百家号添加相应规则
来源转发添加腾讯自选股及中青看点
parent
9fcfba2d
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
27 additions
and
17 deletions
+27
-17
pom.xml
+1
-1
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+1
-5
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+25
-11
No files found.
pom.xml
View file @
dd6b6b30
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<artifactId>
source-forward
</artifactId>
<version>
0.2.
7
-SNAPSHOT
</version>
<version>
0.2.
8
-SNAPSHOT
</version>
<name>
source-forward
</name>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
dd6b6b30
...
@@ -91,12 +91,9 @@ public class MediaSelfSourceCrawler {
...
@@ -91,12 +91,9 @@ public class MediaSelfSourceCrawler {
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
ProxyHolder
ph
=
null
;
ProxyHolder
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
if
(
url
.
contains
(
"toutiao.com"
))
{
if
(
url
.
contains
(
"toutiao.com"
))
{
map
.
put
(
"referer"
,
url
);
map
.
put
(
"referer"
,
url
);
ph
=
ProxyHolder
.
SOUGOU_OUTER_PROXY
;
}
else
{
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
}
url
=
dealUrl
(
url
);
url
=
dealUrl
(
url
);
if
(
Objects
.
nonNull
(
url
))
{
if
(
Objects
.
nonNull
(
url
))
{
...
@@ -170,7 +167,6 @@ public class MediaSelfSourceCrawler {
...
@@ -170,7 +167,6 @@ public class MediaSelfSourceCrawler {
String
url
=
attr
.
get
().
toString
();
String
url
=
attr
.
get
().
toString
();
try
{
try
{
source
=
MatchSource
.
matchMediaSelfSource
(
url
+
eUrl
,
result
);
source
=
MatchSource
.
matchMediaSelfSource
(
url
+
eUrl
,
result
);
logger
.
info
(
url
+
"======="
+
source
);
channel
=
MatchChannel
.
verifyChannel
(
url
);
channel
=
MatchChannel
.
verifyChannel
(
url
);
if
(
channel
==
null
){
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
Jsoup
.
parse
(
result
).
head
().
childNodes
();
List
<
Node
>
nodeList
=
Jsoup
.
parse
(
result
).
head
().
childNodes
();
...
...
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
dd6b6b30
package
com
.
zhiwei
.
source_forward
.
util
;
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.util.List
;
import
com.alibaba.fastjson.JSONObject
;
import
java.util.Objects
;
import
com.zhiwei.source_forward.content.ContentExtractor
;
import
java.util.regex.Matcher
;
import
com.zhiwei.source_forward.content.News
;
import
java.util.regex.Pattern
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
java.util.List
;
import
com.zhiwei.source_forward.content.ContentExtractor
;
import
java.util.Objects
;
import
com.zhiwei.source_forward.content.News
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
/**
/**
* @ClassName: MatchSource
* @ClassName: MatchSource
...
@@ -324,7 +323,19 @@ public class MatchSource {
...
@@ -324,7 +323,19 @@ public class MatchSource {
if
(
StringUtils
.
isNotBlank
(
source
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
source
.
replaceAll
(
".*来源:|)"
,
""
);
source
=
source
.
replaceAll
(
".*来源:|)"
,
""
);
}
}
}
else
if
(
url
.
contains
(
"gu.qq.com"
)){
source
=
document
.
select
(
"span#news_source"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
source
;
}
}
else
if
(
url
.
contains
(
"kandian.youth.cn"
)){
source
=
document
.
select
(
"div.fl > a"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
source
;
}
}
}
if
(
Objects
.
nonNull
(
source
)
&&
source
.
length
()
!=
0
)
{
if
(
Objects
.
nonNull
(
source
)
&&
source
.
length
()
!=
0
)
{
return
source
;
return
source
;
}
}
...
@@ -487,9 +498,12 @@ public class MatchSource {
...
@@ -487,9 +498,12 @@ public class MatchSource {
}
}
}
else
if
(
url
.
contains
(
"baijiahao.baidu.com"
)
||
url
.
contains
(
"mbd.baidu.com"
)){
}
else
if
(
url
.
contains
(
"baijiahao.baidu.com"
)
||
url
.
contains
(
"mbd.baidu.com"
)){
//百度百家
//百度百家
if
(
StringUtils
.
isNotBlank
(
document
.
select
(
"span.userNameSpan"
).
text
())){
source
=
document
.
select
(
"span.userNameSpan"
).
text
();
source
=
document
.
select
(
"span.userNameSpan"
).
text
();
if
(
StringUtils
.
isBlank
(
source
)){
}
else
if
(
StringUtils
.
isNotBlank
(
document
.
select
(
"p.author-name:nth-child(1)"
).
text
()
)){
source
=
document
.
select
(
"p.author-name:nth-child(1)"
).
text
();
source
=
document
.
select
(
"p.author-name:nth-child(1)"
).
text
();
}
else
if
(
StringUtils
.
isNotBlank
(
document
.
select
(
"a.authorName"
).
text
())){
source
=
document
.
select
(
"a.authorName"
).
text
();
}
}
if
(
StringUtils
.
isNotBlank
(
source
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"百度百家-"
+
source
;
source
=
"百度百家-"
+
source
;
...
@@ -768,8 +782,7 @@ public class MatchSource {
...
@@ -768,8 +782,7 @@ public class MatchSource {
if
(
StringUtils
.
isNotBlank
(
source
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"推酷-"
+
source
;
source
=
"推酷-"
+
source
;
}
}
}
}
else
if
(
url
.
contains
(
"36kr.com"
)){
if
(
url
.
contains
(
"36kr.com"
)){
source
=
document
.
select
(
"div.info-header-text > a.author-name"
).
text
();
source
=
document
.
select
(
"div.info-header-text > a.author-name"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
return
"36氪-"
+
source
;
...
@@ -783,6 +796,7 @@ public class MatchSource {
...
@@ -783,6 +796,7 @@ public class MatchSource {
return
"36氪-"
+
source
;
return
"36氪-"
+
source
;
}
}
}
}
return
source
;
return
source
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
printStackTrace
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment