Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
e229722e
Commit
e229722e
authored
Jan 25, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
根据链接匹配自媒体号添加频道匹配
parent
c34e21d1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
64 additions
and
54 deletions
+64
-54
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourcePageProcessor.java
+11
-0
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardPageProcessor.java
+1
-54
src/main/java/com/zhiwei/source_forward/util/TreateData.java
+52
-0
No files found.
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourcePageProcessor.java
View file @
e229722e
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.source_forward.util.TreateData
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
...
...
@@ -26,9 +30,15 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
public
void
process
(
Page
page
)
{
Map
<
String
,
String
>
data
=
new
HashMap
<
String
,
String
>();
String
source
=
null
;
String
channel
=
null
;
try
{
if
(
page
.
getStatusCode
()!=
404
){
source
=
TreateData
.
matchMediaSelfSource
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
());
channel
=
TreateData
.
verifyChannel
(
page
.
getUrl
().
get
());
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
channel
=
TreateData
.
matchChannel
(
nodeList
);
}
}
}
catch
(
Exception
e
)
{
source
=
null
;
...
...
@@ -36,6 +46,7 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
System
.
out
.
println
(
page
.
getUrl
().
get
()+
"================="
+
source
);
data
.
put
(
"url"
,
page
.
getUrl
().
get
());
data
.
put
(
"mediaself"
,
source
);
data
.
put
(
"channel"
,
channel
);
page
.
putField
(
"data"
,
data
);
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardPageProcessor.java
View file @
e229722e
...
...
@@ -34,7 +34,7 @@ public class SourceForwardPageProcessor implements PageProcessor {
String
channel
=
"新闻"
;
try
{
if
(
page
.
getStatusCode
()!=
404
){
channel
=
verifyChannel
(
page
.
getUrl
().
get
());
channel
=
TreateData
.
verifyChannel
(
page
.
getUrl
().
get
());
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
channel
=
TreateData
.
matchChannel
(
nodeList
);
...
...
@@ -53,57 +53,4 @@ public class SourceForwardPageProcessor implements PageProcessor {
page
.
putField
(
"data"
,
data
);
}
/**
* @Title: verifyChannel
* @author hero
* @Description: 根据链接验证文章频道
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
verifyChannel
(
String
url
){
String
channel
=
null
;
if
(
url
.
contains
(
"news."
)
||
url
.
contains
(
"cj.sina.com.cn"
)
||
url
.
contains
(
"wemedia.ifeng.com"
)){
channel
=
"新闻"
;
}
else
if
(
url
.
contains
(
"finance."
)
||
url
.
contains
(
"business."
)
||
url
.
contains
(
"money."
)
||
url
.
contains
(
"stock."
)
||
url
.
contains
(
"10jqka.com.cn"
)){
channel
=
"财经"
;
}
else
if
(
url
.
contains
(
"tech."
)
||
url
.
contains
(
"it."
)
||
url
.
contains
(
"pcedu."
)
||
url
.
contains
(
"mobile."
)
||
url
.
contains
(
"vr."
)){
channel
=
"科技"
;
}
else
if
(
url
.
contains
(
"sports."
)){
channel
=
"体育"
;
}
else
if
(
url
.
contains
(
"ent."
)
||
url
.
contains
(
"yule."
)){
channel
=
"娱乐"
;
}
else
if
(
url
.
contains
(
"auto."
)){
channel
=
"汽车"
;
}
else
if
(
url
.
contains
(
"fashion."
)){
channel
=
"时尚"
;
}
else
if
(
url
.
contains
(
"learning."
)
||
url
.
contains
(
"edu."
)){
channel
=
"教育"
;
}
else
if
(
url
.
contains
(
"baobao."
)){
channel
=
"母婴"
;
}
else
if
(
url
.
contains
(
"house."
)
||
url
.
contains
(
"leju."
)
||
url
.
contains
(
"focus."
)){
channel
=
"房产"
;
}
else
if
(
url
.
contains
(
"games."
)){
channel
=
"游戏"
;
}
else
if
(
url
.
contains
(
"intl."
)){
channel
=
"国际"
;
}
else
if
(
url
.
contains
(
"science."
)){
channel
=
"科学"
;
}
else
if
(
url
.
contains
(
"city."
)){
channel
=
"城市"
;
}
else
if
(
url
.
contains
(
"sc."
)){
channel
=
"市场"
;
}
return
channel
;
}
}
src/main/java/com/zhiwei/source_forward/util/TreateData.java
View file @
e229722e
...
...
@@ -459,6 +459,58 @@ public class TreateData {
}
/**
* @Title: verifyChannel
* @author hero
* @Description: 根据链接验证文章频道
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
verifyChannel
(
String
url
){
String
channel
=
null
;
if
(
url
.
contains
(
"news."
)
||
url
.
contains
(
"cj.sina.com.cn"
)
||
url
.
contains
(
"wemedia.ifeng.com"
)){
channel
=
"新闻"
;
}
else
if
(
url
.
contains
(
"finance."
)
||
url
.
contains
(
"business."
)
||
url
.
contains
(
"money."
)
||
url
.
contains
(
"stock."
)
||
url
.
contains
(
"10jqka.com.cn"
)){
channel
=
"财经"
;
}
else
if
(
url
.
contains
(
"tech."
)
||
url
.
contains
(
"it."
)
||
url
.
contains
(
"pcedu."
)
||
url
.
contains
(
"mobile."
)
||
url
.
contains
(
"vr."
)){
channel
=
"科技"
;
}
else
if
(
url
.
contains
(
"sports."
)){
channel
=
"体育"
;
}
else
if
(
url
.
contains
(
"ent."
)
||
url
.
contains
(
"yule."
)){
channel
=
"娱乐"
;
}
else
if
(
url
.
contains
(
"auto."
)){
channel
=
"汽车"
;
}
else
if
(
url
.
contains
(
"fashion."
)){
channel
=
"时尚"
;
}
else
if
(
url
.
contains
(
"learning."
)
||
url
.
contains
(
"edu."
)){
channel
=
"教育"
;
}
else
if
(
url
.
contains
(
"baobao."
)){
channel
=
"母婴"
;
}
else
if
(
url
.
contains
(
"house."
)
||
url
.
contains
(
"leju."
)
||
url
.
contains
(
"focus."
)){
channel
=
"房产"
;
}
else
if
(
url
.
contains
(
"games."
)){
channel
=
"游戏"
;
}
else
if
(
url
.
contains
(
"intl."
)){
channel
=
"国际"
;
}
else
if
(
url
.
contains
(
"science."
)){
channel
=
"科学"
;
}
else
if
(
url
.
contains
(
"city."
)){
channel
=
"城市"
;
}
else
if
(
url
.
contains
(
"sc."
)){
channel
=
"市场"
;
}
return
channel
;
}
public
static
String
filterSpecialCharacter
(
String
str
)
{
try
{
String
regEx
=
"【[`~!@#$%^&*()+=|{}';'//[//].<>/?~!@#%……&*——+|{}“”;‘’,。、·]】"
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment