Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
7a6d49e2
Commit
7a6d49e2
authored
Jun 30, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加内容匹配
parent
82632f70
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
846 additions
and
654 deletions
+846
-654
src/main/java/com/zhiwei/source_forward/crawler/ContentPageProcessor.java
+51
-0
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourcePageProcessor.java
+6
-5
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardPageProcessor.java
+7
-5
src/main/java/com/zhiwei/source_forward/crawler/UrlLivePageProcessor.java
+1
-1
src/main/java/com/zhiwei/source_forward/pipeline/DataPipeline.java
+99
-0
src/main/java/com/zhiwei/source_forward/pipeline/MediaSelfSourceDataPipeline.java
+0
-40
src/main/java/com/zhiwei/source_forward/pipeline/SourceForwardDataPipeline.java
+0
-40
src/main/java/com/zhiwei/source_forward/pipeline/UrlLivePipeline.java
+0
-40
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
+46
-0
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+9
-10
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+3
-3
src/main/java/com/zhiwei/source_forward/util/MatchChannel.java
+143
-0
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+60
-0
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+385
-0
src/main/java/com/zhiwei/source_forward/util/TreateData.java
+0
-510
src/test/java/com/zhiwei/source_forward/sourceforward/test/MediaSelfSourceTest.java
+36
-0
No files found.
src/main/java/com/zhiwei/source_forward/crawler/ContentPageProcessor.java
0 → 100644
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.source_forward.util.TreateData
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
/**
* @ClassName: ContentPageProcessor
* @Description: 获取文章内容
* @author hero
* @date 2018年6月30日 上午9:54:02
*/
public
class
ContentPageProcessor
implements
PageProcessor
{
private
Site
site
=
Site
.
me
().
setCycleRetryTimes
(
3
).
setSleepTime
(
1500
)
.
setTimeOut
(
10000
)
.
setUserAgent
(
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
.
addHeader
(
"Accept-Encoding"
,
"gzip, deflate, br"
)
;
@Override
public
Site
getSite
()
{
return
site
;
}
@Override
public
void
process
(
Page
page
)
{
Map
<
String
,
String
>
data
=
new
HashMap
<
String
,
String
>();
String
content
=
null
;
try
{
if
(
page
.
getStatusCode
()!=
404
){
MatchContent
.
matchContent
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
());
}
}
catch
(
Exception
e
)
{
content
=
null
;
}
data
.
put
(
"url"
,
page
.
getUrl
().
get
());
data
.
put
(
"content"
,
content
);
page
.
putField
(
"content"
,
data
);
}
}
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourcePageProcessor.java
View file @
7a6d49e2
...
@@ -6,7 +6,8 @@ import java.util.Map;
...
@@ -6,7 +6,8 @@ import java.util.Map;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.source_forward.util.TreateData
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
...
@@ -31,14 +32,14 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
...
@@ -31,14 +32,14 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
String
channel
=
null
;
String
channel
=
null
;
try
{
try
{
if
(
page
.
getStatusCode
()!=
404
){
if
(
page
.
getStatusCode
()!=
404
){
source
=
TreateData
.
matchMediaSelfSource
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
());
source
=
MatchSource
.
matchMediaSelfSource
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
());
if
(
source
==
null
||
source
.
equals
(
""
)){
if
(
source
==
null
||
source
.
equals
(
""
)){
source
=
null
;
source
=
null
;
}
}
channel
=
TreateData
.
verifyChannel
(
page
.
getUrl
().
get
());
channel
=
MatchChannel
.
verifyChannel
(
page
.
getUrl
().
get
());
if
(
channel
==
null
){
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
channel
=
TreateData
.
matchChannel
(
nodeList
);
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -49,7 +50,7 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
...
@@ -49,7 +50,7 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
data
.
put
(
"mediaself"
,
source
);
data
.
put
(
"mediaself"
,
source
);
data
.
put
(
"channel"
,
channel
);
data
.
put
(
"channel"
,
channel
);
page
.
putField
(
"
data
"
,
data
);
page
.
putField
(
"
mediaSelf
"
,
data
);
}
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardPageProcessor.java
View file @
7a6d49e2
...
@@ -6,8 +6,10 @@ import java.util.Map;
...
@@ -6,8 +6,10 @@ import java.util.Map;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.TreateData
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
...
@@ -45,12 +47,12 @@ public class SourceForwardPageProcessor implements PageProcessor {
...
@@ -45,12 +47,12 @@ public class SourceForwardPageProcessor implements PageProcessor {
}
}
data
.
put
(
"isforward"
,
isforward
);
data
.
put
(
"isforward"
,
isforward
);
}
else
{
}
else
{
channel
=
TreateData
.
verifyChannel
(
page
.
getUrl
().
get
());
channel
=
MatchChannel
.
verifyChannel
(
page
.
getUrl
().
get
());
if
(
channel
==
null
){
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
channel
=
TreateData
.
matchChannel
(
nodeList
);
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
}
source
=
TreateData
.
matchSource
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
(),
sourceList
);
source
=
MatchSource
.
matchSource
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
(),
sourceList
);
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -62,7 +64,7 @@ public class SourceForwardPageProcessor implements PageProcessor {
...
@@ -62,7 +64,7 @@ public class SourceForwardPageProcessor implements PageProcessor {
data
.
put
(
"channel"
,
channel
);
data
.
put
(
"channel"
,
channel
);
data
.
put
(
"root_source"
,
source
);
data
.
put
(
"root_source"
,
source
);
page
.
putField
(
"
data
"
,
data
);
page
.
putField
(
"
sourceForward
"
,
data
);
}
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/UrlLivePageProcessor.java
View file @
7a6d49e2
...
@@ -35,7 +35,7 @@ public class UrlLivePageProcessor implements PageProcessor{
...
@@ -35,7 +35,7 @@ public class UrlLivePageProcessor implements PageProcessor{
Map
<
String
,
Object
>
data
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
data
=
new
HashMap
<
String
,
Object
>();
data
.
put
(
"url"
,
page
.
getUrl
().
get
());
data
.
put
(
"url"
,
page
.
getUrl
().
get
());
data
.
put
(
"live"
,
f
);
data
.
put
(
"live"
,
f
);
page
.
putField
(
"
data
"
,
data
);
page
.
putField
(
"
urlLive
"
,
data
);
}
}
@Override
@Override
...
...
src/main/java/com/zhiwei/source_forward/pipeline/DataPipeline.java
0 → 100644
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
pipeline
;
import
java.util.List
;
import
java.util.Map
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
/**
* @ClassName: ContentDataPipeline
* @Description: 存储文章位置
* @author hero
* @date 2018年6月30日 上午9:54:27
*/
public
class
DataPipeline
implements
Pipeline
{
private
List
<
Map
<
String
,
Object
>>
contentDataList
;
private
List
<
Map
<
String
,
Object
>>
mediaSelfDataList
;
private
List
<
Map
<
String
,
Object
>>
sourceForwardDataList
;
private
List
<
Map
<
String
,
Object
>>
urlLivedataList
;
public
DataPipeline
(
List
<
Map
<
String
,
Object
>>
dataList
,
List
<
Map
<
String
,
Object
>>
contentDataList
,
List
<
Map
<
String
,
Object
>>
mediaSelfDataList
,
List
<
Map
<
String
,
Object
>>
sourceForwardDataList
,
List
<
Map
<
String
,
Object
>>
urlLivedataList
)
{
super
();
this
.
contentDataList
=
contentDataList
;
this
.
mediaSelfDataList
=
mediaSelfDataList
;
this
.
sourceForwardDataList
=
sourceForwardDataList
;
this
.
urlLivedataList
=
urlLivedataList
;
}
public
DataPipeline
()
{
super
();
}
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
Map
<
String
,
Object
>
contentData
=
resultItems
.
get
(
"content"
);
Map
<
String
,
Object
>
mediaSelfData
=
resultItems
.
get
(
"mediaSelf"
);
Map
<
String
,
Object
>
sourceForwardData
=
resultItems
.
get
(
"sourceForward"
);
Map
<
String
,
Object
>
urlLivedata
=
resultItems
.
get
(
"urlLive"
);
if
(
contentData
!=
null
)
{
contentDataList
.
add
(
contentData
);
}
if
(
mediaSelfData
!=
null
)
{
mediaSelfDataList
.
add
(
mediaSelfData
);
}
if
(
sourceForwardData
!=
null
)
{
sourceForwardDataList
.
add
(
sourceForwardData
);
}
if
(
urlLivedata
!=
null
)
{
urlLivedataList
.
add
(
urlLivedata
);
}
}
public
List
<
Map
<
String
,
Object
>>
getContentDataList
()
{
return
contentDataList
;
}
public
void
setContentDataList
(
List
<
Map
<
String
,
Object
>>
contentDataList
)
{
this
.
contentDataList
=
contentDataList
;
}
public
List
<
Map
<
String
,
Object
>>
getMediaSelfDataList
()
{
return
mediaSelfDataList
;
}
public
void
setMediaSelfDataList
(
List
<
Map
<
String
,
Object
>>
mediaSelfDataList
)
{
this
.
mediaSelfDataList
=
mediaSelfDataList
;
}
public
List
<
Map
<
String
,
Object
>>
getSourceForwardDataList
()
{
return
sourceForwardDataList
;
}
public
void
setSourceForwardDataList
(
List
<
Map
<
String
,
Object
>>
sourceForwardDataList
)
{
this
.
sourceForwardDataList
=
sourceForwardDataList
;
}
public
List
<
Map
<
String
,
Object
>>
getUrlLivedataList
()
{
return
urlLivedataList
;
}
public
void
setUrlLivedataList
(
List
<
Map
<
String
,
Object
>>
urlLivedataList
)
{
this
.
urlLivedataList
=
urlLivedataList
;
}
}
src/main/java/com/zhiwei/source_forward/pipeline/MediaSelfSourceDataPipeline.java
deleted
100644 → 0
View file @
82632f70
package
com
.
zhiwei
.
source_forward
.
pipeline
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
public
class
MediaSelfSourceDataPipeline
implements
Pipeline
{
private
List
<
Map
<
String
,
Object
>>
dataList
;
public
MediaSelfSourceDataPipeline
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
super
();
this
.
dataList
=
dataList
;
}
public
MediaSelfSourceDataPipeline
()
{
super
();
this
.
dataList
=
new
ArrayList
<>();
}
public
List
<
Map
<
String
,
Object
>>
getDataList
()
{
return
dataList
;
}
public
void
setDataList
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
this
.
dataList
=
dataList
;
}
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
Map
<
String
,
Object
>
data
=
resultItems
.
get
(
"data"
);
if
(
data
!=
null
)
{
dataList
.
add
(
data
);
}
}
}
src/main/java/com/zhiwei/source_forward/pipeline/SourceForwardDataPipeline.java
deleted
100644 → 0
View file @
82632f70
package
com
.
zhiwei
.
source_forward
.
pipeline
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
public
class
SourceForwardDataPipeline
implements
Pipeline
{
private
List
<
Map
<
String
,
Object
>>
dataList
;
public
SourceForwardDataPipeline
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
super
();
this
.
dataList
=
dataList
;
}
public
SourceForwardDataPipeline
()
{
super
();
this
.
dataList
=
new
ArrayList
<>();
}
public
List
<
Map
<
String
,
Object
>>
getDataList
()
{
return
dataList
;
}
public
void
setDataList
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
this
.
dataList
=
dataList
;
}
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
Map
<
String
,
Object
>
data
=
resultItems
.
get
(
"data"
);
if
(
data
!=
null
)
{
dataList
.
add
(
data
);
}
}
}
src/main/java/com/zhiwei/source_forward/pipeline/UrlLivePipeline.java
deleted
100644 → 0
View file @
82632f70
package
com
.
zhiwei
.
source_forward
.
pipeline
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
public
class
UrlLivePipeline
implements
Pipeline
{
private
List
<
Map
<
String
,
Object
>>
dataList
;
public
UrlLivePipeline
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
super
();
this
.
dataList
=
dataList
;
}
public
UrlLivePipeline
()
{
super
();
this
.
dataList
=
new
ArrayList
<>();
}
public
List
<
Map
<
String
,
Object
>>
getDataList
()
{
return
dataList
;
}
public
void
setDataList
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
this
.
dataList
=
dataList
;
}
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
Map
<
String
,
Object
>
data
=
resultItems
.
get
(
"data"
);
if
(
data
!=
null
)
{
dataList
.
add
(
data
);
}
}
}
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
0 → 100644
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
run
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.source_forward.crawler.ContentPageProcessor
;
import
com.zhiwei.source_forward.downloader.MyDownLoader
;
import
com.zhiwei.source_forward.pipeline.DataPipeline
;
import
us.codecraft.webmagic.Spider
;
public
class
ContentMatch
{
/**
* @Title: getSourceForward
* @author hero
* @Description: 验证文章是否转发
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getContent
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
//启动验证来源程序
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
ContentPageProcessor
());
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
spider
.
addUrl
(
entry
.
getKey
());
}
spider
.
setDownloader
(
new
MyDownLoader
());
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
List
<
Map
<
String
,
Object
>>
contentList
=
pipeline
.
getContentDataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
contentList
){
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
//整合数据及验证转发原创
if
(
dataMap
.
containsKey
(
url
)){
Map
<
String
,
Object
>
data
=
dataMap
.
get
(
url
);
String
content
=
data
.
get
(
"content"
)+
""
;
data
.
put
(
"content"
,
content
);
dataMap
.
put
(
url
,
data
);
}
}
return
dataMap
;
}
}
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
7a6d49e2
...
@@ -8,8 +8,7 @@ import java.util.Map.Entry;
...
@@ -8,8 +8,7 @@ import java.util.Map.Entry;
import
com.zhiwei.source_forward.crawler.MediaSelfSourcePageProcessor
;
import
com.zhiwei.source_forward.crawler.MediaSelfSourcePageProcessor
;
import
com.zhiwei.source_forward.crawler.SourceForwardPageProcessor
;
import
com.zhiwei.source_forward.crawler.SourceForwardPageProcessor
;
import
com.zhiwei.source_forward.downloader.MyDownLoader
;
import
com.zhiwei.source_forward.downloader.MyDownLoader
;
import
com.zhiwei.source_forward.pipeline.MediaSelfSourceDataPipeline
;
import
com.zhiwei.source_forward.pipeline.DataPipeline
;
import
com.zhiwei.source_forward.pipeline.SourceForwardDataPipeline
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Spider
;
...
@@ -31,7 +30,7 @@ public class SourceForward {
...
@@ -31,7 +30,7 @@ public class SourceForward {
*/
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getSourceForward
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getSourceForward
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
//启动验证来源程序
//启动验证来源程序
SourceForwardDataPipeline
pipeline
=
new
SourceForward
DataPipeline
();
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
SourceForwardPageProcessor
());
Spider
spider
=
Spider
.
create
(
new
SourceForwardPageProcessor
());
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
spider
.
addUrl
(
entry
.
getKey
());
spider
.
addUrl
(
entry
.
getKey
());
...
@@ -40,7 +39,7 @@ public class SourceForward {
...
@@ -40,7 +39,7 @@ public class SourceForward {
spider
.
addPipeline
(
pipeline
);
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
spider
.
thread
(
5
).
run
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
getDataList
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
get
SourceForward
DataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
String
root_source
=
sourceMap
.
get
(
"root_source"
)!=
null
?
sourceMap
.
get
(
"root_source"
).
toString
():
null
;
String
root_source
=
sourceMap
.
get
(
"root_source"
)!=
null
?
sourceMap
.
get
(
"root_source"
).
toString
():
null
;
...
@@ -85,7 +84,7 @@ public class SourceForward {
...
@@ -85,7 +84,7 @@ public class SourceForward {
*/
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getMediaSelfSource
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getMediaSelfSource
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
//启动验证来源程序
//启动验证来源程序
MediaSelfSourceDataPipeline
pipeline
=
new
MediaSelfSource
DataPipeline
();
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
spider
.
addUrl
(
entry
.
getKey
());
spider
.
addUrl
(
entry
.
getKey
());
...
@@ -94,7 +93,7 @@ public class SourceForward {
...
@@ -94,7 +93,7 @@ public class SourceForward {
spider
.
addPipeline
(
pipeline
);
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
spider
.
thread
(
5
).
run
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
getDataList
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
get
SourceForward
DataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
//整合数据及验证转发原创
//整合数据及验证转发原创
...
@@ -119,7 +118,7 @@ public class SourceForward {
...
@@ -119,7 +118,7 @@ public class SourceForward {
public
static
Map
<
String
,
String
>
getMediaSelfSource
(
List
<
String
>
urlList
){
public
static
Map
<
String
,
String
>
getMediaSelfSource
(
List
<
String
>
urlList
){
//启动验证来源程序
//启动验证来源程序
Map
<
String
,
String
>
dataMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
dataMap
=
new
HashMap
<
String
,
String
>();
MediaSelfSourceDataPipeline
pipeline
=
new
MediaSelfSource
DataPipeline
();
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
for
(
String
url
:
urlList
){
for
(
String
url
:
urlList
){
spider
.
addUrl
(
url
);
spider
.
addUrl
(
url
);
...
@@ -129,7 +128,7 @@ public class SourceForward {
...
@@ -129,7 +128,7 @@ public class SourceForward {
spider
.
addPipeline
(
pipeline
);
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
spider
.
thread
(
5
).
run
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
getDataList
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
get
MediaSelf
DataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
//整合数据及验证转发原创
//整合数据及验证转发原创
...
@@ -152,14 +151,14 @@ public class SourceForward {
...
@@ -152,14 +151,14 @@ public class SourceForward {
*/
*/
public
static
String
getMediaSelfSource
(
String
url
){
public
static
String
getMediaSelfSource
(
String
url
){
//启动验证来源程序
//启动验证来源程序
MediaSelfSourceDataPipeline
pipeline
=
new
MediaSelfSource
DataPipeline
();
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
spider
.
addUrl
(
url
);
spider
.
addUrl
(
url
);
spider
.
setDownloader
(
new
MyDownLoader
());
spider
.
setDownloader
(
new
MyDownLoader
());
spider
.
addPipeline
(
pipeline
);
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
1
).
run
();
spider
.
thread
(
1
).
run
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
getDataList
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
get
MediaSelf
DataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
return
sourceMap
.
get
(
"mediaself"
).
toString
();
return
sourceMap
.
get
(
"mediaself"
).
toString
();
}
}
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
7a6d49e2
...
@@ -5,7 +5,7 @@ import java.util.Map;
...
@@ -5,7 +5,7 @@ import java.util.Map;
import
java.util.Map.Entry
;
import
java.util.Map.Entry
;
import
com.zhiwei.source_forward.crawler.UrlLivePageProcessor
;
import
com.zhiwei.source_forward.crawler.UrlLivePageProcessor
;
import
com.zhiwei.source_forward.pipeline.
UrlLive
Pipeline
;
import
com.zhiwei.source_forward.pipeline.
Data
Pipeline
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Spider
;
...
@@ -28,7 +28,7 @@ public class URLLive {
...
@@ -28,7 +28,7 @@ public class URLLive {
*/
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
verificationURLLive
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
public
static
Map
<
String
,
Map
<
String
,
Object
>>
verificationURLLive
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
//启动验证链接是否有效程序程序
//启动验证链接是否有效程序程序
UrlLivePipeline
pipeline
=
new
UrlLive
Pipeline
();
DataPipeline
pipeline
=
new
Data
Pipeline
();
Spider
spider
=
Spider
.
create
(
new
UrlLivePageProcessor
());
Spider
spider
=
Spider
.
create
(
new
UrlLivePageProcessor
());
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
spider
.
addUrl
(
entry
.
getKey
());
spider
.
addUrl
(
entry
.
getKey
());
...
@@ -37,7 +37,7 @@ public class URLLive {
...
@@ -37,7 +37,7 @@ public class URLLive {
spider
.
thread
(
5
).
run
();
spider
.
thread
(
5
).
run
();
//验证数据是否已删除
//验证数据是否已删除
List
<
Map
<
String
,
Object
>>
dataList
=
pipeline
.
get
D
ataList
();
List
<
Map
<
String
,
Object
>>
dataList
=
pipeline
.
get
UrlLived
ataList
();
for
(
Map
<
String
,
Object
>
data
:
dataList
){
for
(
Map
<
String
,
Object
>
data
:
dataList
){
String
url
=
data
.
get
(
"url"
)+
""
;
String
url
=
data
.
get
(
"url"
)+
""
;
if
(!
url
.
contains
(
"http"
)){
if
(!
url
.
contains
(
"http"
)){
...
...
src/main/java/com/zhiwei/source_forward/util/MatchChannel.java
0 → 100644
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.util.List
;
import
org.jsoup.nodes.Node
;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public
class
MatchChannel
{
/**
* @Title: matchChannel
* @author hero
* @Description: TODO(匹配频道)
* @param @param
* list
* @param @return
* 设定文件
* @return String 返回类型
*/
public
static
String
matchChannel
(
List
<
Node
>
list
)
{
/** 验证频道标签 **/
String
channel
=
"新闻"
;
try
{
for
(
Node
node
:
list
)
{
if
(
node
.
outerHtml
().
contains
(
"<title>"
))
{
String
[]
content
=
node
.
toString
().
split
(
"<title>"
)[
1
].
split
(
"</title>"
)[
0
].
split
(
"_"
);
String
channelMatch
=
""
;
for
(
int
i
=
0
;
i
<
content
.
length
;
i
++)
{
if
(
i
>
0
)
{
channelMatch
+=
content
[
i
]
+
"_"
;
}
}
channel
=
getChannel
(
channelMatch
);
break
;
}
}
}
catch
(
Exception
e
)
{
return
channel
;
}
return
channel
;
}
/**
* @Title: verifyChannel
* @author hero
* @Description: 根据链接验证文章频道
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
verifyChannel
(
String
url
){
String
channel
=
null
;
if
(
url
.
contains
(
"news."
)
||
url
.
contains
(
"cj.sina.com.cn"
)
||
url
.
contains
(
"wemedia.ifeng.com"
)){
channel
=
"新闻"
;
}
else
if
(
url
.
contains
(
"finance."
)
||
url
.
contains
(
"business."
)
||
url
.
contains
(
"money."
)
||
url
.
contains
(
"stock."
)
||
url
.
contains
(
"10jqka.com.cn"
)){
channel
=
"财经"
;
}
else
if
(
url
.
contains
(
"tech."
)
||
url
.
contains
(
"it."
)
||
url
.
contains
(
"pcedu."
)
||
url
.
contains
(
"mobile."
)
||
url
.
contains
(
"vr."
)){
channel
=
"科技"
;
}
else
if
(
url
.
contains
(
"sports."
)){
channel
=
"体育"
;
}
else
if
(
url
.
contains
(
"ent."
)
||
url
.
contains
(
"yule."
)){
channel
=
"娱乐"
;
}
else
if
(
url
.
contains
(
"auto."
)){
channel
=
"汽车"
;
}
else
if
(
url
.
contains
(
"fashion."
)){
channel
=
"时尚"
;
}
else
if
(
url
.
contains
(
"learning."
)
||
url
.
contains
(
"edu."
)){
channel
=
"教育"
;
}
else
if
(
url
.
contains
(
"baobao."
)){
channel
=
"母婴"
;
}
else
if
(
url
.
contains
(
"house."
)
||
url
.
contains
(
"leju."
)
||
url
.
contains
(
"focus."
)){
channel
=
"房产"
;
}
else
if
(
url
.
contains
(
"games."
)){
channel
=
"游戏"
;
}
else
if
(
url
.
contains
(
"intl."
)){
channel
=
"国际"
;
}
else
if
(
url
.
contains
(
"science."
)){
channel
=
"科学"
;
}
else
if
(
url
.
contains
(
"city."
)){
channel
=
"城市"
;
}
else
if
(
url
.
contains
(
"sc."
)){
channel
=
"市场"
;
}
return
channel
;
}
/**
* @Title: getChannel
* @author hero
* @Description: TODO(渠道验证)
* @param @param
* source
* @param @return
* 设定文件
* @return String 返回类型
*/
public
static
String
getChannel
(
String
source
)
{
String
channel
=
"新闻"
;
if
(
source
.
contains
(
"财经"
))
{
channel
=
"财经"
;
}
else
if
(
source
.
contains
(
"金融"
))
{
channel
=
"金融"
;
}
else
if
(
source
.
contains
(
"经济"
))
{
channel
=
"经济"
;
}
else
if
(
source
.
contains
(
"科技"
))
{
channel
=
"科技"
;
}
else
if
(
source
.
contains
(
"时尚"
))
{
channel
=
"时尚"
;
}
else
if
(
source
.
contains
(
"互联网"
))
{
channel
=
"互联网"
;
}
else
if
(
source
.
contains
(
"数码"
))
{
channel
=
"数码"
;
}
else
if
(
source
.
contains
(
"科学"
))
{
channel
=
"科学"
;
}
else
if
(
source
.
contains
(
"TMT"
))
{
channel
=
"TMT"
;
}
else
if
(
source
.
contains
(
"通讯"
))
{
channel
=
"通讯"
;
}
else
if
(
source
.
contains
(
"社会"
))
{
channel
=
"社会"
;
}
else
if
(
source
.
contains
(
"IT"
))
{
channel
=
"IT"
;
}
else
if
(
source
.
contains
(
"房产"
))
{
channel
=
"房产"
;
}
else
if
(
source
.
contains
(
"母婴"
))
{
channel
=
"母婴"
;
}
else
if
(
source
.
contains
(
"3C"
))
{
channel
=
"3C"
;
}
return
channel
;
}
}
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
0 → 100644
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
util
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
cn.edu.hfut.dmic.contentextractor.ContentExtractor
;
import
cn.edu.hfut.dmic.contentextractor.News
;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public
class
MatchContent
{
/**
* @Title: matchContent
* @author hero
* @Description: 匹配文章正文
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
matchContent
(
String
url
,
String
html
)
{
String
content
=
null
;
Document
document
=
Jsoup
.
parse
(
html
);
try
{
content
=
mathchContent
(
html
,
document
);
}
catch
(
Exception
e
)
{
content
=
null
;
}
return
content
;
}
/**
* @Title: mathchContent
* @author hero
* @Description: 匹配正文数据
* @param @param html
* @param @param document
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
mathchContent
(
String
html
,
Document
document
){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String
content
=
null
;
try
{
News
news
=
ContentExtractor
.
getNewsByHtml
(
html
);
content
=
TreateData
.
filterSpecialCharacter
(
news
.
getContent
());
}
catch
(
Exception
e
)
{
content
=
document
.
text
();
System
.
out
.
println
(
"正文抽取失败处理........"
);
e
.
printStackTrace
();
}
return
content
;
}
}
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
0 → 100644
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.util.List
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
cn.edu.hfut.dmic.contentextractor.ContentExtractor
;
import
cn.edu.hfut.dmic.contentextractor.News
;
/**
* @ClassName: MatchSource
* @Description: 匹配来源
* @author hero
* @date 2018年6月30日 上午10:27:29
*/
public
class
MatchSource
{
private
static
String
fromRegex
=
"(来源:(\\S)*(\\s)*[\\S]*)|(来源:(\\S)*(\\s)*[\\S]*)|(本文来源于(\\S)*(\\s)*[\\S]*)"
+
"|(源:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)"
+
"|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)"
+
"|(出自:(\\S)*(\\s)*[\\S]*)|(出自:(\\S)*(\\s)*[\\S]*)"
+
"|(转自:(\\S)*(\\s)*[\\S]*)|(转自:(\\S)*(\\s)*[\\S]*)"
+
"|(出处\\/作者:(\\S)*(\\s)*[\\S]*)|(出处\\/作者:(\\S)*(\\s)*[\\S]*)"
+
"|(出处:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)|(稿源:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)"
;
private
static
String
timeRegex
=
""
+
"([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"
+
"|([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"
+
"|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+
"|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+
"|(\\d{4}年\\d{0,2}月\\d{0,2}日)"
+
"|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+
"|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+
"|(\\d{0,2}月\\d{0,2}日)"
+
"|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+
"|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+
"|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2})"
;
/**
* @Title: findURLs
* @author hero
* @Description: TODO(验证并匹配数据)
* @param @param
* s
* @param @param
* regex
* @param @return
* 设定文件
* @return String 返回类型
*/
public
static
String
matchSource
(
String
url
,
String
html
,
List
<
String
>
sourceList
)
{
String
source
=
null
;
Document
document
=
Jsoup
.
parse
(
html
);
String
htmlBody
=
TreateData
.
filterSpecialCharacter
(
document
.
select
(
"body"
).
text
().
toUpperCase
());
try
{
/***特定网站单独处理**/
if
(
url
.
contains
(
"thepaper.cn"
)){
//单独处理澎湃数据
source
=
document
.
select
(
"div.news_about"
).
text
();
}
else
if
(
url
.
contains
(
"sports.eastday.com"
)){
//单独处理东方体育网
source
=
document
.
select
(
"div.article"
).
select
(
"span"
).
text
();
}
else
if
(
url
.
contains
(
"lesports.com"
)){
//单独处理乐视网数据
source
=
document
.
select
(
"div.article-source"
).
select
(
"strong"
).
text
();
}
else
if
(
url
.
contains
(
"myzaker.com"
)){
//单独处理扎克网数据
source
=
document
.
select
(
"div#article"
).
select
(
"span.auther"
).
text
();
}
else
if
(
url
.
contains
(
"sina.com.cn"
)
||
url
.
contains
(
"sohu.com"
)){
//单独处理新浪网
if
(
html
.
contains
(
"<meta name=\"mediaid\""
)){
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
];
}
}
else
if
(
url
.
contains
(
"a.mini.eastday.com"
)){
//处理东方头条网-自媒体号匹配
// source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text();
source
=
"东方头条"
;
}
else
if
(
url
.
contains
(
"orz520.com"
)){
//千寻生活网解析
source
=
"千寻生活"
;
}
else
if
(
url
.
contains
(
"sh.qihoo.com"
)){
//今日报点解析
source
=
"今日爆点"
;
}
else
if
(
url
.
contains
(
"itouchtv.cn"
)){
//触电新闻解析
source
=
"触电新闻"
;
}
else
if
(
url
.
contains
(
"yidianzixun.com"
)){
//一点资讯
if
(
html
.
contains
(
"related_wemedia"
)){
source
=
"一点资讯"
;
}
else
{
source
=
html
.
split
(
"source\":\""
)[
1
].
split
(
"\",\""
)[
0
];
}
}
else
{
//其他网站处理
source
=
mathchOtherSource
(
html
,
htmlBody
,
sourceList
);
}
if
(
source
!=
null
){
//验证来源
for
(
String
sourceMatch
:
sourceList
)
{
if
(
source
.
contains
(
sourceMatch
))
{
return
sourceMatch
;
}
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
null
;
}
/**
* @Title: matchMediaSelfSource
* @author hero
* @Description: 验证及匹配自媒体号
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
matchMediaSelfSource
(
String
url
,
String
html
)
{
String
source
=
null
;
Document
document
=
Jsoup
.
parse
(
html
);
try
{
/***特定网站单独处理**/
if
(
url
.
contains
(
"toutiao.com"
)){
//今日头条帐号匹配
if
(
html
.
contains
(
"name: '"
)){
source
=
html
.
split
(
"mediaInfo:"
)[
1
].
split
(
"name: '"
)[
1
].
split
(
"',"
)[
0
].
trim
();
}
else
if
(
html
.
contains
(
"screen_name:"
)){
source
=
html
.
split
(
"screen_name:'"
)[
1
].
split
(
"',"
)[
0
].
trim
();
}
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"今日头条-"
+
source
;
}
}
else
if
(
url
.
contains
(
"sohu.com"
)){
//搜狐自媒体号
if
(
html
.
contains
(
"<meta name=\"mediaid\""
)){
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
].
trim
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"搜狐-"
+
source
;
}
}
}
else
if
(
url
.
contains
(
"a.mini.eastday.com"
)){
//处理东方头条网-自媒体号匹配
source
=
document
.
select
(
"[class=\"share_cnt_p clearfix\"]"
).
select
(
"div.fl"
).
select
(
"i"
).
get
(
1
).
text
().
trim
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"东方头条-"
+
source
;
}
}
else
if
(
url
.
contains
(
"sh.qihoo.com"
)){
//今日报点解析
source
=
document
.
select
(
"p.info"
).
select
(
"span.source"
).
text
().
trim
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"快资讯-"
+
source
;
}
}
else
if
(
url
.
contains
(
"cj.sina.com.cn"
)){
//新浪财经头条号
if
(
html
.
contains
(
"<meta name=\"mediaid\""
)){
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
].
trim
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"财经头条-"
+
source
;
}
}
}
else
if
(
url
.
contains
(
"baijia.baidu.com"
)){
//百度百家
source
=
document
.
select
(
"section.info"
).
select
(
"span.author"
).
text
().
trim
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"百度百家-"
+
source
;
}
}
else
if
(
url
.
contains
(
"yidianzixun.com"
)){
//一点资讯
if
(
html
.
contains
(
"related_wemedia"
)){
source
=
html
.
split
(
"media_name\":\""
)[
1
].
split
(
"\",\""
)[
0
].
trim
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"一点资讯-"
+
source
;
}
}
else
{
source
=
html
.
split
(
"source\":\""
)[
1
].
split
(
"\",\""
)[
0
];
}
}
else
if
(
url
.
contains
(
"news.bitauto.com"
)){
source
=
document
.
select
(
"[class=\"gz-box clearfix\"]"
).
select
(
"div.txt-box"
)
.
select
(
"p.p-n"
).
select
(
"a"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"易车网-"
+
source
;
}
}
else
if
(
url
.
contains
(
"chejiahao.autohome.com.cn"
)){
source
=
document
.
select
(
"div.authorMes"
).
select
(
"[class=\"name text-overflow\"]"
)
.
select
(
"a"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"汽车之家-"
+
source
;
}
}
return
source
;
}
catch
(
Exception
e
)
{
return
null
;
}
}
/**
*
* @Title: mathchOtherSource
* @author hero
* @Description: 匹配通用结果数据
* @param @param html
* @param @param htmlBody
* @param @param length
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
mathchOtherSource
(
String
html
,
String
htmlBody
,
List
<
String
>
sourceList
){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String
source
=
null
;
try
{
News
news
=
ContentExtractor
.
getNewsByHtml
(
html
);
String
content
=
TreateData
.
filterSpecialCharacter
(
news
.
getContent
().
toUpperCase
());
String
title
=
TreateData
.
filterSpecialCharacter
(
news
.
getTitle
().
toUpperCase
());
/**剔除正文**/
String
text
=
htmlBody
.
replace
(
content
,
"@@@@@@@@@@"
);
/**分割正文**/
String
[]
matchTextArr
=
text
.
split
(
"@@@@@@@@@@"
);
if
(
TreateData
.
regex
(
fromRegex
,
matchTextArr
[
0
])
!=
null
||
TreateData
.
regex
(
fromRegex
,
matchTextArr
[
1
])!=
null
){
if
(
TreateData
.
regex
(
fromRegex
,
matchTextArr
[
0
])!=
null
){
source
=
TreateData
.
regex
(
fromRegex
,
matchTextArr
[
0
]);
for
(
String
sourceMatch
:
sourceList
)
{
if
(
source
.
contains
(
sourceMatch
))
{
return
sourceMatch
;
}
}
}
else
if
(
TreateData
.
regex
(
fromRegex
,
matchTextArr
[
1
])!=
null
){
source
=
TreateData
.
regex
(
fromRegex
,
matchTextArr
[
1
]);
for
(
String
sourceMatch
:
sourceList
)
{
if
(
source
.
contains
(
sourceMatch
))
{
return
sourceMatch
;
}
}
}
}
else
{
if
(
matchTextArr
[
0
].
contains
(
title
)){
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String
[]
titlesArr
=
matchTextArr
[
0
].
split
(
title
);
for
(
int
j
=
0
;
j
<
titlesArr
.
length
;
j
++){
String
timeSource
=
TreateData
.
regex
(
timeRegex
,
titlesArr
[
j
]);
if
(
timeSource
!=
null
){
source
=
getSourceByTime
(
timeSource
,
titlesArr
[
j
],
sourceList
);
if
(
source
!=
null
){
return
source
;
}
}
}
}
if
(
matchTextArr
[
1
].
contains
(
title
)){
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String
[]
titlesArr
=
matchTextArr
[
1
].
split
(
title
);
for
(
int
j
=
0
;
j
<
titlesArr
.
length
;
j
++){
String
timeSource
=
TreateData
.
regex
(
timeRegex
,
titlesArr
[
j
]);
if
(
timeSource
!=
null
){
source
=
getSourceByTime
(
timeSource
,
titlesArr
[
j
],
sourceList
);
if
(
source
!=
null
){
return
source
;
}
}
}
}
}
/***正文外无相关数据,匹配正文**/
if
(
source
==
null
){
/***
* 匹配命中包含来源等规则的数据
*/
source
=
TreateData
.
regex
(
fromRegex
,
content
);
if
(
source
!=
null
){
for
(
String
sourceMatch
:
sourceList
)
{
if
(
source
.
contains
(
sourceMatch
))
{
return
sourceMatch
;
}
}
}
else
{
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
if
(
content
.
contains
(
title
)){
/**正文中包含标题**/
String
[]
titlesArr
=
content
.
split
(
title
);
for
(
int
j
=
0
;
j
<
titlesArr
.
length
;
j
++){
String
timeSource
=
TreateData
.
regex
(
timeRegex
,
titlesArr
[
j
]);
if
(
timeSource
!=
null
){
source
=
getSourceByTime
(
timeSource
,
titlesArr
[
j
],
sourceList
);
if
(
source
!=
null
){
return
source
;
}
}
}
}
else
{
/**正文中不包含标题**/
String
timeSource
=
TreateData
.
regex
(
timeRegex
,
content
);
if
(
timeSource
!=
null
){
source
=
getSourceByTime
(
timeSource
,
content
,
sourceList
);
if
(
source
!=
null
){
return
source
;
}
}
}
}
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
"正文抽取失败处理........"
);
e
.
printStackTrace
();
/***
* 匹配正文失败
* 匹配命中包含来源等规则的数据
*/
source
=
TreateData
.
regex
(
fromRegex
,
htmlBody
);
if
(
source
!=
null
)
{
for
(
String
sourceMatch
:
sourceList
)
{
if
(
source
.
contains
(
sourceMatch
))
{
return
sourceMatch
;
}
}
}
else
{
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String
timeSource
=
TreateData
.
regex
(
timeRegex
,
htmlBody
);
if
(
timeSource
!=
null
){
source
=
getSourceByTime
(
timeSource
,
htmlBody
,
sourceList
);
if
(
source
!=
null
){
return
source
;
}
}
}
}
return
null
;
}
/**
* @Title: getSourceByTime
* @author hero
* @Description: TODO(根据匹配时间截取数据)
* @param @param htmlBody
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
getSourceByTime
(
String
timeSource
,
String
htmlBody
,
List
<
String
>
sourceList
){
/**以时间做分割,匹配来源信息。
* 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
***/
String
times
[]
=
htmlBody
.
split
(
timeSource
);
for
(
int
j
=
0
;
j
<
times
.
length
;
j
++)
{
String
timecontent
=
times
[
j
];
if
(
j
==
0
)
{
if
(
timecontent
.
length
()
>=
30
)
{
timecontent
=
timecontent
.
substring
(
timecontent
.
length
()
-
30
,
timecontent
.
length
());
}
else
{
timecontent
=
timecontent
.
substring
(
0
,
timecontent
.
length
());
}
}
else
{
if
(
timecontent
.
length
()
>=
30
)
{
timecontent
=
timecontent
.
substring
(
0
,
30
);
}
else
{
timecontent
=
timecontent
.
substring
(
0
,
timecontent
.
length
());
}
}
for
(
String
sourceMatch
:
sourceList
)
{
if
(
timecontent
.
contains
(
sourceMatch
))
{
return
sourceMatch
;
}
}
}
return
null
;
}
}
src/main/java/com/zhiwei/source_forward/util/TreateData.java
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
util
;
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.util.List
;
import
java.util.regex.Matcher
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
java.util.regex.Pattern
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
cn.edu.hfut.dmic.contentextractor.ContentExtractor
;
import
cn.edu.hfut.dmic.contentextractor.News
;
/**
/**
* @ClassName: TreateData
* @ClassName: TreateData
* @Description: TODO(数据处理类)
* @Description: TODO(数据处理类)
...
@@ -19,365 +10,6 @@ import cn.edu.hfut.dmic.contentextractor.News;
...
@@ -19,365 +10,6 @@ import cn.edu.hfut.dmic.contentextractor.News;
*/
*/
public
class
TreateData
{
public
class
TreateData
{
private
static
String
fromRegex
=
"(来源:(\\S)*(\\s)*[\\S]*)|(来源:(\\S)*(\\s)*[\\S]*)|(本文来源于(\\S)*(\\s)*[\\S]*)"
+
"|(源:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)"
+
"|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)"
+
"|(出自:(\\S)*(\\s)*[\\S]*)|(出自:(\\S)*(\\s)*[\\S]*)"
+
"|(转自:(\\S)*(\\s)*[\\S]*)|(转自:(\\S)*(\\s)*[\\S]*)"
+
"|(出处\\/作者:(\\S)*(\\s)*[\\S]*)|(出处\\/作者:(\\S)*(\\s)*[\\S]*)"
+
"|(出处:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)|(稿源:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)"
;
private
static
String
timeRegex
=
""
+
"([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"
+
"|([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"
+
"|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+
"|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+
"|(\\d{4}年\\d{0,2}月\\d{0,2}日)"
+
"|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+
"|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+
"|(\\d{0,2}月\\d{0,2}日)"
+
"|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+
"|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+
"|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2})"
;
/**
* @Title: findURLs
* @author hero
* @Description: TODO(验证并匹配数据)
* @param @param
* s
* @param @param
* regex
* @param @return
* 设定文件
* @return String 返回类型
*/
public
static
String
matchSource
(
String
url
,
String
html
,
List
<
String
>
sourceList
)
{
String
source
=
null
;
Document
document
=
Jsoup
.
parse
(
html
);
String
htmlBody
=
filterSpecialCharacter
(
document
.
select
(
"body"
).
text
().
toUpperCase
());
try
{
/***特定网站单独处理**/
if
(
url
.
contains
(
"thepaper.cn"
)){
//单独处理澎湃数据
source
=
document
.
select
(
"div.news_about"
).
text
();
}
else
if
(
url
.
contains
(
"sports.eastday.com"
)){
//单独处理东方体育网
source
=
document
.
select
(
"div.article"
).
select
(
"span"
).
text
();
}
else
if
(
url
.
contains
(
"lesports.com"
)){
//单独处理乐视网数据
source
=
document
.
select
(
"div.article-source"
).
select
(
"strong"
).
text
();
}
else
if
(
url
.
contains
(
"myzaker.com"
)){
//单独处理扎克网数据
source
=
document
.
select
(
"div#article"
).
select
(
"span.auther"
).
text
();
}
else
if
(
url
.
contains
(
"sina.com.cn"
)
||
url
.
contains
(
"sohu.com"
)){
//单独处理新浪网
if
(
html
.
contains
(
"<meta name=\"mediaid\""
)){
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
];
}
}
else
if
(
url
.
contains
(
"a.mini.eastday.com"
)){
//处理东方头条网-自媒体号匹配
// source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text();
source
=
"东方头条"
;
}
else
if
(
url
.
contains
(
"orz520.com"
)){
//千寻生活网解析
source
=
"千寻生活"
;
}
else
if
(
url
.
contains
(
"sh.qihoo.com"
)){
//今日报点解析
source
=
"今日爆点"
;
}
else
if
(
url
.
contains
(
"itouchtv.cn"
)){
//触电新闻解析
source
=
"触电新闻"
;
}
else
if
(
url
.
contains
(
"yidianzixun.com"
)){
//一点资讯
if
(
html
.
contains
(
"related_wemedia"
)){
source
=
"一点资讯"
;
}
else
{
source
=
html
.
split
(
"source\":\""
)[
1
].
split
(
"\",\""
)[
0
];
}
}
else
{
//其他网站处理
source
=
mathchOtherSource
(
html
,
htmlBody
,
sourceList
);
}
if
(
source
!=
null
){
//验证来源
for
(
String
sourceMatch
:
sourceList
)
{
if
(
source
.
contains
(
sourceMatch
))
{
return
sourceMatch
;
}
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
null
;
}
/**
* @Title: matchMediaSelfSource
* @author hero
* @Description: 验证及匹配自媒体号
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
matchMediaSelfSource
(
String
url
,
String
html
)
{
String
source
=
null
;
Document
document
=
Jsoup
.
parse
(
html
);
try
{
/***特定网站单独处理**/
if
(
url
.
contains
(
"toutiao.com"
)){
//今日头条帐号匹配
if
(
html
.
contains
(
"name: '"
)){
source
=
html
.
split
(
"mediaInfo:"
)[
1
].
split
(
"name: '"
)[
1
].
split
(
"',"
)[
0
].
trim
();
}
else
if
(
html
.
contains
(
"screen_name:"
)){
source
=
html
.
split
(
"screen_name:'"
)[
1
].
split
(
"',"
)[
0
].
trim
();
}
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"今日头条-"
+
source
;
}
}
else
if
(
url
.
contains
(
"sohu.com"
)){
//搜狐自媒体号
if
(
html
.
contains
(
"<meta name=\"mediaid\""
)){
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
].
trim
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"搜狐-"
+
source
;
}
}
}
else
if
(
url
.
contains
(
"a.mini.eastday.com"
)){
//处理东方头条网-自媒体号匹配
source
=
document
.
select
(
"[class=\"share_cnt_p clearfix\"]"
).
select
(
"div.fl"
).
select
(
"i"
).
get
(
1
).
text
().
trim
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"东方头条-"
+
source
;
}
}
else
if
(
url
.
contains
(
"sh.qihoo.com"
)){
//今日报点解析
source
=
document
.
select
(
"p.info"
).
select
(
"span.source"
).
text
().
trim
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"快资讯-"
+
source
;
}
}
else
if
(
url
.
contains
(
"cj.sina.com.cn"
)){
//新浪财经头条号
if
(
html
.
contains
(
"<meta name=\"mediaid\""
)){
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
].
trim
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"财经头条-"
+
source
;
}
}
}
else
if
(
url
.
contains
(
"baijia.baidu.com"
)){
//百度百家
source
=
document
.
select
(
"section.info"
).
select
(
"span.author"
).
text
().
trim
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"百度百家-"
+
source
;
}
}
else
if
(
url
.
contains
(
"yidianzixun.com"
)){
//一点资讯
if
(
html
.
contains
(
"related_wemedia"
)){
source
=
html
.
split
(
"media_name\":\""
)[
1
].
split
(
"\",\""
)[
0
].
trim
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"一点资讯-"
+
source
;
}
}
else
{
source
=
html
.
split
(
"source\":\""
)[
1
].
split
(
"\",\""
)[
0
];
}
}
else
if
(
url
.
contains
(
"news.bitauto.com"
)){
source
=
document
.
select
(
"[class=\"gz-box clearfix\"]"
).
select
(
"div.txt-box"
)
.
select
(
"p.p-n"
).
select
(
"a"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"易车网-"
+
source
;
}
}
else
if
(
url
.
contains
(
"chejiahao.autohome.com.cn"
)){
source
=
document
.
select
(
"div.authorMes"
).
select
(
"[class=\"name text-overflow\"]"
)
.
select
(
"a"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"汽车之家-"
+
source
;
}
}
return
source
;
}
catch
(
Exception
e
)
{
return
null
;
}
}
/**
* @Title: matchChannel
* @author hero
* @Description: TODO(匹配频道)
* @param @param
* list
* @param @return
* 设定文件
* @return String 返回类型
*/
public
static
String
matchChannel
(
List
<
Node
>
list
)
{
/** 验证频道标签 **/
String
channel
=
"新闻"
;
try
{
for
(
Node
node
:
list
)
{
if
(
node
.
outerHtml
().
contains
(
"<title>"
))
{
String
[]
content
=
node
.
toString
().
split
(
"<title>"
)[
1
].
split
(
"</title>"
)[
0
].
split
(
"_"
);
String
channelMatch
=
""
;
for
(
int
i
=
0
;
i
<
content
.
length
;
i
++)
{
if
(
i
>
0
)
{
channelMatch
+=
content
[
i
]
+
"_"
;
}
}
channel
=
getChannel
(
channelMatch
);
break
;
}
}
}
catch
(
Exception
e
)
{
return
channel
;
}
return
channel
;
}
/**
*
* @Title: mathchOtherSource
* @author hero
* @Description: 匹配通用结果数据
* @param @param html
* @param @param htmlBody
* @param @param length
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
mathchOtherSource
(
String
html
,
String
htmlBody
,
List
<
String
>
sourceList
){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String
source
=
null
;
try
{
News
news
=
ContentExtractor
.
getNewsByHtml
(
html
);
String
content
=
filterSpecialCharacter
(
news
.
getContent
().
toUpperCase
());
String
title
=
filterSpecialCharacter
(
news
.
getTitle
().
toUpperCase
());
/**剔除正文**/
String
text
=
htmlBody
.
replace
(
content
,
"@@@@@@@@@@"
);
/**分割正文**/
String
[]
matchTextArr
=
text
.
split
(
"@@@@@@@@@@"
);
if
(
regex
(
fromRegex
,
matchTextArr
[
0
])
!=
null
||
regex
(
fromRegex
,
matchTextArr
[
1
])!=
null
){
if
(
regex
(
fromRegex
,
matchTextArr
[
0
])!=
null
){
source
=
regex
(
fromRegex
,
matchTextArr
[
0
]);
for
(
String
sourceMatch
:
sourceList
)
{
if
(
source
.
contains
(
sourceMatch
))
{
return
sourceMatch
;
}
}
}
else
if
(
regex
(
fromRegex
,
matchTextArr
[
1
])!=
null
){
source
=
regex
(
fromRegex
,
matchTextArr
[
1
]);
for
(
String
sourceMatch
:
sourceList
)
{
if
(
source
.
contains
(
sourceMatch
))
{
return
sourceMatch
;
}
}
}
}
else
{
if
(
matchTextArr
[
0
].
contains
(
title
)){
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String
[]
titlesArr
=
matchTextArr
[
0
].
split
(
title
);
for
(
int
j
=
0
;
j
<
titlesArr
.
length
;
j
++){
String
timeSource
=
regex
(
timeRegex
,
titlesArr
[
j
]);
if
(
timeSource
!=
null
){
source
=
getSourceByTime
(
timeSource
,
titlesArr
[
j
],
sourceList
);
if
(
source
!=
null
){
return
source
;
}
}
}
}
if
(
matchTextArr
[
1
].
contains
(
title
)){
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String
[]
titlesArr
=
matchTextArr
[
1
].
split
(
title
);
for
(
int
j
=
0
;
j
<
titlesArr
.
length
;
j
++){
String
timeSource
=
regex
(
timeRegex
,
titlesArr
[
j
]);
if
(
timeSource
!=
null
){
source
=
getSourceByTime
(
timeSource
,
titlesArr
[
j
],
sourceList
);
if
(
source
!=
null
){
return
source
;
}
}
}
}
}
/***正文外无相关数据,匹配正文**/
if
(
source
==
null
){
/***
* 匹配命中包含来源等规则的数据
*/
source
=
regex
(
fromRegex
,
content
);
if
(
source
!=
null
){
for
(
String
sourceMatch
:
sourceList
)
{
if
(
source
.
contains
(
sourceMatch
))
{
return
sourceMatch
;
}
}
}
else
{
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
if
(
content
.
contains
(
title
)){
/**正文中包含标题**/
String
[]
titlesArr
=
content
.
split
(
title
);
for
(
int
j
=
0
;
j
<
titlesArr
.
length
;
j
++){
String
timeSource
=
regex
(
timeRegex
,
titlesArr
[
j
]);
if
(
timeSource
!=
null
){
source
=
getSourceByTime
(
timeSource
,
titlesArr
[
j
],
sourceList
);
if
(
source
!=
null
){
return
source
;
}
}
}
}
else
{
/**正文中不包含标题**/
String
timeSource
=
regex
(
timeRegex
,
content
);
if
(
timeSource
!=
null
){
source
=
getSourceByTime
(
timeSource
,
content
,
sourceList
);
if
(
source
!=
null
){
return
source
;
}
}
}
}
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
"正文抽取失败处理........"
);
e
.
printStackTrace
();
/***
* 匹配正文失败
* 匹配命中包含来源等规则的数据
*/
source
=
regex
(
fromRegex
,
htmlBody
);
if
(
source
!=
null
)
{
for
(
String
sourceMatch
:
sourceList
)
{
if
(
source
.
contains
(
sourceMatch
))
{
return
sourceMatch
;
}
}
}
else
{
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String
timeSource
=
regex
(
timeRegex
,
htmlBody
);
if
(
timeSource
!=
null
){
source
=
getSourceByTime
(
timeSource
,
htmlBody
,
sourceList
);
if
(
source
!=
null
){
return
source
;
}
}
}
}
return
null
;
}
/***
/***
*
*
...
@@ -404,148 +36,6 @@ public class TreateData {
...
@@ -404,148 +36,6 @@ public class TreateData {
}
}
/**
* @Title: getSourceByTime
* @author hero
* @Description: TODO(根据匹配时间截取数据)
* @param @param htmlBody
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
getSourceByTime
(
String
timeSource
,
String
htmlBody
,
List
<
String
>
sourceList
){
/**以时间做分割,匹配来源信息。
* 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
***/
String
times
[]
=
htmlBody
.
split
(
timeSource
);
for
(
int
j
=
0
;
j
<
times
.
length
;
j
++)
{
String
timecontent
=
times
[
j
];
if
(
j
==
0
)
{
if
(
timecontent
.
length
()
>=
30
)
{
timecontent
=
timecontent
.
substring
(
timecontent
.
length
()
-
30
,
timecontent
.
length
());
}
else
{
timecontent
=
timecontent
.
substring
(
0
,
timecontent
.
length
());
}
}
else
{
if
(
timecontent
.
length
()
>=
30
)
{
timecontent
=
timecontent
.
substring
(
0
,
30
);
}
else
{
timecontent
=
timecontent
.
substring
(
0
,
timecontent
.
length
());
}
}
for
(
String
sourceMatch
:
sourceList
)
{
if
(
timecontent
.
contains
(
sourceMatch
))
{
return
sourceMatch
;
}
}
}
return
null
;
}
/**
* @Title: getChannel
* @author hero
* @Description: TODO(渠道验证)
* @param @param
* source
* @param @return
* 设定文件
* @return String 返回类型
*/
public
static
String
getChannel
(
String
source
)
{
String
channel
=
"新闻"
;
if
(
source
.
contains
(
"财经"
))
{
channel
=
"财经"
;
}
else
if
(
source
.
contains
(
"金融"
))
{
channel
=
"金融"
;
}
else
if
(
source
.
contains
(
"经济"
))
{
channel
=
"经济"
;
}
else
if
(
source
.
contains
(
"科技"
))
{
channel
=
"科技"
;
}
else
if
(
source
.
contains
(
"时尚"
))
{
channel
=
"时尚"
;
}
else
if
(
source
.
contains
(
"互联网"
))
{
channel
=
"互联网"
;
}
else
if
(
source
.
contains
(
"数码"
))
{
channel
=
"数码"
;
}
else
if
(
source
.
contains
(
"科学"
))
{
channel
=
"科学"
;
}
else
if
(
source
.
contains
(
"TMT"
))
{
channel
=
"TMT"
;
}
else
if
(
source
.
contains
(
"通讯"
))
{
channel
=
"通讯"
;
}
else
if
(
source
.
contains
(
"社会"
))
{
channel
=
"社会"
;
}
else
if
(
source
.
contains
(
"IT"
))
{
channel
=
"IT"
;
}
else
if
(
source
.
contains
(
"房产"
))
{
channel
=
"房产"
;
}
else
if
(
source
.
contains
(
"母婴"
))
{
channel
=
"母婴"
;
}
else
if
(
source
.
contains
(
"3C"
))
{
channel
=
"3C"
;
}
return
channel
;
}
/**
* @Title: verifyChannel
* @author hero
* @Description: 根据链接验证文章频道
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
verifyChannel
(
String
url
){
String
channel
=
null
;
if
(
url
.
contains
(
"news."
)
||
url
.
contains
(
"cj.sina.com.cn"
)
||
url
.
contains
(
"wemedia.ifeng.com"
)){
channel
=
"新闻"
;
}
else
if
(
url
.
contains
(
"finance."
)
||
url
.
contains
(
"business."
)
||
url
.
contains
(
"money."
)
||
url
.
contains
(
"stock."
)
||
url
.
contains
(
"10jqka.com.cn"
)){
channel
=
"财经"
;
}
else
if
(
url
.
contains
(
"tech."
)
||
url
.
contains
(
"it."
)
||
url
.
contains
(
"pcedu."
)
||
url
.
contains
(
"mobile."
)
||
url
.
contains
(
"vr."
)){
channel
=
"科技"
;
}
else
if
(
url
.
contains
(
"sports."
)){
channel
=
"体育"
;
}
else
if
(
url
.
contains
(
"ent."
)
||
url
.
contains
(
"yule."
)){
channel
=
"娱乐"
;
}
else
if
(
url
.
contains
(
"auto."
)){
channel
=
"汽车"
;
}
else
if
(
url
.
contains
(
"fashion."
)){
channel
=
"时尚"
;
}
else
if
(
url
.
contains
(
"learning."
)
||
url
.
contains
(
"edu."
)){
channel
=
"教育"
;
}
else
if
(
url
.
contains
(
"baobao."
)){
channel
=
"母婴"
;
}
else
if
(
url
.
contains
(
"house."
)
||
url
.
contains
(
"leju."
)
||
url
.
contains
(
"focus."
)){
channel
=
"房产"
;
}
else
if
(
url
.
contains
(
"games."
)){
channel
=
"游戏"
;
}
else
if
(
url
.
contains
(
"intl."
)){
channel
=
"国际"
;
}
else
if
(
url
.
contains
(
"science."
)){
channel
=
"科学"
;
}
else
if
(
url
.
contains
(
"city."
)){
channel
=
"城市"
;
}
else
if
(
url
.
contains
(
"sc."
)){
channel
=
"市场"
;
}
return
channel
;
}
public
static
String
filterSpecialCharacter
(
String
str
)
{
public
static
String
filterSpecialCharacter
(
String
str
)
{
try
{
try
{
String
regEx
=
"【[`~!@#$%^&*()+=|{}';'//[//].<>/?~!@#%……&*——+|{}“”;‘’,。、·]】"
;
String
regEx
=
"【[`~!@#$%^&*()+=|{}';'//[//].<>/?~!@#%……&*——+|{}“”;‘’,。、·]】"
;
...
...
src/test/java/com/zhiwei/source_forward/sourceforward/test/MediaSelfSourceTest.java
0 → 100644
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
sourceforward
.
test
;
import
java.util.HashMap
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.source_forward.run.SourceForward
;
/**
* @ClassName: SourceForwardTest
* @Description: 来源验证
* @author hero
* @date 2017年12月6日 上午9:55:13
*/
public
class
MediaSelfSourceTest
{
@Test
public
void
sourceForwardTest
(){
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
=
new
HashMap
<
String
,
Map
<
String
,
Object
>>();
String
url
=
"https://www.toutiao.com/a6549872248428167687/"
;
Map
<
String
,
Object
>
data
=
new
HashMap
<
String
,
Object
>();
dataMap
.
put
(
url
,
data
);
SourceForward
.
getMediaSelfSource
(
dataMap
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment