Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
7a6d49e2
Commit
7a6d49e2
authored
Jun 30, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加内容匹配
parent
82632f70
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
461 additions
and
144 deletions
+461
-144
src/main/java/com/zhiwei/source_forward/crawler/ContentPageProcessor.java
+51
-0
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourcePageProcessor.java
+6
-5
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardPageProcessor.java
+7
-5
src/main/java/com/zhiwei/source_forward/crawler/UrlLivePageProcessor.java
+1
-1
src/main/java/com/zhiwei/source_forward/pipeline/DataPipeline.java
+99
-0
src/main/java/com/zhiwei/source_forward/pipeline/MediaSelfSourceDataPipeline.java
+0
-40
src/main/java/com/zhiwei/source_forward/pipeline/SourceForwardDataPipeline.java
+0
-40
src/main/java/com/zhiwei/source_forward/pipeline/UrlLivePipeline.java
+0
-40
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
+46
-0
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+9
-10
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+3
-3
src/main/java/com/zhiwei/source_forward/util/MatchChannel.java
+143
-0
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+60
-0
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+0
-0
src/main/java/com/zhiwei/source_forward/util/TreateData.java
+0
-0
src/test/java/com/zhiwei/source_forward/sourceforward/test/MediaSelfSourceTest.java
+36
-0
No files found.
src/main/java/com/zhiwei/source_forward/crawler/ContentPageProcessor.java
0 → 100644
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.source_forward.util.TreateData
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
/**
* @ClassName: ContentPageProcessor
* @Description: 获取文章内容
* @author hero
* @date 2018年6月30日 上午9:54:02
*/
public
class
ContentPageProcessor
implements
PageProcessor
{
private
Site
site
=
Site
.
me
().
setCycleRetryTimes
(
3
).
setSleepTime
(
1500
)
.
setTimeOut
(
10000
)
.
setUserAgent
(
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
.
addHeader
(
"Accept-Encoding"
,
"gzip, deflate, br"
)
;
@Override
public
Site
getSite
()
{
return
site
;
}
@Override
public
void
process
(
Page
page
)
{
Map
<
String
,
String
>
data
=
new
HashMap
<
String
,
String
>();
String
content
=
null
;
try
{
if
(
page
.
getStatusCode
()!=
404
){
MatchContent
.
matchContent
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
());
}
}
catch
(
Exception
e
)
{
content
=
null
;
}
data
.
put
(
"url"
,
page
.
getUrl
().
get
());
data
.
put
(
"content"
,
content
);
page
.
putField
(
"content"
,
data
);
}
}
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourcePageProcessor.java
View file @
7a6d49e2
...
@@ -6,7 +6,8 @@ import java.util.Map;
...
@@ -6,7 +6,8 @@ import java.util.Map;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.source_forward.util.TreateData
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
...
@@ -31,14 +32,14 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
...
@@ -31,14 +32,14 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
String
channel
=
null
;
String
channel
=
null
;
try
{
try
{
if
(
page
.
getStatusCode
()!=
404
){
if
(
page
.
getStatusCode
()!=
404
){
source
=
TreateData
.
matchMediaSelfSource
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
());
source
=
MatchSource
.
matchMediaSelfSource
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
());
if
(
source
==
null
||
source
.
equals
(
""
)){
if
(
source
==
null
||
source
.
equals
(
""
)){
source
=
null
;
source
=
null
;
}
}
channel
=
TreateData
.
verifyChannel
(
page
.
getUrl
().
get
());
channel
=
MatchChannel
.
verifyChannel
(
page
.
getUrl
().
get
());
if
(
channel
==
null
){
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
channel
=
TreateData
.
matchChannel
(
nodeList
);
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -49,7 +50,7 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
...
@@ -49,7 +50,7 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
data
.
put
(
"mediaself"
,
source
);
data
.
put
(
"mediaself"
,
source
);
data
.
put
(
"channel"
,
channel
);
data
.
put
(
"channel"
,
channel
);
page
.
putField
(
"
data
"
,
data
);
page
.
putField
(
"
mediaSelf
"
,
data
);
}
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardPageProcessor.java
View file @
7a6d49e2
...
@@ -6,8 +6,10 @@ import java.util.Map;
...
@@ -6,8 +6,10 @@ import java.util.Map;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.TreateData
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.processor.PageProcessor
;
...
@@ -45,12 +47,12 @@ public class SourceForwardPageProcessor implements PageProcessor {
...
@@ -45,12 +47,12 @@ public class SourceForwardPageProcessor implements PageProcessor {
}
}
data
.
put
(
"isforward"
,
isforward
);
data
.
put
(
"isforward"
,
isforward
);
}
else
{
}
else
{
channel
=
TreateData
.
verifyChannel
(
page
.
getUrl
().
get
());
channel
=
MatchChannel
.
verifyChannel
(
page
.
getUrl
().
get
());
if
(
channel
==
null
){
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
channel
=
TreateData
.
matchChannel
(
nodeList
);
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
}
source
=
TreateData
.
matchSource
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
(),
sourceList
);
source
=
MatchSource
.
matchSource
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
(),
sourceList
);
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -62,7 +64,7 @@ public class SourceForwardPageProcessor implements PageProcessor {
...
@@ -62,7 +64,7 @@ public class SourceForwardPageProcessor implements PageProcessor {
data
.
put
(
"channel"
,
channel
);
data
.
put
(
"channel"
,
channel
);
data
.
put
(
"root_source"
,
source
);
data
.
put
(
"root_source"
,
source
);
page
.
putField
(
"
data
"
,
data
);
page
.
putField
(
"
sourceForward
"
,
data
);
}
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/UrlLivePageProcessor.java
View file @
7a6d49e2
...
@@ -35,7 +35,7 @@ public class UrlLivePageProcessor implements PageProcessor{
...
@@ -35,7 +35,7 @@ public class UrlLivePageProcessor implements PageProcessor{
Map
<
String
,
Object
>
data
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
data
=
new
HashMap
<
String
,
Object
>();
data
.
put
(
"url"
,
page
.
getUrl
().
get
());
data
.
put
(
"url"
,
page
.
getUrl
().
get
());
data
.
put
(
"live"
,
f
);
data
.
put
(
"live"
,
f
);
page
.
putField
(
"
data
"
,
data
);
page
.
putField
(
"
urlLive
"
,
data
);
}
}
@Override
@Override
...
...
src/main/java/com/zhiwei/source_forward/pipeline/DataPipeline.java
0 → 100644
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
pipeline
;
import
java.util.List
;
import
java.util.Map
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
/**
* @ClassName: ContentDataPipeline
* @Description: 存储文章位置
* @author hero
* @date 2018年6月30日 上午9:54:27
*/
public
class
DataPipeline
implements
Pipeline
{
private
List
<
Map
<
String
,
Object
>>
contentDataList
;
private
List
<
Map
<
String
,
Object
>>
mediaSelfDataList
;
private
List
<
Map
<
String
,
Object
>>
sourceForwardDataList
;
private
List
<
Map
<
String
,
Object
>>
urlLivedataList
;
public
DataPipeline
(
List
<
Map
<
String
,
Object
>>
dataList
,
List
<
Map
<
String
,
Object
>>
contentDataList
,
List
<
Map
<
String
,
Object
>>
mediaSelfDataList
,
List
<
Map
<
String
,
Object
>>
sourceForwardDataList
,
List
<
Map
<
String
,
Object
>>
urlLivedataList
)
{
super
();
this
.
contentDataList
=
contentDataList
;
this
.
mediaSelfDataList
=
mediaSelfDataList
;
this
.
sourceForwardDataList
=
sourceForwardDataList
;
this
.
urlLivedataList
=
urlLivedataList
;
}
public
DataPipeline
()
{
super
();
}
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
Map
<
String
,
Object
>
contentData
=
resultItems
.
get
(
"content"
);
Map
<
String
,
Object
>
mediaSelfData
=
resultItems
.
get
(
"mediaSelf"
);
Map
<
String
,
Object
>
sourceForwardData
=
resultItems
.
get
(
"sourceForward"
);
Map
<
String
,
Object
>
urlLivedata
=
resultItems
.
get
(
"urlLive"
);
if
(
contentData
!=
null
)
{
contentDataList
.
add
(
contentData
);
}
if
(
mediaSelfData
!=
null
)
{
mediaSelfDataList
.
add
(
mediaSelfData
);
}
if
(
sourceForwardData
!=
null
)
{
sourceForwardDataList
.
add
(
sourceForwardData
);
}
if
(
urlLivedata
!=
null
)
{
urlLivedataList
.
add
(
urlLivedata
);
}
}
public
List
<
Map
<
String
,
Object
>>
getContentDataList
()
{
return
contentDataList
;
}
public
void
setContentDataList
(
List
<
Map
<
String
,
Object
>>
contentDataList
)
{
this
.
contentDataList
=
contentDataList
;
}
public
List
<
Map
<
String
,
Object
>>
getMediaSelfDataList
()
{
return
mediaSelfDataList
;
}
public
void
setMediaSelfDataList
(
List
<
Map
<
String
,
Object
>>
mediaSelfDataList
)
{
this
.
mediaSelfDataList
=
mediaSelfDataList
;
}
public
List
<
Map
<
String
,
Object
>>
getSourceForwardDataList
()
{
return
sourceForwardDataList
;
}
public
void
setSourceForwardDataList
(
List
<
Map
<
String
,
Object
>>
sourceForwardDataList
)
{
this
.
sourceForwardDataList
=
sourceForwardDataList
;
}
public
List
<
Map
<
String
,
Object
>>
getUrlLivedataList
()
{
return
urlLivedataList
;
}
public
void
setUrlLivedataList
(
List
<
Map
<
String
,
Object
>>
urlLivedataList
)
{
this
.
urlLivedataList
=
urlLivedataList
;
}
}
src/main/java/com/zhiwei/source_forward/pipeline/MediaSelfSourceDataPipeline.java
deleted
100644 → 0
View file @
82632f70
package
com
.
zhiwei
.
source_forward
.
pipeline
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
public
class
MediaSelfSourceDataPipeline
implements
Pipeline
{
private
List
<
Map
<
String
,
Object
>>
dataList
;
public
MediaSelfSourceDataPipeline
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
super
();
this
.
dataList
=
dataList
;
}
public
MediaSelfSourceDataPipeline
()
{
super
();
this
.
dataList
=
new
ArrayList
<>();
}
public
List
<
Map
<
String
,
Object
>>
getDataList
()
{
return
dataList
;
}
public
void
setDataList
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
this
.
dataList
=
dataList
;
}
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
Map
<
String
,
Object
>
data
=
resultItems
.
get
(
"data"
);
if
(
data
!=
null
)
{
dataList
.
add
(
data
);
}
}
}
src/main/java/com/zhiwei/source_forward/pipeline/SourceForwardDataPipeline.java
deleted
100644 → 0
View file @
82632f70
package
com
.
zhiwei
.
source_forward
.
pipeline
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
public
class
SourceForwardDataPipeline
implements
Pipeline
{
private
List
<
Map
<
String
,
Object
>>
dataList
;
public
SourceForwardDataPipeline
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
super
();
this
.
dataList
=
dataList
;
}
public
SourceForwardDataPipeline
()
{
super
();
this
.
dataList
=
new
ArrayList
<>();
}
public
List
<
Map
<
String
,
Object
>>
getDataList
()
{
return
dataList
;
}
public
void
setDataList
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
this
.
dataList
=
dataList
;
}
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
Map
<
String
,
Object
>
data
=
resultItems
.
get
(
"data"
);
if
(
data
!=
null
)
{
dataList
.
add
(
data
);
}
}
}
src/main/java/com/zhiwei/source_forward/pipeline/UrlLivePipeline.java
deleted
100644 → 0
View file @
82632f70
package
com
.
zhiwei
.
source_forward
.
pipeline
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
public
class
UrlLivePipeline
implements
Pipeline
{
private
List
<
Map
<
String
,
Object
>>
dataList
;
public
UrlLivePipeline
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
super
();
this
.
dataList
=
dataList
;
}
public
UrlLivePipeline
()
{
super
();
this
.
dataList
=
new
ArrayList
<>();
}
public
List
<
Map
<
String
,
Object
>>
getDataList
()
{
return
dataList
;
}
public
void
setDataList
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
this
.
dataList
=
dataList
;
}
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
Map
<
String
,
Object
>
data
=
resultItems
.
get
(
"data"
);
if
(
data
!=
null
)
{
dataList
.
add
(
data
);
}
}
}
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
0 → 100644
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
run
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.source_forward.crawler.ContentPageProcessor
;
import
com.zhiwei.source_forward.downloader.MyDownLoader
;
import
com.zhiwei.source_forward.pipeline.DataPipeline
;
import
us.codecraft.webmagic.Spider
;
public
class
ContentMatch
{
/**
* @Title: getSourceForward
* @author hero
* @Description: 验证文章是否转发
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getContent
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
//启动验证来源程序
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
ContentPageProcessor
());
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
spider
.
addUrl
(
entry
.
getKey
());
}
spider
.
setDownloader
(
new
MyDownLoader
());
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
List
<
Map
<
String
,
Object
>>
contentList
=
pipeline
.
getContentDataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
contentList
){
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
//整合数据及验证转发原创
if
(
dataMap
.
containsKey
(
url
)){
Map
<
String
,
Object
>
data
=
dataMap
.
get
(
url
);
String
content
=
data
.
get
(
"content"
)+
""
;
data
.
put
(
"content"
,
content
);
dataMap
.
put
(
url
,
data
);
}
}
return
dataMap
;
}
}
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
7a6d49e2
...
@@ -8,8 +8,7 @@ import java.util.Map.Entry;
...
@@ -8,8 +8,7 @@ import java.util.Map.Entry;
import
com.zhiwei.source_forward.crawler.MediaSelfSourcePageProcessor
;
import
com.zhiwei.source_forward.crawler.MediaSelfSourcePageProcessor
;
import
com.zhiwei.source_forward.crawler.SourceForwardPageProcessor
;
import
com.zhiwei.source_forward.crawler.SourceForwardPageProcessor
;
import
com.zhiwei.source_forward.downloader.MyDownLoader
;
import
com.zhiwei.source_forward.downloader.MyDownLoader
;
import
com.zhiwei.source_forward.pipeline.MediaSelfSourceDataPipeline
;
import
com.zhiwei.source_forward.pipeline.DataPipeline
;
import
com.zhiwei.source_forward.pipeline.SourceForwardDataPipeline
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Spider
;
...
@@ -31,7 +30,7 @@ public class SourceForward {
...
@@ -31,7 +30,7 @@ public class SourceForward {
*/
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getSourceForward
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getSourceForward
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
//启动验证来源程序
//启动验证来源程序
SourceForwardDataPipeline
pipeline
=
new
SourceForward
DataPipeline
();
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
SourceForwardPageProcessor
());
Spider
spider
=
Spider
.
create
(
new
SourceForwardPageProcessor
());
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
spider
.
addUrl
(
entry
.
getKey
());
spider
.
addUrl
(
entry
.
getKey
());
...
@@ -40,7 +39,7 @@ public class SourceForward {
...
@@ -40,7 +39,7 @@ public class SourceForward {
spider
.
addPipeline
(
pipeline
);
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
spider
.
thread
(
5
).
run
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
getDataList
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
get
SourceForward
DataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
String
root_source
=
sourceMap
.
get
(
"root_source"
)!=
null
?
sourceMap
.
get
(
"root_source"
).
toString
():
null
;
String
root_source
=
sourceMap
.
get
(
"root_source"
)!=
null
?
sourceMap
.
get
(
"root_source"
).
toString
():
null
;
...
@@ -85,7 +84,7 @@ public class SourceForward {
...
@@ -85,7 +84,7 @@ public class SourceForward {
*/
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getMediaSelfSource
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getMediaSelfSource
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
//启动验证来源程序
//启动验证来源程序
MediaSelfSourceDataPipeline
pipeline
=
new
MediaSelfSource
DataPipeline
();
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
spider
.
addUrl
(
entry
.
getKey
());
spider
.
addUrl
(
entry
.
getKey
());
...
@@ -94,7 +93,7 @@ public class SourceForward {
...
@@ -94,7 +93,7 @@ public class SourceForward {
spider
.
addPipeline
(
pipeline
);
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
spider
.
thread
(
5
).
run
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
getDataList
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
get
SourceForward
DataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
//整合数据及验证转发原创
//整合数据及验证转发原创
...
@@ -119,7 +118,7 @@ public class SourceForward {
...
@@ -119,7 +118,7 @@ public class SourceForward {
public
static
Map
<
String
,
String
>
getMediaSelfSource
(
List
<
String
>
urlList
){
public
static
Map
<
String
,
String
>
getMediaSelfSource
(
List
<
String
>
urlList
){
//启动验证来源程序
//启动验证来源程序
Map
<
String
,
String
>
dataMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
dataMap
=
new
HashMap
<
String
,
String
>();
MediaSelfSourceDataPipeline
pipeline
=
new
MediaSelfSource
DataPipeline
();
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
for
(
String
url
:
urlList
){
for
(
String
url
:
urlList
){
spider
.
addUrl
(
url
);
spider
.
addUrl
(
url
);
...
@@ -129,7 +128,7 @@ public class SourceForward {
...
@@ -129,7 +128,7 @@ public class SourceForward {
spider
.
addPipeline
(
pipeline
);
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
spider
.
thread
(
5
).
run
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
getDataList
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
get
MediaSelf
DataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
//整合数据及验证转发原创
//整合数据及验证转发原创
...
@@ -152,14 +151,14 @@ public class SourceForward {
...
@@ -152,14 +151,14 @@ public class SourceForward {
*/
*/
public
static
String
getMediaSelfSource
(
String
url
){
public
static
String
getMediaSelfSource
(
String
url
){
//启动验证来源程序
//启动验证来源程序
MediaSelfSourceDataPipeline
pipeline
=
new
MediaSelfSource
DataPipeline
();
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
spider
.
addUrl
(
url
);
spider
.
addUrl
(
url
);
spider
.
setDownloader
(
new
MyDownLoader
());
spider
.
setDownloader
(
new
MyDownLoader
());
spider
.
addPipeline
(
pipeline
);
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
1
).
run
();
spider
.
thread
(
1
).
run
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
getDataList
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
get
MediaSelf
DataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
return
sourceMap
.
get
(
"mediaself"
).
toString
();
return
sourceMap
.
get
(
"mediaself"
).
toString
();
}
}
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
7a6d49e2
...
@@ -5,7 +5,7 @@ import java.util.Map;
...
@@ -5,7 +5,7 @@ import java.util.Map;
import
java.util.Map.Entry
;
import
java.util.Map.Entry
;
import
com.zhiwei.source_forward.crawler.UrlLivePageProcessor
;
import
com.zhiwei.source_forward.crawler.UrlLivePageProcessor
;
import
com.zhiwei.source_forward.pipeline.
UrlLive
Pipeline
;
import
com.zhiwei.source_forward.pipeline.
Data
Pipeline
;
import
us.codecraft.webmagic.Spider
;
import
us.codecraft.webmagic.Spider
;
...
@@ -28,7 +28,7 @@ public class URLLive {
...
@@ -28,7 +28,7 @@ public class URLLive {
*/
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
verificationURLLive
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
public
static
Map
<
String
,
Map
<
String
,
Object
>>
verificationURLLive
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
//启动验证链接是否有效程序程序
//启动验证链接是否有效程序程序
UrlLivePipeline
pipeline
=
new
UrlLive
Pipeline
();
DataPipeline
pipeline
=
new
Data
Pipeline
();
Spider
spider
=
Spider
.
create
(
new
UrlLivePageProcessor
());
Spider
spider
=
Spider
.
create
(
new
UrlLivePageProcessor
());
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
spider
.
addUrl
(
entry
.
getKey
());
spider
.
addUrl
(
entry
.
getKey
());
...
@@ -37,7 +37,7 @@ public class URLLive {
...
@@ -37,7 +37,7 @@ public class URLLive {
spider
.
thread
(
5
).
run
();
spider
.
thread
(
5
).
run
();
//验证数据是否已删除
//验证数据是否已删除
List
<
Map
<
String
,
Object
>>
dataList
=
pipeline
.
get
D
ataList
();
List
<
Map
<
String
,
Object
>>
dataList
=
pipeline
.
get
UrlLived
ataList
();
for
(
Map
<
String
,
Object
>
data
:
dataList
){
for
(
Map
<
String
,
Object
>
data
:
dataList
){
String
url
=
data
.
get
(
"url"
)+
""
;
String
url
=
data
.
get
(
"url"
)+
""
;
if
(!
url
.
contains
(
"http"
)){
if
(!
url
.
contains
(
"http"
)){
...
...
src/main/java/com/zhiwei/source_forward/util/MatchChannel.java
0 → 100644
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.util.List
;
import
org.jsoup.nodes.Node
;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public
class
MatchChannel
{
/**
* @Title: matchChannel
* @author hero
* @Description: TODO(匹配频道)
* @param @param
* list
* @param @return
* 设定文件
* @return String 返回类型
*/
public
static
String
matchChannel
(
List
<
Node
>
list
)
{
/** 验证频道标签 **/
String
channel
=
"新闻"
;
try
{
for
(
Node
node
:
list
)
{
if
(
node
.
outerHtml
().
contains
(
"<title>"
))
{
String
[]
content
=
node
.
toString
().
split
(
"<title>"
)[
1
].
split
(
"</title>"
)[
0
].
split
(
"_"
);
String
channelMatch
=
""
;
for
(
int
i
=
0
;
i
<
content
.
length
;
i
++)
{
if
(
i
>
0
)
{
channelMatch
+=
content
[
i
]
+
"_"
;
}
}
channel
=
getChannel
(
channelMatch
);
break
;
}
}
}
catch
(
Exception
e
)
{
return
channel
;
}
return
channel
;
}
/**
* @Title: verifyChannel
* @author hero
* @Description: 根据链接验证文章频道
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
verifyChannel
(
String
url
){
String
channel
=
null
;
if
(
url
.
contains
(
"news."
)
||
url
.
contains
(
"cj.sina.com.cn"
)
||
url
.
contains
(
"wemedia.ifeng.com"
)){
channel
=
"新闻"
;
}
else
if
(
url
.
contains
(
"finance."
)
||
url
.
contains
(
"business."
)
||
url
.
contains
(
"money."
)
||
url
.
contains
(
"stock."
)
||
url
.
contains
(
"10jqka.com.cn"
)){
channel
=
"财经"
;
}
else
if
(
url
.
contains
(
"tech."
)
||
url
.
contains
(
"it."
)
||
url
.
contains
(
"pcedu."
)
||
url
.
contains
(
"mobile."
)
||
url
.
contains
(
"vr."
)){
channel
=
"科技"
;
}
else
if
(
url
.
contains
(
"sports."
)){
channel
=
"体育"
;
}
else
if
(
url
.
contains
(
"ent."
)
||
url
.
contains
(
"yule."
)){
channel
=
"娱乐"
;
}
else
if
(
url
.
contains
(
"auto."
)){
channel
=
"汽车"
;
}
else
if
(
url
.
contains
(
"fashion."
)){
channel
=
"时尚"
;
}
else
if
(
url
.
contains
(
"learning."
)
||
url
.
contains
(
"edu."
)){
channel
=
"教育"
;
}
else
if
(
url
.
contains
(
"baobao."
)){
channel
=
"母婴"
;
}
else
if
(
url
.
contains
(
"house."
)
||
url
.
contains
(
"leju."
)
||
url
.
contains
(
"focus."
)){
channel
=
"房产"
;
}
else
if
(
url
.
contains
(
"games."
)){
channel
=
"游戏"
;
}
else
if
(
url
.
contains
(
"intl."
)){
channel
=
"国际"
;
}
else
if
(
url
.
contains
(
"science."
)){
channel
=
"科学"
;
}
else
if
(
url
.
contains
(
"city."
)){
channel
=
"城市"
;
}
else
if
(
url
.
contains
(
"sc."
)){
channel
=
"市场"
;
}
return
channel
;
}
/**
* @Title: getChannel
* @author hero
* @Description: TODO(渠道验证)
* @param @param
* source
* @param @return
* 设定文件
* @return String 返回类型
*/
public
static
String
getChannel
(
String
source
)
{
String
channel
=
"新闻"
;
if
(
source
.
contains
(
"财经"
))
{
channel
=
"财经"
;
}
else
if
(
source
.
contains
(
"金融"
))
{
channel
=
"金融"
;
}
else
if
(
source
.
contains
(
"经济"
))
{
channel
=
"经济"
;
}
else
if
(
source
.
contains
(
"科技"
))
{
channel
=
"科技"
;
}
else
if
(
source
.
contains
(
"时尚"
))
{
channel
=
"时尚"
;
}
else
if
(
source
.
contains
(
"互联网"
))
{
channel
=
"互联网"
;
}
else
if
(
source
.
contains
(
"数码"
))
{
channel
=
"数码"
;
}
else
if
(
source
.
contains
(
"科学"
))
{
channel
=
"科学"
;
}
else
if
(
source
.
contains
(
"TMT"
))
{
channel
=
"TMT"
;
}
else
if
(
source
.
contains
(
"通讯"
))
{
channel
=
"通讯"
;
}
else
if
(
source
.
contains
(
"社会"
))
{
channel
=
"社会"
;
}
else
if
(
source
.
contains
(
"IT"
))
{
channel
=
"IT"
;
}
else
if
(
source
.
contains
(
"房产"
))
{
channel
=
"房产"
;
}
else
if
(
source
.
contains
(
"母婴"
))
{
channel
=
"母婴"
;
}
else
if
(
source
.
contains
(
"3C"
))
{
channel
=
"3C"
;
}
return
channel
;
}
}
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
0 → 100644
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
util
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
cn.edu.hfut.dmic.contentextractor.ContentExtractor
;
import
cn.edu.hfut.dmic.contentextractor.News
;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public
class
MatchContent
{
/**
* @Title: matchContent
* @author hero
* @Description: 匹配文章正文
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
matchContent
(
String
url
,
String
html
)
{
String
content
=
null
;
Document
document
=
Jsoup
.
parse
(
html
);
try
{
content
=
mathchContent
(
html
,
document
);
}
catch
(
Exception
e
)
{
content
=
null
;
}
return
content
;
}
/**
* @Title: mathchContent
* @author hero
* @Description: 匹配正文数据
* @param @param html
* @param @param document
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
mathchContent
(
String
html
,
Document
document
){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String
content
=
null
;
try
{
News
news
=
ContentExtractor
.
getNewsByHtml
(
html
);
content
=
TreateData
.
filterSpecialCharacter
(
news
.
getContent
());
}
catch
(
Exception
e
)
{
content
=
document
.
text
();
System
.
out
.
println
(
"正文抽取失败处理........"
);
e
.
printStackTrace
();
}
return
content
;
}
}
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
0 → 100644
View file @
7a6d49e2
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/source_forward/util/TreateData.java
View file @
7a6d49e2
This diff is collapsed.
Click to expand it.
src/test/java/com/zhiwei/source_forward/sourceforward/test/MediaSelfSourceTest.java
0 → 100644
View file @
7a6d49e2
package
com
.
zhiwei
.
source_forward
.
sourceforward
.
test
;
import
java.util.HashMap
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.source_forward.run.SourceForward
;
/**
* @ClassName: SourceForwardTest
* @Description: 来源验证
* @author hero
* @date 2017年12月6日 上午9:55:13
*/
public
class
MediaSelfSourceTest
{
@Test
public
void
sourceForwardTest
(){
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
=
new
HashMap
<
String
,
Map
<
String
,
Object
>>();
String
url
=
"https://www.toutiao.com/a6549872248428167687/"
;
Map
<
String
,
Object
>
data
=
new
HashMap
<
String
,
Object
>();
dataMap
.
put
(
url
,
data
);
SourceForward
.
getMediaSelfSource
(
dataMap
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment