Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
cd456869
Commit
cd456869
authored
Jan 12, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
1.来源转发匹配精确化搜狐及新浪
2.来源转发匹配将东方头条、今日爆点、千寻生活、触电新闻自媒体匹配为原创 3.添加自媒体号媒体:今日头条、搜狐、东方头条、今日爆点、财经头条、百家号
parent
6ce658e0
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
189 additions
and
16 deletions
+189
-16
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourcePageProcessor.java
+46
-0
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardPageProcessor.java
+2
-3
src/main/java/com/zhiwei/source_forward/pipeline/MediaSelfSourceDataPipeline.java
+40
-0
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+33
-2
src/main/java/com/zhiwei/source_forward/util/TreateData.java
+68
-11
No files found.
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourcePageProcessor.java
0 → 100644
View file @
cd456869
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.TreateData
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
public
class
MediaSelfSourcePageProcessor
implements
PageProcessor
{
private
Site
site
=
Site
.
me
().
setCycleRetryTimes
(
3
).
setSleepTime
(
1500
)
.
setTimeOut
(
10000
)
.
addHeader
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
.
addHeader
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
)
;
@Override
public
Site
getSite
()
{
return
site
;
}
@Override
public
void
process
(
Page
page
)
{
Map
<
String
,
String
>
data
=
new
HashMap
<
String
,
String
>();
String
source
=
null
;
try
{
if
(
page
.
getStatusCode
()!=
404
){
source
=
TreateData
.
matchMediaSelfSource
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
());
}
}
catch
(
Exception
e
)
{
source
=
null
;
}
System
.
out
.
println
(
page
.
getUrl
().
get
()+
"================="
+
source
);
data
.
put
(
"url"
,
page
.
getUrl
().
get
());
data
.
put
(
"mediaself"
,
source
);
page
.
putField
(
"data"
,
data
);
}
}
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardPageProcessor.java
View file @
cd456869
...
...
@@ -36,9 +36,8 @@ public class SourceForwardPageProcessor implements PageProcessor {
if
(
page
.
getStatusCode
()!=
404
){
channel
=
verifyChannel
(
page
.
getUrl
().
get
());
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
channel
=
TreateData
.
matchChannel
(
nodeList
);
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
channel
=
TreateData
.
matchChannel
(
nodeList
);
}
source
=
TreateData
.
matchSource
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
(),
sourceList
);
}
...
...
src/main/java/com/zhiwei/source_forward/pipeline/MediaSelfSourceDataPipeline.java
0 → 100644
View file @
cd456869
package
com
.
zhiwei
.
source_forward
.
pipeline
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
public
class
MediaSelfSourceDataPipeline
implements
Pipeline
{
private
List
<
Map
<
String
,
Object
>>
dataList
;
public
MediaSelfSourceDataPipeline
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
super
();
this
.
dataList
=
dataList
;
}
public
MediaSelfSourceDataPipeline
()
{
super
();
this
.
dataList
=
new
ArrayList
<>();
}
public
List
<
Map
<
String
,
Object
>>
getDataList
()
{
return
dataList
;
}
public
void
setDataList
(
List
<
Map
<
String
,
Object
>>
dataList
)
{
this
.
dataList
=
dataList
;
}
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
Map
<
String
,
Object
>
data
=
resultItems
.
get
(
"data"
);
if
(
data
!=
null
)
{
dataList
.
add
(
data
);
}
}
}
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
cd456869
...
...
@@ -4,8 +4,10 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.source_forward.crawler.MediaSelfSourcePageProcessor
;
import
com.zhiwei.source_forward.crawler.SourceForwardPageProcessor
;
import
com.zhiwei.source_forward.downloader.MyDownLoader
;
import
com.zhiwei.source_forward.pipeline.MediaSelfSourceDataPipeline
;
import
com.zhiwei.source_forward.pipeline.SourceForwardDataPipeline
;
import
us.codecraft.webmagic.Spider
;
...
...
@@ -66,8 +68,37 @@ public class SourceForward {
/**
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体号名称
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getMediaSelfSource
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
//启动验证来源程序
MediaSelfSourceDataPipeline
pipeline
=
new
MediaSelfSourceDataPipeline
();
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
spider
.
addUrl
(
entry
.
getKey
());
}
spider
.
setDownloader
(
new
MyDownLoader
());
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
getDataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
//整合数据及验证转发原创
if
(
dataMap
.
containsKey
(
url
)){
Map
<
String
,
Object
>
data
=
dataMap
.
get
(
url
);
data
.
put
(
"自媒体号"
,
sourceMap
.
get
(
"mediaself"
));
dataMap
.
put
(
url
,
data
);
}
}
return
dataMap
;
}
...
...
src/main/java/com/zhiwei/source_forward/util/TreateData.java
View file @
cd456869
...
...
@@ -70,15 +70,24 @@ public class TreateData {
}
else
if
(
url
.
contains
(
"myzaker.com"
)){
//单独处理扎克网数据
source
=
document
.
select
(
"div#article"
).
select
(
"span.auther"
).
text
();
}
else
if
(
url
.
contains
(
"tech.sina.com.cn"
)){
//单独处理新浪网-科技频道数据
source
=
document
.
select
(
"span.source"
).
text
();
}
else
if
(
url
.
contains
(
"finance.sina.com.cn"
)
||
url
.
contains
(
"news.sina.com.cn"
)){
//单独处理新浪网-财经及新闻数据
source
=
document
.
select
(
"div.page-info"
).
select
(
"span[data-sudaclick=\"media_name\"]"
).
text
();
}
else
if
(
url
.
contains
(
"ent.sina.com.cn"
)){
//单独处理新浪网-娱乐
source
=
document
.
select
(
"div#top_bar"
).
select
(
"div.date-source"
).
select
(
"a"
).
text
();
}
else
if
(
url
.
contains
(
"sina.com.cn"
)
||
url
.
contains
(
"sohu.com"
)){
//单独处理新浪网
if
(
html
.
contains
(
"<meta name=\"mediaid\""
)){
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
];
}
}
else
if
(
url
.
contains
(
"a.mini.eastday.com"
)){
//处理东方头条网-自媒体号匹配
// source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text();
source
=
"东方头条"
;
}
else
if
(
url
.
contains
(
"orz520.com"
)){
//千寻生活网解析
source
=
"千寻生活"
;
}
else
if
(
url
.
contains
(
"sh.qihoo.com"
)){
//今日报点解析
source
=
"今日爆点"
;
}
else
if
(
url
.
contains
(
"itouchtv.cn"
)){
//触电新闻解析
source
=
"触电新闻"
;
}
else
{
//其他网站处理
source
=
mathchOtherSource
(
html
,
htmlBody
,
sourceList
);
...
...
@@ -92,11 +101,60 @@ public class TreateData {
}
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
"+++++++++++++++++"
);
e
.
printStackTrace
();
}
return
null
;
}
/**
* @Title: matchMediaSelfSource
* @author hero
* @Description: 验证及匹配自媒体号
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
matchMediaSelfSource
(
String
url
,
String
html
)
{
String
source
=
null
;
Document
document
=
Jsoup
.
parse
(
html
);
try
{
/***特定网站单独处理**/
if
(
url
.
contains
(
"toutiao.com"
)){
//今日头条帐号匹配
if
(
html
.
contains
(
" source: '"
)){
source
=
"今日头条-"
+
html
.
split
(
" source: '"
)[
1
].
split
(
"',"
)[
0
];
}
}
else
if
(
url
.
contains
(
"sohu.com"
)){
//搜狐自媒体号
if
(
html
.
contains
(
"<meta name=\"mediaid\""
)){
source
=
"搜狐-"
+
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
];
}
}
else
if
(
url
.
contains
(
"a.mini.eastday.com"
)){
//处理东方头条网-自媒体号匹配
source
=
"东方头条-"
+
document
.
select
(
"[class=\"share_cnt_p clearfix\"]"
).
select
(
"div.fl"
).
select
(
"i"
).
get
(
1
).
text
();
}
else
if
(
url
.
contains
(
"sh.qihoo.com"
)){
//今日报点解析
source
=
"今日爆点-"
+
document
.
select
(
"p.info"
).
select
(
"span.source"
).
text
();
}
else
if
(
url
.
contains
(
"cj.sina.com.cn"
)){
//新浪财经头条号
if
(
html
.
contains
(
"<meta name=\"mediaid\""
)){
source
=
"财经头条-"
+
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
];
}
}
else
if
(
url
.
contains
(
"baijia.baidu.com"
)){
//百度百家
source
=
"百家号-"
+
document
.
select
(
"section.info"
).
select
(
"span.author"
).
text
();
}
return
source
;
}
catch
(
Exception
e
)
{
return
null
;
}
}
/**
* @Title: matchChannel
...
...
@@ -154,7 +212,6 @@ public class TreateData {
/**分割正文**/
String
[]
matchTextArr
=
text
.
split
(
"@@@@@@@@@@"
);
if
(
regex
(
fromRegex
,
matchTextArr
[
0
])
!=
null
||
regex
(
fromRegex
,
matchTextArr
[
1
])!=
null
){
if
(
regex
(
fromRegex
,
matchTextArr
[
0
])!=
null
){
source
=
regex
(
fromRegex
,
matchTextArr
[
0
]);
for
(
String
sourceMatch
:
sourceList
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment