Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
574cb605
Commit
574cb605
authored
Jun 30, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修复获取正文bug
parent
ded4bfdb
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
61 additions
and
60 deletions
+61
-60
src/main/java/com/zhiwei/source_forward/crawler/ContentPageProcessor.java
+8
-3
src/main/java/com/zhiwei/source_forward/pipeline/DataPipeline.java
+5
-13
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
+4
-4
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+8
-4
src/test/java/com/zhiwei/source_forward/sourceforward/test/MediaSelfSourceTest.java
+36
-36
No files found.
src/main/java/com/zhiwei/source_forward/crawler/ContentPageProcessor.java
View file @
574cb605
...
...
@@ -2,6 +2,10 @@ package com.zhiwei.source_forward.crawler;
import
java.util.HashMap
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
...
...
@@ -15,10 +19,11 @@ import us.codecraft.webmagic.processor.PageProcessor;
*/
public
class
ContentPageProcessor
implements
PageProcessor
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ContentPageProcessor
.
class
);
private
Site
site
=
Site
.
me
().
setCycleRetryTimes
(
3
).
setSleepTime
(
1500
)
.
setTimeOut
(
10000
)
.
setUserAgent
(
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
.
addHeader
(
"Accept-Encoding"
,
"
gzip,
deflate, br"
)
.
addHeader
(
"Accept-Encoding"
,
"deflate, br"
)
;
@Override
...
...
@@ -32,14 +37,14 @@ public class ContentPageProcessor implements PageProcessor {
String
content
=
null
;
try
{
if
(
page
.
getStatusCode
()!=
404
){
MatchContent
.
matchContent
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
());
content
=
MatchContent
.
matchContent
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
());
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"网页链接失效"
,
e
.
fillInStackTrace
());
content
=
null
;
}
data
.
put
(
"url"
,
page
.
getUrl
().
get
());
data
.
put
(
"content"
,
content
);
page
.
putField
(
"content"
,
data
);
}
...
...
src/main/java/com/zhiwei/source_forward/pipeline/DataPipeline.java
View file @
574cb605
package
com
.
zhiwei
.
source_forward
.
pipeline
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
...
...
@@ -13,19 +14,10 @@ import us.codecraft.webmagic.pipeline.Pipeline;
* @date 2018年6月30日 上午9:54:27
*/
public
class
DataPipeline
implements
Pipeline
{
private
List
<
Map
<
String
,
Object
>>
contentDataList
;
private
List
<
Map
<
String
,
Object
>>
mediaSelfDataList
;
private
List
<
Map
<
String
,
Object
>>
sourceForwardDataList
;
private
List
<
Map
<
String
,
Object
>>
urlLivedataList
;
public
DataPipeline
(
List
<
Map
<
String
,
Object
>>
dataList
,
List
<
Map
<
String
,
Object
>>
contentDataList
,
List
<
Map
<
String
,
Object
>>
mediaSelfDataList
,
List
<
Map
<
String
,
Object
>>
sourceForwardDataList
,
List
<
Map
<
String
,
Object
>>
urlLivedataList
)
{
super
();
this
.
contentDataList
=
contentDataList
;
this
.
mediaSelfDataList
=
mediaSelfDataList
;
this
.
sourceForwardDataList
=
sourceForwardDataList
;
this
.
urlLivedataList
=
urlLivedataList
;
}
private
List
<
Map
<
String
,
Object
>>
contentDataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
private
List
<
Map
<
String
,
Object
>>
mediaSelfDataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
private
List
<
Map
<
String
,
Object
>>
sourceForwardDataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
private
List
<
Map
<
String
,
Object
>>
urlLivedataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
public
DataPipeline
()
{
...
...
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
View file @
574cb605
...
...
@@ -31,12 +31,12 @@ public class ContentMatch {
spider
.
thread
(
5
).
run
();
List
<
Map
<
String
,
Object
>>
contentList
=
pipeline
.
getContentDataList
();
for
(
Map
<
String
,
Object
>
source
Map
:
contentList
){
String
url
=
source
Map
.
get
(
"url"
)+
""
;
//
整合数据及验证转发原创
for
(
Map
<
String
,
Object
>
content
Map
:
contentList
){
String
url
=
content
Map
.
get
(
"url"
)+
""
;
//
搜集原文
if
(
dataMap
.
containsKey
(
url
)){
Map
<
String
,
Object
>
data
=
dataMap
.
get
(
url
);
String
content
=
data
.
get
(
"content"
)+
""
;
String
content
=
contentMap
.
get
(
"content"
)+
""
;
data
.
put
(
"content"
,
content
);
dataMap
.
put
(
url
,
data
);
}
...
...
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
View file @
574cb605
...
...
@@ -2,6 +2,9 @@ package com.zhiwei.source_forward.util;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
cn.edu.hfut.dmic.contentextractor.ContentExtractor
;
import
cn.edu.hfut.dmic.contentextractor.News
;
...
...
@@ -13,7 +16,7 @@ import cn.edu.hfut.dmic.contentextractor.News;
*/
public
class
MatchContent
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
MatchContent
.
class
);
/**
* @Title: matchContent
* @author hero
...
...
@@ -25,10 +28,12 @@ public class MatchContent {
*/
public
static
String
matchContent
(
String
url
,
String
html
)
{
String
content
=
null
;
Document
document
=
Jsoup
.
parse
(
html
);
try
{
Document
document
=
Jsoup
.
parse
(
html
);
content
=
mathchContent
(
html
,
document
);
return
content
;
}
catch
(
Exception
e
)
{
logger
.
debug
(
"获取全文失败"
,
e
.
fillInStackTrace
());
content
=
null
;
}
return
content
;
...
...
@@ -51,9 +56,8 @@ public class MatchContent {
News
news
=
ContentExtractor
.
getNewsByHtml
(
html
);
content
=
TreateData
.
filterSpecialCharacter
(
news
.
getContent
());
}
catch
(
Exception
e
)
{
logger
.
info
(
"正文抽取失败,获取全文文本:{}"
);
content
=
document
.
text
();
System
.
out
.
println
(
"正文抽取失败处理........"
);
e
.
printStackTrace
();
}
return
content
;
}
...
...
src/test/java/com/zhiwei/source_forward/sourceforward/test/MediaSelfSourceTest.java
View file @
574cb605
package
com
.
zhiwei
.
source_forward
.
sourceforward
.
test
;
import
java.util.HashMap
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.source_forward.run.SourceForward
;
/**
* @ClassName: SourceForwardTest
* @Description: 来源验证
* @author hero
* @date 2017年12月6日 上午9:55:13
*/
public
class
MediaSelfSourceTest
{
@Test
public
void
sourceForwardTest
(){
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
=
new
HashMap
<
String
,
Map
<
String
,
Object
>>();
String
url
=
"https://www.toutiao.com/a6549872248428167687/"
;
Map
<
String
,
Object
>
data
=
new
HashMap
<
String
,
Object
>();
dataMap
.
put
(
url
,
data
);
SourceForward
.
getMediaSelfSource
(
dataMap
);
}
}
//
package com.zhiwei.source_forward.sourceforward.test;
//
//
import java.util.HashMap;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.source_forward.run.SourceForward;
//
/
//
**
//
* @ClassName: SourceForwardTest
//
* @Description: 来源验证
//
* @author hero
//
* @date 2017年12月6日 上午9:55:13
//
*/
//
public class MediaSelfSourceTest {
//
//
@Test
//
public void sourceForwardTest(){
//
Map<String,Map<String,Object>> dataMap = new HashMap<String,Map<String,Object>>();
//
String url = "https://www.toutiao.com/a6549872248428167687/";
//
Map<String,Object> data = new HashMap<String,Object>();
//
dataMap.put(url, data);
//
//
SourceForward.getMediaSelfSource(dataMap);
//
//
}
//
//
//
//
//
//
//
//
//
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment