Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
19bb2414
Commit
19bb2414
authored
Aug 22, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改ok初版提交
parent
76581f38
Show whitespace changes
Inline
Side-by-side
Showing
32 changed files
with
1787 additions
and
1562 deletions
+1787
-1562
pom.xml
+21
-31
src/main/java/com/zhiwei/source_forward/bean/ContentBean.java
+78
-0
src/main/java/com/zhiwei/source_forward/bean/MediaSelfSourceBean.java
+92
-0
src/main/java/com/zhiwei/source_forward/bean/SourceForwardBean.java
+102
-0
src/main/java/com/zhiwei/source_forward/bean/UrlLiveBean.java
+78
-0
src/main/java/com/zhiwei/source_forward/content/ContentExtractor.java
+471
-0
src/main/java/com/zhiwei/source_forward/content/News.java
+71
-0
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+121
-0
src/main/java/com/zhiwei/source_forward/crawler/ContentPageProcessor.java
+0
-51
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+143
-0
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourcePageProcessor.java
+0
-56
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+113
-0
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardPageProcessor.java
+0
-70
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+108
-46
src/main/java/com/zhiwei/source_forward/downloader/MyDownLoader.java
+0
-267
src/main/java/com/zhiwei/source_forward/pipeline/DataPipeline.java
+0
-91
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
+60
-34
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+58
-0
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+47
-139
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+41
-15
src/main/java/com/zhiwei/source_forward/spider/MySpider.java
+0
-714
src/main/java/com/zhiwei/source_forward/util/ContentDataCallback.java
+10
-0
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+2
-2
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+2
-2
src/main/java/com/zhiwei/source_forward/util/MediaSelfSourceDataCallBack.java
+10
-0
src/main/java/com/zhiwei/source_forward/util/ProxyClientUtil.java
+37
-0
src/main/java/com/zhiwei/source_forward/util/SourceData.java
+3
-2
src/main/java/com/zhiwei/source_forward/util/SourceForwardDataCallBack.java
+17
-0
src/main/java/com/zhiwei/source_forward/util/TreateData.java
+0
-0
src/main/java/com/zhiwei/source_forward/util/UrlLiveDataCallback.java
+28
-0
src/main/resources/log4j2.xml
+32
-0
src/test/java/com/zhiwei/source_forward/sourceforward/test/URLLiveTest.java
+42
-42
No files found.
pom.xml
View file @
19bb2414
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source_
forward
</artifactId>
<version>
0.0.2
-SNAPSHOT
</version>
<name>
source_
forward
</name>
<artifactId>
source-
forward
</artifactId>
<version>
0.0.3
-SNAPSHOT
</version>
<name>
source-
forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
<properties>
...
...
@@ -21,35 +22,14 @@
<dependencies>
<dependency>
<groupId>
cn.edu.hfut.dmic.webcollector
</groupId>
<artifactId>
WebCollector
</artifactId>
<version>
2.71
</version>
</dependency>
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-core
</artifactId>
<version>
0.6.1
</version>
</dependency>
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-extension
</artifactId>
<version>
0.6.1
</version>
<exclusions>
<exclusion>
<groupId>
org.slf4j
</groupId>
<artifactId>
slf4j-log4j12
</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>
us.codecraft
</groupId>
<artifactId>
webmagic-saxon
</artifactId>
<version>
0.6.1
</version>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.0.2-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
zhiweiTools
</artifactId>
<version>
0.0.6-SNAPSHOT
</version>
<groupId>
com.zhiwei.middleware
</groupId>
<artifactId>
proxy-client
</artifactId>
<version>
0.0.1-RELEASE
</version>
</dependency>
</dependencies>
...
...
@@ -109,4 +89,13 @@
<dependencyManagement>
<dependencies>
<dependency>
<groupId>
com.squareup.okhttp3
</groupId>
<artifactId>
okhttp
</artifactId>
<version>
3.11.0
</version>
</dependency>
</dependencies>
</dependencyManagement>
</project>
\ No newline at end of file
src/main/java/com/zhiwei/source_forward/bean/ContentBean.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
bean
;
public
class
ContentBean
{
private
String
url
;
private
String
content
;
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
String
getContent
()
{
return
content
;
}
public
void
setContent
(
String
content
)
{
this
.
content
=
content
;
}
@Override
public
String
toString
()
{
return
"ContentBean [url="
+
url
+
", content="
+
content
+
"]"
;
}
public
ContentBean
(
String
url
,
String
content
)
{
super
();
this
.
url
=
url
;
this
.
content
=
content
;
}
public
ContentBean
()
{
super
();
}
/**
* @ClassName: Attribution
* @Description: 属性
* @author 0xff
* @date 2018年7月3日 下午5:53:22
*/
public
static
class
Attribution
{
private
Object
attr
;
/**
* Constructor
*
* @param attr
*/
private
Attribution
(
Object
attr
)
{
this
.
attr
=
attr
;
}
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public
static
Attribution
of
(
Object
attr
)
{
return
new
Attribution
(
attr
);
}
/**
* 获取属性
*
* @return Object
*/
public
Object
get
()
{
return
attr
;
}
}
}
src/main/java/com/zhiwei/source_forward/bean/MediaSelfSourceBean.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
bean
;
public
class
MediaSelfSourceBean
{
private
String
url
;
private
String
source
;
private
String
channel
;
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
String
getSource
()
{
return
source
;
}
public
void
setSource
(
String
source
)
{
this
.
source
=
source
;
}
public
String
getChannel
()
{
return
channel
;
}
public
void
setChannel
(
String
channel
)
{
this
.
channel
=
channel
;
}
public
MediaSelfSourceBean
()
{
super
();
}
public
MediaSelfSourceBean
(
String
url
,
String
source
,
String
channel
)
{
super
();
this
.
url
=
url
;
this
.
source
=
source
;
this
.
channel
=
channel
;
}
@Override
public
String
toString
()
{
return
"MediaSelfSourceBean [url="
+
url
+
", source="
+
source
+
", channel="
+
channel
+
"]"
;
}
/**
* @ClassName: Attribution
* @Description: 属性
* @author 0xff
* @date 2018年7月3日 下午5:53:22
*/
public
static
class
Attribution
{
private
Object
attr
;
/**
* Constructor
*
* @param attr
*/
private
Attribution
(
Object
attr
)
{
this
.
attr
=
attr
;
}
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public
static
Attribution
of
(
Object
attr
)
{
return
new
Attribution
(
attr
);
}
/**
* 获取属性
*
* @return Object
*/
public
Object
get
()
{
return
attr
;
}
}
}
src/main/java/com/zhiwei/source_forward/bean/SourceForwardBean.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
bean
;
public
class
SourceForwardBean
{
private
String
url
;
private
String
channel
;
private
String
root_source
;
private
String
isforward
;
public
String
getIsforward
()
{
return
isforward
;
}
public
void
setIsforward
(
String
isforward
)
{
this
.
isforward
=
isforward
;
}
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
String
getChannel
()
{
return
channel
;
}
public
void
setChannel
(
String
channel
)
{
this
.
channel
=
channel
;
}
public
String
getRoot_source
()
{
return
root_source
;
}
public
void
setRoot_source
(
String
root_source
)
{
this
.
root_source
=
root_source
;
}
@Override
public
String
toString
()
{
return
"SourceForwardBean [url="
+
url
+
", channel="
+
channel
+
", root_source="
+
root_source
+
"]"
;
}
public
SourceForwardBean
(
String
url
,
String
channel
,
String
root_source
,
String
isforward
)
{
super
();
this
.
url
=
url
;
this
.
channel
=
channel
;
this
.
root_source
=
root_source
;
this
.
isforward
=
isforward
;
}
public
SourceForwardBean
()
{
super
();
}
/**
* @ClassName: Attribution
* @Description: 属性
* @author 0xff
* @date 2018年7月3日 下午5:53:22
*/
public
static
class
Attribution
{
private
Object
attr
;
/**
* Constructor
*
* @param attr
*/
private
Attribution
(
Object
attr
)
{
this
.
attr
=
attr
;
}
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public
static
Attribution
of
(
Object
attr
)
{
return
new
Attribution
(
attr
);
}
/**
* 获取属性
*
* @return Object
*/
public
Object
get
()
{
return
attr
;
}
}
}
src/main/java/com/zhiwei/source_forward/bean/UrlLiveBean.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
bean
;
public
class
UrlLiveBean
{
private
String
url
;
private
boolean
isLive
;
public
UrlLiveBean
()
{
super
();
}
public
UrlLiveBean
(
String
url
,
boolean
isLive
)
{
super
();
this
.
url
=
url
;
this
.
isLive
=
isLive
;
}
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
boolean
isLive
()
{
return
isLive
;
}
public
void
setLive
(
boolean
isLive
)
{
this
.
isLive
=
isLive
;
}
@Override
public
String
toString
()
{
return
"UrlLiveBean [url="
+
url
+
", isLive="
+
isLive
+
"]"
;
}
/**
* @ClassName: Attribution
* @Description: 属性
* @author 0xff
* @date 2018年7月3日 下午5:53:22
*/
public
static
class
Attribution
{
private
Object
attr
;
/**
* Constructor
*
* @param attr
*/
private
Attribution
(
Object
attr
)
{
this
.
attr
=
attr
;
}
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public
static
Attribution
of
(
Object
attr
)
{
return
new
Attribution
(
attr
);
}
/**
* 获取属性
*
* @return Object
*/
public
Object
get
()
{
return
attr
;
}
}
}
src/main/java/com/zhiwei/source_forward/content/ContentExtractor.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
content
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.concurrent.atomic.AtomicInteger
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.TextNode
;
import
org.jsoup.select.Elements
;
import
org.jsoup.select.NodeVisitor
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
/**
* ContentExtractor could extract content,title,time from news webpage
*
* @author hu
*/
public
class
ContentExtractor
{
public
static
final
Logger
LOG
=
LoggerFactory
.
getLogger
(
ContentExtractor
.
class
);
protected
Document
doc
;
ContentExtractor
(
Document
doc
)
{
this
.
doc
=
doc
;
}
protected
HashMap
<
Element
,
CountInfo
>
infoMap
=
new
HashMap
<
Element
,
CountInfo
>();
class
CountInfo
{
int
textCount
=
0
;
int
linkTextCount
=
0
;
int
tagCount
=
0
;
int
linkTagCount
=
0
;
double
density
=
0
;
double
densitySum
=
0
;
double
score
=
0
;
int
pCount
=
0
;
ArrayList
<
Integer
>
leafList
=
new
ArrayList
<
Integer
>();
}
protected
void
clean
()
{
doc
.
select
(
"script,noscript,style,iframe,br"
).
remove
();
}
protected
CountInfo
computeInfo
(
Node
node
)
{
if
(
node
instanceof
Element
)
{
Element
tag
=
(
Element
)
node
;
CountInfo
countInfo
=
new
CountInfo
();
for
(
Node
childNode
:
tag
.
childNodes
())
{
CountInfo
childCountInfo
=
computeInfo
(
childNode
);
countInfo
.
textCount
+=
childCountInfo
.
textCount
;
countInfo
.
linkTextCount
+=
childCountInfo
.
linkTextCount
;
countInfo
.
tagCount
+=
childCountInfo
.
tagCount
;
countInfo
.
linkTagCount
+=
childCountInfo
.
linkTagCount
;
countInfo
.
leafList
.
addAll
(
childCountInfo
.
leafList
);
countInfo
.
densitySum
+=
childCountInfo
.
density
;
countInfo
.
pCount
+=
childCountInfo
.
pCount
;
}
countInfo
.
tagCount
++;
String
tagName
=
tag
.
tagName
();
if
(
tagName
.
equals
(
"a"
))
{
countInfo
.
linkTextCount
=
countInfo
.
textCount
;
countInfo
.
linkTagCount
++;
}
else
if
(
tagName
.
equals
(
"p"
))
{
countInfo
.
pCount
++;
}
int
pureLen
=
countInfo
.
textCount
-
countInfo
.
linkTextCount
;
int
len
=
countInfo
.
tagCount
-
countInfo
.
linkTagCount
;
if
(
pureLen
==
0
||
len
==
0
)
{
countInfo
.
density
=
0
;
}
else
{
countInfo
.
density
=
(
pureLen
+
0.0
)
/
len
;
}
infoMap
.
put
(
tag
,
countInfo
);
return
countInfo
;
}
else
if
(
node
instanceof
TextNode
)
{
TextNode
tn
=
(
TextNode
)
node
;
CountInfo
countInfo
=
new
CountInfo
();
String
text
=
tn
.
text
();
int
len
=
text
.
length
();
countInfo
.
textCount
=
len
;
countInfo
.
leafList
.
add
(
len
);
return
countInfo
;
}
else
{
return
new
CountInfo
();
}
}
protected
double
computeScore
(
Element
tag
)
{
CountInfo
countInfo
=
infoMap
.
get
(
tag
);
double
var
=
Math
.
sqrt
(
computeVar
(
countInfo
.
leafList
)
+
1
);
double
score
=
Math
.
log
(
var
)
*
countInfo
.
densitySum
*
Math
.
log
(
countInfo
.
textCount
-
countInfo
.
linkTextCount
+
1
)
*
Math
.
log10
(
countInfo
.
pCount
+
2
);
return
score
;
}
protected
double
computeVar
(
ArrayList
<
Integer
>
data
)
{
if
(
data
.
size
()
==
0
)
{
return
0
;
}
if
(
data
.
size
()
==
1
)
{
return
data
.
get
(
0
)
/
2
;
}
double
sum
=
0
;
for
(
Integer
i
:
data
)
{
sum
+=
i
;
}
double
ave
=
sum
/
data
.
size
();
sum
=
0
;
for
(
Integer
i
:
data
)
{
sum
+=
(
i
-
ave
)
*
(
i
-
ave
);
}
sum
=
sum
/
data
.
size
();
return
sum
;
}
public
Element
getContentElement
()
throws
Exception
{
clean
();
computeInfo
(
doc
.
body
());
double
maxScore
=
0
;
Element
content
=
null
;
for
(
Map
.
Entry
<
Element
,
CountInfo
>
entry
:
infoMap
.
entrySet
())
{
Element
tag
=
entry
.
getKey
();
if
(
tag
.
tagName
().
equals
(
"a"
)
||
tag
==
doc
.
body
())
{
continue
;
}
double
score
=
computeScore
(
tag
);
if
(
score
>
maxScore
)
{
maxScore
=
score
;
content
=
tag
;
}
}
if
(
content
==
null
)
{
throw
new
Exception
(
"extraction failed"
);
}
return
content
;
}
public
News
getNews
()
throws
Exception
{
News
news
=
new
News
();
Element
contentElement
;
try
{
contentElement
=
getContentElement
();
news
.
setContentElement
(
contentElement
);
}
catch
(
Exception
ex
)
{
LOG
.
info
(
"news content extraction failed,extraction abort"
,
ex
);
throw
new
Exception
(
ex
);
}
if
(
doc
.
baseUri
()
!=
null
)
{
news
.
setUrl
(
doc
.
baseUri
());
}
try
{
news
.
setTime
(
getTime
(
contentElement
));
}
catch
(
Exception
ex
)
{
LOG
.
info
(
"news title extraction failed"
,
ex
);
}
try
{
news
.
setTitle
(
getTitle
(
contentElement
));
}
catch
(
Exception
ex
)
{
LOG
.
info
(
"title extraction failed"
,
ex
);
}
return
news
;
}
protected
String
getTime
(
Element
contentElement
)
throws
Exception
{
String
regex
=
"([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"
;
Pattern
pattern
=
Pattern
.
compile
(
regex
);
Element
current
=
contentElement
;
for
(
int
i
=
0
;
i
<
2
;
i
++)
{
if
(
current
!=
null
&&
current
!=
doc
.
body
())
{
Element
parent
=
current
.
parent
();
if
(
parent
!=
null
)
{
current
=
parent
;
}
}
}
for
(
int
i
=
0
;
i
<
6
;
i
++)
{
if
(
current
==
null
)
{
break
;
}
String
currentHtml
=
current
.
outerHtml
();
Matcher
matcher
=
pattern
.
matcher
(
currentHtml
);
if
(
matcher
.
find
())
{
return
matcher
.
group
(
1
)
+
"-"
+
matcher
.
group
(
2
)
+
"-"
+
matcher
.
group
(
3
)
+
" "
+
matcher
.
group
(
4
)
+
":"
+
matcher
.
group
(
5
)
+
":"
+
matcher
.
group
(
6
);
}
if
(
current
!=
doc
.
body
())
{
current
=
current
.
parent
();
}
}
try
{
return
getDate
(
contentElement
);
}
catch
(
Exception
ex
)
{
throw
new
Exception
(
"time not found"
);
}
}
protected
String
getDate
(
Element
contentElement
)
throws
Exception
{
String
regex
=
"([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"
;
Pattern
pattern
=
Pattern
.
compile
(
regex
);
Element
current
=
contentElement
;
for
(
int
i
=
0
;
i
<
2
;
i
++)
{
if
(
current
!=
null
&&
current
!=
doc
.
body
())
{
Element
parent
=
current
.
parent
();
if
(
parent
!=
null
)
{
current
=
parent
;
}
}
}
for
(
int
i
=
0
;
i
<
6
;
i
++)
{
if
(
current
==
null
)
{
break
;
}
String
currentHtml
=
current
.
outerHtml
();
Matcher
matcher
=
pattern
.
matcher
(
currentHtml
);
if
(
matcher
.
find
())
{
return
matcher
.
group
(
1
)
+
"-"
+
matcher
.
group
(
2
)
+
"-"
+
matcher
.
group
(
3
);
}
if
(
current
!=
doc
.
body
())
{
current
=
current
.
parent
();
}
}
throw
new
Exception
(
"date not found"
);
}
protected
double
strSim
(
String
a
,
String
b
)
{
int
len1
=
a
.
length
();
int
len2
=
b
.
length
();
if
(
len1
==
0
||
len2
==
0
)
{
return
0
;
}
double
ratio
;
if
(
len1
>
len2
)
{
ratio
=
(
len1
+
0.0
)
/
len2
;
}
else
{
ratio
=
(
len2
+
0.0
)
/
len1
;
}
if
(
ratio
>=
3
)
{
return
0
;
}
return
(
lcs
(
a
,
b
)
+
0.0
)
/
Math
.
max
(
len1
,
len2
);
}
protected
String
getTitle
(
final
Element
contentElement
)
throws
Exception
{
final
ArrayList
<
Element
>
titleList
=
new
ArrayList
<
Element
>();
final
ArrayList
<
Double
>
titleSim
=
new
ArrayList
<
Double
>();
final
AtomicInteger
contentIndex
=
new
AtomicInteger
();
final
String
metaTitle
=
doc
.
title
().
trim
();
if
(!
metaTitle
.
isEmpty
())
{
doc
.
body
().
traverse
(
new
NodeVisitor
()
{
@Override
public
void
head
(
Node
node
,
int
i
)
{
if
(
node
instanceof
Element
)
{
Element
tag
=
(
Element
)
node
;
if
(
tag
==
contentElement
)
{
contentIndex
.
set
(
titleList
.
size
());
return
;
}
String
tagName
=
tag
.
tagName
();
if
(
Pattern
.
matches
(
"h[1-6]"
,
tagName
))
{
String
title
=
tag
.
text
().
trim
();
double
sim
=
strSim
(
title
,
metaTitle
);
titleSim
.
add
(
sim
);
titleList
.
add
(
tag
);
}
}
}
@Override
public
void
tail
(
Node
node
,
int
i
)
{
}
});
int
index
=
contentIndex
.
get
();
if
(
index
>
0
)
{
double
maxScore
=
0
;
int
maxIndex
=
-
1
;
for
(
int
i
=
0
;
i
<
index
;
i
++)
{
double
score
=
(
i
+
1
)
*
titleSim
.
get
(
i
);
if
(
score
>
maxScore
)
{
maxScore
=
score
;
maxIndex
=
i
;
}
}
if
(
maxIndex
!=
-
1
)
{
return
titleList
.
get
(
maxIndex
).
text
();
}
}
}
Elements
titles
=
doc
.
body
().
select
(
"*[id^=title],*[id$=title],*[class^=title],*[class$=title]"
);
if
(
titles
.
size
()
>
0
)
{
String
title
=
titles
.
first
().
text
();
if
(
title
.
length
()
>
5
&&
title
.
length
()<
40
)
{
return
titles
.
first
().
text
();
}
}
try
{
return
getTitleByEditDistance
(
contentElement
);
}
catch
(
Exception
ex
)
{
throw
new
Exception
(
"title not found"
);
}
}
protected
String
getTitleByEditDistance
(
Element
contentElement
)
throws
Exception
{
final
String
metaTitle
=
doc
.
title
();
final
ArrayList
<
Double
>
max
=
new
ArrayList
<
Double
>();
max
.
add
(
0.0
);
final
StringBuilder
sb
=
new
StringBuilder
();
doc
.
body
().
traverse
(
new
NodeVisitor
()
{
public
void
head
(
Node
node
,
int
i
)
{
if
(
node
instanceof
TextNode
)
{
TextNode
tn
=
(
TextNode
)
node
;
String
text
=
tn
.
text
().
trim
();
double
sim
=
strSim
(
text
,
metaTitle
);
if
(
sim
>
0
)
{
if
(
sim
>
max
.
get
(
0
))
{
max
.
set
(
0
,
sim
);
sb
.
setLength
(
0
);
sb
.
append
(
text
);
}
}
}
}
public
void
tail
(
Node
node
,
int
i
)
{
}
});
if
(
sb
.
length
()
>
0
)
{
return
sb
.
toString
();
}
throw
new
Exception
();
}
protected
int
lcs
(
String
x
,
String
y
)
{
int
M
=
x
.
length
();
int
N
=
y
.
length
();
if
(
M
==
0
||
N
==
0
)
{
return
0
;
}
int
[][]
opt
=
new
int
[
M
+
1
][
N
+
1
];
for
(
int
i
=
M
-
1
;
i
>=
0
;
i
--)
{
for
(
int
j
=
N
-
1
;
j
>=
0
;
j
--)
{
if
(
x
.
charAt
(
i
)
==
y
.
charAt
(
j
))
{
opt
[
i
][
j
]
=
opt
[
i
+
1
][
j
+
1
]
+
1
;
}
else
{
opt
[
i
][
j
]
=
Math
.
max
(
opt
[
i
+
1
][
j
],
opt
[
i
][
j
+
1
]);
}
}
}
return
opt
[
0
][
0
];
}
protected
int
editDistance
(
String
word1
,
String
word2
)
{
int
len1
=
word1
.
length
();
int
len2
=
word2
.
length
();
int
[][]
dp
=
new
int
[
len1
+
1
][
len2
+
1
];
for
(
int
i
=
0
;
i
<=
len1
;
i
++)
{
dp
[
i
][
0
]
=
i
;
}
for
(
int
j
=
0
;
j
<=
len2
;
j
++)
{
dp
[
0
][
j
]
=
j
;
}
for
(
int
i
=
0
;
i
<
len1
;
i
++)
{
char
c1
=
word1
.
charAt
(
i
);
for
(
int
j
=
0
;
j
<
len2
;
j
++)
{
char
c2
=
word2
.
charAt
(
j
);
if
(
c1
==
c2
)
{
dp
[
i
+
1
][
j
+
1
]
=
dp
[
i
][
j
];
}
else
{
int
replace
=
dp
[
i
][
j
]
+
1
;
int
insert
=
dp
[
i
][
j
+
1
]
+
1
;
int
delete
=
dp
[
i
+
1
][
j
]
+
1
;
int
min
=
replace
>
insert
?
insert
:
replace
;
min
=
delete
>
min
?
min
:
delete
;
dp
[
i
+
1
][
j
+
1
]
=
min
;
}
}
}
return
dp
[
len1
][
len2
];
}
/*输入Jsoup的Document,获取正文所在Element*/
public
static
Element
getContentElementByDoc
(
Document
doc
)
throws
Exception
{
ContentExtractor
ce
=
new
ContentExtractor
(
doc
);
return
ce
.
getContentElement
();
}
/*输入HTML,获取正文所在Element*/
public
static
Element
getContentElementByHtml
(
String
html
)
throws
Exception
{
Document
doc
=
Jsoup
.
parse
(
html
);
return
getContentElementByDoc
(
doc
);
}
/*输入HTML和URL,获取正文所在Element*/
public
static
Element
getContentElementByHtml
(
String
html
,
String
url
)
throws
Exception
{
Document
doc
=
Jsoup
.
parse
(
html
,
url
);
return
getContentElementByDoc
(
doc
);
}
/*输入Jsoup的Document,获取正文文本*/
public
static
String
getContentByDoc
(
Document
doc
)
throws
Exception
{
ContentExtractor
ce
=
new
ContentExtractor
(
doc
);
return
ce
.
getContentElement
().
text
();
}
/*输入HTML,获取正文文本*/
public
static
String
getContentByHtml
(
String
html
)
throws
Exception
{
Document
doc
=
Jsoup
.
parse
(
html
);
return
getContentElementByDoc
(
doc
).
text
();
}
/*输入HTML和URL,获取正文文本*/
public
static
String
getContentByHtml
(
String
html
,
String
url
)
throws
Exception
{
Document
doc
=
Jsoup
.
parse
(
html
,
url
);
return
getContentElementByDoc
(
doc
).
text
();
}
/*输入Jsoup的Document,获取结构化新闻信息*/
public
static
News
getNewsByDoc
(
Document
doc
)
throws
Exception
{
ContentExtractor
ce
=
new
ContentExtractor
(
doc
);
return
ce
.
getNews
();
}
/*输入HTML,获取结构化新闻信息*/
public
static
News
getNewsByHtml
(
String
html
)
throws
Exception
{
Document
doc
=
Jsoup
.
parse
(
html
);
return
getNewsByDoc
(
doc
);
}
/*输入HTML和URL,获取结构化新闻信息*/
public
static
News
getNewsByHtml
(
String
html
,
String
url
)
throws
Exception
{
Document
doc
=
Jsoup
.
parse
(
html
,
url
);
return
getNewsByDoc
(
doc
);
}
}
src/main/java/com/zhiwei/source_forward/content/News.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
content
;
import
org.jsoup.nodes.Element
;
/**
*
* @author hu
*/
public
class
News
{
protected
String
url
=
null
;
protected
String
title
=
null
;
protected
String
content
=
null
;
protected
String
time
=
null
;
protected
Element
contentElement
=
null
;
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
String
getTitle
()
{
return
title
;
}
public
void
setTitle
(
String
title
)
{
this
.
title
=
title
;
}
public
String
getContent
()
{
if
(
content
==
null
)
{
if
(
contentElement
!=
null
)
{
content
=
contentElement
.
text
();
}
}
return
content
;
}
public
void
setContent
(
String
content
)
{
this
.
content
=
content
;
}
public
String
getTime
()
{
return
time
;
}
public
void
setTime
(
String
time
)
{
this
.
time
=
time
;
}
@Override
public
String
toString
()
{
return
"URL:\n"
+
url
+
"\nTITLE:\n"
+
title
+
"\nTIME:\n"
+
time
+
"\nCONTENT:\n"
+
getContent
()
+
"\nCONTENT(SOURCE):\n"
+
contentElement
;
}
public
Element
getContentElement
()
{
return
contentElement
;
}
public
void
setContentElement
(
Element
contentElement
)
{
this
.
contentElement
=
contentElement
;
}
}
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.source_forward.util.ProxyClientUtil
;
import
com.zhiwei.tools.httpclient.HttpBoot
;
import
com.zhiwei.tools.httpclient.HttpRequestBuilder
;
import
com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter
;
import
okhttp3.Request
;
import
okhttp3.Response
;
public
class
ContentCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public
MultiThreadingCounter
submitTask
(
ContentDataCallback
callback
,
String
...
urls
)
throws
Exception
{
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private
void
start
(
MultiThreadingCounter
counter
,
ContentDataCallback
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
if
(
url
!=
null
)
{
try
{
counter
.
increase
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"关键词 {} 搜索创建出错: {}"
,
e
.
getMessage
());
}
finally
{
counter
.
reduce
();
}
}
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
null
);
counter
.
increase
();
HttpBoot
.
asyncCall
(
request
,
ProxyClientUtil
.
getNATProxy
(),
false
).
addListeners
(
future
->
{
if
(
future
.
isSuccess
())
{
Response
response
=
future
.
result
();
try
{
parseHtml
(
response
,
attr
,
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
}
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
}
counter
.
reduce
();
});
return
counter
;
}
/**
*
*
* @Description 获取正文解析
* @param response
* @param attr
* @param callback
*/
private
void
parseHtml
(
Response
response
,
Attribution
attr
,
ContentDataCallback
callback
)
{
String
content
=
null
;
try
{
if
(
response
.
isSuccessful
()){
String
html
=
response
.
body
().
string
();
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
html
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"网页链接失效"
,
e
.
fillInStackTrace
());
}
finally
{
if
(
response
!=
null
)
{
response
.
close
();
}
}
ContentBean
cb
=
new
ContentBean
(
attr
.
get
().
toString
(),
content
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
cb
,
attr
);
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/ContentPageProcessor.java
deleted
100644 → 0
View file @
76581f38
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.HashMap
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
/**
* @ClassName: ContentPageProcessor
* @Description: 获取文章内容
* @author hero
* @date 2018年6月30日 上午9:54:02
*/
public
class
ContentPageProcessor
implements
PageProcessor
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ContentPageProcessor
.
class
);
private
Site
site
=
Site
.
me
().
setCycleRetryTimes
(
3
).
setSleepTime
(
1500
)
.
setTimeOut
(
10000
)
.
setUserAgent
(
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
.
addHeader
(
"Accept-Encoding"
,
"deflate, br"
)
;
@Override
public
Site
getSite
()
{
return
site
;
}
@Override
public
void
process
(
Page
page
)
{
Map
<
String
,
String
>
data
=
new
HashMap
<
String
,
String
>();
String
content
=
null
;
try
{
if
(
page
.
getStatusCode
()!=
404
){
content
=
MatchContent
.
matchContent
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
());
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"网页链接失效"
,
e
.
fillInStackTrace
());
content
=
null
;
}
data
.
put
(
"url"
,
page
.
getUrl
().
get
());
data
.
put
(
"content"
,
content
);
page
.
putField
(
"content"
,
data
);
}
}
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.List
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
import
com.zhiwei.source_forward.util.ProxyClientUtil
;
import
com.zhiwei.tools.httpclient.HttpBoot
;
import
com.zhiwei.tools.httpclient.HttpRequestBuilder
;
import
com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter
;
import
okhttp3.Request
;
import
okhttp3.Response
;
/**
*
* @ClassName MediaSelfSourceCrawler
* @Description 自媒体号匹配
* @author byte-zbs
* @Date 2018年8月21日 下午3:54:03
* @version 1.0.0
*/
public
class
MediaSelfSourceCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
MediaSelfSourceCrawler
.
class
);
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public
MultiThreadingCounter
submitTask
(
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
throws
Exception
{
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private
void
start
(
MultiThreadingCounter
counter
,
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
if
(
url
!=
null
)
{
try
{
counter
.
increase
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"关键词 {} 搜索创建出错: {}"
,
e
.
getMessage
());
}
finally
{
counter
.
reduce
();
}
}
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
null
);
counter
.
increase
();
HttpBoot
.
asyncCall
(
request
,
ProxyClientUtil
.
getNATProxy
(),
false
).
addListeners
(
future
->
{
if
(
future
.
isSuccess
())
{
Response
response
=
future
.
result
();
try
{
parseHtml
(
response
,
attr
,
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
}
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
}
counter
.
reduce
();
});
return
counter
;
}
/**
*
* @Description 解析文章获取相关数据
* @param response
* @param attr
* @param callback
*/
private
void
parseHtml
(
Response
response
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
String
source
=
null
;
String
channel
=
null
;
try
{
if
(
response
.
isSuccessful
()){
String
html
=
response
.
body
().
string
();
source
=
MatchSource
.
matchMediaSelfSource
(
attr
.
get
().
toString
(),
html
);
if
(
source
==
null
||
source
.
equals
(
""
)){
source
=
null
;
}
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
Jsoup
.
parse
(
html
).
head
().
childNodes
();
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
}
}
catch
(
Exception
e
)
{
source
=
null
;
}
finally
{
if
(
response
!=
null
)
{
response
.
close
();
}
}
logger
.
info
(
attr
.
get
()+
"================="
+
source
);
MediaSelfSourceBean
msfb
=
new
MediaSelfSourceBean
(
attr
.
get
().
toString
(),
source
,
channel
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
msfb
,
attr
);
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourcePageProcessor.java
deleted
100644 → 0
View file @
76581f38
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
public
class
MediaSelfSourcePageProcessor
implements
PageProcessor
{
private
Site
site
=
Site
.
me
().
setCycleRetryTimes
(
3
).
setSleepTime
(
1500
)
.
setTimeOut
(
10000
)
.
setUserAgent
(
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
.
addHeader
(
"Accept-Encoding"
,
"gzip, deflate, br"
)
;
@Override
public
Site
getSite
()
{
return
site
;
}
@Override
public
void
process
(
Page
page
)
{
Map
<
String
,
String
>
data
=
new
HashMap
<
String
,
String
>();
String
source
=
null
;
String
channel
=
null
;
try
{
if
(
page
.
getStatusCode
()!=
404
){
source
=
MatchSource
.
matchMediaSelfSource
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
());
if
(
source
==
null
||
source
.
equals
(
""
)){
source
=
null
;
}
channel
=
MatchChannel
.
verifyChannel
(
page
.
getUrl
().
get
());
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
}
}
catch
(
Exception
e
)
{
source
=
null
;
}
System
.
out
.
println
(
page
.
getUrl
().
get
()+
"================="
+
source
);
data
.
put
(
"url"
,
page
.
getUrl
().
get
());
data
.
put
(
"mediaself"
,
source
);
data
.
put
(
"channel"
,
channel
);
page
.
putField
(
"mediaSelf"
,
data
);
}
}
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.List
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.ProxyClientUtil
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.tools.httpclient.HttpBoot
;
import
com.zhiwei.tools.httpclient.HttpRequestBuilder
;
import
com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter
;
import
okhttp3.Request
;
import
okhttp3.Response
;
public
class
SourceForwardCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SourceForwardCrawler
.
class
);
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
public
MultiThreadingCounter
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
throws
Exception
{
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
private
void
start
(
MultiThreadingCounter
counter
,
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
if
(
url
!=
null
)
{
try
{
counter
.
increase
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"关键词 {} 搜索创建出错: {}"
,
e
.
getMessage
());
}
finally
{
counter
.
reduce
();
}
}
}
}
}
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
String
url
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
null
);
counter
.
increase
();
HttpBoot
.
asyncCall
(
request
,
ProxyClientUtil
.
getNATProxy
(),
false
).
addListeners
(
future
->
{
if
(
future
.
isSuccess
())
{
Response
response
=
future
.
result
();
try
{
parseHtml
(
response
,
attr
,
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
}
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
}
counter
.
reduce
();
});
return
counter
;
}
private
void
parseHtml
(
Response
response
,
Attribution
attr
,
SourceForwardDataCallBack
callback
)
{
String
source
=
null
;
String
channel
=
"新闻"
;
String
isforward
=
"未知"
;
try
{
if
(
response
.
isSuccessful
()){
Document
document
=
Jsoup
.
parse
(
response
.
body
().
string
());
if
(
attr
.
get
().
toString
().
contains
(
"mp.weixin.qq.com"
)){
isforward
=
document
.
select
(
"div#meta_content"
).
select
(
"span#copyright_logo"
).
text
();
if
(!
"原创"
.
equals
(
isforward
)){
isforward
=
"未知"
;
}
}
else
{
channel
=
MatchChannel
.
verifyChannel
(
attr
.
get
().
toString
());
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
document
.
head
().
childNodes
();
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
source
=
MatchSource
.
matchSource
(
attr
.
get
().
toString
(),
document
.
toString
(),
sourceList
);
}
}
}
catch
(
Exception
e
)
{
source
=
null
;
channel
=
"新闻"
;
}
finally
{
if
(
response
!=
null
)
{
response
.
close
();
}
}
logger
.
info
(
attr
.
get
().
toString
()+
"======="
+
channel
+
"================="
+
source
);
SourceForwardBean
sfb
=
new
SourceForwardBean
(
attr
.
get
().
toString
(),
channel
,
source
,
isforward
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
sfb
,
attr
);
}
}
}
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardPageProcessor.java
deleted
100644 → 0
View file @
76581f38
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.SourceData
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
public
class
SourceForwardPageProcessor
implements
PageProcessor
{
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
private
Site
site
=
Site
.
me
().
setCycleRetryTimes
(
3
).
setSleepTime
(
1500
)
.
setTimeOut
(
10000
)
.
addHeader
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
.
addHeader
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
)
;
@Override
public
Site
getSite
()
{
return
site
;
}
@Override
public
void
process
(
Page
page
)
{
Map
<
String
,
String
>
data
=
new
HashMap
<
String
,
String
>();
String
source
=
null
;
String
channel
=
"新闻"
;
try
{
if
(
page
.
getStatusCode
()!=
404
){
if
(
page
.
getUrl
().
get
().
contains
(
"mp.weixin.qq.com"
)){
String
isforward
=
"未知"
;
Document
document
=
page
.
getHtml
().
getDocument
();
isforward
=
document
.
select
(
"div#meta_content"
).
select
(
"span#copyright_logo"
).
text
();
if
(!
"原创"
.
equals
(
isforward
)){
isforward
=
"未知"
;
}
data
.
put
(
"isforward"
,
isforward
);
}
else
{
channel
=
MatchChannel
.
verifyChannel
(
page
.
getUrl
().
get
());
if
(
channel
==
null
){
List
<
Node
>
nodeList
=
page
.
getHtml
().
getDocument
().
head
().
childNodes
();
channel
=
MatchChannel
.
matchChannel
(
nodeList
);
}
source
=
MatchSource
.
matchSource
(
page
.
getUrl
().
get
(),
page
.
getHtml
().
toString
(),
sourceList
);
}
}
}
catch
(
Exception
e
)
{
source
=
null
;
channel
=
"新闻"
;
}
System
.
out
.
println
(
page
.
getUrl
().
get
()+
"======="
+
channel
+
"================="
+
source
);
data
.
put
(
"url"
,
page
.
getUrl
().
get
());
data
.
put
(
"channel"
,
channel
);
data
.
put
(
"root_source"
,
source
);
page
.
putField
(
"sourceForward"
,
data
);
}
}
src/main/java/com/zhiwei/source_forward/crawler/UrlLive
PageProcesso
r.java
→
src/main/java/com/zhiwei/source_forward/crawler/UrlLive
Crawle
r.java
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.HashMap
;
import
java.io.IOException
;
import
java.util.List
;
import
java.util.Map
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.processor.PageProcessor
;
public
class
UrlLivePageProcessor
implements
PageProcessor
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
UrlLivePageProcessor
.
class
);
private
Site
site
=
Site
.
me
().
setCycleRetryTimes
(
3
).
setSleepTime
(
1500
)
.
setTimeOut
(
15000
)
.
addHeader
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
.
addHeader
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
);
@Override
public
void
process
(
Page
page
)
{
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.ProxyClientUtil
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.tools.httpclient.HttpBoot
;
import
com.zhiwei.tools.httpclient.HttpRequestBuilder
;
import
com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter
;
import
okhttp3.Request
;
import
okhttp3.Response
;
/**
*
* @ClassName UrlLiveCrawler
* @Description 判断页面是否存在
* @author byte-zbs
* @Date 2018年8月20日 下午3:34:57
* @version 1.0.0
*/
public
class
UrlLiveCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
public
MultiThreadingCounter
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
throws
Exception
{
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
private
void
start
(
MultiThreadingCounter
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
if
(
url
!=
null
)
{
try
{
counter
.
increase
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"关键词 {} 搜索创建出错: {}"
,
e
.
getMessage
());
}
finally
{
counter
.
reduce
();
}
}
}
}
}
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
String
url
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
null
);
counter
.
increase
();
HttpBoot
.
asyncCall
(
request
,
ProxyClientUtil
.
getNATProxy
(),
false
).
addListeners
(
future
->
{
if
(
future
.
isSuccess
())
{
Response
response
=
future
.
result
();
try
{
parseHtml
(
response
,
attr
,
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
}
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
}
counter
.
reduce
();
});
return
counter
;
}
private
void
parseHtml
(
Response
response
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
/***验证网页是否能够连通*/
boolean
f
=
true
;
if
(
page
!=
null
){
if
(
page
.
getStatusCode
()==
200
){
f
=
matchDel
(
page
);
}
else
if
(
page
.
getStatusCode
()==
404
){
f
=
true
;
if
(!
response
.
isSuccessful
()){
try
{
f
=
matchDel
(
response
.
body
().
string
(),
attr
.
get
().
toString
());
}
catch
(
IOException
e
)
{
logger
.
info
(
"数据判断出错 {}"
,
e
.
getMessage
());
}
finally
{
if
(
response
!=
null
)
{
response
.
close
();
}
}
}
else
{
f
=
false
;
}
UrlLiveBean
ulb
=
new
UrlLiveBean
(
attr
.
get
().
toString
(),
f
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
ulb
,
attr
);
}
Map
<
String
,
Object
>
data
=
new
HashMap
<
String
,
Object
>();
data
.
put
(
"url"
,
page
.
getUrl
().
get
());
data
.
put
(
"live"
,
f
);
page
.
putField
(
"urlLive"
,
data
);
}
@Override
public
Site
getSite
()
{
return
site
;
}
/***
* @Title: matchDel
* @author hero
...
...
@@ -53,59 +109,59 @@ public class UrlLivePageProcessor implements PageProcessor{
* @param @return 设定文件
* @return boolean 返回类型
*/
public
boolean
matchDel
(
Page
page
){
public
boolean
matchDel
(
String
result
,
String
url
){
int
step
=
1
;
Document
doc
=
page
.
getHtml
().
getDocument
(
);
Document
doc
=
Jsoup
.
parse
(
result
);
if
(
rulerHead
(
doc
)){
logger
.
info
(
"{}检测规则:第{}步"
,
page
.
getUrl
()
,
step
);
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
true
;
}
step
++;
if
(
rulerYaoyan
(
doc
))
{
logger
.
info
(
"{}检测规则:第{}步"
,
page
.
getUrl
()
,
step
);
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
true
;
}
step
++;
if
(
rulerWeigui
(
doc
))
{
logger
.
info
(
"{}检测规则:第{}步"
,
page
.
getUrl
()
,
step
);
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
true
;
}
step
++;
if
(
rulerTousu
(
doc
))
{
logger
.
info
(
"{}检测规则:第{}步"
,
page
.
getUrl
()
,
step
);
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
true
;
}
step
++;
if
(
page
.
getUrl
().
get
()
.
contains
(
"huanqiu.com"
))
if
(
url
.
contains
(
"huanqiu.com"
))
{
logger
.
info
(
"{}检测规则:第{}步"
,
page
.
getUrl
()
,
step
);
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
rulerHuanqiuWuxiao
(
doc
);
}
step
++;
//7
if
(
rulerBucunzai
(
doc
))
{
logger
.
info
(
"{}检测规则:第{}步"
,
page
.
getUrl
()
,
step
);
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
true
;
}
step
++;
//8
if
(
rulerKong
(
doc
))
{
logger
.
info
(
"{}检测规则:第{}步"
,
page
.
getUrl
()
,
step
);
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
true
;
}
step
++;
//9
if
(
rulerZhaoshang
(
doc
))
{
logger
.
info
(
"{}检测规则:第{}步"
,
page
.
getUrl
()
,
step
);
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
true
;
}
step
++;
//11
if
(
rulerYidian
(
doc
))
{
logger
.
info
(
"{}检测规则:第{}步"
,
page
.
getUrl
()
,
step
);
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
true
;
}
return
false
;
...
...
@@ -302,6 +358,12 @@ public class UrlLivePageProcessor implements PageProcessor{
return
true
;
}
}
if
(
node
.
outerHtml
().
contains
(
"meta"
))
{
String
meta
=
node
.
toString
();
if
(
meta
.
contains
(
"公益404页面"
))
{
return
true
;
}
}
}
}
catch
(
Exception
e
)
{
return
false
;
...
...
src/main/java/com/zhiwei/source_forward/downloader/MyDownLoader.java
deleted
100644 → 0
View file @
76581f38
package
com
.
zhiwei
.
source_forward
.
downloader
;
import
java.io.IOException
;
import
java.net.UnknownHostException
;
import
java.nio.charset.Charset
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.Set
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.HttpHost
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.NameValuePair
;
import
org.apache.http.client.config.CookieSpecs
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpUriRequest
;
import
org.apache.http.client.methods.RequestBuilder
;
import
org.apache.http.conn.ConnectTimeoutException
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.util.EntityUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.downloader.AbstractDownloader
;
import
us.codecraft.webmagic.downloader.HttpClientGenerator
;
import
us.codecraft.webmagic.proxy.Proxy
;
import
us.codecraft.webmagic.selector.PlainText
;
import
us.codecraft.webmagic.utils.HttpConstant
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
us.codecraft.webmagic.utils.WMCollections
;
public
class
MyDownLoader
extends
AbstractDownloader
{
private
Logger
logger
=
LoggerFactory
.
getLogger
(
getClass
());
private
final
Map
<
String
,
CloseableHttpClient
>
httpClients
=
new
HashMap
<
String
,
CloseableHttpClient
>();
private
HttpClientGenerator
httpClientGenerator
=
new
HttpClientGenerator
();
private
CloseableHttpClient
getHttpClient
(
Site
site
,
Proxy
proxy
)
{
if
(
site
==
null
)
{
return
httpClientGenerator
.
getClient
(
null
,
proxy
);
}
String
domain
=
site
.
getDomain
();
CloseableHttpClient
httpClient
=
httpClients
.
get
(
domain
);
if
(
httpClient
==
null
)
{
synchronized
(
this
)
{
httpClient
=
httpClients
.
get
(
domain
);
if
(
httpClient
==
null
)
{
httpClient
=
httpClientGenerator
.
getClient
(
site
,
proxy
);
httpClients
.
put
(
domain
,
httpClient
);
}
}
}
return
httpClient
;
}
@Override
public
Page
download
(
Request
request
,
Task
task
){
Site
site
=
null
;
if
(
task
!=
null
)
{
site
=
task
.
getSite
();
}
Set
<
Integer
>
acceptStatCode
;
String
charset
=
null
;
Map
<
String
,
String
>
headers
=
null
;
if
(
site
!=
null
)
{
acceptStatCode
=
site
.
getAcceptStatCode
();
charset
=
site
.
getCharset
();
headers
=
site
.
getHeaders
();
}
else
{
acceptStatCode
=
WMCollections
.
newHashSet
(
200
);
}
logger
.
info
(
"downloading page {}"
,
request
.
getUrl
());
CloseableHttpResponse
httpResponse
=
null
;
int
statusCode
=
0
;
try
{
HttpHost
proxyHost
=
null
;
Proxy
proxy
=
null
;
//TODO
if
(
site
.
getHttpProxyPool
()
!=
null
&&
site
.
getHttpProxyPool
().
isEnable
())
{
proxy
=
site
.
getHttpProxyFromPool
();
proxyHost
=
proxy
.
getHttpHost
();
}
else
if
(
site
.
getHttpProxy
()!=
null
){
proxyHost
=
site
.
getHttpProxy
();
}
HttpUriRequest
httpUriRequest
=
getHttpUriRequest
(
request
,
site
,
headers
,
proxyHost
);
//���������˴���
httpResponse
=
getHttpClient
(
site
,
proxy
).
execute
(
httpUriRequest
);
//getHttpClient�������˴�����֤
statusCode
=
httpResponse
.
getStatusLine
().
getStatusCode
();
request
.
putExtra
(
Request
.
STATUS_CODE
,
statusCode
);
if
(
statusAccept
(
acceptStatCode
,
statusCode
))
{
Page
page
=
handleResponse
(
request
,
charset
,
httpResponse
,
task
);
onSuccess
(
request
);
return
page
;
}
else
{
logger
.
warn
(
"get page {} error, status code {} "
,
request
.
getUrl
(),
statusCode
);
return
null
;
}
}
catch
(
ConnectTimeoutException
e
)
{
logger
.
warn
(
"download page {} error"
,
request
.
getUrl
(),
e
);
onError
(
request
);
Page
page
=
new
Page
();
page
.
setStatusCode
(
404
);
page
.
setUrl
(
new
PlainText
(
request
.
getUrl
()));
page
.
setRawText
(
null
);
return
page
;
}
catch
(
UnknownHostException
e
)
{
logger
.
warn
(
"download page {} error"
,
request
.
getUrl
(),
e
);
onError
(
request
);
Page
page
=
new
Page
();
page
.
setStatusCode
(
404
);
page
.
setUrl
(
new
PlainText
(
request
.
getUrl
()));
page
.
setRawText
(
null
);
return
page
;
}
catch
(
IOException
e
)
{
logger
.
warn
(
"download page {} error"
,
request
.
getUrl
(),
e
);
if
(
site
.
getCycleRetryTimes
()
>
0
)
{
return
addToCycleRetry
(
request
,
site
);
}
onError
(
request
);
return
null
;
}
finally
{
request
.
putExtra
(
Request
.
STATUS_CODE
,
statusCode
);
if
(
site
.
getHttpProxyPool
()!=
null
&&
site
.
getHttpProxyPool
().
isEnable
())
{
site
.
returnHttpProxyToPool
((
HttpHost
)
request
.
getExtra
(
Request
.
PROXY
),
(
Integer
)
request
.
getExtra
(
Request
.
STATUS_CODE
));
}
try
{
if
(
httpResponse
!=
null
)
{
//ensure the connection is released back to pool
EntityUtils
.
consume
(
httpResponse
.
getEntity
());
}
}
catch
(
IOException
e
)
{
logger
.
warn
(
"close response fail"
,
e
);
}
}
}
@Override
public
void
setThread
(
int
thread
)
{
httpClientGenerator
.
setPoolSize
(
thread
);
}
protected
boolean
statusAccept
(
Set
<
Integer
>
acceptStatCode
,
int
statusCode
)
{
return
acceptStatCode
.
contains
(
statusCode
);
}
protected
HttpUriRequest
getHttpUriRequest
(
Request
request
,
Site
site
,
Map
<
String
,
String
>
headers
,
HttpHost
proxy
)
{
RequestBuilder
requestBuilder
=
selectRequestMethod
(
request
).
setUri
(
request
.
getUrl
());
if
(
headers
!=
null
)
{
for
(
Map
.
Entry
<
String
,
String
>
headerEntry
:
headers
.
entrySet
())
{
requestBuilder
.
addHeader
(
headerEntry
.
getKey
(),
headerEntry
.
getValue
());
}
}
@SuppressWarnings
(
"deprecation"
)
RequestConfig
.
Builder
requestConfigBuilder
=
RequestConfig
.
custom
()
.
setConnectionRequestTimeout
(
site
.
getTimeOut
())
.
setSocketTimeout
(
site
.
getTimeOut
())
.
setConnectTimeout
(
site
.
getTimeOut
())
.
setCookieSpec
(
CookieSpecs
.
BEST_MATCH
);
if
(
proxy
!=
null
)
{
requestConfigBuilder
.
setProxy
(
proxy
);
request
.
putExtra
(
Request
.
PROXY
,
proxy
);
}
requestBuilder
.
setConfig
(
requestConfigBuilder
.
build
());
return
requestBuilder
.
build
();
}
protected
RequestBuilder
selectRequestMethod
(
Request
request
)
{
String
method
=
request
.
getMethod
();
if
(
method
==
null
||
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
GET
))
{
//default get
return
RequestBuilder
.
get
();
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
POST
))
{
RequestBuilder
requestBuilder
=
RequestBuilder
.
post
();
NameValuePair
[]
nameValuePair
=
(
NameValuePair
[])
request
.
getExtra
(
"nameValuePair"
);
if
(
nameValuePair
!=
null
&&
nameValuePair
.
length
>
0
)
{
requestBuilder
.
addParameters
(
nameValuePair
);
}
return
requestBuilder
;
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
HEAD
))
{
return
RequestBuilder
.
head
();
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
PUT
))
{
return
RequestBuilder
.
put
();
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
DELETE
))
{
return
RequestBuilder
.
delete
();
}
else
if
(
method
.
equalsIgnoreCase
(
HttpConstant
.
Method
.
TRACE
))
{
return
RequestBuilder
.
trace
();
}
throw
new
IllegalArgumentException
(
"Illegal HTTP Method "
+
method
);
}
protected
Page
handleResponse
(
Request
request
,
String
charset
,
HttpResponse
httpResponse
,
Task
task
)
throws
IOException
{
String
content
=
getContent
(
charset
,
httpResponse
);
Page
page
=
new
Page
();
page
.
setRawText
(
content
);
page
.
setUrl
(
new
PlainText
(
request
.
getUrl
()));
page
.
setRequest
(
request
);
page
.
setStatusCode
(
httpResponse
.
getStatusLine
().
getStatusCode
());
return
page
;
}
protected
String
getContent
(
String
charset
,
HttpResponse
httpResponse
)
throws
IOException
{
if
(
charset
==
null
)
{
byte
[]
contentBytes
=
IOUtils
.
toByteArray
(
httpResponse
.
getEntity
().
getContent
());
String
htmlCharset
=
getHtmlCharset
(
httpResponse
,
contentBytes
);
if
(
htmlCharset
!=
null
)
{
return
new
String
(
contentBytes
,
htmlCharset
);
}
else
{
logger
.
warn
(
"Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()"
,
Charset
.
defaultCharset
());
return
new
String
(
contentBytes
);
}
}
else
{
return
IOUtils
.
toString
(
httpResponse
.
getEntity
().
getContent
(),
charset
);
}
}
protected
String
getHtmlCharset
(
HttpResponse
httpResponse
,
byte
[]
contentBytes
)
throws
IOException
{
String
charset
;
// charset
// 1、encoding in http header Content-Type
String
value
=
httpResponse
.
getEntity
().
getContentType
().
getValue
();
charset
=
UrlUtils
.
getCharset
(
value
);
if
(
StringUtils
.
isNotBlank
(
charset
))
{
logger
.
debug
(
"Auto get charset: {}"
,
charset
);
return
charset
;
}
// use default charset to decode first time
Charset
defaultCharset
=
Charset
.
defaultCharset
();
String
content
=
new
String
(
contentBytes
,
defaultCharset
.
name
());
// 2、charset in meta
if
(
StringUtils
.
isNotEmpty
(
content
))
{
Document
document
=
Jsoup
.
parse
(
content
);
Elements
links
=
document
.
select
(
"meta"
);
for
(
Element
link
:
links
)
{
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String
metaContent
=
link
.
attr
(
"content"
);
String
metaCharset
=
link
.
attr
(
"charset"
);
if
(
metaContent
.
indexOf
(
"charset"
)
!=
-
1
)
{
metaContent
=
metaContent
.
substring
(
metaContent
.
indexOf
(
"charset"
),
metaContent
.
length
());
charset
=
metaContent
.
split
(
"="
)[
1
];
break
;
}
// 2.2、html5 <meta charset="UTF-8" />
else
if
(
StringUtils
.
isNotEmpty
(
metaCharset
))
{
charset
=
metaCharset
;
break
;
}
}
}
logger
.
debug
(
"Auto get charset: {}"
,
charset
);
// 3、todo use tools as cpdetector for content decode
return
charset
;
}
}
src/main/java/com/zhiwei/source_forward/pipeline/DataPipeline.java
deleted
100644 → 0
View file @
76581f38
package
com
.
zhiwei
.
source_forward
.
pipeline
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
us.codecraft.webmagic.ResultItems
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
/**
* @ClassName: ContentDataPipeline
* @Description: 存储文章位置
* @author hero
* @date 2018年6月30日 上午9:54:27
*/
public
class
DataPipeline
implements
Pipeline
{
private
List
<
Map
<
String
,
Object
>>
contentDataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
private
List
<
Map
<
String
,
Object
>>
mediaSelfDataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
private
List
<
Map
<
String
,
Object
>>
sourceForwardDataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
private
List
<
Map
<
String
,
Object
>>
urlLivedataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
public
DataPipeline
()
{
super
();
}
@Override
public
void
process
(
ResultItems
resultItems
,
Task
task
)
{
Map
<
String
,
Object
>
contentData
=
resultItems
.
get
(
"content"
);
Map
<
String
,
Object
>
mediaSelfData
=
resultItems
.
get
(
"mediaSelf"
);
Map
<
String
,
Object
>
sourceForwardData
=
resultItems
.
get
(
"sourceForward"
);
Map
<
String
,
Object
>
urlLivedata
=
resultItems
.
get
(
"urlLive"
);
if
(
contentData
!=
null
)
{
contentDataList
.
add
(
contentData
);
}
if
(
mediaSelfData
!=
null
)
{
mediaSelfDataList
.
add
(
mediaSelfData
);
}
if
(
sourceForwardData
!=
null
)
{
sourceForwardDataList
.
add
(
sourceForwardData
);
}
if
(
urlLivedata
!=
null
)
{
urlLivedataList
.
add
(
urlLivedata
);
}
}
public
List
<
Map
<
String
,
Object
>>
getContentDataList
()
{
return
contentDataList
;
}
public
void
setContentDataList
(
List
<
Map
<
String
,
Object
>>
contentDataList
)
{
this
.
contentDataList
=
contentDataList
;
}
public
List
<
Map
<
String
,
Object
>>
getMediaSelfDataList
()
{
return
mediaSelfDataList
;
}
public
void
setMediaSelfDataList
(
List
<
Map
<
String
,
Object
>>
mediaSelfDataList
)
{
this
.
mediaSelfDataList
=
mediaSelfDataList
;
}
public
List
<
Map
<
String
,
Object
>>
getSourceForwardDataList
()
{
return
sourceForwardDataList
;
}
public
void
setSourceForwardDataList
(
List
<
Map
<
String
,
Object
>>
sourceForwardDataList
)
{
this
.
sourceForwardDataList
=
sourceForwardDataList
;
}
public
List
<
Map
<
String
,
Object
>>
getUrlLivedataList
()
{
return
urlLivedataList
;
}
public
void
setUrlLivedataList
(
List
<
Map
<
String
,
Object
>>
urlLivedataList
)
{
this
.
urlLivedataList
=
urlLivedataList
;
}
}
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
run
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.source_forward.crawler.ContentPageProcessor
;
import
com.zhiwei.source_forward.downloader.MyDownLoader
;
import
com.zhiwei.source_forward.pipeline.DataPipeline
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
us.codecraft.webmagic.Spider
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.crawler.ContentCrawler
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
public
class
ContentMatch
{
/**
* @Title: getSourceForward
* @author hero
* @Description: 验证文章是否转发
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getContent
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
//启动验证来源程序
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
ContentPageProcessor
());
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
spider
.
addUrl
(
entry
.
getKey
());
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentMatch
.
class
);
public
static
List
<
ContentBean
>
getContentMatch
(
List
<
String
>
urlList
){
//启动获取链接来源
List
<
ContentBean
>
dataList
=
ContentMatchCrawlerThread
.
getContentMatch
(
urlList
);
return
dataList
;
}
spider
.
setDownloader
(
new
MyDownLoader
());
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
List
<
Map
<
String
,
Object
>>
contentList
=
pipeline
.
getContentDataList
();
for
(
Map
<
String
,
Object
>
contentMap
:
contentList
){
String
url
=
contentMap
.
get
(
"url"
)+
""
;
//搜集原文
if
(
dataMap
.
containsKey
(
url
)){
Map
<
String
,
Object
>
data
=
dataMap
.
get
(
url
);
String
content
=
contentMap
.
get
(
"content"
)+
""
;
data
.
put
(
"content"
,
content
);
dataMap
.
put
(
url
,
data
);
public
static
void
main
(
String
[]
args
)
{
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://sh.qihoo.com/pc/99493b3bf136d8e20?sign=360_e39369d1"
);
urlList
.
add
(
"http://news.ctocio.com.cn/383/14543883.shtml"
);
urlList
.
add
(
"http://www.jn001.com/news/2018-07/05/content_561091.htm"
);
urlList
.
add
(
"http://www.ca800.com/fFa8D/bOTUBC1QfF/40944.aspx"
);
urlList
.
add
(
"http://sh.qihoo.com/pc/988470164f6c5ca14?sign=360_e39369d1"
);
urlList
.
add
(
"http://news.jstv.com/a/20180705/1530731642686.shtml?jsbcApp=1"
);
urlList
.
add
(
"https://tech.sina.cn/i/gn/2018-07-05/detail-ihexfcvi8155439.d.html?pos=18"
);
urlList
.
add
(
"http://sh.qihoo.com/pc/983b3d157f91af18b?sign=360_e39369d1"
);
urlList
.
add
(
"http://china.rednet.cn/c/2018/07/05/4671927.htm"
);
urlList
.
add
(
"http://news.enorth.com.cn/system/2018/07/05/035782857.shtml"
);
urlList
.
add
(
"https://www.toutiao.com/i6573922350037729796/"
);
urlList
.
add
(
"http://news.cnhubei.com/xw/sh/201807/t4132048.shtml"
);
urlList
.
add
(
"https://www.toutiao.com/a6573774143949373956/"
);
List
<
ContentBean
>
da
=
ContentMatch
.
getContentMatch
(
urlList
);
for
(
ContentBean
sfb
:
da
)
{
System
.
out
.
println
(
sfb
.
toString
());
}
}
return
dataMap
;
static
class
ContentMatchCrawlerThread
extends
Thread
{
private
static
List
<
ContentBean
>
getContentMatch
(
List
<
String
>
urlList
){
try
{
ContentCrawler
crawler
=
new
ContentCrawler
();
List
<
ContentBean
>
list
=
Collections
.
synchronizedList
(
new
ArrayList
<
ContentBean
>());
ContentDataCallback
callback
=
new
ContentDataCallback
()
{
@Override
public
void
onData
(
ContentBean
data
,
Attribution
attr
)
{
list
.
add
(
data
);
logger
.
info
(
"列表大小:::{}"
,
list
.
size
());
}
};
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
return
list
;
}
catch
(
Exception
e
){
e
.
fillInStackTrace
();
}
return
null
;
}
}
}
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
run
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler
;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
public
class
MediaSelfSource
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
MediaSelfSource
.
class
);
public
static
List
<
MediaSelfSourceBean
>
getMediaSelfSource
(
List
<
String
>
urlList
)
{
List
<
MediaSelfSourceBean
>
list
=
MediaSelfSourceCrawlerThread
.
getMediaSelfSource
(
urlList
);
return
list
;
}
public
static
void
main
(
String
[]
args
)
{
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"https://baijiahao.baidu.com/s?id=1606950814338460255&wfr=spider&for=pc&qq-pf-to=pcqq.c2c"
);
List
<
MediaSelfSourceBean
>
da
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
for
(
MediaSelfSourceBean
mssb
:
da
)
{
System
.
out
.
println
(
mssb
.
toString
());
}
}
static
class
MediaSelfSourceCrawlerThread
extends
Thread
{
private
static
List
<
MediaSelfSourceBean
>
getMediaSelfSource
(
List
<
String
>
urlList
){
try
{
MediaSelfSourceCrawler
crawler
=
new
MediaSelfSourceCrawler
();
List
<
MediaSelfSourceBean
>
list
=
Collections
.
synchronizedList
(
new
ArrayList
<
MediaSelfSourceBean
>());
MediaSelfSourceDataCallBack
callback
=
new
MediaSelfSourceDataCallBack
()
{
@Override
public
void
onData
(
MediaSelfSourceBean
data
,
Attribution
attr
)
{
list
.
add
(
data
);
logger
.
info
(
"列表大小:::{}"
,
list
.
size
());
}
};
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
return
list
;
}
catch
(
Exception
e
){
e
.
fillInStackTrace
();
}
return
null
;
}
}
}
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
run
;
import
java.util.HashMap
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.source_forward.crawler.MediaSelfSourcePageProcessor
;
import
com.zhiwei.source_forward.crawler.SourceForwardPageProcessor
;
import
com.zhiwei.source_forward.downloader.MyDownLoader
;
import
com.zhiwei.source_forward.pipeline.DataPipeline
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
us.codecraft.webmagic.Spider
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.crawler.SourceForwardCrawler
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
/**
* @ClassName: SourceForward
...
...
@@ -20,149 +20,57 @@ import us.codecraft.webmagic.Spider;
*/
public
class
SourceForward
{
/**
* @Title: getSourceForward
* @author hero
* @Description: 验证文章是否转发
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getSourceForward
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
//启动验证来源程序
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
SourceForwardPageProcessor
());
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
spider
.
addUrl
(
entry
.
getKey
());
}
spider
.
setDownloader
(
new
MyDownLoader
());
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
private
static
Logger
logger
=
LogManager
.
getLogger
(
SourceForward
.
class
);
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
getSourceForwardDataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
String
root_source
=
sourceMap
.
get
(
"root_source"
)!=
null
?
sourceMap
.
get
(
"root_source"
).
toString
():
null
;
String
isForwardWX
=
sourceMap
.
get
(
"isforward"
)!=
null
?
sourceMap
.
get
(
"isforward"
).
toString
():
null
;
String
channel
=
sourceMap
.
get
(
"channel"
)+
""
;
//整合数据及验证转发原创
if
(
dataMap
.
containsKey
(
url
)){
Map
<
String
,
Object
>
data
=
dataMap
.
get
(
url
);
String
source
=
data
.
get
(
"来源"
)+
""
;
String
isForward
=
"转发"
;
if
(
root_source
==
null
){
isForward
=
"原创"
;
}
else
if
(
root_source
.
toUpperCase
().
trim
().
equals
(
source
.
toUpperCase
().
trim
())){
isForward
=
"原创"
;
public
static
List
<
SourceForwardBean
>
getSourceForward
(
List
<
String
>
urlList
){
//启动获取链接来源
List
<
SourceForwardBean
>
dataList
=
SourceForwardCrawlerThread
.
getSourceForward
(
urlList
);
return
dataList
;
}
if
(
url
.
contains
(
"mp.weixin.qq.com"
)){
isForward
=
isForwardWX
;
}
else
{
data
.
put
(
"原来源"
,
root_source
);
data
.
put
(
"频道"
,
channel
);
public
static
void
main
(
String
[]
args
)
{
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://sh.qihoo.com/pc/99493b3bf136d8e20?sign=360_e39369d1"
);
urlList
.
add
(
"http://news.ctocio.com.cn/383/14543883.shtml"
);
urlList
.
add
(
"http://www.jn001.com/news/2018-07/05/content_561091.htm"
);
urlList
.
add
(
"http://www.ca800.com/fFa8D/bOTUBC1QfF/40944.aspx"
);
urlList
.
add
(
"http://sh.qihoo.com/pc/988470164f6c5ca14?sign=360_e39369d1"
);
urlList
.
add
(
"http://news.jstv.com/a/20180705/1530731642686.shtml?jsbcApp=1"
);
urlList
.
add
(
"https://tech.sina.cn/i/gn/2018-07-05/detail-ihexfcvi8155439.d.html?pos=18"
);
urlList
.
add
(
"http://sh.qihoo.com/pc/983b3d157f91af18b?sign=360_e39369d1"
);
urlList
.
add
(
"http://china.rednet.cn/c/2018/07/05/4671927.htm"
);
urlList
.
add
(
"http://news.enorth.com.cn/system/2018/07/05/035782857.shtml"
);
urlList
.
add
(
"https://www.toutiao.com/i6573922350037729796/"
);
urlList
.
add
(
"http://news.cnhubei.com/xw/sh/201807/t4132048.shtml"
);
urlList
.
add
(
"https://www.toutiao.com/a6573774143949373956/"
);
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
for
(
SourceForwardBean
sfb
:
da
)
{
System
.
out
.
println
(
sfb
.
toString
());
}
data
.
put
(
"是否转发"
,
isForward
);
dataMap
.
put
(
url
,
data
);
}
}
return
dataMap
;
}
static
class
SourceForwardCrawlerThread
extends
Thread
{
private
static
List
<
SourceForwardBean
>
getSourceForward
(
List
<
String
>
urlList
){
try
{
SourceForwardCrawler
crawler
=
new
SourceForwardCrawler
();
List
<
SourceForwardBean
>
list
=
Collections
.
synchronizedList
(
new
ArrayList
<
SourceForwardBean
>());
SourceForwardDataCallBack
callback
=
new
SourceForwardDataCallBack
()
{
/**
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体号名称
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getMediaSelfSource
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
//启动验证来源程序
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
spider
.
addUrl
(
entry
.
getKey
());
@Override
public
void
onData
(
SourceForwardBean
data
,
Attribution
attr
)
{
list
.
add
(
data
);
logger
.
info
(
"列表大小:::{}"
,
list
.
size
());
}
spider
.
setDownloader
(
new
MyDownLoader
());
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
getMediaSelfDataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
//整合数据及验证转发原创
if
(
dataMap
.
containsKey
(
url
)){
Map
<
String
,
Object
>
data
=
dataMap
.
get
(
url
);
data
.
put
(
"自媒体号"
,
sourceMap
.
get
(
"mediaself"
));
data
.
put
(
"频道"
,
sourceMap
.
get
(
"channel"
));
dataMap
.
put
(
url
,
data
);
}
}
return
dataMap
;
}
/**
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体账号
* @param @param urlList
* @param @return 设定文件
* @return Map<String,String> 返回类型
*/
public
static
Map
<
String
,
String
>
getMediaSelfSource
(
List
<
String
>
urlList
){
//启动验证来源程序
Map
<
String
,
String
>
dataMap
=
new
HashMap
<
String
,
String
>();
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
for
(
String
url
:
urlList
){
spider
.
addUrl
(
url
);
dataMap
.
put
(
url
,
null
);
}
spider
.
setDownloader
(
new
MyDownLoader
());
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
getMediaSelfDataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
String
url
=
sourceMap
.
get
(
"url"
)+
""
;
//整合数据及验证转发原创
if
(
dataMap
.
containsKey
(
url
)){
dataMap
.
put
(
url
,
sourceMap
.
get
(
"mediaself"
).
toString
());
}
}
return
dataMap
;
}
/**
*
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体账号
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
getMediaSelfSource
(
String
url
){
//启动验证来源程序
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
MediaSelfSourcePageProcessor
());
spider
.
addUrl
(
url
);
spider
.
setDownloader
(
new
MyDownLoader
());
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
1
).
run
();
List
<
Map
<
String
,
Object
>>
sourceForwardList
=
pipeline
.
getMediaSelfDataList
();
for
(
Map
<
String
,
Object
>
sourceMap
:
sourceForwardList
){
return
sourceMap
.
get
(
"mediaself"
).
toString
();
};
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
return
list
;
}
catch
(
Exception
e
){
e
.
fillInStackTrace
();
}
return
null
;
}
}
}
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
run
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.source_forward.
crawler.UrlLivePageProcessor
;
import
com.zhiwei.source_forward.
pipeline.DataPipeline
;
import
us.codecraft.webmagic.Spider
;
import
com.zhiwei.source_forward.
bean.UrlLiveBean
;
import
com.zhiwei.source_forward.
bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.crawler.UrlLiveCrawler
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
/**
* @ClassName: URLLive
...
...
@@ -17,7 +19,6 @@ import us.codecraft.webmagic.Spider;
*/
public
class
URLLive
{
/**
* @Title: verificationURLLive
* @author hero
...
...
@@ -27,35 +28,60 @@ public class URLLive {
* @return Map<String,Map<String,Object>> 返回类型
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
verificationURLLive
(
Map
<
String
,
Map
<
String
,
Object
>>
dataMap
){
List
<
String
>
urlList
=
new
ArrayList
<>();
//启动验证链接是否有效程序程序
DataPipeline
pipeline
=
new
DataPipeline
();
Spider
spider
=
Spider
.
create
(
new
UrlLivePageProcessor
());
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
spider
.
addUrl
(
entry
.
getKey
());
urlList
.
add
(
entry
.
getKey
());
}
spider
.
addPipeline
(
pipeline
);
spider
.
thread
(
5
).
run
();
//验证数据是否已删除
List
<
Map
<
String
,
Object
>>
dataList
=
pipeline
.
getUrlLivedataList
(
);
for
(
Map
<
String
,
Object
>
data
:
dataList
){
String
url
=
data
.
get
(
"url"
)+
""
;
List
<
UrlLiveBean
>
dataList
=
UrlLiveCrawlerThread
.
getUrlLiveCrawle
(
urlList
);
for
(
UrlLiveBean
ub
:
dataList
){
String
url
=
ub
.
getUrl
()
;
if
(!
url
.
contains
(
"http"
)){
url
=
"http://"
+
url
;
}
if
(!
url
.
contains
(
"www"
)){
url
=
url
.
replace
(
"://"
,
"://www."
);
}
boolean
live
=
(
boolean
)
data
.
get
(
"live"
);
boolean
live
=
ub
.
isLive
(
);
if
(
dataMap
.
containsKey
(
url
)){
Map
<
String
,
Object
>
map
=
dataMap
.
get
(
url
);
map
.
put
(
"是否删除"
,
live
);
dataMap
.
put
(
url
,
map
);
}
}
return
dataMap
;
}
public
static
List
<
UrlLiveBean
>
verificationURLLive
(
List
<
String
>
urlList
){
//启动验证链接是否有效程序程序
List
<
UrlLiveBean
>
dataList
=
UrlLiveCrawlerThread
.
getUrlLiveCrawle
(
urlList
);
return
dataList
;
}
static
class
UrlLiveCrawlerThread
extends
Thread
{
private
static
List
<
UrlLiveBean
>
getUrlLiveCrawle
(
List
<
String
>
urlList
){
try
{
UrlLiveCrawler
crawler
=
new
UrlLiveCrawler
();
List
<
UrlLiveBean
>
list
=
Collections
.
synchronizedList
(
new
ArrayList
<
UrlLiveBean
>());
UrlLiveDataCallback
callback
=
new
UrlLiveDataCallback
()
{
@Override
public
void
onData
(
UrlLiveBean
data
,
Attribution
attr
)
{
list
.
add
(
data
);
System
.
out
.
println
(
"列表大小:::"
+
list
.
size
());
}
};
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
return
list
;
}
catch
(
Exception
e
){
e
.
fillInStackTrace
();
}
return
null
;
}
}
}
src/main/java/com/zhiwei/source_forward/spider/MySpider.java
deleted
100644 → 0
View file @
76581f38
package
com
.
zhiwei
.
source_forward
.
spider
;
import
java.io.Closeable
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Collection
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.UUID
;
import
java.util.concurrent.ExecutorService
;
import
java.util.concurrent.TimeUnit
;
import
java.util.concurrent.atomic.AtomicInteger
;
import
java.util.concurrent.atomic.AtomicLong
;
import
java.util.concurrent.locks.Condition
;
import
java.util.concurrent.locks.ReentrantLock
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
us.codecraft.webmagic.Page
;
import
us.codecraft.webmagic.Request
;
import
us.codecraft.webmagic.Site
;
import
us.codecraft.webmagic.SpiderListener
;
import
us.codecraft.webmagic.Task
;
import
us.codecraft.webmagic.downloader.Downloader
;
import
us.codecraft.webmagic.downloader.HttpClientDownloader
;
import
us.codecraft.webmagic.pipeline.CollectorPipeline
;
import
us.codecraft.webmagic.pipeline.ConsolePipeline
;
import
us.codecraft.webmagic.pipeline.Pipeline
;
import
us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline
;
import
us.codecraft.webmagic.processor.PageProcessor
;
import
us.codecraft.webmagic.scheduler.QueueScheduler
;
import
us.codecraft.webmagic.scheduler.Scheduler
;
import
us.codecraft.webmagic.thread.CountableThreadPool
;
import
us.codecraft.webmagic.utils.UrlUtils
;
import
us.codecraft.webmagic.utils.WMCollections
;
public
class
MySpider
implements
Runnable
,
Task
{
protected
Downloader
downloader
;
protected
List
<
Pipeline
>
pipelines
=
new
ArrayList
<
Pipeline
>();
protected
PageProcessor
pageProcessor
;
protected
List
<
Request
>
startRequests
;
protected
Site
site
;
protected
String
uuid
;
protected
Scheduler
scheduler
=
new
QueueScheduler
();
protected
Logger
logger
=
LoggerFactory
.
getLogger
(
getClass
());
protected
CountableThreadPool
threadPool
;
protected
ExecutorService
executorService
;
protected
int
threadNum
=
1
;
protected
AtomicInteger
stat
=
new
AtomicInteger
(
STAT_INIT
);
protected
boolean
exitWhenComplete
=
true
;
protected
final
static
int
STAT_INIT
=
0
;
protected
final
static
int
STAT_RUNNING
=
1
;
protected
final
static
int
STAT_STOPPED
=
2
;
protected
boolean
spawnUrl
=
true
;
protected
boolean
destroyWhenExit
=
true
;
private
ReentrantLock
newUrlLock
=
new
ReentrantLock
();
private
Condition
newUrlCondition
=
newUrlLock
.
newCondition
();
private
List
<
SpiderListener
>
spiderListeners
;
private
final
AtomicLong
pageCount
=
new
AtomicLong
(
0
);
private
Date
startTime
;
private
int
emptySleepTime
=
30000
;
/**
* create a spider with pageProcessor.
*
* @param pageProcessor pageProcessor
* @return new spider
* @see PageProcessor
*/
public
static
MySpider
create
(
PageProcessor
pageProcessor
)
{
return
new
MySpider
(
pageProcessor
);
}
/**
* create a spider with pageProcessor.
*
* @param pageProcessor pageProcessor
*/
public
MySpider
(
PageProcessor
pageProcessor
)
{
this
.
pageProcessor
=
pageProcessor
;
this
.
site
=
pageProcessor
.
getSite
();
this
.
startRequests
=
pageProcessor
.
getSite
().
getStartRequests
();
}
/**
* Set startUrls of Spider.<br>
* Prior to startUrls of Site.
*
* @param startUrls startUrls
* @return this
*/
public
MySpider
startUrls
(
List
<
String
>
startUrls
)
{
checkIfRunning
();
this
.
startRequests
=
UrlUtils
.
convertToRequests
(
startUrls
);
return
this
;
}
/**
* Set startUrls of Spider.<br>
* Prior to startUrls of Site.
*
* @param startRequests startRequests
* @return this
*/
public
MySpider
startRequest
(
List
<
Request
>
startRequests
)
{
checkIfRunning
();
this
.
startRequests
=
startRequests
;
return
this
;
}
/**
* Set an uuid for spider.<br>
* Default uuid is domain of site.<br>
*
* @param uuid uuid
* @return this
*/
public
MySpider
setUUID
(
String
uuid
)
{
this
.
uuid
=
uuid
;
return
this
;
}
/**
* set scheduler for Spider
*
* @param scheduler scheduler
* @return this
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
*/
@Deprecated
public
MySpider
scheduler
(
Scheduler
scheduler
)
{
return
setScheduler
(
scheduler
);
}
/**
* set scheduler for Spider
*
* @param scheduler scheduler
* @return this
* @see Scheduler
* @since 0.2.1
*/
public
MySpider
setScheduler
(
Scheduler
scheduler
)
{
checkIfRunning
();
Scheduler
oldScheduler
=
this
.
scheduler
;
this
.
scheduler
=
scheduler
;
if
(
oldScheduler
!=
null
)
{
Request
request
;
while
((
request
=
oldScheduler
.
poll
(
this
))
!=
null
)
{
this
.
scheduler
.
push
(
request
,
this
);
}
}
return
this
;
}
/**
* add a pipeline for Spider
*
* @param pipeline pipeline
* @return this
* @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
* @deprecated
*/
public
MySpider
pipeline
(
Pipeline
pipeline
)
{
return
addPipeline
(
pipeline
);
}
/**
* add a pipeline for Spider
*
* @param pipeline pipeline
* @return this
* @see Pipeline
* @since 0.2.1
*/
public
MySpider
addPipeline
(
Pipeline
pipeline
)
{
checkIfRunning
();
this
.
pipelines
.
add
(
pipeline
);
return
this
;
}
/**
* set pipelines for Spider
*
* @param pipelines pipelines
* @return this
* @see Pipeline
* @since 0.4.1
*/
public
MySpider
setPipelines
(
List
<
Pipeline
>
pipelines
)
{
checkIfRunning
();
this
.
pipelines
=
pipelines
;
return
this
;
}
/**
* clear the pipelines set
*
* @return this
*/
public
MySpider
clearPipeline
()
{
pipelines
=
new
ArrayList
<
Pipeline
>();
return
this
;
}
/**
* set the downloader of spider
*
* @param downloader downloader
* @return this
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
* @deprecated
*/
public
MySpider
downloader
(
Downloader
downloader
)
{
return
setDownloader
(
downloader
);
}
/**
* set the downloader of spider
*
* @param downloader downloader
* @return this
* @see Downloader
*/
public
MySpider
setDownloader
(
Downloader
downloader
)
{
checkIfRunning
();
this
.
downloader
=
downloader
;
return
this
;
}
protected
void
initComponent
()
{
if
(
downloader
==
null
)
{
this
.
downloader
=
new
HttpClientDownloader
();
}
if
(
pipelines
.
isEmpty
())
{
pipelines
.
add
(
new
ConsolePipeline
());
}
downloader
.
setThread
(
threadNum
);
if
(
threadPool
==
null
||
threadPool
.
isShutdown
())
{
if
(
executorService
!=
null
&&
!
executorService
.
isShutdown
())
{
threadPool
=
new
CountableThreadPool
(
threadNum
,
executorService
);
}
else
{
threadPool
=
new
CountableThreadPool
(
threadNum
);
}
}
if
(
startRequests
!=
null
)
{
for
(
Request
request
:
startRequests
)
{
addRequest
(
request
);
}
startRequests
.
clear
();
}
startTime
=
new
Date
();
}
@Override
public
void
run
()
{
checkRunningStat
();
initComponent
();
logger
.
info
(
"Spider "
+
getUUID
()
+
" started!"
);
while
(!
Thread
.
currentThread
().
isInterrupted
()
&&
stat
.
get
()
==
STAT_RUNNING
)
{
Request
request
=
scheduler
.
poll
(
this
);
if
(
request
==
null
)
{
if
(
threadPool
.
getThreadAlive
()
==
0
&&
exitWhenComplete
)
{
break
;
}
// wait until new url added
waitNewUrl
();
}
else
{
final
Request
requestFinal
=
request
;
threadPool
.
execute
(
new
Runnable
()
{
@Override
public
void
run
()
{
try
{
processRequest
(
requestFinal
);
onSuccess
(
requestFinal
);
}
catch
(
Exception
e
)
{
onError
(
requestFinal
);
logger
.
error
(
"process request "
+
requestFinal
+
" error"
,
e
);
}
finally
{
pageCount
.
incrementAndGet
();
signalNewUrl
();
}
}
});
}
}
stat
.
set
(
STAT_STOPPED
);
// release some resources
if
(
destroyWhenExit
)
{
close
();
}
}
protected
void
onError
(
Request
request
)
{
if
(
CollectionUtils
.
isNotEmpty
(
spiderListeners
))
{
for
(
SpiderListener
spiderListener
:
spiderListeners
)
{
spiderListener
.
onError
(
request
);
}
}
}
protected
void
onSuccess
(
Request
request
)
{
if
(
CollectionUtils
.
isNotEmpty
(
spiderListeners
))
{
for
(
SpiderListener
spiderListener
:
spiderListeners
)
{
spiderListener
.
onSuccess
(
request
);
}
}
}
private
void
checkRunningStat
()
{
while
(
true
)
{
int
statNow
=
stat
.
get
();
if
(
statNow
==
STAT_RUNNING
)
{
throw
new
IllegalStateException
(
"Spider is already running!"
);
}
if
(
stat
.
compareAndSet
(
statNow
,
STAT_RUNNING
))
{
break
;
}
}
}
public
void
close
()
{
destroyEach
(
downloader
);
destroyEach
(
pageProcessor
);
destroyEach
(
scheduler
);
for
(
Pipeline
pipeline
:
pipelines
)
{
destroyEach
(
pipeline
);
}
threadPool
.
shutdown
();
}
private
void
destroyEach
(
Object
object
)
{
if
(
object
instanceof
Closeable
)
{
try
{
((
Closeable
)
object
).
close
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
}
/**
* Process specific urls without url discovering.
*
* @param urls urls to process
*/
public
void
test
(
String
...
urls
)
{
initComponent
();
if
(
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
processRequest
(
new
Request
(
url
));
}
}
}
protected
void
processRequest
(
Request
request
)
{
Page
page
=
downloader
.
download
(
request
,
this
);
if
(
page
==
null
)
{
sleep
(
site
.
getSleepTime
());
onError
(
request
);
return
;
}
// for cycle retry
if
(
page
.
isNeedCycleRetry
())
{
extractAndAddRequests
(
page
,
true
);
sleep
(
site
.
getRetrySleepTime
());
return
;
}
pageProcessor
.
process
(
page
);
extractAndAddRequests
(
page
,
spawnUrl
);
if
(!
page
.
getResultItems
().
isSkip
())
{
for
(
Pipeline
pipeline
:
pipelines
)
{
pipeline
.
process
(
page
.
getResultItems
(),
this
);
}
}
//for proxy status management
request
.
putExtra
(
Request
.
STATUS_CODE
,
page
.
getStatusCode
());
sleep
(
site
.
getSleepTime
());
}
protected
void
sleep
(
int
time
)
{
try
{
Thread
.
sleep
(
time
);
}
catch
(
InterruptedException
e
)
{
e
.
printStackTrace
();
}
}
protected
void
extractAndAddRequests
(
Page
page
,
boolean
spawnUrl
)
{
if
(
spawnUrl
&&
CollectionUtils
.
isNotEmpty
(
page
.
getTargetRequests
()))
{
for
(
Request
request
:
page
.
getTargetRequests
())
{
addRequest
(
request
);
}
}
}
private
void
addRequest
(
Request
request
)
{
if
(
site
.
getDomain
()
==
null
&&
request
!=
null
&&
request
.
getUrl
()
!=
null
)
{
site
.
setDomain
(
UrlUtils
.
getDomain
(
request
.
getUrl
()));
}
scheduler
.
push
(
request
,
this
);
}
protected
void
checkIfRunning
()
{
if
(
stat
.
get
()
==
STAT_RUNNING
)
{
throw
new
IllegalStateException
(
"Spider is already running!"
);
}
}
public
void
runAsync
()
{
Thread
thread
=
new
Thread
(
this
);
thread
.
setDaemon
(
false
);
thread
.
start
();
}
/**
* Add urls to crawl. <br>
*
* @param urls urls
* @return this
*/
public
MySpider
addUrl
(
String
...
urls
)
{
for
(
String
url
:
urls
)
{
addRequest
(
new
Request
(
url
));
}
signalNewUrl
();
return
this
;
}
/**
* Download urls synchronizing.
*
* @param urls urls
* @return list downloaded
*/
@SuppressWarnings
({
"rawtypes"
,
"unchecked"
})
public
<
T
>
List
<
T
>
getAll
(
Collection
<
String
>
urls
)
{
destroyWhenExit
=
false
;
spawnUrl
=
false
;
startRequests
.
clear
();
for
(
Request
request
:
UrlUtils
.
convertToRequests
(
urls
))
{
addRequest
(
request
);
}
CollectorPipeline
collectorPipeline
=
getCollectorPipeline
();
pipelines
.
add
(
collectorPipeline
);
run
();
spawnUrl
=
true
;
destroyWhenExit
=
true
;
return
collectorPipeline
.
getCollected
();
}
@SuppressWarnings
(
"rawtypes"
)
protected
CollectorPipeline
getCollectorPipeline
()
{
return
new
ResultItemsCollectorPipeline
();
}
public
<
T
>
T
get
(
String
url
)
{
List
<
String
>
urls
=
WMCollections
.
newArrayList
(
url
);
List
<
T
>
resultItemses
=
getAll
(
urls
);
if
(
resultItemses
!=
null
&&
resultItemses
.
size
()
>
0
)
{
return
resultItemses
.
get
(
0
);
}
else
{
return
null
;
}
}
/**
* Add urls with information to crawl.<br>
*
* @param requests requests
* @return this
*/
public
MySpider
addRequest
(
Request
...
requests
)
{
for
(
Request
request
:
requests
)
{
addRequest
(
request
);
}
signalNewUrl
();
return
this
;
}
private
void
waitNewUrl
()
{
newUrlLock
.
lock
();
try
{
//double check
if
(
threadPool
.
getThreadAlive
()
==
0
&&
exitWhenComplete
)
{
return
;
}
newUrlCondition
.
await
(
emptySleepTime
,
TimeUnit
.
MILLISECONDS
);
}
catch
(
InterruptedException
e
)
{
logger
.
warn
(
"waitNewUrl - interrupted, error {}"
,
e
);
}
finally
{
newUrlLock
.
unlock
();
}
}
private
void
signalNewUrl
()
{
try
{
newUrlLock
.
lock
();
newUrlCondition
.
signalAll
();
}
finally
{
newUrlLock
.
unlock
();
}
}
public
void
start
()
{
runAsync
();
}
public
void
stop
()
{
if
(
stat
.
compareAndSet
(
STAT_RUNNING
,
STAT_STOPPED
))
{
logger
.
info
(
"Spider "
+
getUUID
()
+
" stop success!"
);
}
else
{
logger
.
info
(
"Spider "
+
getUUID
()
+
" stop fail!"
);
}
}
/**
* start with more than one threads
*
* @param threadNum threadNum
* @return this
*/
public
MySpider
thread
(
int
threadNum
)
{
checkIfRunning
();
this
.
threadNum
=
threadNum
;
if
(
threadNum
<=
0
)
{
throw
new
IllegalArgumentException
(
"threadNum should be more than one!"
);
}
return
this
;
}
/**
* start with more than one threads
*
* @param executorService executorService to run the spider
* @param threadNum threadNum
* @return this
*/
public
MySpider
thread
(
ExecutorService
executorService
,
int
threadNum
)
{
checkIfRunning
();
this
.
threadNum
=
threadNum
;
if
(
threadNum
<=
0
)
{
throw
new
IllegalArgumentException
(
"threadNum should be more than one!"
);
}
return
this
;
}
public
boolean
isExitWhenComplete
()
{
return
exitWhenComplete
;
}
/**
* Exit when complete. <br>
* True: exit when all url of the site is downloaded. <br>
* False: not exit until call stop() manually.<br>
*
* @param exitWhenComplete exitWhenComplete
* @return this
*/
public
MySpider
setExitWhenComplete
(
boolean
exitWhenComplete
)
{
this
.
exitWhenComplete
=
exitWhenComplete
;
return
this
;
}
public
boolean
isSpawnUrl
()
{
return
spawnUrl
;
}
/**
* Get page count downloaded by spider.
*
* @return total downloaded page count
* @since 0.4.1
*/
public
long
getPageCount
()
{
return
pageCount
.
get
();
}
/**
* Get running status by spider.
*
* @return running status
* @see Status
* @since 0.4.1
*/
public
Status
getStatus
()
{
return
Status
.
fromValue
(
stat
.
get
());
}
public
enum
Status
{
Init
(
0
),
Running
(
1
),
Stopped
(
2
);
private
Status
(
int
value
)
{
this
.
value
=
value
;
}
private
int
value
;
int
getValue
()
{
return
value
;
}
public
static
Status
fromValue
(
int
value
)
{
for
(
Status
status
:
Status
.
values
())
{
if
(
status
.
getValue
()
==
value
)
{
return
status
;
}
}
//default value
return
Init
;
}
}
/**
* Get thread count which is running
*
* @return thread count which is running
* @since 0.4.1
*/
public
int
getThreadAlive
()
{
if
(
threadPool
==
null
)
{
return
0
;
}
return
threadPool
.
getThreadAlive
();
}
/**
* Whether add urls extracted to download.<br>
* Add urls to download when it is true, and just download seed urls when it is false. <br>
* DO NOT set it unless you know what it means!
*
* @param spawnUrl spawnUrl
* @return this
* @since 0.4.0
*/
public
MySpider
setSpawnUrl
(
boolean
spawnUrl
)
{
this
.
spawnUrl
=
spawnUrl
;
return
this
;
}
@Override
public
String
getUUID
()
{
if
(
uuid
!=
null
)
{
return
uuid
;
}
if
(
site
!=
null
)
{
return
site
.
getDomain
();
}
uuid
=
UUID
.
randomUUID
().
toString
();
return
uuid
;
}
public
MySpider
setExecutorService
(
ExecutorService
executorService
)
{
checkIfRunning
();
this
.
executorService
=
executorService
;
return
this
;
}
@Override
public
Site
getSite
()
{
return
site
;
}
public
List
<
SpiderListener
>
getSpiderListeners
()
{
return
spiderListeners
;
}
public
MySpider
setSpiderListeners
(
List
<
SpiderListener
>
spiderListeners
)
{
this
.
spiderListeners
=
spiderListeners
;
return
this
;
}
public
Date
getStartTime
()
{
return
startTime
;
}
public
Scheduler
getScheduler
()
{
return
scheduler
;
}
/**
* Set wait time when no url is polled.<br><br>
*
* @param emptySleepTime In MILLISECONDS.
*/
public
void
setEmptySleepTime
(
int
emptySleepTime
)
{
this
.
emptySleepTime
=
emptySleepTime
;
}
}
src/main/java/com/zhiwei/source_forward/util/ContentDataCallback.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
util
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
public
interface
ContentDataCallback
{
void
onData
(
ContentBean
data
,
Attribution
attr
);
}
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
View file @
19bb2414
...
...
@@ -5,8 +5,8 @@ import org.jsoup.nodes.Document;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
c
n.edu.hfut.dmic.contentextractor
.ContentExtractor
;
import
c
n.edu.hfut.dmic.contentextractor
.News
;
import
c
om.zhiwei.source_forward.content
.ContentExtractor
;
import
c
om.zhiwei.source_forward.content
.News
;
/**
* @ClassName: MatchChannel
...
...
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
19bb2414
...
...
@@ -5,8 +5,8 @@ import java.util.List;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
c
n.edu.hfut.dmic.contentextractor
.ContentExtractor
;
import
c
n.edu.hfut.dmic.contentextractor
.News
;
import
c
om.zhiwei.source_forward.content
.ContentExtractor
;
import
c
om.zhiwei.source_forward.content
.News
;
/**
* @ClassName: MatchSource
...
...
src/main/java/com/zhiwei/source_forward/util/MediaSelfSourceDataCallBack.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
util
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
public
interface
MediaSelfSourceDataCallBack
{
void
onData
(
MediaSelfSourceBean
data
,
Attribution
attr
);
}
src/main/java/com/zhiwei/source_forward/util/ProxyClientUtil.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.net.Proxy
;
import
com.zhiwei.proxy.common.Definition.GroupType
;
import
com.zhiwei.proxy.core.ProxyClient
;
import
com.zhiwei.proxy.core.ProxyClientFactory
;
public
class
ProxyClientUtil
{
private
static
volatile
ProxyClient
client
;
/**
* @Title: getNATProxy
* @author hero
* @Description: 获取NAT机代理IP
* @param @return 设定文件
* @return Proxy 返回类型
*/
public
static
Proxy
getNATProxy
(){
return
getClient
().
getNATProxy
();
}
public
static
ProxyClient
getClient
()
{
if
(
client
==
null
)
{
synchronized
(
ProxyClientUtil
.
class
)
{
if
(
client
==
null
)
{
client
=
ProxyClientFactory
.
build
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
}
}
}
return
client
;
}
}
src/main/java/com/zhiwei/source_forward/util/SourceData.java
View file @
19bb2414
...
...
@@ -9,7 +9,8 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.zhiweiTools.order.TreatOrder
;
import
com.zhiwei.tools.order.TreatOrder
;
/**
* @ClassName: SourceData
...
...
@@ -82,7 +83,7 @@ public class SourceData {
public
static
List
<
String
>
getSourceList
(){
List
<
String
>
result
=
null
;
if
(
sourceMap
!=
null
&&
sourceMap
.
size
()>
0
){
result
=
new
ArrayList
<
String
>();
result
=
new
ArrayList
<>();
List
<
Entry
<
String
,
Integer
>>
dataList
=
TreatOrder
.
treatOrderByCountDesc
(
sourceMap
);
for
(
Entry
<
String
,
Integer
>
entry
:
dataList
){
result
.
add
(
entry
.
getKey
());
...
...
src/main/java/com/zhiwei/source_forward/util/SourceForwardDataCallBack.java
0 → 100644
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
util
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
public
interface
SourceForwardDataCallBack
{
/**
* 当有输入传入调度
*
* @param data
* @param attr
* @return void
*/
void
onData
(
SourceForwardBean
data
,
Attribution
attr
);
}
src/main/java/com/zhiwei/source_forward/util/TreateData.java
View file @
19bb2414
src/main/java/com/zhiwei/source_forward/util/UrlLiveDataCallback.java
0 → 100644
View file @
19bb2414
/**
* @Title: DataCallback.java
* @Package com.zhiwei.crawler.baidu
* @author 0xff
* @date 2018年6月29日 下午4:44:38
*/
package
com
.
zhiwei
.
source_forward
.
util
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
/**
* @ClassName: UrlLiveDataCallback
* @Description: 链接是否删除保存接口
* @author 0xff
* @date 2018年6月29日 下午4:44:38
*/
public
interface
UrlLiveDataCallback
{
/**
* 当有输入传入调度
*
* @param data
* @param attr
* @return void
*/
void
onData
(
UrlLiveBean
data
,
Attribution
attr
);
}
src/main/resources/log4j2.xml
0 → 100644
View file @
19bb2414
<?xml version="1.0" encoding="UTF-8"?>
<!-- log4j2 自身的日志级别 -->
<Configuration
status=
"WARN"
>
<properties>
<property
name=
"LOG_HOME"
>
Log/
</property>
<property
name=
"LOG_FILE"
>
crawler
</property>
</properties>
<Appenders>
<!-- 定义日志输出地 -->
<Console
name=
"Console"
target=
"SYSTEM_OUT"
>
<PatternLayout
pattern=
"%d{yyyy-MM-dd HH:mm:ss.SSS} %-5level %logger{36} - %msg%n"
/>
</Console>
<RollingRandomAccessFile
name=
"LogFile"
fileName=
"${LOG_HOME}/${LOG_FILE}.log"
filePattern=
"${LOG_HOME}/$${date:yyyy-MM}/${LOG_FILE}-%d{yyyy-MM-dd}-%i.log"
>
<PatternLayout
pattern=
"%d{yyyy-MM-dd HH:mm:ss.SSS} %-5level %logger{36} - %msg%n"
/>
<Policies>
<TimeBasedTriggeringPolicy
interval=
"1"
/>
<SizeBasedTriggeringPolicy
size=
"20 MB"
/>
</Policies>
<DefaultRolloverStrategy
max=
"20"
/>
</RollingRandomAccessFile>
</Appenders>
<Loggers>
<Root
level=
"all"
>
<AppenderRef
ref=
"Console"
level=
"info"
/>
<AppenderRef
ref=
"LogFile"
level=
"info"
/>
</Root>
</Loggers>
</Configuration>
\ No newline at end of file
src/test/java/com/zhiwei/source_forward/sourceforward/test/URLLiveTest.java
View file @
19bb2414
package
com
.
zhiwei
.
source_forward
.
sourceforward
.
test
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
org.junit.Test
;
import
com.zhiwei.source_forward.run.URLLive
;
/**
* @ClassName: URLLiveTest
* @Description: 验证链接有效性
* @author hero
* @date 2017年12月6日 下午1:30:26
*/
public
class
URLLiveTest
{
// @Test
// public void urlLiveTest(){
// String path = "E://稿件汇总网媒数据//福莱网媒.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> data = poi.importExcel(path, 0);
// @SuppressWarnings("unchecked")
// List<String> headList = (List<String>)data.get("head");
// headList.add("是否删除");
// @SuppressWarnings("unchecked")
// List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body");
// Map<String,Map<String,Object>> dataMap = ReadMediaData.getUrlLive(dataList);
// dataMap = URLLive.verificationURLLive(dataMap);
//
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Entry<String,Map<String,Object>> dataEntry : dataMap.entrySet()){
// bodyList.add(dataEntry.getValue());
// }
// poi.exportExcel(path ,"匹配后数据", headList, bodyList);
// }
}
//
package com.zhiwei.source_forward.sourceforward.test;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
import java.util.Map.Entry;
//
//
import org.junit.Test;
//
//
import com.zhiwei.source_forward.run.URLLive;
//
/
//
**
//
* @ClassName: URLLiveTest
//
* @Description: 验证链接有效性
//
* @author hero
//
* @date 2017年12月6日 下午1:30:26
//
*/
//
public class URLLiveTest {
//
//
//
//
@Test
//
//
public void urlLiveTest(){
//
//
String path = "E://稿件汇总网媒数据//福莱网媒.xlsx";
//
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
//
Map<String,Object> data = poi.importExcel(path, 0);
//
//
@SuppressWarnings("unchecked")
//
//
List<String> headList = (List<String>)data.get("head");
//
//
headList.add("是否删除");
//
//
@SuppressWarnings("unchecked")
//
//
List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body");
//
//
Map<String,Map<String,Object>> dataMap = ReadMediaData.getUrlLive(dataList);
//
//
dataMap = URLLive.verificationURLLive(dataMap);
//
//
//
//
List<Map<String,Object>> bodyList = new ArrayList<>();
//
//
for(Entry<String,Map<String,Object>> dataEntry : dataMap.entrySet()){
//
//
bodyList.add(dataEntry.getValue());
//
//
}
//
//
poi.exportExcel(path ,"匹配后数据", headList, bodyList);
//
//
}
//
//
//
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment