Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
5a79e3d2
Commit
5a79e3d2
authored
Dec 14, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
发布版本 修改
parent
4a1a7343
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
77 additions
and
65 deletions
+77
-65
pom.xml
+7
-2
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+7
-8
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+3
-2
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+6
-2
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+6
-3
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
+11
-12
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+8
-11
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+14
-17
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+3
-4
src/main/java/com/zhiwei/source_forward/util/MatchChannel.java
+1
-1
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+2
-2
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+7
-0
src/main/java/com/zhiwei/source_forward/util/ReadMediaData.java
+1
-1
src/main/java/com/zhiwei/source_forward/util/TreateData.java
+1
-0
No files found.
pom.xml
View file @
5a79e3d2
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<artifactId>
source-forward
</artifactId>
<version>
0.0.
5
-SNAPSHOT
</version>
<version>
0.0.
7
-SNAPSHOT
</version>
<name>
source-forward
</name>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
@@ -24,7 +24,12 @@
...
@@ -24,7 +24,12 @@
<dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.0.9-SNAPSHOT
</version>
<version>
0.1.1-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.1.1-RELEASE
</version>
</dependency>
</dependency>
</dependencies>
</dependencies>
...
...
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
5a79e3d2
...
@@ -7,7 +7,7 @@ import org.apache.logging.log4j.Logger;
...
@@ -7,7 +7,7 @@ import org.apache.logging.log4j.Logger;
import
com.zhiwei.crawler.async.MultiThreadingCounter
;
import
com.zhiwei.crawler.async.MultiThreadingCounter
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.
HttpRequestBuilder
;
import
com.zhiwei.crawler.core.
RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
...
@@ -31,8 +31,8 @@ public class ContentCrawler {
...
@@ -31,8 +31,8 @@ public class ContentCrawler {
* @throws Exception
* @throws Exception
*/
*/
public
MultiThreadingCounter
submitTask
(
ContentDataCallback
callback
,
public
MultiThreadingCounter
submitTask
(
ContentDataCallback
callback
,
String
...
urls
)
throws
Exception
{
String
...
urls
)
{
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
(
20
,
TimeUnit
.
MINUTES
,
false
);
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
(
15
,
TimeUnit
.
MINUTES
,
false
);
start
(
counter
,
callback
,
urls
);
start
(
counter
,
callback
,
urls
);
return
counter
;
return
counter
;
}
}
...
@@ -52,7 +52,7 @@ public class ContentCrawler {
...
@@ -52,7 +52,7 @@ public class ContentCrawler {
try
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"
关键词 {} 搜索创建出错: {}"
,
e
.
getMessage
()
);
logger
.
error
(
"
搜索创建出错"
,
e
);
}
}
}
}
}
}
...
@@ -71,7 +71,7 @@ public class ContentCrawler {
...
@@ -71,7 +71,7 @@ public class ContentCrawler {
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
nul
l
);
Request
request
=
RequestUtils
.
wrapGet
(
ur
l
);
counter
.
increase
();
counter
.
increase
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
,
true
).
addListener
(
future
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
,
true
).
addListener
(
future
->
{
if
(
future
.
isSuccess
())
{
if
(
future
.
isSuccess
())
{
...
@@ -84,9 +84,9 @@ public class ContentCrawler {
...
@@ -84,9 +84,9 @@ public class ContentCrawler {
}
else
{
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
}
}
counter
.
reduce
();
counter
.
reduce
();
});
});
return
counter
;
return
counter
;
}
}
...
@@ -104,12 +104,11 @@ public class ContentCrawler {
...
@@ -104,12 +104,11 @@ public class ContentCrawler {
try
{
try
{
if
(
response
.
isSuccessful
())
{
if
(
response
.
isSuccessful
())
{
String
html
=
response
.
body
().
string
();
String
html
=
response
.
body
().
string
();
System
.
out
.
println
(
html
);
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
html
);
html
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"网页链接失效"
,
e
.
fillInStackTrace
()
);
logger
.
error
(
"网页链接失效"
,
e
);
}
finally
{
}
finally
{
if
(
response
!=
null
)
{
if
(
response
!=
null
)
{
response
.
close
();
response
.
close
();
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
5a79e3d2
...
@@ -45,7 +45,7 @@ public class MediaSelfSourceCrawler {
...
@@ -45,7 +45,7 @@ public class MediaSelfSourceCrawler {
* @throws Exception
* @throws Exception
*/
*/
public
MultiThreadingCounter
submitTask
(
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
throws
Exception
{
public
MultiThreadingCounter
submitTask
(
MediaSelfSourceDataCallBack
callback
,
String
...
urls
)
throws
Exception
{
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
(
"任务======= "
,
1
0
,
TimeUnit
.
SECONDS
,
true
);
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
(
"任务======= "
,
1
5
,
TimeUnit
.
SECONDS
,
true
);
start
(
counter
,
callback
,
urls
);
start
(
counter
,
callback
,
urls
);
return
counter
;
return
counter
;
}
}
...
@@ -65,7 +65,7 @@ public class MediaSelfSourceCrawler {
...
@@ -65,7 +65,7 @@ public class MediaSelfSourceCrawler {
counter
.
increase
();
counter
.
increase
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"
关键词 {} 搜索创建出错: {}"
,
e
.
getMessage
()
);
logger
.
error
(
"
搜索创建出错"
,
e
);
}
finally
{
}
finally
{
counter
.
reduce
();
counter
.
reduce
();
}
}
...
@@ -135,6 +135,7 @@ public class MediaSelfSourceCrawler {
...
@@ -135,6 +135,7 @@ public class MediaSelfSourceCrawler {
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
source
=
null
;
source
=
null
;
}
finally
{
}
finally
{
if
(
response
!=
null
)
{
if
(
response
!=
null
)
{
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
5a79e3d2
...
@@ -31,7 +31,7 @@ public class SourceForwardCrawler {
...
@@ -31,7 +31,7 @@ public class SourceForwardCrawler {
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
public
MultiThreadingCounter
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
throws
Exception
{
public
MultiThreadingCounter
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
throws
Exception
{
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
(
20
,
TimeUnit
.
MINUTES
,
false
);
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
(
5
,
TimeUnit
.
MINUTES
,
false
);
start
(
counter
,
callback
,
urls
);
start
(
counter
,
callback
,
urls
);
return
counter
;
return
counter
;
}
}
...
@@ -44,7 +44,7 @@ public class SourceForwardCrawler {
...
@@ -44,7 +44,7 @@ public class SourceForwardCrawler {
counter
.
increase
();
counter
.
increase
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"
关键词 {} 搜索创建出错: {}"
,
e
.
getMessage
()
);
logger
.
error
(
"
搜索创建出错"
,
e
);
}
finally
{
}
finally
{
counter
.
reduce
();
counter
.
reduce
();
}
}
...
@@ -58,6 +58,7 @@ public class SourceForwardCrawler {
...
@@ -58,6 +58,7 @@ public class SourceForwardCrawler {
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
null
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
null
);
counter
.
increase
();
counter
.
increase
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
,
true
).
addListener
(
future
->
{
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
,
true
).
addListener
(
future
->
{
try
{
if
(
future
.
isSuccess
())
{
if
(
future
.
isSuccess
())
{
Response
response
=
future
.
result
();
Response
response
=
future
.
result
();
try
{
try
{
...
@@ -68,7 +69,9 @@ public class SourceForwardCrawler {
...
@@ -68,7 +69,9 @@ public class SourceForwardCrawler {
}
else
{
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
}
}
}
finally
{
counter
.
reduce
();
counter
.
reduce
();
}
});
});
return
counter
;
return
counter
;
}
}
...
@@ -98,6 +101,7 @@ public class SourceForwardCrawler {
...
@@ -98,6 +101,7 @@ public class SourceForwardCrawler {
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
source
=
null
;
source
=
null
;
channel
=
"新闻"
;
channel
=
"新闻"
;
}
finally
{
}
finally
{
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
5a79e3d2
...
@@ -46,7 +46,7 @@ public class UrlLiveCrawler {
...
@@ -46,7 +46,7 @@ public class UrlLiveCrawler {
try
{
try
{
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"
关键词 {} 搜索创建出错: {}"
,
e
.
getMessage
()
);
logger
.
error
(
"
搜索创建出错:"
,
e
);
}
}
}
}
}
}
...
@@ -124,7 +124,7 @@ public class UrlLiveCrawler {
...
@@ -124,7 +124,7 @@ public class UrlLiveCrawler {
url
=
"https://www.toutiao.com/a"
+
url
.
split
(
"/"
)[
4
]
+
"/"
;
url
=
"https://www.toutiao.com/a"
+
url
.
split
(
"/"
)[
4
]
+
"/"
;
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"url 解析出错 {}"
,
url
);
logger
.
error
(
"url 解析出错 "
,
e
);
return
url
;
return
url
;
}
}
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
...
@@ -151,7 +151,7 @@ public class UrlLiveCrawler {
...
@@ -151,7 +151,7 @@ public class UrlLiveCrawler {
try
{
try
{
f
=
matchDel
(
html
,
attr
.
getAttr
().
toString
());
f
=
matchDel
(
html
,
attr
.
getAttr
().
toString
());
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"数据判断出错 {}"
,
e
.
getMessage
()
);
logger
.
error
(
"数据判断出错 "
,
e
);
}
}
UrlLiveBean
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
f
);
UrlLiveBean
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
f
);
if
(
callback
==
null
)
{
if
(
callback
==
null
)
{
...
@@ -378,6 +378,7 @@ public class UrlLiveCrawler {
...
@@ -378,6 +378,7 @@ public class UrlLiveCrawler {
}
}
catch
(
Exception
e
)
catch
(
Exception
e
)
{
{
e
.
printStackTrace
();
// TODO: handle exception
// TODO: handle exception
}
}
...
@@ -406,6 +407,7 @@ public class UrlLiveCrawler {
...
@@ -406,6 +407,7 @@ public class UrlLiveCrawler {
}
}
catch
(
Exception
e
)
catch
(
Exception
e
)
{
{
e
.
printStackTrace
();
// : handle exception
// : handle exception
}
}
return
flg
;
return
flg
;
...
@@ -438,6 +440,7 @@ public class UrlLiveCrawler {
...
@@ -438,6 +440,7 @@ public class UrlLiveCrawler {
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
false
;
return
false
;
}
}
return
false
;
return
false
;
...
...
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
View file @
5a79e3d2
...
@@ -51,25 +51,25 @@ public class ContentMatch {
...
@@ -51,25 +51,25 @@ public class ContentMatch {
public
static
List
<
ContentBean
>
getContentMatch
(
List
<
String
>
urlList
){
public
static
List
<
ContentBean
>
getContentMatch
(
List
<
String
>
urlList
){
//启动获取链接正文
//启动获取链接正文
List
<
ContentBean
>
dataList
=
ContentMatchCrawlerThread
.
getContentMatch
(
urlList
);
return
ContentMatchCrawlerThread
.
getContentMatch
(
urlList
);
return
dataList
;
}
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
List
<
String
>
urlList
=
new
ArrayList
<>();
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
urlList
.
add
(
"https://mp.weixin.qq.com/s?src=11×tamp=1539828001&ver=1189&signature=SAyiGuX8VfwlPsIlq*V7I8epXKcMc9Zr6RptkDT34vDk7tSYQCwix6qJxMm25JK9gxo0t9HKAeqm70V2J1FhcDiSlf1eMhSSMz8EiCk*Hu50B7sJFkoH46HHo1uiC4f7&new=1"
);
// List<String> urlList = new ArrayList<>();
List
<
ContentBean
>
l
=
getContentMatch
(
urlList
);
// urlList.add("http://www.egsea.com/news/detail?id=324048");
for
(
ContentBean
cb
:
l
)
{
// List<ContentBean> l = getContentMatch(urlList);
System
.
out
.
println
(
cb
.
getContent
());
// for(ContentBean cb : l) {
}
// System.out.println(cb.getContent());
// }
}
}
static
class
ContentMatchCrawlerThread
extends
Thread
{
static
class
ContentMatchCrawlerThread
extends
Thread
{
private
static
List
<
ContentBean
>
getContentMatch
(
List
<
String
>
urlList
){
private
static
List
<
ContentBean
>
getContentMatch
(
List
<
String
>
urlList
){
List
<
ContentBean
>
list
=
Collections
.
synchronizedList
(
new
ArrayList
<
ContentBean
>());
try
{
try
{
ContentCrawler
crawler
=
new
ContentCrawler
();
ContentCrawler
crawler
=
new
ContentCrawler
();
List
<
ContentBean
>
list
=
Collections
.
synchronizedList
(
new
ArrayList
<
ContentBean
>());
ContentDataCallback
callback
=
new
ContentDataCallback
()
{
ContentDataCallback
callback
=
new
ContentDataCallback
()
{
@Override
@Override
...
@@ -80,11 +80,10 @@ public class ContentMatch {
...
@@ -80,11 +80,10 @@ public class ContentMatch {
};
};
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
return
list
;
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
e
.
fillInStackTrace
(
);
logger
.
error
(
" Exception {} "
,
e
);
}
}
return
null
;
return
list
;
}
}
}
}
...
...
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
View file @
5a79e3d2
...
@@ -7,8 +7,6 @@ import java.util.List;
...
@@ -7,8 +7,6 @@ import java.util.List;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler
;
import
com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler
;
...
@@ -24,13 +22,13 @@ public class MediaSelfSource {
...
@@ -24,13 +22,13 @@ public class MediaSelfSource {
}
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
//
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List
<
String
>
urlList
=
new
ArrayList
<>();
//
List<String> urlList = new ArrayList<>();
urlList
.
add
(
"https://www.toutiao.com/a6452936157751968013/"
);
//
urlList.add("https://www.toutiao.com/a6452936157751968013/");
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
//
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for
(
MediaSelfSourceBean
b
:
u
)
{
//
for(MediaSelfSourceBean b : u) {
System
.
out
.
println
(
b
.
toString
());
//
System.out.println(b.toString());
}
//
}
}
}
static
class
MediaSelfSourceCrawlerThread
extends
Thread
{
static
class
MediaSelfSourceCrawlerThread
extends
Thread
{
...
@@ -49,9 +47,8 @@ public class MediaSelfSource {
...
@@ -49,9 +47,8 @@ public class MediaSelfSource {
};
};
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
return
list
;
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
e
.
fillIn
StackTrace
();
e
.
print
StackTrace
();
}
}
return
list
;
return
list
;
}
}
...
...
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
5a79e3d2
...
@@ -10,8 +10,6 @@ import java.util.Map.Entry;
...
@@ -10,8 +10,6 @@ import java.util.Map.Entry;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
...
@@ -119,8 +117,9 @@ public class SourceForward {
...
@@ -119,8 +117,9 @@ public class SourceForward {
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
urlList
.
add
(
entry
.
getKey
());
urlList
.
add
(
entry
.
getKey
());
}
}
System
.
out
.
println
(
urlList
.
size
());
List
<
SourceForwardBean
>
dataList
=
SourceForwardCrawlerThread
.
getSourceForward
(
urlList
);
List
<
SourceForwardBean
>
dataList
=
SourceForwardCrawlerThread
.
getSourceForward
(
urlList
);
System
.
out
.
println
(
dataList
.
size
());
for
(
SourceForwardBean
sfb
:
dataList
){
for
(
SourceForwardBean
sfb
:
dataList
){
String
url
=
sfb
.
getUrl
();
String
url
=
sfb
.
getUrl
();
String
root_source
=
sfb
.
getRoot_source
();
String
root_source
=
sfb
.
getRoot_source
();
...
@@ -144,7 +143,7 @@ public class SourceForward {
...
@@ -144,7 +143,7 @@ public class SourceForward {
dataMap
.
put
(
url
,
data
);
dataMap
.
put
(
url
,
data
);
}
}
}
}
System
.
out
.
println
(
"success"
);
return
dataMap
;
return
dataMap
;
}
}
...
@@ -156,26 +155,25 @@ public class SourceForward {
...
@@ -156,26 +155,25 @@ public class SourceForward {
*/
*/
public
static
List
<
SourceForwardBean
>
getSourceForward
(
List
<
String
>
urlList
){
public
static
List
<
SourceForwardBean
>
getSourceForward
(
List
<
String
>
urlList
){
//启动获取链接来源
//启动获取链接来源
List
<
SourceForwardBean
>
dataList
=
SourceForwardCrawlerThread
.
getSourceForward
(
urlList
);
return
SourceForwardCrawlerThread
.
getSourceForward
(
urlList
);
return
dataList
;
}
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
//
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List
<
String
>
urlList
=
new
ArrayList
<>();
//
List<String> urlList = new ArrayList<>();
urlList
.
add
(
"http://www.toutiao.com/a6452936157751968013/"
);
//
urlList.add("http://www.toutiao.com/a6452936157751968013/");
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
//
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for
(
SourceForwardBean
sfb
:
da
)
{
//
for(SourceForwardBean sfb : da) {
System
.
out
.
println
(
sfb
.
toString
());
//
System.out.println(sfb.toString());
}
//
}
}
}
static
class
SourceForwardCrawlerThread
extends
Thread
{
static
class
SourceForwardCrawlerThread
extends
Thread
{
private
static
List
<
SourceForwardBean
>
getSourceForward
(
List
<
String
>
urlList
){
private
static
List
<
SourceForwardBean
>
getSourceForward
(
List
<
String
>
urlList
){
List
<
SourceForwardBean
>
list
=
Collections
.
synchronizedList
(
new
ArrayList
<
SourceForwardBean
>());
try
{
try
{
SourceForwardCrawler
crawler
=
new
SourceForwardCrawler
();
SourceForwardCrawler
crawler
=
new
SourceForwardCrawler
();
List
<
SourceForwardBean
>
list
=
Collections
.
synchronizedList
(
new
ArrayList
<
SourceForwardBean
>());
SourceForwardDataCallBack
callback
=
new
SourceForwardDataCallBack
()
{
SourceForwardDataCallBack
callback
=
new
SourceForwardDataCallBack
()
{
@Override
@Override
...
@@ -186,11 +184,10 @@ public class SourceForward {
...
@@ -186,11 +184,10 @@ public class SourceForward {
};
};
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
return
list
;
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
e
.
fillIn
StackTrace
();
e
.
print
StackTrace
();
}
}
return
null
;
return
list
;
}
}
}
}
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
5a79e3d2
...
@@ -76,9 +76,9 @@ public class URLLive {
...
@@ -76,9 +76,9 @@ public class URLLive {
static
class
UrlLiveCrawlerThread
extends
Thread
{
static
class
UrlLiveCrawlerThread
extends
Thread
{
private
static
List
<
UrlLiveBean
>
getUrlLiveCrawle
(
List
<
String
>
urlList
){
private
static
List
<
UrlLiveBean
>
getUrlLiveCrawle
(
List
<
String
>
urlList
){
List
<
UrlLiveBean
>
list
=
Collections
.
synchronizedList
(
new
ArrayList
<
UrlLiveBean
>());
try
{
try
{
UrlLiveCrawler
crawler
=
new
UrlLiveCrawler
();
UrlLiveCrawler
crawler
=
new
UrlLiveCrawler
();
List
<
UrlLiveBean
>
list
=
Collections
.
synchronizedList
(
new
ArrayList
<
UrlLiveBean
>());
UrlLiveDataCallback
callback
=
new
UrlLiveDataCallback
()
{
UrlLiveDataCallback
callback
=
new
UrlLiveDataCallback
()
{
@Override
@Override
...
@@ -89,11 +89,10 @@ public class URLLive {
...
@@ -89,11 +89,10 @@ public class URLLive {
};
};
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
return
list
;
}
catch
(
Exception
e
){
}
catch
(
Exception
e
){
e
.
fillIn
StackTrace
();
e
.
print
StackTrace
();
}
}
return
null
;
return
list
;
}
}
}
}
...
...
src/main/java/com/zhiwei/source_forward/util/MatchChannel.java
View file @
5a79e3d2
...
@@ -40,7 +40,7 @@ public class MatchChannel {
...
@@ -40,7 +40,7 @@ public class MatchChannel {
}
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
return
channel
;
e
.
printStackTrace
()
;
}
}
return
channel
;
return
channel
;
}
}
...
...
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
View file @
5a79e3d2
...
@@ -44,7 +44,7 @@ public class MatchContent {
...
@@ -44,7 +44,7 @@ public class MatchContent {
}
}
return
ZhiWeiTools
.
delHTMLTag
(
content
);
return
ZhiWeiTools
.
delHTMLTag
(
content
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
debug
(
"获取全文失败"
,
e
.
fillInStackTrace
()
);
logger
.
error
(
"获取全文失败"
,
e
);
content
=
null
;
content
=
null
;
}
}
return
content
;
return
content
;
...
@@ -106,7 +106,7 @@ public class MatchContent {
...
@@ -106,7 +106,7 @@ public class MatchContent {
News
news
=
ContentExtractor
.
getNewsByHtml
(
html
);
News
news
=
ContentExtractor
.
getNewsByHtml
(
html
);
content
=
TreateData
.
filterSpecialCharacter
(
news
.
getContent
());
content
=
TreateData
.
filterSpecialCharacter
(
news
.
getContent
());
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"正文抽取失败,获取全文文本:{}"
);
logger
.
error
(
"正文抽取失败,获取全文文本:"
,
e
);
content
=
document
.
text
();
content
=
document
.
text
();
}
}
return
content
;
return
content
;
...
...
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
5a79e3d2
...
@@ -192,6 +192,7 @@ public class MatchSource {
...
@@ -192,6 +192,7 @@ public class MatchSource {
}
}
return
source
;
return
source
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
return
null
;
}
}
}
}
...
@@ -353,6 +354,8 @@ public class MatchSource {
...
@@ -353,6 +354,8 @@ public class MatchSource {
*/
*/
private
static
String
getSourceByTime
(
String
timeSource
,
String
htmlBody
,
List
<
String
>
sourceList
){
private
static
String
getSourceByTime
(
String
timeSource
,
String
htmlBody
,
List
<
String
>
sourceList
){
try
{
/**以时间做分割,匹配来源信息。
/**以时间做分割,匹配来源信息。
* 主要匹配 YYYY-MM-dd xx日报
* 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* 或 xx日报 YYYY-MM-dd
...
@@ -381,5 +384,9 @@ public class MatchSource {
...
@@ -381,5 +384,9 @@ public class MatchSource {
}
}
}
}
return
null
;
return
null
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
}
}
}
}
}
src/main/java/com/zhiwei/source_forward/util/ReadMediaData.java
View file @
5a79e3d2
...
@@ -32,7 +32,7 @@ public class ReadMediaData {
...
@@ -32,7 +32,7 @@ public class ReadMediaData {
//添加来源到自定义来源列表
//添加来源到自定义来源列表
SourceData
.
addUserSource
(
source
);
SourceData
.
addUserSource
(
source
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
}
return
result
;
return
result
;
...
...
src/main/java/com/zhiwei/source_forward/util/TreateData.java
View file @
5a79e3d2
...
@@ -43,6 +43,7 @@ public class TreateData {
...
@@ -43,6 +43,7 @@ public class TreateData {
Matcher
m
=
p
.
matcher
(
str
);
Matcher
m
=
p
.
matcher
(
str
);
return
m
.
replaceAll
(
""
);
return
m
.
replaceAll
(
""
);
}
catch
(
Exception
ex
)
{
}
catch
(
Exception
ex
)
{
ex
.
printStackTrace
();
return
str
;
return
str
;
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment