Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
98e0d120
Commit
98e0d120
authored
Sep 11, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
sourceforward 链接匹配修改
parent
aa2a108b
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
216 additions
and
104 deletions
+216
-104
pom.xml
+2
-12
src/main/java/com/zhiwei/source_forward/bean/UrlLiveBean.java
+51
-5
src/main/java/com/zhiwei/source_forward/content/ContentExtractor.java
+13
-13
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+17
-16
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+3
-3
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+3
-3
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+80
-39
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
+8
-8
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+22
-3
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+17
-2
No files found.
pom.xml
View file @
98e0d120
...
@@ -24,12 +24,12 @@
...
@@ -24,12 +24,12 @@
<dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.0.
2
-SNAPSHOT
</version>
<version>
0.0.
5
-SNAPSHOT
</version>
</dependency>
</dependency>
<dependency>
<dependency>
<groupId>
com.zhiwei.middleware
</groupId>
<groupId>
com.zhiwei.middleware
</groupId>
<artifactId>
proxy-client
</artifactId>
<artifactId>
proxy-client
</artifactId>
<version>
0.0.
1
-RELEASE
</version>
<version>
0.0.
2
-RELEASE
</version>
</dependency>
</dependency>
</dependencies>
</dependencies>
...
@@ -89,13 +89,4 @@
...
@@ -89,13 +89,4 @@
<dependencyManagement>
<dependencies>
<dependency>
<groupId>
com.squareup.okhttp3
</groupId>
<artifactId>
okhttp
</artifactId>
<version>
3.11.0
</version>
</dependency>
</dependencies>
</dependencyManagement>
</project>
</project>
\ No newline at end of file
src/main/java/com/zhiwei/source_forward/bean/UrlLiveBean.java
View file @
98e0d120
...
@@ -4,18 +4,28 @@ public class UrlLiveBean {
...
@@ -4,18 +4,28 @@ public class UrlLiveBean {
private
String
url
;
private
String
url
;
private
boolean
isLive
;
private
Integer
isLive
;
public
UrlLiveBean
()
{
public
UrlLiveBean
()
{
super
();
super
();
}
}
public
UrlLiveBean
(
String
url
,
boolean
isLive
)
{
public
UrlLiveBean
(
String
url
,
Integer
isLive
)
{
super
();
super
();
this
.
url
=
url
;
this
.
url
=
url
;
this
.
isLive
=
isLive
;
this
.
isLive
=
isLive
;
}
}
public
UrlLiveBean
(
String
url
,
boolean
isLive
)
{
super
();
this
.
url
=
url
;
if
(
isLive
)
{
this
.
isLive
=
1
;
//已删除
}
else
{
this
.
isLive
=
0
;
}
}
public
String
getUrl
()
{
public
String
getUrl
()
{
return
url
;
return
url
;
}
}
...
@@ -24,11 +34,11 @@ public class UrlLiveBean {
...
@@ -24,11 +34,11 @@ public class UrlLiveBean {
this
.
url
=
url
;
this
.
url
=
url
;
}
}
public
boolean
isLive
()
{
public
Integer
isLive
()
{
return
isLive
;
return
isLive
;
}
}
public
void
setLive
(
boolean
isLive
)
{
public
void
setLive
(
Integer
isLive
)
{
this
.
isLive
=
isLive
;
this
.
isLive
=
isLive
;
}
}
...
@@ -46,6 +56,8 @@ public class UrlLiveBean {
...
@@ -46,6 +56,8 @@ public class UrlLiveBean {
public
static
class
Attribution
{
public
static
class
Attribution
{
private
Object
attr
;
private
Object
attr
;
private
Integer
count
;
/**
/**
* Constructor
* Constructor
*
*
...
@@ -56,6 +68,17 @@ public class UrlLiveBean {
...
@@ -56,6 +68,17 @@ public class UrlLiveBean {
}
}
/**
/**
*
* @Description TODO(这里用一句话描述这个方法的作用)
* @param attr
* @param count
*/
private
Attribution
(
Object
attr
,
Integer
count
){
this
.
attr
=
attr
;
this
.
count
=
count
;
}
/**
* 创建属性
* 创建属性
*
*
* @param attr
* @param attr
...
@@ -66,13 +89,36 @@ public class UrlLiveBean {
...
@@ -66,13 +89,36 @@ public class UrlLiveBean {
}
}
/**
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public
static
Attribution
of
(
Object
attr
,
Integer
count
)
{
return
new
Attribution
(
attr
,
count
);
}
/**
* 获取属性
* 获取属性
*
*
* @return Object
* @return Object
*/
*/
public
Object
get
()
{
public
Object
get
Attr
()
{
return
attr
;
return
attr
;
}
}
/**
* 获取属性
*
* @return Object
*/
public
Integer
getCount
()
{
return
count
;
}
public
void
AddCount
()
{
count
++;
}
}
}
}
}
src/main/java/com/zhiwei/source_forward/content/ContentExtractor.java
View file @
98e0d120
...
@@ -143,9 +143,9 @@ public class ContentExtractor {
...
@@ -143,9 +143,9 @@ public class ContentExtractor {
content
=
tag
;
content
=
tag
;
}
}
}
}
if
(
content
==
null
)
{
//
if (content == null) {
throw
new
Exception
(
"extraction failed"
);
//
throw new Exception("extraction failed");
}
//
}
return
content
;
return
content
;
}
}
...
@@ -164,17 +164,17 @@ public class ContentExtractor {
...
@@ -164,17 +164,17 @@ public class ContentExtractor {
news
.
setUrl
(
doc
.
baseUri
());
news
.
setUrl
(
doc
.
baseUri
());
}
}
try
{
//
try {
news
.
setTime
(
getTime
(
contentElement
));
//
news.setTime(getTime(contentElement));
}
catch
(
Exception
ex
)
{
//
} catch (Exception ex) {
LOG
.
info
(
"news title extraction failed"
,
ex
);
//
LOG.info("news title extraction failed", ex);
}
//
}
try
{
//
try {
news
.
setTitle
(
getTitle
(
contentElement
));
//
news.setTitle(getTitle(contentElement));
}
catch
(
Exception
ex
)
{
//
} catch (Exception ex) {
LOG
.
info
(
"title extraction failed"
,
ex
);
//
LOG.info("title extraction failed", ex);
}
//
}
return
news
;
return
news
;
}
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
98e0d120
...
@@ -3,16 +3,15 @@ package com.zhiwei.source_forward.crawler;
...
@@ -3,16 +3,15 @@ package com.zhiwei.source_forward.crawler;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.crawler.async.MultiThreadingCounter
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpRequestBuilder
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.source_forward.util.ProxyClientUtil
;
import
com.zhiwei.source_forward.util.ProxyClientUtil
;
import
com.zhiwei.tools.httpclient.HttpBoot
;
import
com.zhiwei.tools.httpclient.HttpRequestBuilder
;
import
com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter
;
import
okhttp3.Headers
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
okhttp3.Response
;
...
@@ -28,7 +27,8 @@ public class ContentCrawler {
...
@@ -28,7 +27,8 @@ public class ContentCrawler {
* @return
* @return
* @throws Exception
* @throws Exception
*/
*/
public
MultiThreadingCounter
submitTask
(
ContentDataCallback
callback
,
String
...
urls
)
throws
Exception
{
public
MultiThreadingCounter
submitTask
(
ContentDataCallback
callback
,
String
...
urls
)
throws
Exception
{
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
();
MultiThreadingCounter
counter
=
new
MultiThreadingCounter
();
start
(
counter
,
callback
,
urls
);
start
(
counter
,
callback
,
urls
);
return
counter
;
return
counter
;
...
@@ -41,17 +41,15 @@ public class ContentCrawler {
...
@@ -41,17 +41,15 @@ public class ContentCrawler {
* @param callback
* @param callback
* @param urls
* @param urls
*/
*/
private
void
start
(
MultiThreadingCounter
counter
,
ContentDataCallback
callback
,
String
...
urls
)
{
private
void
start
(
MultiThreadingCounter
counter
,
ContentDataCallback
callback
,
String
...
urls
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
if
(
urls
!=
null
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
if
(
url
!=
null
)
{
if
(
url
!=
null
)
{
try
{
try
{
counter
.
increase
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"关键词 {} 搜索创建出错: {}"
,
e
.
getMessage
());
logger
.
error
(
"关键词 {} 搜索创建出错: {}"
,
e
.
getMessage
());
}
finally
{
counter
.
reduce
();
}
}
}
}
}
}
...
@@ -67,7 +65,8 @@ public class ContentCrawler {
...
@@ -67,7 +65,8 @@ public class ContentCrawler {
* @param callback
* @param callback
* @return
* @return
*/
*/
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
String
url
,
Attribution
attr
,
ContentDataCallback
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
null
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
null
);
counter
.
increase
();
counter
.
increase
();
...
@@ -80,8 +79,9 @@ public class ContentCrawler {
...
@@ -80,8 +79,9 @@ public class ContentCrawler {
logger
.
error
(
"解析出错"
,
e
);
logger
.
error
(
"解析出错"
,
e
);
}
}
}
else
{
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
}
}
counter
.
reduce
();
counter
.
reduce
();
});
});
return
counter
;
return
counter
;
...
@@ -99,14 +99,15 @@ public class ContentCrawler {
...
@@ -99,14 +99,15 @@ public class ContentCrawler {
ContentDataCallback
callback
)
{
ContentDataCallback
callback
)
{
String
content
=
null
;
String
content
=
null
;
try
{
try
{
if
(
response
.
isSuccessful
())
{
if
(
response
.
isSuccessful
())
{
String
html
=
response
.
body
().
string
();
String
html
=
response
.
body
().
string
();
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
html
);
content
=
MatchContent
.
matchContent
(
attr
.
get
().
toString
(),
html
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"网页链接失效"
,
e
.
fillInStackTrace
());
logger
.
info
(
"网页链接失效"
,
e
.
fillInStackTrace
());
}
finally
{
}
finally
{
if
(
response
!=
null
)
{
if
(
response
!=
null
)
{
response
.
close
();
response
.
close
();
}
}
}
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
98e0d120
...
@@ -7,15 +7,15 @@ import org.apache.logging.log4j.Logger;
...
@@ -7,15 +7,15 @@ import org.apache.logging.log4j.Logger;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.crawler.async.MultiThreadingCounter
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpRequestBuilder
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
import
com.zhiwei.source_forward.util.ProxyClientUtil
;
import
com.zhiwei.source_forward.util.ProxyClientUtil
;
import
com.zhiwei.tools.httpclient.HttpBoot
;
import
com.zhiwei.tools.httpclient.HttpRequestBuilder
;
import
com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
okhttp3.Response
;
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
98e0d120
...
@@ -8,6 +8,9 @@ import org.jsoup.Jsoup;
...
@@ -8,6 +8,9 @@ import org.jsoup.Jsoup;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.crawler.async.MultiThreadingCounter
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpRequestBuilder
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchChannel
;
...
@@ -15,9 +18,6 @@ import com.zhiwei.source_forward.util.MatchSource;
...
@@ -15,9 +18,6 @@ import com.zhiwei.source_forward.util.MatchSource;
import
com.zhiwei.source_forward.util.ProxyClientUtil
;
import
com.zhiwei.source_forward.util.ProxyClientUtil
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.tools.httpclient.HttpBoot
;
import
com.zhiwei.tools.httpclient.HttpRequestBuilder
;
import
com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
okhttp3.Response
;
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
98e0d120
package
com
.
zhiwei
.
source_forward
.
crawler
;
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.io.IOException
;
import
java.util.List
;
import
java.util.List
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
...
@@ -9,13 +8,13 @@ import org.jsoup.Jsoup;
...
@@ -9,13 +8,13 @@ import org.jsoup.Jsoup;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.crawler.async.MultiThreadingCounter
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpRequestBuilder
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.ProxyClientUtil
;
import
com.zhiwei.source_forward.util.ProxyClientUtil
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.tools.httpclient.HttpBoot
;
import
com.zhiwei.tools.httpclient.HttpRequestBuilder
;
import
com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter
;
import
okhttp3.Request
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
okhttp3.Response
;
...
@@ -43,12 +42,9 @@ public class UrlLiveCrawler {
...
@@ -43,12 +42,9 @@ public class UrlLiveCrawler {
for
(
String
url
:
urls
)
{
for
(
String
url
:
urls
)
{
if
(
url
!=
null
)
{
if
(
url
!=
null
)
{
try
{
try
{
counter
.
increase
();
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"关键词 {} 搜索创建出错: {}"
,
e
.
getMessage
());
logger
.
error
(
"关键词 {} 搜索创建出错: {}"
,
e
.
getMessage
());
}
finally
{
counter
.
reduce
();
}
}
}
}
}
}
...
@@ -57,6 +53,7 @@ public class UrlLiveCrawler {
...
@@ -57,6 +53,7 @@ public class UrlLiveCrawler {
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
String
url
,
private
MultiThreadingCounter
search
(
MultiThreadingCounter
counter
,
String
url
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
null
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
null
);
counter
.
increase
();
counter
.
increase
();
...
@@ -64,36 +61,82 @@ public class UrlLiveCrawler {
...
@@ -64,36 +61,82 @@ public class UrlLiveCrawler {
if
(
future
.
isSuccess
())
{
if
(
future
.
isSuccess
())
{
Response
response
=
future
.
result
();
Response
response
=
future
.
result
();
try
{
try
{
parseHtml
(
response
,
attr
,
callback
);
if
(
response
.
code
()
==
200
)
{
parseHtml
(
response
.
body
().
string
(),
attr
,
callback
);
}
else
{
callBack
(
callback
,
attr
,
1
);
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
logger
.
error
(
"解析出错"
,
e
);
}
finally
{
if
(
response
!=
null
)
{
response
.
close
();
}
}
}
}
else
{
}
else
{
if
(
attr
.
getCount
()
>
3
)
{
callBack
(
callback
,
attr
,
-
1
);
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
}
else
{
attr
.
AddCount
();
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
}
}
}
counter
.
reduce
();
counter
.
reduce
();
});
});
return
counter
;
return
counter
;
}
}
private
void
parseHtml
(
Response
response
,
Attribution
attr
,
private
void
callBack
(
UrlLiveDataCallback
callback
,
Attribution
attr
,
int
i
)
{
UrlLiveBean
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
i
);
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
callback
.
onData
(
ulb
,
attr
);
}
}
private
String
dealUrl
(
String
url
)
{
if
(
url
.
contains
(
"toutiao.com"
))
{
try
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
}
else
{
url
=
url
.
replace
(
"toutiao.com"
,
"www.toutiao.com"
);
}
if
(
url
.
contains
(
"https"
))
{
}
else
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
if
(
url
.
contains
(
"group"
))
{
url
=
"https://www.toutiao.com/a"
+
url
.
split
(
"/"
)[
4
]
+
"/"
;
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"url 解析出错 {}"
,
url
);
return
url
;
}
}
return
url
;
}
/**
*
* @Description 判断是否删除
* @param html
* @param attr
* @param callback
*/
private
void
parseHtml
(
String
html
,
Attribution
attr
,
UrlLiveDataCallback
callback
)
{
UrlLiveDataCallback
callback
)
{
/***验证网页是否能够连通*/
/***验证网页是否能够连通*/
boolean
f
=
true
;
boolean
f
=
true
;
if
(!
response
.
isSuccessful
()){
try
{
try
{
f
=
matchDel
(
response
.
body
().
string
(),
attr
.
get
().
toString
());
f
=
matchDel
(
html
,
attr
.
getAttr
().
toString
());
}
catch
(
IO
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
info
(
"数据判断出错 {}"
,
e
.
getMessage
());
logger
.
info
(
"数据判断出错 {}"
,
e
.
getMessage
());
}
finally
{
if
(
response
!=
null
)
{
response
.
close
();
}
}
}
UrlLiveBean
ulb
=
new
UrlLiveBean
(
attr
.
getAttr
().
toString
(),
f
);
}
else
{
f
=
false
;
}
UrlLiveBean
ulb
=
new
UrlLiveBean
(
attr
.
get
().
toString
(),
f
);
if
(
callback
==
null
)
{
if
(
callback
==
null
)
{
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
logger
.
warn
(
"DataCallback 对象为 null,无法保存数据"
);
}
else
{
}
else
{
...
@@ -123,12 +166,6 @@ public class UrlLiveCrawler {
...
@@ -123,12 +166,6 @@ public class UrlLiveCrawler {
return
true
;
return
true
;
}
}
step
++;
step
++;
if
(
rulerWeigui
(
doc
))
{
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
true
;
}
step
++;
if
(
rulerTousu
(
doc
))
if
(
rulerTousu
(
doc
))
{
{
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
...
@@ -158,6 +195,11 @@ public class UrlLiveCrawler {
...
@@ -158,6 +195,11 @@ public class UrlLiveCrawler {
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
true
;
return
true
;
}
}
step
++;
//10
if
(
rulerWeigui
(
doc
))
{
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
true
;
}
step
++;
//11
step
++;
//11
if
(
rulerYidian
(
doc
))
if
(
rulerYidian
(
doc
))
{
{
...
@@ -169,7 +211,7 @@ public class UrlLiveCrawler {
...
@@ -169,7 +211,7 @@ public class UrlLiveCrawler {
/**
/**
*
*
*
@TODO(TODO
微信谣言的无效网址筛选规则)
*
(
微信谣言的无效网址筛选规则)
* @author 陈炜涛
* @author 陈炜涛
* @param doc
* @param doc
* @return
* @return
...
@@ -188,7 +230,7 @@ public class UrlLiveCrawler {
...
@@ -188,7 +230,7 @@ public class UrlLiveCrawler {
/**
/**
*
*
*
@TODO(TODO
微信内容违规的无效网址筛选规则)
*
(
微信内容违规的无效网址筛选规则)
* @author 陈炜涛
* @author 陈炜涛
* @param doc
* @param doc
* @return
* @return
...
@@ -198,8 +240,7 @@ public class UrlLiveCrawler {
...
@@ -198,8 +240,7 @@ public class UrlLiveCrawler {
private
boolean
rulerWeigui
(
Document
doc
)
private
boolean
rulerWeigui
(
Document
doc
)
{
{
boolean
flg
=
false
;
boolean
flg
=
false
;
if
(
"此内容因违规无法查看"
.
equals
(
doc
.
select
(
".text_area > p:nth-child(1)"
)
if
((
doc
.
select
(
"p.title"
).
text
()).
contains
(
"此内容因违规无法查看"
))
.
text
()))
{
{
flg
=
true
;
flg
=
true
;
}
}
...
@@ -208,7 +249,7 @@ public class UrlLiveCrawler {
...
@@ -208,7 +249,7 @@ public class UrlLiveCrawler {
/**
/**
*
*
*
@TODO(TODO
微信内容违规的无效网址筛选规则)
*
(
微信内容违规的无效网址筛选规则)
* @author 陈炜涛
* @author 陈炜涛
* @param doc
* @param doc
* @return
* @return
...
@@ -227,7 +268,7 @@ public class UrlLiveCrawler {
...
@@ -227,7 +268,7 @@ public class UrlLiveCrawler {
/**
/**
*
*
*
@TODO(TODO
环球的无效网址筛选规则)
*
(
环球的无效网址筛选规则)
* @author 陈炜涛
* @author 陈炜涛
* @param doc
* @param doc
* @return
* @return
...
@@ -246,7 +287,7 @@ public class UrlLiveCrawler {
...
@@ -246,7 +287,7 @@ public class UrlLiveCrawler {
/**
/**
*
*
*
@TODO(TODO
空的无效网址筛选规则)
*
(
空的无效网址筛选规则)
* @author 陈炜涛
* @author 陈炜涛
* @param doc
* @param doc
* @return
* @return
...
@@ -267,7 +308,7 @@ public class UrlLiveCrawler {
...
@@ -267,7 +308,7 @@ public class UrlLiveCrawler {
/**
/**
*
*
*
@TODO(TODO
内容不存在)
*
(
内容不存在)
* @author 陈炜涛
* @author 陈炜涛
* @param doc
* @param doc
* @return
* @return
...
@@ -286,7 +327,7 @@ public class UrlLiveCrawler {
...
@@ -286,7 +327,7 @@ public class UrlLiveCrawler {
/**
/**
*
*
*
@TODO(TODO
招商网的无效网址筛选规则)
*
(
招商网的无效网址筛选规则)
* @author 陈炜涛
* @author 陈炜涛
* @param doc
* @param doc
* @return
* @return
...
@@ -315,7 +356,7 @@ public class UrlLiveCrawler {
...
@@ -315,7 +356,7 @@ public class UrlLiveCrawler {
/**
/**
*
*
*
@TODO(TODO
一点资讯的无效网址筛选规则)
*
(
一点资讯的无效网址筛选规则)
* @author 陈炜涛
* @author 陈炜涛
* @param doc
* @param doc
* @return
* @return
...
@@ -334,7 +375,7 @@ public class UrlLiveCrawler {
...
@@ -334,7 +375,7 @@ public class UrlLiveCrawler {
}
}
catch
(
Exception
e
)
catch
(
Exception
e
)
{
{
//
TODO
: handle exception
// : handle exception
}
}
return
flg
;
return
flg
;
}
}
...
@@ -354,7 +395,7 @@ public class UrlLiveCrawler {
...
@@ -354,7 +395,7 @@ public class UrlLiveCrawler {
for
(
Node
node
:
nodeList
)
{
for
(
Node
node
:
nodeList
)
{
if
(
node
.
outerHtml
().
contains
(
"<title>"
))
{
if
(
node
.
outerHtml
().
contains
(
"<title>"
))
{
String
title
=
node
.
toString
().
split
(
"<title>"
)[
1
].
split
(
"</title>"
)[
0
];
String
title
=
node
.
toString
().
split
(
"<title>"
)[
1
].
split
(
"</title>"
)[
0
];
if
(
title
.
contains
(
"
404
"
)){
if
(
title
.
contains
(
"
未知错误"
)
||
title
.
contains
(
"Object moved"
)
||
title
.
contains
(
"404"
)
||
title
.
contains
(
"页面没有找到"
)
||
title
.
contains
(
"页面未找到"
)
||
title
.
contains
(
"301 Moved Permanently
"
)){
return
true
;
return
true
;
}
}
}
}
...
...
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
View file @
98e0d120
...
@@ -55,14 +55,14 @@ public class ContentMatch {
...
@@ -55,14 +55,14 @@ public class ContentMatch {
return
dataList
;
return
dataList
;
}
}
//
public static void main(String[] args) {
public
static
void
main
(
String
[]
args
)
{
//
List<String> urlList = new ArrayList<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
// urlList.add("http://www.toutiao.com/a6571343464292680196/
");
urlList
.
add
(
"https://mp.weixin.qq.com/s?src=11×tamp=1535697915&ver=1093&signature=HNXpB8owyjfkyX-p2UDMga5R-qEpgjEpRQAjVmy7xqdrfsjZNdW0xa56dgCWMD9I*eo**yak46juxNEzryhKVLRT48DG0g9SUJSVrKSaPrhHEuJ1JOA86mSaY7TrHMMT&new=1
"
);
//
List<ContentBean> l = getContentMatch(urlList);
List
<
ContentBean
>
l
=
getContentMatch
(
urlList
);
//
for(ContentBean cb : l) {
for
(
ContentBean
cb
:
l
)
{
//
System.out.println(cb.getContent());
System
.
out
.
println
(
cb
.
getContent
());
//
}
}
//
}
}
static
class
ContentMatchCrawlerThread
extends
Thread
{
static
class
ContentMatchCrawlerThread
extends
Thread
{
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
98e0d120
...
@@ -33,27 +33,46 @@ public class URLLive {
...
@@ -33,27 +33,46 @@ public class URLLive {
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
for
(
Entry
<
String
,
Map
<
String
,
Object
>>
entry
:
dataMap
.
entrySet
()){
urlList
.
add
(
entry
.
getKey
());
urlList
.
add
(
entry
.
getKey
());
}
}
System
.
out
.
println
(
urlList
.
size
());
//验证数据是否已删除
//验证数据是否已删除
List
<
UrlLiveBean
>
dataList
=
UrlLiveCrawlerThread
.
getUrlLiveCrawle
(
urlList
);
List
<
UrlLiveBean
>
dataList
=
UrlLiveCrawlerThread
.
getUrlLiveCrawle
(
urlList
);
for
(
UrlLiveBean
ub
:
dataList
){
for
(
UrlLiveBean
ub
:
dataList
){
String
url
=
ub
.
getUrl
();
String
url
=
ub
.
getUrl
();
boolean
live
=
ub
.
isLive
();
int
i
=
ub
.
isLive
();
if
(
dataMap
.
containsKey
(
url
)){
if
(
dataMap
.
containsKey
(
url
)){
Map
<
String
,
Object
>
map
=
dataMap
.
get
(
url
);
Map
<
String
,
Object
>
map
=
dataMap
.
get
(
url
);
map
.
put
(
"是否删除"
,
live
);
if
(
i
==
1
)
{
map
.
put
(
"是否删除"
,
true
);
}
else
if
(
i
==
0
)
{
map
.
put
(
"是否删除"
,
false
);
}
dataMap
.
put
(
url
,
map
);
dataMap
.
put
(
url
,
map
);
}
}
}
}
return
dataMap
;
return
dataMap
;
}
}
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @param urlList
* @return UrlLiveBean 1 已删除 2 未删除 -1 访问失败
*/
public
static
List
<
UrlLiveBean
>
verificationURLLive
(
List
<
String
>
urlList
){
public
static
List
<
UrlLiveBean
>
verificationURLLive
(
List
<
String
>
urlList
){
//启动验证链接是否有效程序程序
//启动验证链接是否有效程序程序
List
<
UrlLiveBean
>
dataList
=
UrlLiveCrawlerThread
.
getUrlLiveCrawle
(
urlList
);
List
<
UrlLiveBean
>
dataList
=
UrlLiveCrawlerThread
.
getUrlLiveCrawle
(
urlList
);
return
dataList
;
return
dataList
;
}
}
public
static
void
main
(
String
[]
args
)
{
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://www.zyzpes.com/toutiao/5048828/20180419A1AFBC00.html"
);
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
for
(
UrlLiveBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
}
}
static
class
UrlLiveCrawlerThread
extends
Thread
{
static
class
UrlLiveCrawlerThread
extends
Thread
{
private
static
List
<
UrlLiveBean
>
getUrlLiveCrawle
(
List
<
String
>
urlList
){
private
static
List
<
UrlLiveBean
>
getUrlLiveCrawle
(
List
<
String
>
urlList
){
...
...
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
View file @
98e0d120
...
@@ -38,7 +38,8 @@ public class MatchContent {
...
@@ -38,7 +38,8 @@ public class MatchContent {
content
=
matchContentWeixin
(
document
);
content
=
matchContentWeixin
(
document
);
}
else
if
(
url
.
contains
(
"toutiao.com"
))
{
}
else
if
(
url
.
contains
(
"toutiao.com"
))
{
content
=
matchContentToutiao
(
html
);
content
=
matchContentToutiao
(
html
);
}
else
{
}
if
(
content
==
null
||
content
.
length
()
<
10
)
{
content
=
mathchContent
(
html
,
document
);
content
=
mathchContent
(
html
,
document
);
}
}
return
ZhiWeiTools
.
delHTMLTag
(
content
);
return
ZhiWeiTools
.
delHTMLTag
(
content
);
...
@@ -71,7 +72,21 @@ public class MatchContent {
...
@@ -71,7 +72,21 @@ public class MatchContent {
* @return
* @return
*/
*/
private
static
String
matchContentWeixin
(
Document
document
)
{
private
static
String
matchContentWeixin
(
Document
document
)
{
return
document
.
select
(
"div.rich_media_content"
).
text
();
try
{
String
content
=
document
.
select
(
"div.rich_media_content"
).
text
();
if
(
document
.
toString
().
contains
(
"<script id=\"content_tpl\""
))
{
Pattern
pa
=
Pattern
.
compile
(
"\\<script id=\"content_tpl(.*?)\\</script\\>"
);
Matcher
ma
=
pa
.
matcher
(
document
.
toString
());
while
(
ma
.
find
())
{
return
ma
.
group
(
0
).
replaceAll
(
"<script id=\"content_tpl\" type=\"text/html\">"
,
""
).
replaceAll
(
"</script>"
,
""
);
}
return
content
;
}
return
content
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
""
;
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment