Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
2c9d4fa2
Commit
2c9d4fa2
authored
Nov 02, 2022
by
chenweiyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
代理升级版本
parent
4860f41e
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
522 additions
and
459 deletions
+522
-459
pom.xml
+27
-5
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+15
-10
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+14
-9
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+14
-9
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+40
-29
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
+270
-267
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+1
-1
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+1
-1
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+121
-121
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
+19
-7
No files found.
pom.xml
View file @
2c9d4fa2
...
...
@@ -3,13 +3,17 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<version>
0.3.
0
-SNAPSHOT
</version>
<version>
0.3.
1
-SNAPSHOT
</version>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
<properties>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
<http-boot.version>
0.1.0.8-SNAPSHOT
</http-boot.version>
<task-boot.version>
1.1.2-SNAPSHOT
</task-boot.version>
<boilerpipe.version>
0.0.1-SNAPSHOT
</boilerpipe.version>
<conomys-consumer.version>
0.0.3-SNAPSHOT
</conomys-consumer.version>
</properties>
<developers>
...
...
@@ -30,12 +34,30 @@
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.
2.4
-SNAPSHOT
</version>
<version>
0.
4.5
-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.6.6.8-SNAPSHOT
</version>
<groupId>
com.kohlschutter.boilerpipe
</groupId>
<artifactId>
boilerpipe-extractor
</artifactId>
<version>
${boilerpipe.version}
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.http
</groupId>
<artifactId>
http-boot
</artifactId>
<version>
${http-boot.version}
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.async
</groupId>
<artifactId>
task-boot
</artifactId>
<version>
${task-boot.version}
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.network
</groupId>
<artifactId>
cynomys-consumer
</artifactId>
<version>
${conomys-consumer.version}
</version>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
2c9d4fa2
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.Objects
;
import
java.util.concurrent.Semaphore
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
public
class
ContentCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
2
).
build
();
private
static
Semaphore
semaphore
=
new
Semaphore
(
5
);
/**
*
* @Description 链接传入 并 返回采集完信号
...
...
@@ -51,9 +53,12 @@ public class ContentCrawler {
ZhiWeiTools
.
sleep
(
100
);
if
(
url
!=
null
)
{
try
{
semaphore
.
acquire
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
finally
{
semaphore
.
release
();
}
}
}
...
...
@@ -75,15 +80,15 @@ public class ContentCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
Proxy
Hold
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
Proxy
ServerSuppli
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
s
tring
(),
attr
,
callback
);
parseHtml
(
rs
.
body
S
tring
(),
attr
,
callback
);
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"搜索结果访问失败:
{}
"
,
ex
);
logger
.
info
(
"搜索结果访问失败: "
,
ex
);
}
finally
{
counter
.
done
();
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
2c9d4fa2
...
...
@@ -4,22 +4,23 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.concurrent.Semaphore
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -34,8 +35,9 @@ import okhttp3.Request;
public
class
MediaSelfSourceCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
MediaSelfSourceCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
2
).
build
();
private
static
Semaphore
semaphore
=
new
Semaphore
(
5
);
/**
*
* @Description 链接传入 并 返回采集完信号
...
...
@@ -69,9 +71,12 @@ public class MediaSelfSourceCrawler {
counter
.
add
();
if
(
url
!=
null
)
{
try
{
semaphore
.
acquire
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
finally
{
semaphore
.
release
();
}
}
counter
.
done
();
...
...
@@ -91,7 +96,7 @@ public class MediaSelfSourceCrawler {
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Proxy
Holder
ph
=
ProxyHold
er
.
NAT_HEAVY_PROXY
;
Proxy
ServerSupplier
ph
=
ProxyServerSuppli
er
.
NAT_HEAVY_PROXY
;
if
(
url
.
contains
(
"toutiao.com"
))
{
map
.
put
(
"referer"
,
url
);
}
...
...
@@ -104,7 +109,7 @@ public class MediaSelfSourceCrawler {
try
{
if
(
Objects
.
isNull
(
ex
))
{
try
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
,
rs
.
r
equest
().
url
().
uri
().
toString
());
parseHtml
(
rs
.
body
String
(),
attr
,
callback
,
rs
.
bootR
equest
().
url
().
uri
().
toString
());
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
2c9d4fa2
...
...
@@ -4,6 +4,7 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.concurrent.Semaphore
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
...
...
@@ -11,16 +12,16 @@ import org.jsoup.Jsoup;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -29,9 +30,10 @@ public class SourceForwardCrawler {
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SourceForwardCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
2
).
build
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
private
static
Semaphore
semaphore
=
new
Semaphore
(
5
);
public
GroupSync
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
try
{
GroupSync
counter
=
new
GroupSync
();
...
...
@@ -50,10 +52,13 @@ public class SourceForwardCrawler {
ZhiWeiTools
.
sleep
(
100
);
if
(
url
!=
null
)
{
try
{
semaphore
.
acquire
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
}
finally
{
semaphore
.
release
();
}
}
counter
.
done
();
}
...
...
@@ -77,10 +82,10 @@ public class SourceForwardCrawler {
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
Proxy
Hold
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
Proxy
ServerSuppli
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
s
tring
(),
attr
,
callback
);
parseHtml
(
rs
.
body
S
tring
(),
attr
,
callback
);
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
2c9d4fa2
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.concurrent.Semaphore
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONPath
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
java.util.*
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
okhttp3.Request
;
/**
* @author byte-zbs
...
...
@@ -32,8 +39,9 @@ import static java.util.Objects.nonNull;
public
class
UrlLiveCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
true
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
2
).
build
();
private
static
Semaphore
semaphore
=
new
Semaphore
(
5
);
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
...
...
@@ -43,17 +51,20 @@ public class UrlLiveCrawler {
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
counter
.
add
();
ZhiWeiTools
.
sleep
(
100
);
if
(
nonNull
(
url
))
{
try
{
try
{
counter
.
add
();
semaphore
.
acquire
();
ZhiWeiTools
.
sleep
(
200
);
if
(
nonNull
(
url
))
{
// ZhiWeiTools.sleep(3000);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
}
finally
{
counter
.
done
();
semaphore
.
release
();
}
counter
.
done
();
}
}
}
...
...
@@ -63,12 +74,12 @@ public class UrlLiveCrawler {
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
Proxy
Holder
ph
=
ProxyHold
er
.
NAT_HEAVY_PROXY
;
Proxy
ServerSupplier
ph
=
ProxyServerSuppli
er
.
NAT_HEAVY_PROXY
;
if
(
url
.
contains
(
"toutiao.com"
))
{
ph
=
Proxy
Hold
er
.
NAT_HEAVY_PROXY
;
ph
=
Proxy
ServerSuppli
er
.
NAT_HEAVY_PROXY
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
ph
=
Proxy
Hold
er
.
NAT_HEAVY_PROXY
;
ph
=
Proxy
ServerSuppli
er
.
NAT_HEAVY_PROXY
;
}
try
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
...
...
@@ -80,7 +91,7 @@ public class UrlLiveCrawler {
System
.
out
.
println
(
rs
.
code
());
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
isSuccessful
())
{
parseHtml
(
rs
.
body
().
s
tring
(),
attr
,
callback
);
parseHtml
(
rs
.
body
S
tring
(),
attr
,
callback
);
}
else
if
(
rs
.
code
()
==
404
)
{
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
{
...
...
@@ -91,7 +102,7 @@ public class UrlLiveCrawler {
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" 数据是否删除 采集出错
{}
"
,
e
);
logger
.
error
(
" 数据是否删除 采集出错 "
,
e
);
}
finally
{
counter
.
done
();
}
...
...
@@ -99,7 +110,7 @@ public class UrlLiveCrawler {
return
counter
;
}
}
catch
(
Exception
e2
)
{
logger
.
error
(
"数据出错
{}
"
,
e2
);
logger
.
error
(
"数据出错 "
,
e2
);
}
return
counter
;
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
View file @
2c9d4fa2
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.async.TaskBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
okhttp3.Request
;
import
okhttp3.Response
;
public
class
UrlLiveCrawlerNew
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawlerNew
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
false
).
build
();
public
List
<
UrlLiveBean
>
judgeIsDelete
(
List
<
String
>
urlList
)
{
GroupSync
counter
=
new
GroupSync
();
List
<
UrlLiveBean
>
ulbList
=
new
ArrayList
<>();
urlList
.
forEach
(
url
->
{
try
{
counter
.
add
();
TaskBoot
.
blockingAsync
(()
->
{
try
{
counter
.
add
();
UrlLiveBean
ulb
=
dealUrlLive
(
url
);
if
(
Objects
.
nonNull
(
ulb
))
{
ulbList
.
add
(
ulb
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"链接是否删除新"
,
e
);
}
finally
{
counter
.
done
();
}
});
}
catch
(
Exception
e2
)
{
logger
.
error
(
"数据出错 {}"
,
e2
);
}
finally
{
counter
.
done
();
}
});
try
{
counter
.
await
();
}
catch
(
InterruptedException
e
)
{
e
.
printStackTrace
();
}
return
ulbList
;
}
private
UrlLiveBean
dealUrlLive
(
String
url
)
{
try
{
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
// Map<String,String> headers = HeaderTool.getCommonHead();
if
(
url
.
contains
(
"www.toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
int
code
=
404
;
for
(
int
i
=
0
;
i
<
2
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
)){
if
(
response
.
isSuccessful
())
{
return
matchDel
(
response
.
body
().
string
(),
url
);
}
else
{
code
=
response
.
code
();
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析"
,
e
);
}
}
if
(
code
==
403
){
return
callBack
(
url
,
-
1
,
String
.
valueOf
(
code
));
}
else
{
return
callBack
(
url
,
1
,
String
.
valueOf
(
code
));
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
}
}
private
UrlLiveBean
callBack
(
String
url
,
int
i
,
String
title
)
{
if
(
i
==
1
)
{
return
new
UrlLiveBean
(
url
,
true
,
title
);
}
else
{
return
new
UrlLiveBean
(
url
,
i
,
title
);
}
}
private
String
dealUrl
(
String
url
)
{
try
{
if
(
url
.
contains
(
"toutiao.com"
))
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
}
else
{
url
=
url
.
replace
(
"toutiao.com"
,
"www.toutiao.com"
);
}
if
(
url
.
contains
(
"https"
))
{
}
else
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
if
(
url
.
contains
(
"group"
))
{
url
=
"https://www.toutiao.com/a"
+
url
.
split
(
"/"
)[
4
]
+
"/"
;
}
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
if
(
url
.
contains
(
"https"
))
{
}
else
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
)
&&
url
.
contains
(
"wm_aid"
))
{
url
=
"http://ff.dayu.com/contents/origin/"
+
url
.
split
(
"wm_aid="
)[
1
].
split
(
"!!wm_id"
)[
0
]+
"?biz_id=1002&_fetch_author=1&_fetch_incrs=1"
;
}
return
url
;
}
catch
(
Exception
e
)
{
return
url
;
}
}
/***
* @Title: matchDel
* @author hero
* @Description: 验证链接是否有效
* @param @param page
* @param @return 设定文件
* @return boolean 返回类型
*/
public
UrlLiveBean
matchDel
(
String
result
,
String
url
){
try
{
Document
doc
=
Jsoup
.
parse
(
result
);
String
title
=
null
;
if
(
url
.
contains
(
"mp.weixin.qq.com"
)
||
url
.
contains
(
"post.mp.qq.com"
)
||
url
.
contains
(
"weixin.sogou.com"
)){
title
=
doc
.
select
(
"h2.rich_media_title"
).
text
().
replaceAll
(
" "
,
""
);
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"p.title"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"h3.msg-title"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"div.global_error_msg.warn"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"p.tips"
).
text
();
}
if
(
Objects
.
isNull
(
title
)
||
title
.
isEmpty
())
{
title
=
doc
.
select
(
"h2"
).
text
();
}
}
else
if
(
url
.
contains
(
"kuaibao"
)){
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
title
=
doc
.
select
(
"p.style1"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"baidu.com"
)
||
url
.
contains
(
"hao123.com"
))
{
title
=
doc
.
select
(
"p#contaniner"
).
text
();
}
else
if
(
url
.
contains
(
"kanfanews.com"
))
{
title
=
doc
.
select
(
"p#tit"
).
text
();
}
else
if
(
url
.
contains
(
"ifeng.com"
)
&&
result
.
contains
(
"url=http://www.ifeng.com/"
))
{
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"a.mp.uc.cn"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
title
=
json
.
getJSONObject
(
"data"
).
getString
(
"title"
);
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
"网页已删除"
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" uc 数据 json 转换失败"
,
e
);
}
}
else
if
(
url
.
contains
(
"huanqiu.com"
)
&&
result
.
contains
(
"www.huanqiu.com/404.html"
))
{
title
=
"网页已删除"
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
JSONObject
resultJson
=
JSONObject
.
parseObject
(
result
);
title
=
resultJson
.
getString
(
"title"
)!=
null
?
resultJson
.
getString
(
"title"
):
resultJson
.
getString
(
"message"
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"div.adiv > p > span"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"title"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
)
{
title
=
doc
.
select
(
"h1"
).
text
().
replaceAll
(
" "
,
""
);
}
//若title 为拿到 用 此方法
if
(
Objects
.
isNull
(
title
)
||
title
.
length
()
<
1
||
result
.
length
()
<
200
)
{
title
=
"网页已删除"
;
}
if
(
Objects
.
nonNull
(
title
)
&&
title
.
length
()
>
1
){
return
new
UrlLiveBean
(
url
,
isDelete
(
title
),
title
);
}
else
{
return
null
;
}
}
catch
(
Exception
e
)
{
return
null
;
}
}
/**
*
* @Description 标题判断
* @param title
* @return
*/
private
boolean
isDelete
(
String
title
)
{
List
<
String
>
eList
=
Arrays
.
asList
(
"系统出错"
,
"该内容已被发布者删除"
,
"网页已删除"
,
"此帐号已自主注销,内容无法查看"
,
"页面提示"
,
"正在维护中"
,
"此文章被第三方评估为不实信息"
,
"财经头条"
,
"知识100题"
,
"502BadGateway"
,
"提示信息"
,
"跳转页"
,
"跳转中..."
,
"此帐号在冻结期,内容无法查看"
,
"东北新闻网"
,
"百度一下,你就知道"
,
"帐号已迁移"
,
"手机百度"
,
"内容被删除"
,
"亚博国际|首页"
,
"中国软件网"
,
"云广网"
,
"新浪首页"
,
"文章暂时找不到了"
,
"-法易网"
,
"【一点资讯】www.yidianzixun.com"
,
"错误页面"
,
"网站暂停通知"
,
"【快资讯】你的专属资讯平台"
,
"百度新闻——全球最大的中文新闻平台"
,
"以上文章由以下机构判定为不实信息"
,
"该公众号已迁移"
,
"财经网-CAIJING.COM.CN"
,
"蚂蚁资讯"
,
"参数错误"
,
"时尚头条_YOKA时尚网"
,
"该文章已经被删除"
,
"网易"
,
"链接已过期"
,
"找不到页面"
,
"今晚网"
,
"该文章已被删除"
,
"该回答已被删除-知乎"
,
"资源不存在"
);
List
<
String
>
cList
=
Arrays
.
asList
(
"提示信息-"
,
"此内容因违规无法查看"
,
"微信公众号不存在"
,
"此内容被投诉且经审核涉嫌侵权,无法查看"
,
"thepageyourequestedwasnotfound"
,
"未知错误"
,
"Objectmoved"
,
"404"
,
"页面没有找到"
,
"页面未找到"
,
"301MovedPermanently"
,
"加载异常"
,
"此帐号已被屏蔽, 内容无法查看"
,
"链接不存在"
);
return
cList
.
stream
().
anyMatch
(
title:
:
contains
)
||
eList
.
stream
().
anyMatch
(
title:
:
equals
);
}
/**
* 处理知乎链接
*
* */
private
static
String
treatZhihuUrl
(
String
url
)
{
if
(
url
.
contains
(
"/answer/"
))
{
url
=
"https://api.zhihu.com/answers/"
+
url
.
replaceAll
(
".*/answer/"
,
""
);
}
else
if
(
url
.
contains
(
"/question/"
)
&&
!
url
.
contains
(
"/answer/"
))
{
url
=
"https://api.zhihu.com/questions/"
+
url
.
replaceAll
(
".*/question/"
,
""
);
}
else
if
(
url
.
contains
(
"/p/"
))
{
url
=
"https://api.zhihu.com/articles/"
+
url
.
replaceAll
(
".*/p/"
,
""
);
}
return
url
;
}
}
//package com.zhiwei.source_forward.crawler;
//
//import java.util.ArrayList;
//import java.util.Arrays;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//import java.util.Objects;
//import java.util.concurrent.Semaphore;
//
//import org.apache.logging.log4j.LogManager;
//import org.apache.logging.log4j.Logger;
//import org.jsoup.Jsoup;
//import org.jsoup.nodes.Document;
//
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.http.boot.HttpBoot;
//import com.zhiwei.http.util.RequestUtils;
//import com.zhiwei.source_forward.bean.UrlLiveBean;
//import com.zhiwei.task.async.TaskBoot;
//import com.zhiwei.task.sync.GroupSync;
//
//import okhttp3.Request;
//import okhttp3.Response;
//
//public class UrlLiveCrawlerNew {
//
// private static final Logger logger = LogManager.getLogger(UrlLiveCrawlerNew.class);
// private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).throwException(false).build();
// private static Semaphore semaphore = new Semaphore(5);
//
//
// public List<UrlLiveBean> judgeIsDelete(List<String> urlList) {
// GroupSync counter = new GroupSync();
// List<UrlLiveBean> ulbList = new ArrayList<>();
// urlList.forEach(url -> {
// try {
// counter.add();
// TaskBoot.blockingAsync(() -> {
// try {
// counter.add();
// UrlLiveBean ulb = dealUrlLive(url);
// if(Objects.nonNull(ulb)) {
// ulbList.add(ulb);
// }
// } catch (Exception e) {
// logger.error("链接是否删除新", e);
// } finally {
// counter.done();
// }
// });
// } catch (Exception e2) {
// logger.error("数据出错 {}" ,e2);
// } finally {
// counter.done();
// }
// });
// try {
// counter.await();
// } catch (InterruptedException e) {
// e.printStackTrace();
// }
// return ulbList;
// }
//
// private UrlLiveBean dealUrlLive(String url) {
// try {
// url = dealUrl(url);
// logger.info("当前处理 URL: {}", url);
// Map<String,String> headers = new HashMap<>();
//// Map<String,String> headers = HeaderTool.getCommonHead();
// if(url.contains("www.toutiao.com")){
// headers.put("referer", url);
// }else if(url.contains("zhihu.com")) {
// url = treatZhihuUrl(url);
// }
// Request request = RequestUtils.wrapGet(url, headers);
// int code = 404;
// for(int i = 0; i < 2; i++) {
// try (Response response = httpBoot.syncCall(request)){
// if(response.isSuccessful()) {
// return matchDel(response.body().string(), url);
// }else {
// code = response.code();
// }
// } catch (Exception e) {
// logger.error("解析", e);
// }
// }
// if(code == 403){
// return callBack(url, -1, String.valueOf(code));
// }else {
// return callBack(url, 1, String.valueOf(code));
// }
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// }
//
// private UrlLiveBean callBack(String url,int i,String title) {
// if(i == 1) {
// return new UrlLiveBean(url, true, title);
// }else {
// return new UrlLiveBean(url, i, title);
// }
// }
//
// private String dealUrl(String url) {
// try {
// if(url.contains("toutiao.com")) {
// if(url.contains("www.toutiao.com")) {
//
// }else {
// url = url.replace("toutiao.com", "www.toutiao.com");
// }
// if(url.contains("https")) {
//
// }else {
// url = url.replace("http", "https");
// }
// if(url.contains("group")) {
// url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
// }
// }else if(url.contains("mp.weixin.qq.com")) {
// if(url.contains("https")) {
//
// }else {
// url = url.replace("http", "https");
// }
// }else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) {
// url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1";
// }
// return url;
// } catch (Exception e) {
// return url;
// }
// }
//
// /***
// * @Title: matchDel
// * @author hero
// * @Description: 验证链接是否有效
// * @param @param page
// * @param @return 设定文件
// * @return boolean 返回类型
// */
// public UrlLiveBean matchDel(String result,String url){
// try {
// Document doc = Jsoup.parse(result);
// String title = null;
// if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")){
// title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
// if(Objects.isNull(title) || title.isEmpty()) {
// title = doc.select("p.title").text();
// }
// if(Objects.isNull(title) || title.isEmpty()) {
// title = doc.select("h3.msg-title").text();
// }
// if(Objects.isNull(title) || title.isEmpty()) {
// title = doc.select("div.global_error_msg.warn").text();
// }
// if(Objects.isNull(title) || title.isEmpty()) {
// title = doc.select("p.tips").text();
// }
// if(Objects.isNull(title) || title.isEmpty()) {
// title = doc.select("h2").text();
// }
// }else if(url.contains("kuaibao")){
// title = doc.select("p.title").text().replaceAll(" ", "");
// }else if(url.contains("chinadaily.com.cn")){
// title = doc.select("p.style1").text().replaceAll(" ", "");
// }else if(url.contains("baidu.com") || url.contains("hao123.com")) {
// title = doc.select("p#contaniner").text();
// }else if(url.contains("kanfanews.com")) {
// title = doc.select("p#tit").text();
// }else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
// title = "网页已删除";
// }else if(url.contains("a.mp.uc.cn")) {
// try {
// JSONObject json = JSONObject.parseObject(result);
// title = json.getJSONObject("data").getString("title");
// if(Objects.isNull(title) || title.length() < 1) {
// title = "网页已删除";
// }
// } catch (Exception e) {
// logger.error(" uc 数据 json 转换失败", e);
// }
// }else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
// title = "网页已删除";
// }else if(url.contains("zhihu.com")) {
// JSONObject resultJson = JSONObject.parseObject(result);
//
// title = resultJson.getString("title")!=null?resultJson.getString("title"):resultJson.getString("message");
// }
//
// //若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1) {
// title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
// }
//
// //若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1) {
// title = doc.select("title").text().replaceAll(" ", "");
// }
//
// //若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1) {
// title = doc.select("h1").text().replaceAll(" ", "");
// }
//
// //若title 为拿到 用 此方法
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
// title = "网页已删除";
// }
//
// if(Objects.nonNull(title) && title.length() > 1){
// return new UrlLiveBean(url, isDelete(title),title);
// } else {
// return null;
// }
// } catch (Exception e) {
// return null;
// }
// }
//
// /**
// *
// * @Description 标题判断
// * @param title
// * @return
// */
// private boolean isDelete(String title) {
// List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除"
// ,"此帐号已自主注销,内容无法查看","页面提示","正在维护中"
// ,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway"
// ,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网"
// ,"百度一下,你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页"
// ,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网"
// ,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
// ,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
// ,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
// ,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在");
//
// List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
// ,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误"
// ,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常",
// "此帐号已被屏蔽, 内容无法查看","链接不存在");
//
// return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
// }
//
//
// /**
// * 处理知乎链接
// *
// * */
// private static String treatZhihuUrl(String url) {
// if(url.contains("/answer/")) {
// url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", "");
// }else if(url.contains("/question/") && !url.contains("/answer/")) {
// url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", "");
// }else if(url.contains("/p/")) {
// url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", "");
// }
// return url;
// }
//
//
//}
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
2c9d4fa2
...
...
@@ -80,7 +80,7 @@ public class SourceForward {
public
static
void
main
(
String
[]
args
)
{
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http
://gu.qq.com/resources/shy/news/detail-v2/index.html?#/index?id=SN202006091653447945411f
"
);
urlList
.
add
(
"http
s://ypstatic.cnnb.com.cn/yppage-share/news/share/news_detail?newsId=627223d9e4b042b45e211c5a
"
);
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
for
(
SourceForwardBean
sfb
:
da
)
{
System
.
out
.
println
(
"=============="
+
sfb
.
toString
());
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
2c9d4fa2
...
...
@@ -98,7 +98,7 @@ public class URLLive {
};
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
}
catch
(
Exception
e
){
logger
.
error
(
" 数据采集运行有问题
{}
"
,
e
);
logger
.
error
(
" 数据采集运行有问题 "
,
e
);
}
return
list
;
}
...
...
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
View file @
2c9d4fa2
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.kohlschutter.boilerpipe.extractors.ArticleExtractor
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public
class
MatchContent
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
MatchContent
.
class
);
/**
* @Title: matchContent
* @author hero
* @Description: 匹配文章正文
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
matchContent
(
String
url
,
String
html
)
{
String
content
=
null
;
try
{
Document
document
=
Jsoup
.
parse
(
html
);
if
(
url
.
contains
(
"weixin.qq.com"
))
{
content
=
matchContentWeixin
(
html
);
}
else
if
(
url
.
contains
(
"toutiao.com"
))
{
content
=
matchContentToutiao
(
html
);
}
if
(
content
==
null
||
content
.
length
()
<
10
)
{
content
=
mathchContent
(
html
,
document
);
}
return
ZhiWeiTools
.
delHTMLTag
(
content
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取全文失败"
,
e
);
content
=
null
;
}
return
content
;
}
/**
*
* @Description 头条正文获取
* @param html
* @return
*/
private
static
String
matchContentToutiao
(
String
html
)
{
Pattern
pa
=
Pattern
.
compile
(
"content:(.*?)',"
);
Matcher
ma
=
pa
.
matcher
(
html
);
while
(
ma
.
find
())
{
return
ma
.
group
(
1
);
}
return
null
;
}
/**
*
* @Description 微信文本获取
* @param html
* @return
*/
private
static
String
matchContentWeixin
(
String
contentHtml
)
{
try
{
Document
document
=
Jsoup
.
parse
(
contentHtml
);
if
(
contentHtml
.
contains
(
"js_article"
))
{
return
document
.
select
(
"div#js_article"
).
text
();
}
else
if
(
contentHtml
.
contains
(
"js_share_content"
))
{
return
document
.
select
(
"div#js_share_content"
).
text
();
}
if
(
contentHtml
.
contains
(
"content_tpl"
))
{
String
text
=
document
.
select
(
"script#content_tpl"
).
html
();
return
Jsoup
.
parse
(
text
).
text
();
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"微信全文解析出错 {}"
,
e
);
}
return
""
;
}
/**
* @Title: mathchContent
* @author hero
* @Description: 匹配正文数据
* @param @param html
* @param @param document
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
mathchContent
(
String
html
,
Document
document
){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String
content
=
null
;
try
{
content
=
ArticleExtractor
.
getInstance
().
getText
(
html
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"正文抽取失败,获取全文文本:"
,
e
);
content
=
document
.
text
();
}
// String content = null;
// try {
// News news = ContentExtractor.getNewsByHtml(html);
// content = TreateData.filterSpecialCharacter(news.getContent());
// } catch (Exception e) {
// logger.error("正文抽取失败,获取全文文本:",e);
// content = document.text();
// }
return
content
;
}
}
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.kohlschutter.boilerpipe.extractors.ArticleExtractor
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public
class
MatchContent
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
MatchContent
.
class
);
/**
* @Title: matchContent
* @author hero
* @Description: 匹配文章正文
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
matchContent
(
String
url
,
String
html
)
{
String
content
=
null
;
try
{
Document
document
=
Jsoup
.
parse
(
html
);
if
(
url
.
contains
(
"weixin.qq.com"
))
{
content
=
matchContentWeixin
(
html
);
}
else
if
(
url
.
contains
(
"toutiao.com"
))
{
content
=
matchContentToutiao
(
html
);
}
if
(
content
==
null
||
content
.
length
()
<
10
)
{
content
=
mathchContent
(
html
,
document
);
}
return
ZhiWeiTools
.
delHTMLTag
(
content
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取全文失败"
,
e
);
content
=
null
;
}
return
content
;
}
/**
*
* @Description 头条正文获取
* @param html
* @return
*/
private
static
String
matchContentToutiao
(
String
html
)
{
Pattern
pa
=
Pattern
.
compile
(
"content:(.*?)',"
);
Matcher
ma
=
pa
.
matcher
(
html
);
while
(
ma
.
find
())
{
return
ma
.
group
(
1
);
}
return
null
;
}
/**
*
* @Description 微信文本获取
* @param html
* @return
*/
private
static
String
matchContentWeixin
(
String
contentHtml
)
{
try
{
Document
document
=
Jsoup
.
parse
(
contentHtml
);
if
(
contentHtml
.
contains
(
"js_article"
))
{
return
document
.
select
(
"div#js_article"
).
text
();
}
else
if
(
contentHtml
.
contains
(
"js_share_content"
))
{
return
document
.
select
(
"div#js_share_content"
).
text
();
}
if
(
contentHtml
.
contains
(
"content_tpl"
))
{
String
text
=
document
.
select
(
"script#content_tpl"
).
html
();
return
Jsoup
.
parse
(
text
).
text
();
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"微信全文解析出错 {}"
,
e
);
}
return
""
;
}
/**
* @Title: mathchContent
* @author hero
* @Description: 匹配正文数据
* @param @param html
* @param @param document
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
mathchContent
(
String
html
,
Document
document
){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String
content
=
null
;
try
{
content
=
ArticleExtractor
.
getInstance
().
getText
(
html
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"正文抽取失败,获取全文文本:"
,
e
);
content
=
document
.
text
();
}
// String content = null;
// try {
// News news = ContentExtractor.getNewsByHtml(html);
// content = TreateData.filterSpecialCharacter(news.getContent());
// } catch (Exception e) {
// logger.error("正文抽取失败,获取全文文本:",e);
// content = document.text();
// }
return
content
;
}
}
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
View file @
2c9d4fa2
package
com
.
zhiwei
.
source_forward
.
util
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.source_forward.config.ProxyConfig
;
import
org.apache.dubbo.config.ApplicationConfig
;
import
org.apache.dubbo.config.ConsumerConfig
;
import
org.apache.dubbo.config.RegistryConfig
;
import
com.zhiwei.http.proxy.CynomysFactory
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumer
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory
;
/**
* 初始化代理
...
...
@@ -16,10 +20,18 @@ public class ProxyInit {
* void
*/
public
static
void
initProxy
()
{
String
address
=
ProxyConfig
.
registry
;
String
appName
=
"xumiaoxin"
;
long
appId
=
ProxyConfig
.
proxyid
;
ProxyFactory
.
init
(
SimpleConfig
.
builder
().
registry
(
address
).
appName
(
appName
).
appId
(
appId
).
group
(
ProxyConfig
.
group
).
build
());
ApplicationConfig
applicationConfig
=
new
ApplicationConfig
();
applicationConfig
.
setName
(
"actool"
);
RegistryConfig
registryConfig
=
new
RegistryConfig
();
registryConfig
.
setAddress
(
"zookeeper://192.168.0.30:2181"
);
ConsumerConfig
consumerConfig
=
new
ConsumerConfig
();
// 设置分组
consumerConfig
.
setGroup
(
"local"
);
String
username
=
"18271694195"
;
String
password
=
"Zhiwei289"
;
// 创建 consumer,applicationConfig 非必需参数
CynomysConsumer
consumer
=
CynomysConsumerFactory
.
create
(
applicationConfig
,
registryConfig
,
consumerConfig
,
username
,
password
);
CynomysFactory
.
init
(
consumer
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment