Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
2c9d4fa2
Commit
2c9d4fa2
authored
Nov 02, 2022
by
chenweiyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
代理升级版本
parent
4860f41e
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
252 additions
and
192 deletions
+252
-192
pom.xml
+27
-5
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+15
-10
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+14
-9
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+14
-9
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+40
-29
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
+0
-0
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+1
-1
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+1
-1
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+121
-121
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
+19
-7
No files found.
pom.xml
View file @
2c9d4fa2
...
...
@@ -3,13 +3,17 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<version>
0.3.
0
-SNAPSHOT
</version>
<version>
0.3.
1
-SNAPSHOT
</version>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
<properties>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
<http-boot.version>
0.1.0.8-SNAPSHOT
</http-boot.version>
<task-boot.version>
1.1.2-SNAPSHOT
</task-boot.version>
<boilerpipe.version>
0.0.1-SNAPSHOT
</boilerpipe.version>
<conomys-consumer.version>
0.0.3-SNAPSHOT
</conomys-consumer.version>
</properties>
<developers>
...
...
@@ -30,12 +34,30 @@
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.
2.4
-SNAPSHOT
</version>
<version>
0.
4.5
-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.6.6.8-SNAPSHOT
</version>
<groupId>
com.kohlschutter.boilerpipe
</groupId>
<artifactId>
boilerpipe-extractor
</artifactId>
<version>
${boilerpipe.version}
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.http
</groupId>
<artifactId>
http-boot
</artifactId>
<version>
${http-boot.version}
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.async
</groupId>
<artifactId>
task-boot
</artifactId>
<version>
${task-boot.version}
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.network
</groupId>
<artifactId>
cynomys-consumer
</artifactId>
<version>
${conomys-consumer.version}
</version>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
2c9d4fa2
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
java.util.Objects
;
import
java.util.concurrent.Semaphore
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
import
com.zhiwei.source_forward.util.MatchContent
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
public
class
ContentCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
ContentCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
2
).
build
();
private
static
Semaphore
semaphore
=
new
Semaphore
(
5
);
/**
*
* @Description 链接传入 并 返回采集完信号
...
...
@@ -51,9 +53,12 @@ public class ContentCrawler {
ZhiWeiTools
.
sleep
(
100
);
if
(
url
!=
null
)
{
try
{
semaphore
.
acquire
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
finally
{
semaphore
.
release
();
}
}
}
...
...
@@ -75,15 +80,15 @@ public class ContentCrawler {
Request
request
=
RequestUtils
.
wrapGet
(
url
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
Proxy
Hold
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
Proxy
ServerSuppli
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
s
tring
(),
attr
,
callback
);
parseHtml
(
rs
.
body
S
tring
(),
attr
,
callback
);
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"搜索结果访问失败:
{}
"
,
ex
);
logger
.
info
(
"搜索结果访问失败: "
,
ex
);
}
finally
{
counter
.
done
();
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
2c9d4fa2
...
...
@@ -4,22 +4,23 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.concurrent.Semaphore
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -34,8 +35,9 @@ import okhttp3.Request;
public
class
MediaSelfSourceCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
MediaSelfSourceCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
2
).
build
();
private
static
Semaphore
semaphore
=
new
Semaphore
(
5
);
/**
*
* @Description 链接传入 并 返回采集完信号
...
...
@@ -69,9 +71,12 @@ public class MediaSelfSourceCrawler {
counter
.
add
();
if
(
url
!=
null
)
{
try
{
semaphore
.
acquire
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
finally
{
semaphore
.
release
();
}
}
counter
.
done
();
...
...
@@ -91,7 +96,7 @@ public class MediaSelfSourceCrawler {
private
GroupSync
search
(
GroupSync
counter
,
String
url
,
Attribution
attr
,
MediaSelfSourceDataCallBack
callback
)
{
logger
.
info
(
"当前处理 URL: {}"
,
attr
.
get
());
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Proxy
Holder
ph
=
ProxyHold
er
.
NAT_HEAVY_PROXY
;
Proxy
ServerSupplier
ph
=
ProxyServerSuppli
er
.
NAT_HEAVY_PROXY
;
if
(
url
.
contains
(
"toutiao.com"
))
{
map
.
put
(
"referer"
,
url
);
}
...
...
@@ -104,7 +109,7 @@ public class MediaSelfSourceCrawler {
try
{
if
(
Objects
.
isNull
(
ex
))
{
try
{
parseHtml
(
rs
.
body
().
string
(),
attr
,
callback
,
rs
.
r
equest
().
url
().
uri
().
toString
());
parseHtml
(
rs
.
body
String
(),
attr
,
callback
,
rs
.
bootR
equest
().
url
().
uri
().
toString
());
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
,
e
);
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
2c9d4fa2
...
...
@@ -4,6 +4,7 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.concurrent.Semaphore
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
...
...
@@ -11,16 +12,16 @@ import org.jsoup.Jsoup;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Node
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
import
com.zhiwei.source_forward.util.MatchSource
;
import
com.zhiwei.source_forward.util.SourceData
;
import
com.zhiwei.source_forward.util.SourceForwardDataCallBack
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
...
...
@@ -29,9 +30,10 @@ public class SourceForwardCrawler {
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
SourceForwardCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
new
Builder
().
retryTimes
(
2
).
build
();
private
static
List
<
String
>
sourceList
=
SourceData
.
getSourceList
();
private
static
Semaphore
semaphore
=
new
Semaphore
(
5
);
public
GroupSync
submitTask
(
SourceForwardDataCallBack
callback
,
String
...
urls
)
{
try
{
GroupSync
counter
=
new
GroupSync
();
...
...
@@ -50,10 +52,13 @@ public class SourceForwardCrawler {
ZhiWeiTools
.
sleep
(
100
);
if
(
url
!=
null
)
{
try
{
semaphore
.
acquire
();
search
(
counter
,
url
,
Attribution
.
of
(
url
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错"
,
e
);
}
}
finally
{
semaphore
.
release
();
}
}
counter
.
done
();
}
...
...
@@ -77,10 +82,10 @@ public class SourceForwardCrawler {
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
Proxy
Hold
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
Proxy
ServerSuppli
er
.
NAT_HEAVY_PROXY
,
true
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
parseHtml
(
rs
.
body
().
s
tring
(),
attr
,
callback
);
parseHtml
(
rs
.
body
S
tring
(),
attr
,
callback
);
}
else
{
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
ex
);
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
2c9d4fa2
package
com
.
zhiwei
.
source_forward
.
crawler
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.concurrent.Semaphore
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONPath
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.http.boot.HttpBoot
;
import
com.zhiwei.http.proxy.ProxyServerSupplier
;
import
com.zhiwei.http.util.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.task.sync.GroupSync
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
java.util.*
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
okhttp3.Request
;
/**
* @author byte-zbs
...
...
@@ -32,8 +39,9 @@ import static java.util.Objects.nonNull;
public
class
UrlLiveCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
true
).
build
();
private
static
HttpBoot
httpBoot
=
HttpBoot
.
newBuilder
().
retryTimes
(
2
).
build
();
private
static
Semaphore
semaphore
=
new
Semaphore
(
5
);
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
...
...
@@ -43,17 +51,20 @@ public class UrlLiveCrawler {
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
if
(
nonNull
(
urls
)
&&
urls
.
length
>
0
)
{
for
(
String
url
:
urls
)
{
counter
.
add
();
ZhiWeiTools
.
sleep
(
100
);
if
(
nonNull
(
url
))
{
try
{
try
{
counter
.
add
();
semaphore
.
acquire
();
ZhiWeiTools
.
sleep
(
200
);
if
(
nonNull
(
url
))
{
// ZhiWeiTools.sleep(3000);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
}
finally
{
counter
.
done
();
semaphore
.
release
();
}
counter
.
done
();
}
}
}
...
...
@@ -63,12 +74,12 @@ public class UrlLiveCrawler {
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
Proxy
Holder
ph
=
ProxyHold
er
.
NAT_HEAVY_PROXY
;
Proxy
ServerSupplier
ph
=
ProxyServerSuppli
er
.
NAT_HEAVY_PROXY
;
if
(
url
.
contains
(
"toutiao.com"
))
{
ph
=
Proxy
Hold
er
.
NAT_HEAVY_PROXY
;
ph
=
Proxy
ServerSuppli
er
.
NAT_HEAVY_PROXY
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
ph
=
Proxy
Hold
er
.
NAT_HEAVY_PROXY
;
ph
=
Proxy
ServerSuppli
er
.
NAT_HEAVY_PROXY
;
}
try
{
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
...
...
@@ -80,7 +91,7 @@ public class UrlLiveCrawler {
System
.
out
.
println
(
rs
.
code
());
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
isSuccessful
())
{
parseHtml
(
rs
.
body
().
s
tring
(),
attr
,
callback
);
parseHtml
(
rs
.
body
S
tring
(),
attr
,
callback
);
}
else
if
(
rs
.
code
()
==
404
)
{
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
else
{
...
...
@@ -91,7 +102,7 @@ public class UrlLiveCrawler {
callBack
(
callback
,
attr
,
-
1
,
"程序无法判断"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" 数据是否删除 采集出错
{}
"
,
e
);
logger
.
error
(
" 数据是否删除 采集出错 "
,
e
);
}
finally
{
counter
.
done
();
}
...
...
@@ -99,7 +110,7 @@ public class UrlLiveCrawler {
return
counter
;
}
}
catch
(
Exception
e2
)
{
logger
.
error
(
"数据出错
{}
"
,
e2
);
logger
.
error
(
"数据出错 "
,
e2
);
}
return
counter
;
}
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
View file @
2c9d4fa2
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
2c9d4fa2
...
...
@@ -80,7 +80,7 @@ public class SourceForward {
public
static
void
main
(
String
[]
args
)
{
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http
://gu.qq.com/resources/shy/news/detail-v2/index.html?#/index?id=SN202006091653447945411f
"
);
urlList
.
add
(
"http
s://ypstatic.cnnb.com.cn/yppage-share/news/share/news_detail?newsId=627223d9e4b042b45e211c5a
"
);
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
for
(
SourceForwardBean
sfb
:
da
)
{
System
.
out
.
println
(
"=============="
+
sfb
.
toString
());
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
2c9d4fa2
...
...
@@ -98,7 +98,7 @@ public class URLLive {
};
crawler
.
submitTask
(
callback
,
urlList
.
toArray
(
new
String
[
urlList
.
size
()])).
await
();
}
catch
(
Exception
e
){
logger
.
error
(
" 数据采集运行有问题
{}
"
,
e
);
logger
.
error
(
" 数据采集运行有问题 "
,
e
);
}
return
list
;
}
...
...
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
View file @
2c9d4fa2
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.kohlschutter.boilerpipe.extractors.ArticleExtractor
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public
class
MatchContent
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
MatchContent
.
class
);
/**
* @Title: matchContent
* @author hero
* @Description: 匹配文章正文
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
matchContent
(
String
url
,
String
html
)
{
String
content
=
null
;
try
{
Document
document
=
Jsoup
.
parse
(
html
);
if
(
url
.
contains
(
"weixin.qq.com"
))
{
content
=
matchContentWeixin
(
html
);
}
else
if
(
url
.
contains
(
"toutiao.com"
))
{
content
=
matchContentToutiao
(
html
);
}
if
(
content
==
null
||
content
.
length
()
<
10
)
{
content
=
mathchContent
(
html
,
document
);
}
return
ZhiWeiTools
.
delHTMLTag
(
content
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取全文失败"
,
e
);
content
=
null
;
}
return
content
;
}
/**
*
* @Description 头条正文获取
* @param html
* @return
*/
private
static
String
matchContentToutiao
(
String
html
)
{
Pattern
pa
=
Pattern
.
compile
(
"content:(.*?)',"
);
Matcher
ma
=
pa
.
matcher
(
html
);
while
(
ma
.
find
())
{
return
ma
.
group
(
1
);
}
return
null
;
}
/**
*
* @Description 微信文本获取
* @param html
* @return
*/
private
static
String
matchContentWeixin
(
String
contentHtml
)
{
try
{
Document
document
=
Jsoup
.
parse
(
contentHtml
);
if
(
contentHtml
.
contains
(
"js_article"
))
{
return
document
.
select
(
"div#js_article"
).
text
();
}
else
if
(
contentHtml
.
contains
(
"js_share_content"
))
{
return
document
.
select
(
"div#js_share_content"
).
text
();
}
if
(
contentHtml
.
contains
(
"content_tpl"
))
{
String
text
=
document
.
select
(
"script#content_tpl"
).
html
();
return
Jsoup
.
parse
(
text
).
text
();
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"微信全文解析出错 {}"
,
e
);
}
return
""
;
}
/**
* @Title: mathchContent
* @author hero
* @Description: 匹配正文数据
* @param @param html
* @param @param document
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
mathchContent
(
String
html
,
Document
document
){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String
content
=
null
;
try
{
content
=
ArticleExtractor
.
getInstance
().
getText
(
html
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"正文抽取失败,获取全文文本:"
,
e
);
content
=
document
.
text
();
}
// String content = null;
// try {
// News news = ContentExtractor.getNewsByHtml(html);
// content = TreateData.filterSpecialCharacter(news.getContent());
// } catch (Exception e) {
// logger.error("正文抽取失败,获取全文文本:",e);
// content = document.text();
// }
return
content
;
}
}
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.kohlschutter.boilerpipe.extractors.ArticleExtractor
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
/**
* @ClassName: MatchChannel
* @Description: 匹配频道
* @author hero
* @date 2018年6月30日 上午10:27:58
*/
public
class
MatchContent
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
MatchContent
.
class
);
/**
* @Title: matchContent
* @author hero
* @Description: 匹配文章正文
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public
static
String
matchContent
(
String
url
,
String
html
)
{
String
content
=
null
;
try
{
Document
document
=
Jsoup
.
parse
(
html
);
if
(
url
.
contains
(
"weixin.qq.com"
))
{
content
=
matchContentWeixin
(
html
);
}
else
if
(
url
.
contains
(
"toutiao.com"
))
{
content
=
matchContentToutiao
(
html
);
}
if
(
content
==
null
||
content
.
length
()
<
10
)
{
content
=
mathchContent
(
html
,
document
);
}
return
ZhiWeiTools
.
delHTMLTag
(
content
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取全文失败"
,
e
);
content
=
null
;
}
return
content
;
}
/**
*
* @Description 头条正文获取
* @param html
* @return
*/
private
static
String
matchContentToutiao
(
String
html
)
{
Pattern
pa
=
Pattern
.
compile
(
"content:(.*?)',"
);
Matcher
ma
=
pa
.
matcher
(
html
);
while
(
ma
.
find
())
{
return
ma
.
group
(
1
);
}
return
null
;
}
/**
*
* @Description 微信文本获取
* @param html
* @return
*/
private
static
String
matchContentWeixin
(
String
contentHtml
)
{
try
{
Document
document
=
Jsoup
.
parse
(
contentHtml
);
if
(
contentHtml
.
contains
(
"js_article"
))
{
return
document
.
select
(
"div#js_article"
).
text
();
}
else
if
(
contentHtml
.
contains
(
"js_share_content"
))
{
return
document
.
select
(
"div#js_share_content"
).
text
();
}
if
(
contentHtml
.
contains
(
"content_tpl"
))
{
String
text
=
document
.
select
(
"script#content_tpl"
).
html
();
return
Jsoup
.
parse
(
text
).
text
();
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"微信全文解析出错 {}"
,
e
);
}
return
""
;
}
/**
* @Title: mathchContent
* @author hero
* @Description: 匹配正文数据
* @param @param html
* @param @param document
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
mathchContent
(
String
html
,
Document
document
){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String
content
=
null
;
try
{
content
=
ArticleExtractor
.
getInstance
().
getText
(
html
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"正文抽取失败,获取全文文本:"
,
e
);
content
=
document
.
text
();
}
// String content = null;
// try {
// News news = ContentExtractor.getNewsByHtml(html);
// content = TreateData.filterSpecialCharacter(news.getContent());
// } catch (Exception e) {
// logger.error("正文抽取失败,获取全文文本:",e);
// content = document.text();
// }
return
content
;
}
}
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
View file @
2c9d4fa2
package
com
.
zhiwei
.
source_forward
.
util
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.source_forward.config.ProxyConfig
;
import
org.apache.dubbo.config.ApplicationConfig
;
import
org.apache.dubbo.config.ConsumerConfig
;
import
org.apache.dubbo.config.RegistryConfig
;
import
com.zhiwei.http.proxy.CynomysFactory
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumer
;
import
com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory
;
/**
* 初始化代理
...
...
@@ -16,10 +20,18 @@ public class ProxyInit {
* void
*/
public
static
void
initProxy
()
{
String
address
=
ProxyConfig
.
registry
;
String
appName
=
"xumiaoxin"
;
long
appId
=
ProxyConfig
.
proxyid
;
ProxyFactory
.
init
(
SimpleConfig
.
builder
().
registry
(
address
).
appName
(
appName
).
appId
(
appId
).
group
(
ProxyConfig
.
group
).
build
());
ApplicationConfig
applicationConfig
=
new
ApplicationConfig
();
applicationConfig
.
setName
(
"actool"
);
RegistryConfig
registryConfig
=
new
RegistryConfig
();
registryConfig
.
setAddress
(
"zookeeper://192.168.0.30:2181"
);
ConsumerConfig
consumerConfig
=
new
ConsumerConfig
();
// 设置分组
consumerConfig
.
setGroup
(
"local"
);
String
username
=
"18271694195"
;
String
password
=
"Zhiwei289"
;
// 创建 consumer,applicationConfig 非必需参数
CynomysConsumer
consumer
=
CynomysConsumerFactory
.
create
(
applicationConfig
,
registryConfig
,
consumerConfig
,
username
,
password
);
CynomysFactory
.
init
(
consumer
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment