Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
554dd201
Commit
554dd201
authored
Mar 22, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加自媒体 创业家 科技讯 爱尖刀 来源获取
parent
7f4a87a2
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
25 additions
and
19 deletions
+25
-19
pom.xml
+1
-1
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+1
-0
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+3
-16
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+1
-1
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+19
-1
No files found.
pom.xml
View file @
554dd201
...
...
@@ -29,7 +29,7 @@
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.3.
0
-RELEASE
</version>
<version>
0.3.
1
-RELEASE
</version>
<scope>
provided
</scope>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
554dd201
...
...
@@ -92,6 +92,7 @@ public class MediaSelfSourceCrawler {
if
(
url
.
contains
(
"toutiao.com"
))
{
map
.
put
(
"referer"
,
url
);
}
map
.
put
(
"Connection"
,
"close"
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
map
);
counter
.
add
();
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
554dd201
...
...
@@ -22,8 +22,6 @@ import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
okhttp3.Request
;
/**
*
* @ClassName UrlLiveCrawler
...
...
@@ -35,17 +33,12 @@ import okhttp3.Request;
public
class
UrlLiveCrawler
{
private
static
final
Logger
logger
=
LogManager
.
getLogger
(
UrlLiveCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
(
false
,
2
);
public
GroupSync
submitTask
(
UrlLiveDataCallback
callback
,
String
...
urls
)
{
try
{
GroupSync
counter
=
new
GroupSync
();
start
(
counter
,
callback
,
urls
);
return
counter
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" 判断链接是否删除 {} "
,
e
);
return
null
;
}
}
private
void
start
(
GroupSync
counter
,
UrlLiveDataCallback
callback
,
String
...
urls
)
{
...
...
@@ -72,9 +65,8 @@ public class UrlLiveCrawler {
if
(
url
.
contains
(
"www.toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
}
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
counter
.
add
();
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
RequestUtils
.
wrapGet
(
url
,
headers
)
,
ProxyHolder
.
NAT_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
code
()
==
200
)
{
...
...
@@ -90,7 +82,7 @@ public class UrlLiveCrawler {
}
else
{
if
(
attr
.
getCount
()
>
3
)
{
callBack
(
callback
,
attr
,
-
1
,
null
);
logger
.
info
(
"
{} 搜索结果访问失败: {}"
,
request
.
url
().
url
()
,
ex
);
logger
.
info
(
"
搜索结果访问失败: {}"
,
ex
);
}
else
{
attr
.
AddCount
();
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
...
...
@@ -122,7 +114,6 @@ public class UrlLiveCrawler {
private
String
dealUrl
(
String
url
)
{
try
{
if
(
url
.
contains
(
"toutiao.com"
))
{
try
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
}
else
{
...
...
@@ -136,10 +127,6 @@ public class UrlLiveCrawler {
if
(
url
.
contains
(
"group"
))
{
url
=
"https://www.toutiao.com/a"
+
url
.
split
(
"/"
)[
4
]
+
"/"
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"url 解析出错 "
,
e
);
return
url
;
}
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
if
(
url
.
contains
(
"https"
))
{
...
...
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
View file @
554dd201
...
...
@@ -25,7 +25,7 @@ public class MediaSelfSource {
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http
s://www.toutiao.com/a6669697912458445059/
"
);
urlList
.
add
(
"http
://dy.163.com/v2/article/detail/EANTKV6H0512ES8F.html
"
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
for
(
MediaSelfSourceBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
...
...
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
554dd201
...
...
@@ -226,7 +226,7 @@ public class MatchSource {
}
}
else
if
(
url
.
contains
(
"dy.163.com"
)){
//网易订阅-网易号
source
=
document
.
select
(
"div.colum_info
>
h4"
).
text
();
source
=
document
.
select
(
"div.colum_info
>
h4"
).
text
();
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"网易号-"
+
source
;
}
...
...
@@ -260,6 +260,24 @@ public class MatchSource {
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"今日潮闻-"
+
source
;
}
}
else
if
(
url
.
contains
(
"ijiandao.com"
)){
source
=
document
.
select
(
"div.article-author > span.author-name > a"
)
.
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"爱尖刀-"
+
source
;
}
}
else
if
(
url
.
contains
(
"chuangyejia.com"
)){
source
=
document
.
select
(
"div.article-title > ul.article-author > li:nth-child(1)"
)
.
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"创业家-"
+
source
;
}
}
else
if
(
url
.
contains
(
"kejixun.com"
)){
source
=
document
.
select
(
"div.r-box.r-box-none.clearfix > div > div.big-man > h3 > a"
)
.
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"科技讯-"
+
source
;
}
}
return
source
;
}
catch
(
Exception
e
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment