Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
2d3871ad
Commit
2d3871ad
authored
May 24, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
部分自媒体解析修改和来源解析修改
parent
9557316d
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
18 additions
and
13 deletions
+18
-13
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+1
-1
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+1
-1
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+1
-1
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+15
-10
No files found.
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
2d3871ad
...
...
@@ -82,7 +82,7 @@ public class UrlLiveCrawler {
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
()));
}
}
else
{
callBack
(
callback
,
attr
,
1
,
String
.
valueOf
(
rs
.
code
())
);
callBack
(
callback
,
attr
,
1
,
"未访问成功"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" 数据是否删除 采集出错 {} "
,
e
);
...
...
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
View file @
2d3871ad
...
...
@@ -25,7 +25,7 @@ public class MediaSelfSource {
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http
://sh.qihoo.com/pc/9dcfa48989d33df34?cota=1&sign=360_e39369d1&refer_scene=so_3
"
);
urlList
.
add
(
"http
s://item.btime.com/m_9bf5d805a257ddc87
"
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
for
(
MediaSelfSourceBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
...
...
src/main/java/com/zhiwei/source_forward/run/SourceForward.java
View file @
2d3871ad
...
...
@@ -81,7 +81,7 @@ public class SourceForward {
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http
s://www.jiemian.com/article/2782869.
html"
);
urlList
.
add
(
"http
://industry.caijing.com.cn/20190423/4582310.s
html"
);
List
<
SourceForwardBean
>
da
=
SourceForward
.
getSourceForward
(
urlList
);
for
(
SourceForwardBean
sfb
:
da
)
{
System
.
out
.
println
(
sfb
.
toString
());
...
...
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
2d3871ad
...
...
@@ -3,7 +3,6 @@ package com.zhiwei.source_forward.util;
import
java.util.List
;
import
java.util.Objects
;
import
org.checkerframework.checker.units.qual.s
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -101,10 +100,7 @@ public class MatchSource {
}
else
if
(
url
.
contains
(
"caijing.com.cn"
)){
//财经网产经
source
=
document
.
select
(
"#source_baidu"
).
text
();
}
else
if
(
url
.
contains
(
"news.eastday.com"
)){
}
else
if
(
url
.
contains
(
"news.eastday.com"
)){
//单独处理东方网
source
=
document
.
select
(
"div#sectionleft"
).
select
(
"div"
).
select
(
"p"
).
select
(
"a"
).
text
();
}
else
if
(
url
.
contains
(
"ny.chinacenn.com"
)){
...
...
@@ -217,7 +213,7 @@ public class MatchSource {
source
=
document
.
select
(
"div.tip.fl"
).
select
(
"a"
).
text
();
}
else
if
(
url
.
contains
(
"finance.jrj.com.cn"
)){
//单独处理金融界
source
=
document
.
select
(
"p.inftop"
).
select
(
"span"
).
select
(
"a"
).
text
().
replaceAll
(
"价值.*| "
,
""
);
source
=
document
.
select
(
"p.inftop"
).
select
(
"span"
).
get
(
1
).
select
(
"a"
).
text
().
replaceAll
(
"价值.*| "
,
""
);
}
else
if
(
url
.
contains
(
"tech.china.com.cn"
)){
//单独处理中国网
source
=
document
.
select
(
"span.fl.time2"
).
select
(
"a"
).
text
();
...
...
@@ -298,8 +294,12 @@ public class MatchSource {
source
=
document
.
select
(
"span.label_nr"
).
text
();
}
else
if
(
url
.
contains
(
"jiemian.com"
)){
//单独处理界面新闻
source
=
document
.
select
(
"div.article-info"
).
select
(
"span"
).
text
().
replaceAll
(
".*来源:| 字体[\\w\\W]*"
,
""
);
}
// source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", "");
return
"界面新闻"
;
}
else
if
(
url
.
contains
(
"finance.youth.cn"
)){
//单独处理中国青年网
source
=
document
.
select
(
"span#source_baidu"
).
text
().
replaceAll
(
"来源:|作者.*"
,
""
);
}
if
(
Objects
.
nonNull
(
source
)
&&
source
.
length
()
!=
0
)
{
return
source
;
...
...
@@ -339,9 +339,11 @@ public class MatchSource {
/***特定网站单独处理**/
if
(
url
.
contains
(
"toutiao.com"
)){
//今日头条帐号匹配
if
(
html
.
contains
(
"name: '"
)){
if
(
html
.
contains
(
"name: '"
)
&&
html
.
contains
(
"mediaInfo"
)
){
source
=
html
.
split
(
"mediaInfo:"
)[
1
].
split
(
"name: '"
)[
1
].
split
(
"',"
)[
0
].
trim
();
}
else
if
(
html
.
contains
(
"screen_name:"
)){
}
else
if
(
html
.
contains
(
"name: '"
)
&&
html
.
contains
(
"ugcInfo"
)){
source
=
html
.
split
(
"ugcInfo:"
)[
1
].
split
(
"name: '"
)[
1
].
split
(
"',"
)[
0
].
trim
();
}
else
if
(
html
.
contains
(
"screen_name:"
)){
source
=
html
.
split
(
"screen_name:'"
)[
1
].
split
(
"',"
)[
0
].
trim
();
}
if
(
source
!=
null
&&
source
.
length
()>
1
){
...
...
@@ -423,6 +425,9 @@ public class MatchSource {
else
if
(
url
.
contains
(
"item.btime.com"
)){
//北京时间
source
=
document
.
select
(
"a.author"
).
text
();
if
(
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
){
source
=
document
.
select
(
"div.content-info > span.col.cite"
).
text
();
}
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"北京时间-"
+
source
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment