Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
0e4b6f49
Commit
0e4b6f49
authored
Aug 22, 2019
by
win 10
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
新增界面新闻、亿欧网、蓝鲸、蓝鲸财经、虎嗅、连线家六个自媒体来源的匹配
parent
364e507d
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
33 additions
and
13 deletions
+33
-13
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+33
-13
No files found.
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
0e4b6f49
...
...
@@ -174,7 +174,7 @@ public class MatchSource {
source
=
document
.
select
(
"div.new-content-info.clearfix"
).
select
(
"span"
).
text
().
replaceAll
(
".*作者:"
,
""
);
}
else
if
(
url
.
contains
(
"finance.eastmoney.com"
)){
//单独处理东方财富网
source
=
document
.
select
(
"div.source.data-source"
).
attr
(
"data-source"
)
.
toString
()
;
source
=
document
.
select
(
"div.source.data-source"
).
attr
(
"data-source"
);
}
else
if
(
url
.
contains
(
"emwap.eastmoney.com"
)){
//单独处理东方财富网客户端
source
=
document
.
select
(
"div.where"
).
select
(
"span.source"
).
attr
(
"title"
);
...
...
@@ -298,7 +298,7 @@ public class MatchSource {
}
else
if
(
url
.
contains
(
"stock.10jqka.com.cn"
)){
//单独处理重庆晨报
source
=
document
.
select
(
"span.label_nr"
).
text
();
}
else
if
(
url
.
contains
(
"jiemian.com"
)){
}
else
if
(
url
.
contains
(
"jiemian.com"
)
){
//单独处理界面新闻
// source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", "");
return
"界面新闻"
;
...
...
@@ -331,7 +331,7 @@ public class MatchSource {
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
}
return
null
;
}
...
...
@@ -411,11 +411,9 @@ public class MatchSource {
}
else
if
(
url
.
contains
(
"cj.sina.com.cn"
)
||
url
.
contains
(
"finance.sina.cn"
)
||
url
.
contains
(
"tech.sina.cn"
)
||
url
.
contains
(
"news.sina.cn"
)){
source
=
document
.
select
(
"h2.weibo_user"
).
text
();
if
(
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
){
if
(
(
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
&&
html
.
contains
(
"<meta name=\"mediaid\""
)
){
//新浪科技头条号
if
(
html
.
contains
(
"<meta name=\"mediaid\""
)){
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
].
trim
();
}
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
].
trim
();
}
if
(
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
){
//新浪财经头条号
...
...
@@ -471,9 +469,6 @@ public class MatchSource {
if
(
source
!=
null
&&
source
.
length
()>
1
){
source
=
"北京时间-"
+
source
;
}
}
else
if
(
url
.
contains
(
"item.btime.com"
)){
//北京时间
source
=
document
.
select
(
"span.col cite"
).
text
();
}
else
if
(
url
.
contains
(
"mp.qq.com"
)){
source
=
document
.
select
(
"div#account_top > div.puin_text > div.pname"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
...
...
@@ -588,6 +583,31 @@ public class MatchSource {
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"优酷-"
+
source
;
}
}
else
if
(
url
.
contains
(
"jiemian.com"
))
{
source
=
document
.
select
(
"div.article-info > p > span.author > a"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
)){
source
=
"界面新闻-"
+
source
;
}
}
else
if
(
url
.
contains
(
"iyiou.com"
))
{
source
=
document
.
select
(
"div#post_author > a"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
))
{
source
=
"亿欧网-"
+
source
;
}
}
else
if
(
url
.
contains
(
"lanjingtmt.com"
))
{
source
=
document
.
select
(
"div.scd-title > a:nth-child(2)"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
))
{
source
=
"蓝鲸-"
+
source
;
}
}
else
if
(
url
.
contains
(
"lanjinger.com"
))
{
source
=
document
.
select
(
"div.article_info > span.info.author_name"
).
text
().
replaceAll
(
".*编辑| "
,
""
);
if
(
source
!=
null
&&
!
source
.
equals
(
""
))
{
source
=
"蓝鲸财经-"
+
source
;
}
}
else
if
(
url
.
contains
(
"huxiu.com"
))
{
source
=
document
.
select
(
"div.article__author-info-box > a.article-author-info > span.author-info__username"
).
text
();
if
(
source
!=
null
&&
!
source
.
equals
(
""
))
{
source
=
"虎嗅-"
+
source
;
}
}
return
source
;
}
catch
(
Exception
e
)
{
...
...
@@ -712,7 +732,7 @@ public class MatchSource {
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
"正文抽取失败处理........"
);
e
.
printStackTrace
();
e
.
toString
();
/***
* 匹配正文失败
* 匹配命中包含来源等规则的数据
...
...
@@ -758,7 +778,7 @@ public class MatchSource {
* 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
***/
String
times
[]
=
htmlBody
.
split
(
timeSource
);
String
[]
times
=
htmlBody
.
split
(
timeSource
);
for
(
int
j
=
0
;
j
<
times
.
length
;
j
++)
{
String
timecontent
=
times
[
j
];
if
(
j
==
0
)
{
...
...
@@ -783,7 +803,7 @@ public class MatchSource {
}
return
null
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
toString
();
return
null
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment