Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
bd0353ac
Commit
bd0353ac
authored
Nov 09, 2020
by
chenweiyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
冲突
parent
eff378d9
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
36 additions
and
34 deletions
+36
-34
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+0
-1
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+36
-33
No files found.
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
bd0353ac
...
@@ -52,7 +52,6 @@ public class UrlLiveCrawler {
...
@@ -52,7 +52,6 @@ public class UrlLiveCrawler {
counter
.
add
();
counter
.
add
();
if
(
nonNull
(
url
))
{
if
(
nonNull
(
url
))
{
try
{
try
{
// ZhiWeiTools.sleep(3000);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
search
(
counter
,
url
,
Attribution
.
of
(
url
,
1
),
callback
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜索创建出错:"
,
e
);
logger
.
error
(
"搜索创建出错:"
,
e
);
...
...
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
bd0353ac
package
com
.
zhiwei
.
source_forward
.
util
;
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.util.List
;
import
com.alibaba.fastjson.JSONObject
;
import
java.util.Objects
;
import
com.zhiwei.source_forward.content.ContentExtractor
;
import
java.util.regex.Matcher
;
import
com.zhiwei.source_forward.content.News
;
import
java.util.regex.Pattern
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
java.util.List
;
import
com.zhiwei.source_forward.content.ContentExtractor
;
import
java.util.Objects
;
import
com.zhiwei.source_forward.content.News
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
/**
/**
* @ClassName: MatchSource
* @ClassName: MatchSource
...
@@ -53,7 +52,8 @@ public class MatchSource {
...
@@ -53,7 +52,8 @@ public class MatchSource {
* 设定文件
* 设定文件
* @return String 返回类型
* @return String 返回类型
*/
*/
public
static
String
matchSource
(
String
url
,
String
html
,
List
<
String
>
sourceList
)
{
public
static
String
matchSource
(
String
url
,
String
html
,
List
<
String
>
sourceList
)
{
String
source
=
null
;
String
source
=
null
;
Document
document
=
Jsoup
.
parse
(
html
);
Document
document
=
Jsoup
.
parse
(
html
);
String
htmlBody
=
TreateData
.
filterSpecialCharacter
(
document
.
select
(
"body"
).
text
().
toUpperCase
());
String
htmlBody
=
TreateData
.
filterSpecialCharacter
(
document
.
select
(
"body"
).
text
().
toUpperCase
());
...
@@ -324,8 +324,19 @@ public class MatchSource {
...
@@ -324,8 +324,19 @@ public class MatchSource {
if
(
StringUtils
.
isNotBlank
(
source
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
source
.
replaceAll
(
".*来源:|)"
,
""
);
source
=
source
.
replaceAll
(
".*来源:|)"
,
""
);
}
}
}
else
if
(
url
.
contains
(
"gu.qq.com"
)){
source
=
document
.
select
(
"span#news_source"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
source
;
}
}
else
if
(
url
.
contains
(
"kandian.youth.cn"
)){
source
=
document
.
select
(
"div.fl > a"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
return
source
;
}
}
}
if
(
Objects
.
nonNull
(
source
)
&&
source
.
length
()
!=
0
)
{
if
(
Objects
.
nonNull
(
source
)
&&
source
.
length
()
!=
0
)
{
return
source
;
return
source
;
}
}
...
@@ -438,29 +449,18 @@ public class MatchSource {
...
@@ -438,29 +449,18 @@ public class MatchSource {
}
}
}
}
}
}
if
(
source
.
length
()
<
1
&&
html
.
contains
(
"window.__INITIAL_DATA__ ="
))
{
Matcher
ma
=
Pattern
.
compile
(
"window.__INITIAL_DATA__ =[\\s\\S]+?}}"
).
matcher
(
html
);
if
(
ma
.
find
())
{
String
result
=
ma
.
group
().
replaceAll
(
"window.__INITIAL_DATA__ =|\\</script\\>|"
,
""
).
trim
();
if
(
result
.
contains
(
"window.autohomePVDDWhiteList"
))
{
result
=
result
.
split
(
"window.autohomePVDDWhiteList"
)[
0
];
}
JSONObject
json
=
JSONObject
.
parseObject
(
result
.
trim
());
source
=
json
.
getJSONObject
(
"detail"
).
getString
(
"sec_src"
);
if
(
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
{
source
=
json
.
getJSONObject
(
"detail"
).
getString
(
"src"
);
}
}
}
if
(
Objects
.
nonNull
(
source
)
&&
source
.
length
()>
1
){
if
(
Objects
.
nonNull
(
source
)
&&
source
.
length
()>
1
){
source
=
"快资讯-"
+
source
;
source
=
"快资讯-"
+
source
;
}
}
}
else
if
(
url
.
contains
(
"
cj.sina.com.cn"
)
||
url
.
contains
(
"finance.sina.cn"
)
||
}
else
if
(
url
.
contains
(
"
k.sina.com.cn"
)
||
url
.
contains
(
"cj.sina.com.cn"
)
||
url
.
contains
(
"finance.sina.cn"
)
||
url
.
contains
(
"tech.sina.cn"
)
||
url
.
contains
(
"news.sina.cn"
)
||
url
.
contains
(
"k.sina.cn"
)){
url
.
contains
(
"tech.sina.cn"
)
||
url
.
contains
(
"news.sina.cn"
)
||
url
.
contains
(
"k.sina.cn"
)){
source
=
document
.
select
(
"h2.weibo_user"
).
text
();
source
=
document
.
select
(
"h2.weibo_user"
).
text
();
if
(
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
{
if
(
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
{
source
=
document
.
select
(
"#top_bar > div > div.date-source > a"
).
text
();
source
=
document
.
select
(
"#top_bar > div > div.date-source >span > a"
).
text
();
}
}
if
(
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
{
source
=
document
.
select
(
"#top_bar > div > div.date-source > a"
).
text
();
}
if
((
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
&&
html
.
contains
(
"<meta name=\"mediaid\""
)){
if
((
Objects
.
isNull
(
source
)
||
source
.
length
()
<
1
)
&&
html
.
contains
(
"<meta name=\"mediaid\""
)){
//新浪科技头条号
//新浪科技头条号
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
].
trim
();
source
=
html
.
split
(
"<meta name=\"mediaid\" content=\""
)[
1
].
split
(
"\""
)[
0
].
trim
();
...
@@ -502,9 +502,12 @@ public class MatchSource {
...
@@ -502,9 +502,12 @@ public class MatchSource {
}
}
}
else
if
(
url
.
contains
(
"baijiahao.baidu.com"
)
||
url
.
contains
(
"mbd.baidu.com"
)){
}
else
if
(
url
.
contains
(
"baijiahao.baidu.com"
)
||
url
.
contains
(
"mbd.baidu.com"
)){
//百度百家
//百度百家
source
=
document
.
select
(
"span.userNameSpan"
).
text
();
if
(
StringUtils
.
isNotBlank
(
document
.
select
(
"span.userNameSpan"
).
text
())){
if
(
StringUtils
.
isBlank
(
source
)){
source
=
document
.
select
(
"span.userNameSpan"
).
text
();
}
else
if
(
StringUtils
.
isNotBlank
(
document
.
select
(
"p.author-name:nth-child(1)"
).
text
())){
source
=
document
.
select
(
"p.author-name:nth-child(1)"
).
text
();
source
=
document
.
select
(
"p.author-name:nth-child(1)"
).
text
();
}
else
if
(
StringUtils
.
isNotBlank
(
document
.
select
(
"a.authorName"
).
text
())){
source
=
document
.
select
(
"a.authorName"
).
text
();
}
}
if
(
StringUtils
.
isNotBlank
(
source
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"百度百家-"
+
source
;
source
=
"百度百家-"
+
source
;
...
@@ -783,8 +786,7 @@ public class MatchSource {
...
@@ -783,8 +786,7 @@ public class MatchSource {
if
(
StringUtils
.
isNotBlank
(
source
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"推酷-"
+
source
;
source
=
"推酷-"
+
source
;
}
}
}
}
else
if
(
url
.
contains
(
"36kr.com"
)){
if
(
url
.
contains
(
"36kr.com"
)){
source
=
document
.
select
(
"div.info-header-text > a.author-name"
).
text
();
source
=
document
.
select
(
"div.info-header-text > a.author-name"
).
text
();
if
(
StringUtils
.
isNotBlank
(
source
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
return
"36氪-"
+
source
;
return
"36氪-"
+
source
;
...
@@ -798,6 +800,7 @@ public class MatchSource {
...
@@ -798,6 +800,7 @@ public class MatchSource {
return
"36氪-"
+
source
;
return
"36氪-"
+
source
;
}
}
}
}
return
source
;
return
source
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
printStackTrace
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment