Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
aa2a108b
Commit
aa2a108b
authored
Aug 29, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
头条获取修改
parent
b6fe1572
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
31 additions
and
9 deletions
+31
-9
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+1
-0
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
+8
-8
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+22
-1
No files found.
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
aa2a108b
...
...
@@ -12,6 +12,7 @@ import com.zhiwei.tools.httpclient.HttpBoot;
import
com.zhiwei.tools.httpclient.HttpRequestBuilder
;
import
com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter
;
import
okhttp3.Headers
;
import
okhttp3.Request
;
import
okhttp3.Response
;
...
...
src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
View file @
aa2a108b
...
...
@@ -55,14 +55,14 @@ public class ContentMatch {
return
dataList
;
}
public
static
void
main
(
String
[]
args
)
{
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"https://mp.weixin.qq.com/s?src=11×tamp=1535449515&ver=1088&signature=9kByOydse2KaausR0FP5HoQpSeSXs097LR-akxhJxfCV*onfJuoWkznZ8UEk5OfFox4aVzDqx0n0xwbtTm6KUzPpNz2desfNiQ4Uevp4LaTSyoH3OKysG2qxy2jisojb&new=1
"
);
List
<
ContentBean
>
l
=
getContentMatch
(
urlList
);
for
(
ContentBean
cb
:
l
)
{
System
.
out
.
println
(
cb
.
toString
());
}
}
//
public static void main(String[] args) {
//
List<String> urlList = new ArrayList<>();
// urlList.add("http://www.toutiao.com/a6571343464292680196/
");
//
List<ContentBean> l = getContentMatch(urlList);
//
for(ContentBean cb : l) {
// System.out.println(cb.getContent
());
//
}
//
}
static
class
ContentMatchCrawlerThread
extends
Thread
{
...
...
src/main/java/com/zhiwei/source_forward/util/MatchContent.java
View file @
aa2a108b
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
...
...
@@ -7,6 +10,7 @@ import org.slf4j.LoggerFactory;
import
com.zhiwei.source_forward.content.ContentExtractor
;
import
com.zhiwei.source_forward.content.News
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
/**
* @ClassName: MatchChannel
...
...
@@ -32,10 +36,12 @@ public class MatchContent {
Document
document
=
Jsoup
.
parse
(
html
);
if
(
url
.
contains
(
"weixin.qq.com"
))
{
content
=
matchContentWeixin
(
document
);
}
else
if
(
url
.
contains
(
"toutiao.com"
))
{
content
=
matchContentToutiao
(
html
);
}
else
{
content
=
mathchContent
(
html
,
document
);
}
return
content
;
return
ZhiWeiTools
.
delHTMLTag
(
content
)
;
}
catch
(
Exception
e
)
{
logger
.
debug
(
"获取全文失败"
,
e
.
fillInStackTrace
());
content
=
null
;
...
...
@@ -45,6 +51,21 @@ public class MatchContent {
/**
*
* @Description 头条正文获取
* @param html
* @return
*/
private
static
String
matchContentToutiao
(
String
html
)
{
Pattern
pa
=
Pattern
.
compile
(
"content:(.*?)',"
);
Matcher
ma
=
pa
.
matcher
(
html
);
while
(
ma
.
find
())
{
return
ma
.
group
(
1
);
}
return
null
;
}
/**
*
* @Description 微信文本获取
* @param html
* @return
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment