Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
8e2e2cc2
Commit
8e2e2cc2
authored
Mar 22, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
贴吧 增加 采集全文,百度知道增加 返回参数
parent
d12dad92
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
60 additions
and
1 deletions
+60
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
+36
-0
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduZhidaoCrawlerParse.java
+22
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/DoubanCrawlerParse.java
+2
-0
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
View file @
8e2e2cc2
...
...
@@ -5,6 +5,7 @@ import java.util.ArrayList;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
...
...
@@ -29,6 +30,7 @@ import okhttp3.Response;
public
class
BaiduTiebaCrawlerParse
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
Logger
logger
=
LogManager
.
getLogger
(
BaiduTiebaCrawlerParse
.
class
);
/**
* @Title: getBaiduTiebaData
* @author hero
...
...
@@ -173,6 +175,16 @@ public class BaiduTiebaCrawlerParse {
return
resultMap
;
}
private
static
String
ganalysisData
(
String
result
,
String
url
)
{
/** 解析页面 */
Document
document
=
Jsoup
.
parse
(
result
);
// 开始解析
return
document
.
select
(
"#j_p_postlist > div:nth-child(1) > div.d_post_content_main.d_post_content_firstfloor > div.p_content > cc > div.j_d_post_content"
).
text
();
}
/**
*
* @Description 百度贴吧获取时间
...
...
@@ -269,6 +281,7 @@ public class BaiduTiebaCrawlerParse {
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
...
...
@@ -363,4 +376,27 @@ public class BaiduTiebaCrawlerParse {
}
return
url
;
}
/**
*
* @Description 贴吧用户问题
* @param url
* @param proxy
* @return
*/
public
static
String
getTiebaData
(
String
url
,
Proxy
proxy
)
{
if
(
url
.
contains
(
"?"
))
{
url
=
url
.
split
(
"\\?"
)[
0
];
}
try
{
String
htmlBody
=
downloadHtml
(
url
,
null
);
if
(
Objects
.
nonNull
(
htmlBody
))
{
return
ganalysisData
(
htmlBody
,
url
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
null
;
}
}
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduZhidaoCrawlerParse.java
View file @
8e2e2cc2
...
...
@@ -3,6 +3,7 @@ package com.zhiwei.media_data_crawler.crawler;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
...
...
@@ -18,11 +19,13 @@ import com.zhiwei.crawler.core.HttpBoot;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
okhttp3.Response
;
public
class
BaiduZhidaoCrawlerParse
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiduZhidaoCrawlerParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
(
false
,
2
);
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
ProxyHolder
proxy
)
{
try
{
...
...
@@ -43,11 +46,17 @@ public class BaiduZhidaoCrawlerParse {
String
content
=
element
.
select
(
"dd.answer"
).
text
();
String
time
=
element
.
select
(
"dd.dd.explain.f-light > span:nth-child(1)"
).
text
();
String
source
=
element
.
select
(
"dd.dd.explain.f-light > span:nth-child(2) > a"
).
text
();
String
answerCount
=
element
.
select
(
"dd.dd.explain.f-light > span:nth-child(3) > a"
).
text
();
String
like
=
element
.
select
(
"dd.dd.explain.f-light > span:nth-child(4)"
).
text
();
map
.
put
(
"url"
,
ur
);
map
.
put
(
"title"
,
title
);
map
.
put
(
"content"
,
content
);
map
.
put
(
"time"
,
time
);
map
.
put
(
"source"
,
source
);
map
.
put
(
"answerCount"
,
answerCount
);
map
.
put
(
"like"
,
like
);
map
.
put
(
"word"
,
word
);
System
.
out
.
println
(
map
.
toString
());
dataList
.
add
(
map
);
}
if
(
dataList
.
size
()
-
count
<
8
)
{
...
...
@@ -63,6 +72,18 @@ public class BaiduZhidaoCrawlerParse {
}
public
static
Date
getBaiduZhidaoTime
(
String
url
,
ProxyHolder
proxy
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
return
new
Date
(
Long
.
parseLong
(
result
.
split
(
"createTime: '"
)[
1
].
split
(
"'"
)[
0
]+
"000"
));
}
catch
(
Exception
e
)
{
logger
.
error
(
"百度知道问题时间获取失败{}"
,
e
);
}
return
null
;
}
// public static void main(String[] argss
}
src/main/java/com/zhiwei/media_data_crawler/crawler/DoubanCrawlerParse.java
View file @
8e2e2cc2
...
...
@@ -103,6 +103,7 @@ public class DoubanCrawlerParse {
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
...
...
@@ -129,6 +130,7 @@ public class DoubanCrawlerParse {
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_PROXY
);
}
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment