Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
a56fa9e1
Commit
a56fa9e1
authored
Nov 08, 2019
by
win 10
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
更新天涯论坛采集
parent
aacd8761
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
149 additions
and
87 deletions
+149
-87
src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
+78
-81
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+2
-6
src/test/java/com/zhiwei/media_data_crawler/test/GetTiayaDataTest.java
+69
-0
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
View file @
a56fa9e1
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
...
@@ -17,12 +15,10 @@ import org.jsoup.select.Elements;
...
@@ -17,12 +15,10 @@ import org.jsoup.select.Elements;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.LunTanData
;
import
com.zhiwei.media_data_crawler.entity.LunTanData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
okhttp3.Response
;
...
@@ -35,7 +31,7 @@ public class TianYaCrawlerParse {
...
@@ -35,7 +31,7 @@ public class TianYaCrawlerParse {
/**
/**
* @Title: getBaiduTiebaData
* @Title: getBaiduTiebaData
* @author hero
* @author hero
* @Description: 根
據關鍵詞獲取百度貼吧數據
(最多50頁)
* @Description: 根
据关键词获取天涯论坛数据
(最多50頁)
* @param @param word
* @param @param word
* @param @param proxy
* @param @param proxy
* @param @param tiebaName
* @param @param tiebaName
...
@@ -44,29 +40,36 @@ public class TianYaCrawlerParse {
...
@@ -44,29 +40,36 @@ public class TianYaCrawlerParse {
* @return List<TiebaData> 返回类型
* @return List<TiebaData> 返回类型
*/
*/
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
LunTanData
>
getLunTanData
(
String
word
,
Proxy
proxy
,
String
endTime
)
throws
Exception
{
public
static
List
<
LunTanData
>
getLunTanData
(
String
word
,
Proxy
Holder
proxy
,
String
startTime
,
String
endTime
)
{
List
<
LunTanData
>
list
=
new
ArrayList
<
LunTanData
>();
List
<
LunTanData
>
list
=
new
ArrayList
<
LunTanData
>();
int
page
=
0
;
int
page
=
0
;
boolean
more
=
true
;
boolean
more
=
true
;
while
(
more
)
{
while
(
more
)
{
// 最大页数为20
for
(
int
i
=
0
;
i
<
4
;
i
++)
{
if
(
page
>
50
)
{
// 最大页数为50
more
=
false
;
String
htmlBody
=
downloadHtml
(
word
,
proxy
,
page
);
}
/** 解析页面 */
String
htmlBody
=
downloadHtml
(
word
,
proxy
,
page
);
if
(
htmlBody
!=
null
)
{
if
(
htmlBody
!=
null
)
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
,
endTime
);
if
(!
document
.
select
(
"div.long-pages"
).
select
(
"a"
).
text
().
contains
(
"下一页"
)
&&
(
i
<
3
))
{
List
<
LunTanData
>
dataList
=
(
List
<
LunTanData
>)
dataMap
.
get
(
"data"
);
continue
;
list
.
addAll
(
dataList
);
}
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
}
else
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
document
,
word
,
startTime
,
endTime
);
more
=
false
;
List
<
LunTanData
>
dataList
=
(
List
<
LunTanData
>)
dataMap
.
get
(
"data"
);
}
list
.
addAll
(
dataList
);
page
++;
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
if
(
DataCrawler
.
sleepTime
!=
null
){
}
else
{
ZhiWeiTools
.
sleep
(
DataCrawler
.
sleepTime
);
more
=
false
;
}
}
page
++;
if
(
page
>
50
)
{
more
=
false
;
}
break
;
}
}
}
return
list
;
return
list
;
}
}
...
@@ -75,7 +78,7 @@ public class TianYaCrawlerParse {
...
@@ -75,7 +78,7 @@ public class TianYaCrawlerParse {
/**
/**
* @Title: downloadHtml
* @Title: downloadHtml
* @author hero
* @author hero
* @Description: 下
載百度貼吧數據
* @Description: 下
载天涯论坛数据
* @param @param word
* @param @param word
* @param @param proxy
* @param @param proxy
* @param @param tiebaName
* @param @param tiebaName
...
@@ -84,8 +87,7 @@ public class TianYaCrawlerParse {
...
@@ -84,8 +87,7 @@ public class TianYaCrawlerParse {
* @param @throws Exception 设定文件
* @param @throws Exception 设定文件
* @return String 返回类型
* @return String 返回类型
*/
*/
private
static
String
downloadHtml
(
String
word
,
Proxy
proxy
,
private
static
String
downloadHtml
(
String
word
,
ProxyHolder
proxy
,
int
page
)
{
int
page
)
throws
Exception
{
// 获取通用请求头
// 获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
// 获取链接地址
// 获取链接地址
...
@@ -94,21 +96,10 @@ public class TianYaCrawlerParse {
...
@@ -94,21 +96,10 @@ public class TianYaCrawlerParse {
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
url
);
// 下载数据页面
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
==
null
?
ProxyHolder
.
NAT_HEAVY_PROXY
:
proxy
))
{
Response
response
=
null
;
if
(
proxy
!=
null
)
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
);
}
else
{
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
);
}
return
response
.
body
().
string
();
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
toString
());
if
(
i
==
3
){
throw
e
;
}
else
{
continue
;
}
}
}
}
}
return
null
;
return
null
;
...
@@ -118,56 +109,62 @@ public class TianYaCrawlerParse {
...
@@ -118,56 +109,62 @@ public class TianYaCrawlerParse {
/**
/**
* @Title: analysisData
* @Title: analysisData
* @author hero
* @author hero
* @Description: 解析
Baidu貼吧數據
* @Description: 解析
天涯论坛数据
* @param @param htmlBody
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @param word
* @param @return
* @param @return
* @param @throws Exception 设定文件
* @param @throws Exception 设定文件
* @return Map<String,Object> 返回类型
* @return Map<String,Object> 返回类型
*/
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
,
String
endTime
)
throws
Exception
{
private
static
Map
<
String
,
Object
>
analysisData
(
Document
document
,
String
word
,
String
startTime
,
String
endTime
)
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
LunTanData
>
list
=
new
ArrayList
<
LunTanData
>();
try
{
boolean
more
=
true
;
List
<
LunTanData
>
list
=
new
ArrayList
<
LunTanData
>();
/** 解析页面 */
boolean
more
=
true
;
Document
document
=
Jsoup
.
parse
(
htmlBody
);
/** 判断是否有下一页 **/
/** 判断是否有下一页 **/
if
(!
document
.
select
(
"div.long-pages"
).
select
(
"a"
).
text
().
contains
(
"下一页"
))
{
if
(!
document
.
select
(
"div.long-pages"
).
select
(
"a"
).
text
().
contains
(
"下一页"
))
{
more
=
false
;
more
=
false
;
}
}
// 开始解析
// 开始解析
Elements
elementes
=
document
.
select
(
"div.searchListOne"
).
select
(
"ul"
).
select
(
"li"
);
Elements
elementes
=
document
.
select
(
"div.searchListOne"
).
select
(
"ul"
).
select
(
"li"
);
String
time
=
null
;
String
time
=
null
;
String
source
=
null
;
String
source
=
null
;
String
link
=
null
;
String
link
=
null
;
String
title
=
null
;
String
title
=
null
;
String
content
=
null
;
String
content
=
null
;
String
author
=
null
;
String
author
=
null
;
Integer
replyCount
=
0
;
Integer
reply_count
=
0
;
for
(
Element
element
:
elementes
)
{
long
startDate
=
TimeParse
.
stringFormartDate
(
startTime
).
getTime
();
title
=
element
.
select
(
"div"
).
select
(
"h3"
).
select
(
"a"
).
text
();
long
endDate
=
TimeParse
.
stringFormartDate
(
endTime
).
getTime
();
link
=
element
.
select
(
"div"
).
select
(
"h3"
).
select
(
"a"
).
attr
(
"href"
);
for
(
Element
element
:
elementes
)
{
content
=
element
.
select
(
"div"
).
select
(
"p"
).
text
();
if
(
element
.
toString
().
contains
(
"search_msg"
))
{
source
=
element
.
select
(
"p.source"
).
select
(
"a"
).
get
(
0
).
text
();
break
;
author
=
element
.
select
(
"p.source"
).
select
(
"a"
).
get
(
1
).
text
();
}
time
=
element
.
select
(
"p.source"
).
select
(
"span"
).
get
(
0
).
text
();
title
=
element
.
select
(
"div"
).
select
(
"h3"
).
select
(
"a"
).
text
();
reply_count
=
Integer
.
valueOf
(
element
.
select
(
"p.source"
).
select
(
"span"
).
get
(
1
).
text
());
link
=
element
.
select
(
"div"
).
select
(
"h3"
).
select
(
"a"
).
attr
(
"href"
);
LunTanData
luntanData
=
new
LunTanData
(
link
,
title
,
time
,
source
,
author
,
content
,
reply_count
,
pt
,
word
);
content
=
element
.
select
(
"div"
).
select
(
"p"
).
text
();
Date
date
=
TimeParse
.
stringFormartDate
(
time
);
source
=
element
.
select
(
"p.source"
).
select
(
"a:nth-child(1)"
).
text
();
Date
endDate
=
TimeParse
.
stringFormartDate
(
endTime
);
author
=
element
.
select
(
"p.source"
).
select
(
"a:nth-child(2)"
).
text
();
if
(
date
.
before
(
endDate
)){
time
=
element
.
select
(
"p.source"
).
select
(
"span"
).
get
(
0
).
text
();
replyCount
=
Integer
.
valueOf
(
element
.
select
(
"p.source"
).
select
(
"span"
).
get
(
1
).
text
());
LunTanData
luntanData
=
new
LunTanData
(
link
,
title
,
time
,
source
,
author
,
content
,
replyCount
,
pt
,
word
);
long
date
=
TimeParse
.
stringFormartDate
(
time
).
getTime
();
if
(
date
>=
startDate
&&
(
date
<=
endDate
)){
list
.
add
(
luntanData
);
}
else
if
(
date
<
startDate
){
more
=
false
;
}
}
if
(
elementes
.
isEmpty
()){
more
=
false
;
more
=
false
;
}
else
{
// System.out.println(luntanData);
list
.
add
(
luntanData
);
}
}
resultMap
.
put
(
"data"
,
list
);
resultMap
.
put
(
"more"
,
more
);
}
catch
(
Exception
e
)
{
e
.
toString
();
}
}
if
(
elementes
.
size
()==
0
){
more
=
false
;
}
resultMap
.
put
(
"data"
,
list
);
resultMap
.
put
(
"more"
,
more
);
return
resultMap
;
return
resultMap
;
}
}
...
@@ -187,7 +184,7 @@ public class TianYaCrawlerParse {
...
@@ -187,7 +184,7 @@ public class TianYaCrawlerParse {
url
=
"http://search.tianya.cn/bbs?q="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)
url
=
"http://search.tianya.cn/bbs?q="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)
+
"&s=4&f=0&pn="
+
page
;
+
"&s=4&f=0&pn="
+
page
;
}
}
System
.
out
.
println
(
url
);
System
.
out
.
println
(
word
+
" == "
+
url
);
return
url
;
return
url
;
}
}
...
...
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
View file @
a56fa9e1
...
@@ -329,12 +329,8 @@ public class DataCrawler {
...
@@ -329,12 +329,8 @@ public class DataCrawler {
* 设定文件
* 设定文件
* @return List<LunTanData> 返回类型
* @return List<LunTanData> 返回类型
*/
*/
public
static
List
<
LunTanData
>
getLunTanData
(
String
word
,
Proxy
proxy
,
String
endTime
)
{
public
static
List
<
LunTanData
>
getLunTanData
(
String
word
,
ProxyHolder
proxy
,
String
startTime
,
String
endTime
)
{
try
{
return
TianYaCrawlerParse
.
getLunTanData
(
word
,
proxy
,
startTime
,
endTime
);
return
TianYaCrawlerParse
.
getLunTanData
(
word
,
proxy
,
endTime
);
}
catch
(
Exception
e
)
{
return
Collections
.
emptyList
();
}
}
}
/**
/**
...
...
src/test/java/com/zhiwei/media_data_crawler/test/GetTiayaDataTest.java
0 → 100644
View file @
a56fa9e1
package
com
.
zhiwei
.
media_data_crawler
.
test
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.media_data_crawler.crawler.WordsReadFile
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.LunTanData
;
/**
* 天涯论坛数据获取
* @author xMx
* @date 2019年11月8日 下午4:08:29
*/
public
class
GetTiayaDataTest
{
public
static
void
main
(
String
[]
args
)
{
String
wordFilePath
=
"D:\\crawlerdata\\关键词6.txt"
;
//关键词
String
filePath
=
"D:\\crawlerdata\\天涯论坛-精装房.xlsx"
;
String
startTime
=
"2019-01-01 00:00:00"
;
//开始时间
String
endTime
=
"2019-11-08 23:59:59"
;
//结束时间
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000008
);
List
<
String
>
wordList
=
WordsReadFile
.
getWords
(
wordFilePath
);
List
<
LunTanData
>
list
=
new
ArrayList
<>();
wordList
.
forEach
(
word
->{
list
.
addAll
(
DataCrawler
.
getLunTanData
(
word
,
null
,
startTime
,
endTime
));
});
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
list
.
forEach
(
data
->{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"地址"
,
data
.
getUrl
());
map
.
put
(
"标题"
,
data
.
getTitle
());
map
.
put
(
"时间"
,
data
.
getTime
());
map
.
put
(
"来源"
,
data
.
getSource
());
map
.
put
(
"回复者或楼主"
,
data
.
getAuthor
());
map
.
put
(
"回复内容"
,
data
.
getContent
());
map
.
put
(
"回复数"
,
data
.
getReply_count
());
map
.
put
(
"平台"
,
data
.
getPt
());
map
.
put
(
"关键词"
,
data
.
getWord
());
bodyList
.
add
(
map
);
});
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"地址"
);
headList
.
add
(
"标题"
);
headList
.
add
(
"时间"
);
headList
.
add
(
"来源"
);
headList
.
add
(
"回复者或楼主"
);
headList
.
add
(
"回复内容"
);
headList
.
add
(
"回复数"
);
headList
.
add
(
"平台"
);
headList
.
add
(
"关键词"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
filePath
,
"数据"
,
headList
,
bodyList
);
System
.
out
.
println
(
"导出成功"
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment