Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
L
live-crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
live-crawler
Commits
5bbec004
Commit
5bbec004
authored
Jan 28, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
提交根据房间id查询房间信息
parent
eb80b6f4
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
327 additions
and
0 deletions
+327
-0
src/main/java/com/zhiwei/live/bean/RoomInfo.java
+81
-0
src/main/java/com/zhiwei/live/roominfo/BilibiliRoomInfoCrawler.java
+74
-0
src/main/java/com/zhiwei/live/roominfo/DouYuRoomInfoCrawler.java
+46
-0
src/main/java/com/zhiwei/live/roominfo/HuYaRoomInfoCrawler.java
+45
-0
src/main/java/com/zhiwei/live/roominfo/PandamTVRoomInfoCrawler.java
+81
-0
No files found.
src/main/java/com/zhiwei/live/bean/RoomInfo.java
0 → 100644
View file @
5bbec004
package
com
.
zhiwei
.
live
.
bean
;
public
class
RoomInfo
{
String
pt
;
//平台类型
String
roomId
;
//房间号
String
nickName
;
//主播昵称
String
roomName
;
//房间名称
Integer
hotNum
;
//直播间热度
public
RoomInfo
(){}
public
RoomInfo
(
String
pt
,
String
roomId
,
String
nickName
,
String
roomName
,
Integer
hotNum
){
this
.
pt
=
pt
;
this
.
roomId
=
roomId
;
this
.
nickName
=
nickName
;
this
.
roomName
=
roomName
;
this
.
hotNum
=
hotNum
;
}
@Override
public
String
toString
()
{
return
"new RoomInfo["
+
"pt = "
+
pt
+
", roomId = "
+
roomId
+
", roomName = "
+
roomName
+
", nickName = "
+
nickName
+
", hotNum = "
+
hotNum
+
"]"
;
}
public
String
getPt
()
{
return
pt
;
}
public
String
getRoomId
()
{
return
roomId
;
}
public
String
getNickName
()
{
return
nickName
;
}
public
String
getRoomName
()
{
return
roomName
;
}
public
int
getHotNum
()
{
return
hotNum
;
}
public
void
setPt
(
String
pt
)
{
this
.
pt
=
pt
;
}
public
void
setRoomId
(
String
roomId
)
{
this
.
roomId
=
roomId
;
}
public
void
setNickName
(
String
nickName
)
{
this
.
nickName
=
nickName
;
}
public
void
setRoomName
(
String
roomName
)
{
this
.
roomName
=
roomName
;
}
public
void
setHotNum
(
int
hotNum
)
{
this
.
hotNum
=
hotNum
;
}
}
src/main/java/com/zhiwei/live/roominfo/BilibiliRoomInfoCrawler.java
0 → 100644
View file @
5bbec004
package
com
.
zhiwei
.
live
.
roominfo
;
import
java.io.IOException
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.live.bean.RoomInfo
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
/**
* bilibili 直播间信息爬取
* @author qq859
*
*/
public
class
BilibiliRoomInfoCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
Logger
logger
=
LogManager
.
getLogger
(
BilibiliRoomInfoCrawler
.
class
);
private
static
final
String
PT
=
"B站"
;
/**
* 根据房间id获取房间信息
* @param roomId
* @return
* @throws Exception
*/
public
static
RoomInfo
getRoomInfoByRoomId
(
String
roomId
)
throws
Exception
{
String
url
=
"https://live.bilibili.com/"
+
roomId
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_PROXY
).
body
().
string
();
if
(!
StringUtils
.
isBlank
(
htmlBody
))
{
//判断页面中是否包含房间信息
if
(
htmlBody
.
contains
(
"window.__NEPTUNE_IS_MY_WAIFU__="
))
{
//通过截取获取直播间信息字段,将截取的字段处理为json格式方便解析
htmlBody
=
htmlBody
.
split
(
"<script>window.__NEPTUNE_IS_MY_WAIFU__="
)[
1
].
split
(
"</script>"
)[
0
];
htmlBody
=
ZhiWeiTools
.
decodeUnicode
(
htmlBody
);
htmlBody
=
ZhiWeiTools
.
delHTMLTag
(
htmlBody
);
htmlBody
=
htmlBody
.
replaceAll
(
"\\\\"
,
""
).
replaceAll
(
"'"
,
"\""
);
//解析json数据
JSONObject
baseInfoRes
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
data
=
baseInfoRes
.
getJSONObject
(
"baseInfoRes"
).
getJSONObject
(
"data"
);
Integer
person_num
=
data
.
getIntValue
(
"online"
);
String
roomname
=
data
.
getString
(
"title"
);
Integer
room_id
=
data
.
getInteger
(
"room_id"
);
String
roomIds
=
room_id
!=
null
?
room_id
.
toString
():
null
;
String
username
=
null
;
//通过房间id获取用户信息
String
roomUrl
=
"https://api.live.bilibili.com/live_user/v1/UserInfo/get_anchor_in_room?roomid="
+
room_id
;
String
roomBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
roomUrl
)).
body
().
string
();
if
(!
StringUtils
.
isBlank
(
roomBody
))
{
JSONObject
roomData
=
JSONObject
.
parseObject
(
roomBody
).
getJSONObject
(
"data"
);
username
=
roomData
.
getJSONObject
(
"info"
).
getString
(
"uname"
);
}
return
new
RoomInfo
(
PT
,
roomIds
,
roomname
,
username
,
person_num
);
}
else
{
logger
.
info
(
"此次采集页面中不包含房间信息字段, 此次页面信息为:{}"
,
htmlBody
);
return
null
;
}
}
return
null
;
}
}
src/main/java/com/zhiwei/live/roominfo/DouYuRoomInfoCrawler.java
0 → 100644
View file @
5bbec004
package
com
.
zhiwei
.
live
.
roominfo
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.live.bean.RoomInfo
;
/**
* 斗鱼直播间信息获取
* @author qq859
*
*/
public
class
DouYuRoomInfoCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
Logger
logger
=
LogManager
.
getLogger
(
DouYuRoomInfoCrawler
.
class
);
private
static
final
String
PT
=
"斗鱼"
;
/**
* 根据房间id获取房间信息
* @param roomId
* @return
* @throws Exception
*/
public
static
RoomInfo
getRoomInfoByRoomId
(
String
roomId
)
throws
Exception
{
String
url
=
"http://open.douyucdn.cn/api/RoomApi/room/"
+
roomId
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_PROXY
).
body
().
string
();
if
(!
StringUtils
.
isBlank
(
htmlBody
))
{
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
String
room_name
=
data
.
getString
(
"room_name"
);
String
user_name
=
data
.
getString
(
"owner_name"
);
Integer
hn
=
data
.
getInteger
(
"hn"
);
int
online
=
data
.
getInteger
(
"online"
);
return
new
RoomInfo
(
PT
,
roomId
,
room_name
,
user_name
,
hn
);
}
else
{
logger
.
info
(
"此次采集页面中不包含房间信息字段, 此次页面信息为:{}"
,
htmlBody
);
return
null
;
}
}
}
src/main/java/com/zhiwei/live/roominfo/HuYaRoomInfoCrawler.java
0 → 100644
View file @
5bbec004
package
com
.
zhiwei
.
live
.
roominfo
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.live.bean.RoomInfo
;
public
class
HuYaRoomInfoCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
Logger
logger
=
LogManager
.
getLogger
(
HuYaRoomInfoCrawler
.
class
);
private
static
final
String
PT
=
"虎牙"
;
/**
* 根据房间id获取房间信息
*
* @param roomId
* @return
* @throws Exception
*/
public
static
RoomInfo
getRoomInfoByRoomId
(
String
roomId
)
throws
Exception
{
String
url
=
"http://www.huya.com/"
+
roomId
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_PROXY
).
body
().
string
();
if
(!
StringUtils
.
isBlank
(
htmlBody
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
String
roomName
=
document
.
select
(
"h1#J_roomTitle"
).
text
();
Integer
liveCount
=
Integer
.
valueOf
(
document
.
select
(
"em#live-count"
).
text
().
replaceAll
(
","
,
""
));
String
username
=
document
.
select
(
"h3.host-name"
).
text
();
String
activityCount
=
document
.
select
(
"div#activityCount"
).
text
();
String
room_id
=
document
.
select
(
"span.host-rid"
).
text
();
return
new
RoomInfo
(
PT
,
room_id
,
roomName
,
username
,
liveCount
);
}
else
{
logger
.
info
(
"此次采集页面中不包含房间信息字段, 此次页面信息为:{}"
,
htmlBody
);
return
null
;
}
}
}
src/main/java/com/zhiwei/live/roominfo/PandamTVRoomInfoCrawler.java
0 → 100644
View file @
5bbec004
package
com
.
zhiwei
.
live
.
roominfo
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.live.bean.RoomInfo
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
/**
* 熊猫TV直播间信息
* @author qq859
*
*/
public
class
PandamTVRoomInfoCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
Logger
logger
=
LogManager
.
getLogger
(
PandamTVRoomInfoCrawler
.
class
);
private
static
final
String
PT
=
"熊猫TV"
;
/**
* 根据房间id获取房间信息
* @param roomId
* @return
* @throws Exception
*/
public
static
RoomInfo
getRoomInfoByRoomId
(
String
roomId
)
throws
Exception
{
String
url
=
"https://www.panda.tv/"
+
roomId
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_PROXY
).
body
().
string
();
if
(!
StringUtils
.
isBlank
(
htmlBody
))
{
//判断页面中是否包含房间信息,此为pc端直播
if
(
htmlBody
.
contains
(
"window._config_roominfo = "
))
{
//通过截取获取直播间信息字段,将截取的字段处理为json格式方便解析
htmlBody
=
htmlBody
.
split
(
"window._config_roominfo = "
)[
1
].
split
(
"} };"
)[
0
]+
"} }"
;
htmlBody
=
ZhiWeiTools
.
decodeUnicode
(
htmlBody
);
htmlBody
=
ZhiWeiTools
.
delHTMLTag
(
htmlBody
);
htmlBody
=
htmlBody
.
replaceAll
(
"\\\\"
,
""
).
replaceAll
(
"'"
,
"\""
)
.
replaceAll
(
"\"param\":\""
,
"\"param\":"
).
replaceAll
(
"}\","
,
"},"
);
//
//解析json数据
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
callbackParam
=
json
.
getJSONObject
(
"callbackParam"
);
Integer
person_num
=
callbackParam
.
getJSONObject
(
"param"
).
getIntValue
(
"person_num"
);
JSONObject
roominfo
=
json
.
getJSONObject
(
"roominfo"
);
String
roomname
=
roominfo
.
getString
(
"name"
);
JSONObject
hostinfo
=
json
.
getJSONObject
(
"hostinfo"
);
String
username
=
hostinfo
.
getString
(
"name"
);
return
new
RoomInfo
(
PT
,
roomId
,
roomname
,
username
,
person_num
);
}
//判断页面中是否包含房间信息,此为使用手机端直播
else
if
(
htmlBody
.
contains
(
"window.HOSTINFO="
)){
//通过截取获取直播间信息字段,将截取的字段处理为json格式方便解析
htmlBody
=
htmlBody
.
split
(
"window.HOSTINFO="
)[
1
].
split
(
";</script>"
)[
0
];
htmlBody
=
ZhiWeiTools
.
decodeUnicode
(
htmlBody
);
htmlBody
=
ZhiWeiTools
.
delHTMLTag
(
htmlBody
);
htmlBody
=
htmlBody
.
replaceAll
(
"\\\\"
,
""
).
replaceAll
(
"'"
,
"\""
);
//解析json数据
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
roominfo
=
json
.
getJSONObject
(
"roominfo"
);
Integer
person_num
=
roominfo
.
getIntValue
(
"personnum"
);
String
roomname
=
roominfo
.
getString
(
"name"
);
JSONObject
hostinfo
=
json
.
getJSONObject
(
"hostinfo"
);
String
username
=
hostinfo
.
getString
(
"nickName"
);
return
new
RoomInfo
(
PT
,
roomId
,
roomname
,
username
,
person_num
);
}
else
{
logger
.
info
(
"此次采集页面中不包含房间信息字段, 此次页面信息为:{}"
,
htmlBody
);
return
null
;
}
}
return
null
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment