Commit 2a8c7a4d by zhiwei

斗鱼、虎牙、熊猫TV数据采集

parent 7c521742
package com.zhiwei.live.bean;
public class RoomInfo {
String pt; //平台类型
String roomId; //房间号
String nickName; //主播昵称
String roomName; //房间名称
Integer hotNum; //直播间热度
public RoomInfo(){}
public RoomInfo(String pt, String roomId, String nickName, String roomName, Integer hotNum){
String pt; // 平台类型
String roomId; // 房间号
String nickName; // 主播昵称
String roomName; // 房间名称
Integer hotNum; // 直播间热度
Integer fans; // 订阅数
public RoomInfo() {
}
public RoomInfo(String pt, String roomId, String nickName, String roomName, Integer hotNum, Integer fans) {
this.pt = pt;
this.roomId = roomId;
this.nickName = nickName;
this.roomName = roomName;
this.hotNum = hotNum;
this.fans = fans;
}
@Override
public String toString() {
return "new RoomInfo["
+ "pt = " + pt
+ ", roomId = " + roomId
+ ", roomName = " + roomName
+ ", nickName = " + nickName
+ ", hotNum = " + hotNum
+ "]";
return "new RoomInfo[" + "pt = " + pt + ", roomId = " + roomId + ", roomName = " + roomName + ", nickName = "
+ nickName + ", hotNum = " + hotNum + ", fans = " + fans + "]";
}
public Integer getFans() {
return fans;
}
public void setHotNum(Integer hotNum) {
this.hotNum = hotNum;
}
public void setFans(Integer fans) {
this.fans = fans;
}
public String getPt() {
......@@ -75,7 +83,5 @@ public class RoomInfo {
public void setHotNum(int hotNum) {
this.hotNum = hotNum;
}
}
......@@ -17,6 +17,8 @@ public class BilibiliMessage {
String content; //弹幕内容
String room_id; //房间id
public BilibiliMessage(JSONObject json) throws Exception {
constructJson(json);
}
......@@ -26,6 +28,7 @@ public class BilibiliMessage {
private void constructJson(JSONObject json) throws Exception{
try {
System.out.println(json);
JSONArray jsonArray = json.getJSONArray("info");
messageType = json.getString("cmd");
user_id = jsonArray.getJSONArray(2).getString(0);
......@@ -43,11 +46,12 @@ public class BilibiliMessage {
@Override
public String toString() {
return "new BilibiliMessage["
+ "user_id = " + user_id
+ " user_id = " + user_id
+ ", nickName = " + nickName
+ ", messageType = " + messageType
+ ", time = " + time
+ ", content = " + content
+ ", room_id = " + room_id
+ "]";
}
......@@ -91,5 +95,13 @@ public class BilibiliMessage {
this.content = content;
}
public String getRoom_id() {
return room_id;
}
public void setRoom_id(String room_id) {
this.room_id = room_id;
}
}
......@@ -71,6 +71,8 @@ public class BilibiliMessageHandler extends ChannelInboundHandlerAdapter{
while(matcher.find()) {
JSONObject dataJson = JSONObject.parseObject(matcher.group());
BilibiliMessage bilibiliMessage = new BilibiliMessage(dataJson);
bilibiliMessage.setRoom_id(roomId);
dataCallBack.onData(bilibiliMessage);
// System.out.println(bilibiliMessage);
}
......
......@@ -42,7 +42,7 @@ public class DouYuMessage {
@Override
public String toString() {
return "new BilibiliMessage["
return "new DouYuMessage["
+ "user_id = " + user_id
+ ", nickName = " + nickName
+ ", time = " + time
......
......@@ -45,17 +45,18 @@ public class BilibiliRoomInfoCrawler {
Integer person_num = data.getIntValue("online");
String roomname = data.getString("title");
Integer room_id = data.getInteger("room_id");
Integer fans = data.getInteger("attention");
String roomId = room_id!=null?room_id.toString():null;
String username = null;
//通过房间id获取用户信息
roomUrl = "https://api.live.bilibili.com/live_user/v1/UserInfo/get_anchor_in_room?roomid="+room_id;
roomUrl = "https://api.live.bilibili.com/live_user/v1/UserInfo/get_anchor_in_room?roomid="+roomId;
String roomBody = httpBoot.syncCall(RequestUtils.wrapGet(roomUrl)).body().string();
if(!StringUtils.isBlank(roomBody)) {
JSONObject roomData = JSONObject.parseObject(roomBody).getJSONObject("data");
username = roomData.getJSONObject("info").getString("uname");
}
return new RoomInfo(PT, roomId, roomname, username, person_num);
return new RoomInfo(PT, roomId, roomname, username, person_num, fans);
}else {
logger.info("此次采集页面中不包含房间信息字段, 此次页面信息为:{}", htmlBody);
return null;
......@@ -67,7 +68,7 @@ public class BilibiliRoomInfoCrawler {
public static RoomInfo getRoomInfoByRoomUrlProxy(String roomUrl) throws Exception{
public static RoomInfo getRoomInfoProxyByRoomUrl(String roomUrl) throws Exception{
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(roomUrl), ProxyHolder.NAT_PROXY).body().string();
if(!StringUtils.isBlank(htmlBody)) {
//判断页面中是否包含房间信息
......@@ -85,7 +86,7 @@ public class BilibiliRoomInfoCrawler {
String roomname = data.getString("title");
Integer room_id = data.getInteger("room_id");
String roomId = room_id!=null?room_id.toString():null;
Integer fans = data.getInteger("attention");
String username = null;
//通过房间id获取用户信息
roomUrl = "https://api.live.bilibili.com/live_user/v1/UserInfo/get_anchor_in_room?roomid="+room_id;
......@@ -94,7 +95,7 @@ public class BilibiliRoomInfoCrawler {
JSONObject roomData = JSONObject.parseObject(roomBody).getJSONObject("data");
username = roomData.getJSONObject("info").getString("uname");
}
return new RoomInfo(PT, roomId, roomname, username, person_num);
return new RoomInfo(PT, roomId, roomname, username, person_num, fans);
}else {
logger.info("此次采集页面中不包含房间信息字段, 此次页面信息为:{}", htmlBody);
return null;
......
......@@ -32,6 +32,7 @@ public class DouYuRoomInfoCrawler {
String roomBody = httpBoot.syncCall(RequestUtils.wrapGet(roomUrl)).body().string();
if(!StringUtils.isBlank(roomBody) && roomBody.contains("ROOM.room_id =")) {
String roomId = roomBody.split("ROOM\\.room_id = ")[1].split("; ")[0].trim();
//获取房间信息
String url = "http://open.douyucdn.cn/api/RoomApi/room/" + roomId;
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
if(!StringUtils.isBlank(htmlBody)) {
......@@ -40,7 +41,15 @@ public class DouYuRoomInfoCrawler {
String user_name = data.getString("owner_name");
Integer hn = data.getInteger("hn");
int online = data.getInteger("online");
return new RoomInfo(PT, roomId, room_name, user_name , hn);
Integer fans = 0;
//获取用户信息
String userUrl = "https://www.douyu.com/swf_api/h5room/" + roomId;
String userBody = httpBoot.syncCall(RequestUtils.wrapGet(userUrl)).body().string();
if(!StringUtils.isBlank(userBody)) {
JSONObject userData = JSONObject.parseObject(userBody).getJSONObject("data");
fans = Integer.valueOf(userData.getString("fans"));
}
return new RoomInfo(PT, roomId, room_name, user_name , hn, fans);
}else {
logger.info("此次采集页面中不包含房间信息字段, 此次页面信息为:{}", htmlBody);
return null;
......@@ -71,7 +80,15 @@ public class DouYuRoomInfoCrawler {
String user_name = data.getString("owner_name");
Integer hn = data.getInteger("hn");
int online = data.getInteger("online");
return new RoomInfo(PT, roomId, room_name, user_name , hn);
Integer fans = 0;
//获取用户信息
String userUrl = "https://www.douyu.com/swf_api/h5room/" + roomId;
String userBody = httpBoot.syncCall(RequestUtils.wrapGet(userUrl)).body().string();
if(!StringUtils.isBlank(userBody)) {
JSONObject userData = JSONObject.parseObject(userBody).getJSONObject("data");
fans = Integer.valueOf(userData.getString("fans"));
}
return new RoomInfo(PT, roomId, room_name, user_name , hn, fans);
}else {
logger.info("此次采集页面中不包含房间信息字段, 此次页面信息为:{}", htmlBody);
return null;
......
......@@ -32,8 +32,14 @@ public class HuYaRoomInfoCrawler {
Integer liveCount = Integer.valueOf(document.select("em#live-count").text().replaceAll(",", ""));
String username = document.select("h3.host-name").text();
String activityCount = document.select("div#activityCount").text();
Integer fans = 0;
try {
fans = Integer.valueOf(activityCount.replaceAll(",", ""));
} catch (Exception e) {
fans = 0;
}
String room_id = document.select("span.host-rid").text();
return new RoomInfo(PT, room_id, roomName, username, liveCount);
return new RoomInfo(PT, room_id, roomName, username, liveCount, fans);
} else {
logger.info("此次采集页面中不包含房间信息字段, 此次页面信息为:{}", htmlBody);
return null;
......@@ -58,7 +64,13 @@ public class HuYaRoomInfoCrawler {
String username = document.select("h3.host-name").text();
String activityCount = document.select("div#activityCount").text();
String room_id = document.select("span.host-rid").text();
return new RoomInfo(PT, room_id, roomName, username, liveCount);
Integer fans = 0;
try {
fans = Integer.valueOf(activityCount.replaceAll(",", ""));
} catch (Exception e) {
fans = 0;
}
return new RoomInfo(PT, room_id, roomName, username, liveCount, fans);
} else {
logger.info("此次采集页面中不包含房间信息字段, 此次页面信息为:{}", htmlBody);
return null;
......
......@@ -47,8 +47,9 @@ public class PandamTVRoomInfoCrawler {
String roomname = roominfo.getString("name");
JSONObject hostinfo = json.getJSONObject("hostinfo");
String username = hostinfo.getString("name");
return new RoomInfo(PT, roomId, roomname, username, person_num);
int fans = 0;
return new RoomInfo(PT, roomId, roomname, username, person_num, fans);
}
//判断页面中是否包含房间信息,此为使用手机端直播
else if(htmlBody.contains("window.HOSTINFO=")){
......@@ -65,8 +66,8 @@ public class PandamTVRoomInfoCrawler {
String roomname = roominfo.getString("name");
JSONObject hostinfo = json.getJSONObject("hostinfo");
String username = hostinfo.getString("nickName");
return new RoomInfo(PT, roomId, roomname, username, person_num);
int fans = 0;
return new RoomInfo(PT, roomId, roomname, username, person_num, fans);
}else {
logger.info("此次采集页面中不包含房间信息字段, 此次页面信息为:{}", htmlBody);
return null;
......@@ -103,7 +104,8 @@ public class PandamTVRoomInfoCrawler {
String roomname = roominfo.getString("name");
JSONObject hostinfo = json.getJSONObject("hostinfo");
String username = hostinfo.getString("name");
return new RoomInfo(PT, roomId, roomname, username, person_num);
int fans = 0;
return new RoomInfo(PT, roomId, roomname, username, person_num, fans);
}
//判断页面中是否包含房间信息,此为使用手机端直播
else if(htmlBody.contains("window.HOSTINFO=")){
......@@ -120,7 +122,8 @@ public class PandamTVRoomInfoCrawler {
String roomname = roominfo.getString("name");
JSONObject hostinfo = json.getJSONObject("hostinfo");
String username = hostinfo.getString("nickName");
return new RoomInfo(PT, roomId, roomname, username, person_num);
int fans = 0;
return new RoomInfo(PT, roomId, roomname, username, person_num, fans);
}else {
logger.info("此次采集页面中不包含房间信息字段, 此次页面信息为:{}", htmlBody);
return null;
......
package com.zhiwei.live.test.roomInfo;
import org.junit.jupiter.api.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.live.bean.RoomInfo;
import com.zhiwei.live.roominfo.BilibiliRoomInfoCrawler;
import com.zhiwei.live.roominfo.DouYuRoomInfoCrawler;
import com.zhiwei.live.roominfo.HuYaRoomInfoCrawler;
import com.zhiwei.live.roominfo.PandamTVRoomInfoCrawler;
public class RoomInfoCrawlerTest {
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
static {
ProxyFactory.init(registry, group, GroupType.PROVIDER);
}
@Test
public void getBilibiliRoomInfoByRoomUrl() {
String roomUrl = "https://live.bilibili.com/483";
try {
RoomInfo roomInfo = BilibiliRoomInfoCrawler.getRoomInfoByRoomUrl(roomUrl);
System.out.println("房间信息:::"+ roomInfo);
} catch (Exception e) {
e.printStackTrace();
}
}
@Test
public void getDouyuRoomInfoByRoomUrl() {
String roomUrl = "https://www.douyu.com/4372875";
try {
RoomInfo roomInfo = DouYuRoomInfoCrawler.getRoomInfoByRoomUrl(roomUrl);
System.out.println("房间信息:::"+ roomInfo);
} catch (Exception e) {
e.printStackTrace();
}
}
@Test
public void getHuYaRoomInfoByRoomUrl() {
String roomUrl = "https://www.huya.com/baishaling";
try {
RoomInfo roomInfo = HuYaRoomInfoCrawler.getRoomInfoByRoomUrl(roomUrl);
System.out.println("房间信息:::"+ roomInfo);
} catch (Exception e) {
e.printStackTrace();
}
}
@Test
public void getPandamTVRoomInfoByRoomUrl() {
String roomUrl = "https://www.panda.tv/337852";
try {
RoomInfo roomInfo = PandamTVRoomInfoCrawler.getRoomInfoByRoomUrl(roomUrl);
System.out.println("房间信息:::"+ roomInfo);
} catch (Exception e) {
e.printStackTrace();
}
}
}
//package com.zhiwei.live.test.roomInfo;
//
//import org.junit.jupiter.api.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.live.bean.RoomInfo;
//import com.zhiwei.live.roominfo.BilibiliRoomInfoCrawler;
//import com.zhiwei.live.roominfo.DouYuRoomInfoCrawler;
//import com.zhiwei.live.roominfo.HuYaRoomInfoCrawler;
//import com.zhiwei.live.roominfo.PandamTVRoomInfoCrawler;
//
//public class RoomInfoCrawlerTest {
// private static final String registry = "zookeeper://192.168.0.36:2181";
// private static final String group = "local";
//
// static {
// ProxyFactory.init(registry, group, GroupType.PROVIDER);
// }
//
// @Test
// public void getBilibiliRoomInfoByRoomUrl() {
// String roomUrl = "https://live.bilibili.com/483";
// try {
// RoomInfo roomInfo = BilibiliRoomInfoCrawler.getRoomInfoByRoomUrl(roomUrl);
// System.out.println("房间信息:::"+ roomInfo);
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
//
//
// @Test
// public void getDouyuRoomInfoByRoomUrl() {
// String roomUrl = "https://www.douyu.com/4372875";
// try {
// RoomInfo roomInfo = DouYuRoomInfoCrawler.getRoomInfoByRoomUrl(roomUrl);
// System.out.println("房间信息:::"+ roomInfo);
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
//
//
// @Test
// public void getHuYaRoomInfoByRoomUrl() {
// String roomUrl = "https://www.huya.com/baishaling";
// try {
// RoomInfo roomInfo = HuYaRoomInfoCrawler.getRoomInfoByRoomUrl(roomUrl);
// System.out.println("房间信息:::"+ roomInfo);
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
//
// @Test
// public void getPandamTVRoomInfoByRoomUrl() {
// String roomUrl = "https://www.panda.tv/337852";
// try {
// RoomInfo roomInfo = PandamTVRoomInfoCrawler.getRoomInfoByRoomUrl(roomUrl);
// System.out.println("房间信息:::"+ roomInfo);
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment