Commit f09aa1c9 by yangchen

增加大鱼的相关采集

parent 4c650e8d
......@@ -7,6 +7,8 @@ import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
public class HeadGet {
/**
......@@ -336,17 +338,180 @@ public class HeadGet {
return headerMap;
}
public static void main(String[] args) {
String url = "http://www.miaopai.com/show/H99oVYnsv47ejBqK8TMZXA__.htm";
String cookie = "kg_udid=9E907CB26A8E3CC24F416CB5CF360E9F; sessionId=f771a3317da8040ace111d192f5e32df; udid=59B30CA793DBDCA2D400F41C8B3DDA78; aliyungf_tc=AQAAAEVz1U5hFwoA6tbnc5AZYZww6PRM";
Map<String, String> headerMap = HeadGet.getMiaoPaiByURlHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url, headerMap);
Document doc = Jsoup.parse(result);
String s = doc.select("body > div.box885 > div.contentLeft.contentLeft_detail > div.videoList.video_detail > div.videoIntr > div > div.personalAbout > div.personalData > p.personalDataN > a").text();
System.out.println(s);
/**
*
* @Description 爱奇艺关键词获取视频信息头信息
* @param cookie
* @return
*/
public static Map<String,String> getAiqiyiBywordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "so.iqiyi.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 爱奇艺链接视频信息头信息
* @param cookie
* @return
*/
public static Map<String,String> getAiqiyiHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "www.iqiyi.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 爱奇艺获取播放数头信息
* @param cookie
* @return
*/
public static Map<String,String> getAiqiyiForCountHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "cache.video.iqiyi.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 梨视频获取头信息
* @param cookie
* @return
*/
public static Map<String,String> getPearVideoByWordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "www.pearvideo.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 西瓜视频获取头信息
* @param cookie
* @return
*/
public static Map<String,String> getXiguaByWordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "www.ixigua.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 优酷视频获取头信息
* @param cookie
* @return
*/
public static Map<String,String> getSoKuByWordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "www.soku.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static Map<String,String> getJikeComment39HeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "app.jike.ruguoapp.com");
headerMap.put("Accept-Language", "zh-cn");
headerMap.put("Accept", "*/*");
headerMap.put("User-Agent", "%E5%8D%B3%E5%88%BB/989 CFNetwork/811.5.4 Darwin/16.7.0");
headerMap.put("App-BuildNo", "989");
headerMap.put("App-Version", "3.9.1");
headerMap.put("Content-Type", "application/json");
headerMap.put("Manufacturer", "Apple");
headerMap.put("Content-Length", "39");
headerMap.put("Connection", "keep-alive");
headerMap.put("OS-Version", "Version 10.3.3 (Build 14G60)");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static Map<String,String> getJikeComment94HeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "app.jike.ruguoapp.com");
headerMap.put("Accept-Language", "zh-cn");
headerMap.put("Accept", "*/*");
headerMap.put("User-Agent", "%E5%8D%B3%E5%88%BB/989 CFNetwork/811.5.4 Darwin/16.7.0");
headerMap.put("App-BuildNo", "989");
headerMap.put("App-Version", "3.9.1");
// headerMap.put("Content-Type", "application/json");
headerMap.put("Manufacturer", "Apple");
headerMap.put("Content-Length", "94");
headerMap.put("Connection", "keep-alive");
headerMap.put("OS-Version", "Version 10.3.3 (Build 14G60)");
// System.out.println(result);
// System.out.println(result.length());
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static Map<String,Object> getJikeCommentParamMap(String targetId,String time) {
Map<String,Object> paramMap = new HashMap<String,Object>();
JSONObject json = new JSONObject();
if(time != null) {
json.put("createdAt", time);
paramMap.put("loadMoreKey", json);
}
paramMap.put("targetId", targetId);
return paramMap;
}
public static void main(String[] args) {
String url = "https://app.jike.ruguoapp.com/1.0/messageComments/listPrimary";
String cookie = "jike:config:searchPlaceholderLastInfo=1514465731446#0; jike:sess=eyJfdWlkIjoiNWE0NGRmMTlmOWM4NWYwMDExODJhMjkwIiwiX3Nlc3Npb25Ub2tlbiI6InQ5cExKaEpiTFdVeDFsbUxKZW9vMUlKMEsifQ==; jike:sess.sig=HBuRKsTsMIIR9aMDUdkNV_mGH1E";
Map<String, String> headerMap = HeadGet.getJikeComment94HeaderMap(cookie);
Map<String,Object> paramMap = HeadGet.getJikeCommentParamMap("5a449a3d580d23001148412e","2017-12-28T10:17:50.601Z");
String result = HttpClient.executeHttpRequestPost(url, headerMap, paramMap);
System.out.println(result);
System.out.println(result.length());
}
}
......@@ -74,7 +74,7 @@ public class Dayu {
dataList.addAll(lists);
while(true) {
lists.clear();
ZhiWeiTools.sleep(5000);
ZhiWeiTools.sleep(3000);
System.out.println(url+"&hotValue="+i);
result = HttpClient.executeHttpRequestGet(url+"&hotValue="+i, headerMap);
lists = dayuCommentAnalysis.getDayuCommentData(result,articleId);
......@@ -83,6 +83,7 @@ public class Dayu {
}
dataList.addAll(lists);
i -= 10;
System.out.println("=============已获取到的评论数"+dataList.size());
}
return dataList;
} catch (Exception e) {
......
......@@ -45,7 +45,6 @@ public class DayuCommentAnalysis {
int i = data.getInteger("reply_cnt");
dataMap.put("replay_count", i);
if(i > 0) {
// System.out.println(dataMap.toString());
dataList.addAll(getReplayData(id,articleId));
}
dataList.add(dataMap);
......@@ -71,7 +70,7 @@ public class DayuCommentAnalysis {
List<Map<String,Object>> data = new ArrayList<Map<String,Object>>();
List<String> timeList = new ArrayList<String>();
while(true) {
ZhiWeiTools.sleep(3000);
ZhiWeiTools.sleep(2000);
long time = analysisReplayData(result,data);
if(timeList.contains(String.valueOf(time))){
break;
......@@ -82,7 +81,7 @@ public class DayuCommentAnalysis {
}
result = HttpClient.executeHttpRequestGet(url+time, headerMap);
}
System.out.println("====================="+data.size());
System.out.println("=====================评论下回复获取数=="+data.size());
return data;
}
......
......@@ -18,7 +18,7 @@ public class DayuCommentExample {
//14180961224021425316 这个为此参数
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata/UC评论采集.xlsx", 0);
Map<String,Object> map = poi.importExcel("D://crawlerdata/UC评论采集-1.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>();
......@@ -26,7 +26,12 @@ public class DayuCommentExample {
String url = "";
try {
url = map1.get("url")+"";
String articleId = url.split("aid=")[1].split("&")[0];
String articleId = "";
if(url.contains("aid")) {
articleId = url.split("aid=")[1].split("&")[0];
}else {
articleId = url;
}
List<Map<String,Object>> dataList = Dayu.getDayuCommentData(articleId);
if(dataList.size() <= 0) {
urlList.add(url);
......
......@@ -35,7 +35,7 @@ public class YidainzixunByWordExample {
headList.add("time");
headList.add("url");
System.out.println(listAll.size());
poi.exportExcel("D://crawlerdata/一点资讯-美.xlsx", "asd", headList, listAll);
poi.exportExcel("D://crawlerdata/一点资讯-美.xlsx", "asd", headList, listAll);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment