Commit f09aa1c9 by yangchen

增加大鱼的相关采集

parent 4c650e8d
...@@ -7,6 +7,8 @@ import java.util.Map; ...@@ -7,6 +7,8 @@ import java.util.Map;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
public class HeadGet { public class HeadGet {
/** /**
...@@ -336,17 +338,180 @@ public class HeadGet { ...@@ -336,17 +338,180 @@ public class HeadGet {
return headerMap; return headerMap;
} }
/**
*
* @Description 爱奇艺关键词获取视频信息头信息
* @param cookie
* @return
*/
public static Map<String,String> getAiqiyiBywordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "so.iqiyi.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 爱奇艺链接视频信息头信息
* @param cookie
* @return
*/
public static Map<String,String> getAiqiyiHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "www.iqiyi.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 爱奇艺获取播放数头信息
* @param cookie
* @return
*/
public static Map<String,String> getAiqiyiForCountHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "cache.video.iqiyi.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 梨视频获取头信息
* @param cookie
* @return
*/
public static Map<String,String> getPearVideoByWordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "www.pearvideo.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 西瓜视频获取头信息
* @param cookie
* @return
*/
public static Map<String,String> getXiguaByWordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "www.ixigua.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 优酷视频获取头信息
* @param cookie
* @return
*/
public static Map<String,String> getSoKuByWordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "www.soku.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static Map<String,String> getJikeComment39HeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "app.jike.ruguoapp.com");
headerMap.put("Accept-Language", "zh-cn");
headerMap.put("Accept", "*/*");
headerMap.put("User-Agent", "%E5%8D%B3%E5%88%BB/989 CFNetwork/811.5.4 Darwin/16.7.0");
headerMap.put("App-BuildNo", "989");
headerMap.put("App-Version", "3.9.1");
headerMap.put("Content-Type", "application/json");
headerMap.put("Manufacturer", "Apple");
headerMap.put("Content-Length", "39");
headerMap.put("Connection", "keep-alive");
headerMap.put("OS-Version", "Version 10.3.3 (Build 14G60)");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static Map<String,String> getJikeComment94HeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "app.jike.ruguoapp.com");
headerMap.put("Accept-Language", "zh-cn");
headerMap.put("Accept", "*/*");
headerMap.put("User-Agent", "%E5%8D%B3%E5%88%BB/989 CFNetwork/811.5.4 Darwin/16.7.0");
headerMap.put("App-BuildNo", "989");
headerMap.put("App-Version", "3.9.1");
// headerMap.put("Content-Type", "application/json");
headerMap.put("Manufacturer", "Apple");
headerMap.put("Content-Length", "94");
headerMap.put("Connection", "keep-alive");
headerMap.put("OS-Version", "Version 10.3.3 (Build 14G60)");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static Map<String,Object> getJikeCommentParamMap(String targetId,String time) {
Map<String,Object> paramMap = new HashMap<String,Object>();
JSONObject json = new JSONObject();
if(time != null) {
json.put("createdAt", time);
paramMap.put("loadMoreKey", json);
}
paramMap.put("targetId", targetId);
return paramMap;
}
public static void main(String[] args) { public static void main(String[] args) {
String url = "http://www.miaopai.com/show/H99oVYnsv47ejBqK8TMZXA__.htm"; String url = "https://app.jike.ruguoapp.com/1.0/messageComments/listPrimary";
String cookie = "kg_udid=9E907CB26A8E3CC24F416CB5CF360E9F; sessionId=f771a3317da8040ace111d192f5e32df; udid=59B30CA793DBDCA2D400F41C8B3DDA78; aliyungf_tc=AQAAAEVz1U5hFwoA6tbnc5AZYZww6PRM"; String cookie = "jike:config:searchPlaceholderLastInfo=1514465731446#0; jike:sess=eyJfdWlkIjoiNWE0NGRmMTlmOWM4NWYwMDExODJhMjkwIiwiX3Nlc3Npb25Ub2tlbiI6InQ5cExKaEpiTFdVeDFsbUxKZW9vMUlKMEsifQ==; jike:sess.sig=HBuRKsTsMIIR9aMDUdkNV_mGH1E";
Map<String, String> headerMap = HeadGet.getMiaoPaiByURlHeaderMap(null); Map<String, String> headerMap = HeadGet.getJikeComment94HeaderMap(cookie);
String result = HttpClient.executeHttpRequestGet(url, headerMap); Map<String,Object> paramMap = HeadGet.getJikeCommentParamMap("5a449a3d580d23001148412e","2017-12-28T10:17:50.601Z");
Document doc = Jsoup.parse(result); String result = HttpClient.executeHttpRequestPost(url, headerMap, paramMap);
String s = doc.select("body > div.box885 > div.contentLeft.contentLeft_detail > div.videoList.video_detail > div.videoIntr > div > div.personalAbout > div.personalData > p.personalDataN > a").text(); System.out.println(result);
System.out.println(s); System.out.println(result.length());
// System.out.println(result);
// System.out.println(result.length());
} }
} }
...@@ -74,7 +74,7 @@ public class Dayu { ...@@ -74,7 +74,7 @@ public class Dayu {
dataList.addAll(lists); dataList.addAll(lists);
while(true) { while(true) {
lists.clear(); lists.clear();
ZhiWeiTools.sleep(5000); ZhiWeiTools.sleep(3000);
System.out.println(url+"&hotValue="+i); System.out.println(url+"&hotValue="+i);
result = HttpClient.executeHttpRequestGet(url+"&hotValue="+i, headerMap); result = HttpClient.executeHttpRequestGet(url+"&hotValue="+i, headerMap);
lists = dayuCommentAnalysis.getDayuCommentData(result,articleId); lists = dayuCommentAnalysis.getDayuCommentData(result,articleId);
...@@ -83,6 +83,7 @@ public class Dayu { ...@@ -83,6 +83,7 @@ public class Dayu {
} }
dataList.addAll(lists); dataList.addAll(lists);
i -= 10; i -= 10;
System.out.println("=============已获取到的评论数"+dataList.size());
} }
return dataList; return dataList;
} catch (Exception e) { } catch (Exception e) {
......
...@@ -45,7 +45,6 @@ public class DayuCommentAnalysis { ...@@ -45,7 +45,6 @@ public class DayuCommentAnalysis {
int i = data.getInteger("reply_cnt"); int i = data.getInteger("reply_cnt");
dataMap.put("replay_count", i); dataMap.put("replay_count", i);
if(i > 0) { if(i > 0) {
// System.out.println(dataMap.toString());
dataList.addAll(getReplayData(id,articleId)); dataList.addAll(getReplayData(id,articleId));
} }
dataList.add(dataMap); dataList.add(dataMap);
...@@ -71,7 +70,7 @@ public class DayuCommentAnalysis { ...@@ -71,7 +70,7 @@ public class DayuCommentAnalysis {
List<Map<String,Object>> data = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> data = new ArrayList<Map<String,Object>>();
List<String> timeList = new ArrayList<String>(); List<String> timeList = new ArrayList<String>();
while(true) { while(true) {
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(2000);
long time = analysisReplayData(result,data); long time = analysisReplayData(result,data);
if(timeList.contains(String.valueOf(time))){ if(timeList.contains(String.valueOf(time))){
break; break;
...@@ -82,7 +81,7 @@ public class DayuCommentAnalysis { ...@@ -82,7 +81,7 @@ public class DayuCommentAnalysis {
} }
result = HttpClient.executeHttpRequestGet(url+time, headerMap); result = HttpClient.executeHttpRequestGet(url+time, headerMap);
} }
System.out.println("====================="+data.size()); System.out.println("=====================评论下回复获取数=="+data.size());
return data; return data;
} }
......
...@@ -18,7 +18,7 @@ public class DayuCommentExample { ...@@ -18,7 +18,7 @@ public class DayuCommentExample {
//14180961224021425316 这个为此参数 //14180961224021425316 这个为此参数
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata/UC评论采集.xlsx", 0); Map<String,Object> map = poi.importExcel("D://crawlerdata/UC评论采集-1.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body"); List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>(); List<String> urlList = new ArrayList<String>();
...@@ -26,7 +26,12 @@ public class DayuCommentExample { ...@@ -26,7 +26,12 @@ public class DayuCommentExample {
String url = ""; String url = "";
try { try {
url = map1.get("url")+""; url = map1.get("url")+"";
String articleId = url.split("aid=")[1].split("&")[0]; String articleId = "";
if(url.contains("aid")) {
articleId = url.split("aid=")[1].split("&")[0];
}else {
articleId = url;
}
List<Map<String,Object>> dataList = Dayu.getDayuCommentData(articleId); List<Map<String,Object>> dataList = Dayu.getDayuCommentData(articleId);
if(dataList.size() <= 0) { if(dataList.size() <= 0) {
urlList.add(url); urlList.add(url);
......
...@@ -35,7 +35,7 @@ public class YidainzixunByWordExample { ...@@ -35,7 +35,7 @@ public class YidainzixunByWordExample {
headList.add("time"); headList.add("time");
headList.add("url"); headList.add("url");
System.out.println(listAll.size()); System.out.println(listAll.size());
poi.exportExcel("D://crawlerdata/一点资讯-美.xlsx", "asd", headList, listAll); poi.exportExcel("D://crawlerdata/一点资讯-美.xlsx", "asd", headList, listAll);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment