Commit a205f946 by yangchen

增加相关自媒体

parent 9c9ec722
package com.zhiwei.parse;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.AiqiyiByWordAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class Aiqiyi {
private static Logger logger = LoggerFactory.getLogger(Aiqiyi.class);
private static AiqiyiByWordAnalysis aiqiyiByWordAnalysis = new AiqiyiByWordAnalysis();
/**
*
* @Description 爱奇艺依据关键词获取视频
* @param word
* @return
*/
public static List<Map<String,Object>> getAiqiyiByWordData(String word) {
Map<String,String> headerMap = HeadGet.getAiqiyiBywordHeaderMap(null);
Map<String,String> headerMap1 = HeadGet.getAiqiyiHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
for(int i = 1;i <= 20;i++) {
String url = "http://so.iqiyi.com/so/q_"+URLEncoder.encode(word, "UTF-8")+"_ctg_%E7%94%9F%E6%B4%BB_t_0_page_"+i+"_p_1_qc_0_rd__site__m_11_bitrate_?af=true";
String result = HttpClient.executeHttpRequestGet(url, headerMap);
List<String> urlList = aiqiyiByWordAnalysis.getAiqiyiUrlList(result);
for(String newurl : urlList) {
ZhiWeiTools.sleep(2000);
Map<String,Object> map = aiqiyiByWordAnalysis.getAiqiyiData(newurl, headerMap1);
if(map != null) {
dataList.add(map);
}
}
System.out.println("=============="+dataList.size());
}
return dataList;
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return null;
}
}
}
package com.zhiwei.parse;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.PearVideoByWordAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class PearVideo {
private static Logger logger = LoggerFactory.getLogger(PearVideo.class);
private static PearVideoByWordAnalysis pearVideoByWordAnalysis = new PearVideoByWordAnalysis();
/**
*
* @Description 梨视频关键词采集
* @param word
* @return
*/
public static List<Map<String,Object>> getPearVideoData(String word) {
Map<String,String> headerMap = HeadGet.getPearVideoByWordHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
for(int i = 0; i <= 9000;i+=10) {
String url = "http://www.pearvideo.com/search_loading.jsp?start="+i+"&k="+URLEncoder.encode(word, "UTF-8");
String result = HttpClient.executeHttpRequestGet(url, headerMap);
List<Map<String,Object>> dataList1 = pearVideoByWordAnalysis.getPearVideoData(result);
if(dataList1 != null && dataList1.size() > 0) {
dataList.addAll(dataList1);
}
System.out.println(i+"=========="+dataList.size());
ZhiWeiTools.sleep(4000);
}
return dataList;
} catch (UnsupportedEncodingException e) {
logger.error("获取数据出错",e.getMessage());
e.printStackTrace();
return null;
}
}
}
package com.zhiwei.parse;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.SoKuByWordAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class Soku {
private static Logger logger = LoggerFactory.getLogger(Soku.class);
private static SoKuByWordAnalysis soKuByWordAnalysis = new SoKuByWordAnalysis();
/**
*
* @Description 搜库依据关键词查找视频
* @param word
* @param type
* @return
*/
public static List<Map<String,Object>> getSoKuByWordData(String word,String type) {
Map<String,String> headerMap = HeadGet.getSoKuByWordHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
for(int i = 1; i < 14;i++) {
String url = "http://www.soku.com/search_video_ajax/q_"+URLEncoder.encode(word, "UTF-8")+"_orderby_3_cateid_"+type+"_limitdate_365?site=14&_lg=20&page=";
String result = HttpClient.executeHttpRequestGet(url+i, headerMap);
List<Map<String,Object>> lists = soKuByWordAnalysis.getSoKuData(result);
if(lists != null && lists.size() > 0) {
dataList.addAll(lists);
}
ZhiWeiTools.sleep(5000);
System.out.println(word+"==="+type+"==========已获取到="+dataList.size());
}
return dataList;
} catch (UnsupportedEncodingException e) {
logger.error("获取优酷视频出错",e.getMessage());
e.printStackTrace();
return dataList;
}
}
}
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.WangyiCommentAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class Wangyi {
private static Logger logger = LoggerFactory.getLogger(Wangyi.class);
private static WangyiCommentAnalysis wangyiCommentAnalysis = new WangyiCommentAnalysis();
/**
*
* @Description 网易评论获取
* @param id
* @return
*/
public static List<Map<String,Object>> getWangyiCommentData(String id) {
Map<String,String> headerMap = HeadGet.getWangyiCommentHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
int i = 0;
List<String> idList = new ArrayList<String>();
try {
while(true) {
String url = "http://comment.dy.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/"+id+"/comments/newList?offset="+i+"&limit=30";
String result = HttpClient.executeHttpRequestGet(url, headerMap);
System.out.println(url);
List<Map<String,Object>> lists = wangyiCommentAnalysis.getWangyiCommentData(result,idList);
if(lists == null || lists.size() < 1) {
break;
}
dataList.addAll(lists);
i += 30;
ZhiWeiTools.sleep(4000);
System.out.println("==================已采集到的数据=" + dataList.size());
}
//去重
return dataList;
} catch (Exception e) {
logger.error("获取网易评论出错",e.getMessage());
return dataList;
}
}
/**
*
* @Description 获取评论数
* @param id
* @return
*/
public static int getWangyiCommentCount(String id) {
String url = "http://comment.dy.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/"+id;
Map<String,String> headerMap = HeadGet.getWangyiCommentHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url, headerMap);
JSONObject json = JSONObject.parseObject(result);
return json.getInteger("tcount");
}
}
package com.zhiwei.parse;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.XiguaAccountAnalysis;
import com.zhiwei.parse.analysis.XiguaByWordAnalysis;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class XiGua {
private static Logger logger = LoggerFactory.getLogger(XiGua.class);
private static XiguaByWordAnalysis xiguaByWordAnalysis = new XiguaByWordAnalysis();
private static XiguaAccountAnalysis xiguaAccountAnalysis = new XiguaAccountAnalysis();
/**
*
* @Description 关键词获取视频数据
* @param word
* @return
*/
public static List<Map<String,Object>> getXiguaVideoByWordData(String word) {
Map<String,String> headerMap = HeadGet.getXiguaByWordHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
int i = 0;
int j = 0;
try {
while(true) {
String url = "https://www.ixigua.com/search_content/?format=json&autoload=true&count=20&keyword="+URLEncoder.encode(word, "UTF-8")+"&cur_tab=1&offset=";
System.out.println(url + i);
String result = HttpClient.executeHttpRequestGet(url+i, headerMap);
List<Map<String,Object>> lists = xiguaByWordAnalysis.getXiguaData(result);
if(lists != null && lists.size() > 0) {
dataList.addAll(lists);
j = 0;
}
ZhiWeiTools.sleep(4000);
if(lists == null || lists.size() < 1) {
j++;
if(j > 3) {
break;
}
continue;
}
i += 20;
}
return dataList;
} catch (UnsupportedEncodingException e) {
logger.error("获取西瓜视频数据出错",e.getMessage());
e.printStackTrace();
return dataList;
}
}
/**
*
* @Description 获取用户历史视频
* @param url
* @return
*/
public static List<Map<String,Object>> getXiguaAccountData(String url,String startTime) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
Map<String,String> headerMap = HeadGet.getXiguaByWordHeaderMap(null);
String time = "0";
long time1 = TimeParse.stringFormartDate(startTime).getTime();
try {
while(true) {
String uid = xiguaAccountAnalysis.getUid(url);
String newurl = "https://www.ixigua.com/c/user/article/?user_id="+uid+"&max_behot_time="+time+"&count=20";
String result = HttpClient.executeHttpRequestGet(newurl, headerMap);
JSONObject json = JSONObject.parseObject(result);
time = json.getJSONObject("next").getString("max_behot_time");
List<Map<String,Object>> list = xiguaAccountAnalysis.getXiguaAccountData(json,time1);
if(list == null || list.size() < 1) {
break;
}
dataList.addAll(list);
System.out.println("==============已采集到数据="+dataList.size());
ZhiWeiTools.sleep(5000);
}
return dataList;
} catch (Exception e) {
logger.error("获取西瓜视频用户历史视频出错",e.getMessage());
return dataList;
}
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class AiqiyiByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(AiqiyiByWordAnalysis.class);
/**
*
* @Description 解析出所有有用链接
* @param result
* @return
*/
public List<String> getAiqiyiUrlList(String result) {
List<String> urlList = new ArrayList<String>();
try {
Document doc = Jsoup.parse(result);
Elements elements = doc.select("ul.mod_result_list").select("li.list_item");
for(Element element : elements) {
String url = element.select("a.figure-180101").attr("href");
if(url != null && url.length() > 1) {
urlList.add(url);
}
}
return urlList;
} catch (Exception e) {
e.printStackTrace();
return urlList;
}
}
public Map<String,Object> getAiqiyiData(String url,Map<String,String> headerMap) {
Map<String,Object> dataMap = new HashMap<String,Object>();
try {
String result = HttpClient.executeHttpRequestGet(url, headerMap);
Document doc = Jsoup.parse(result);
String time = doc.select("#widget-vshort-ptime").text();
if(!time.contains("2017")) {
return null;
}
dataMap.put("time", time.split("发布时间:")[1]);
String source = doc.select("#widget-vshort-un-inner").attr("title");
dataMap.put("source", source);
String content = doc.select("#widget-vshort-lesswrap").text();
dataMap.put("content", content);
dataMap.put("url", url);
String title = doc.select("#widget-videotitle").attr("title");
String id = result.split(" tvId: ")[1].split(",")[0];
ZhiWeiTools.sleep(2000);
int count = getVideo_count(id);
dataMap.put("count", count);
dataMap.put("title", title);
System.out.println(dataMap.toString());
return dataMap;
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
return dataMap;
}
}
public int getVideo_count(String id) {
try {
String url = "http://cache.video.iqiyi.com/jp/pc/"+id+"/";
Map<String,String> headerMap = HeadGet.getAiqiyiForCountHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url, headerMap);
String count = result.split(":")[1].split("\\}")[0];
return Integer.valueOf(count);
} catch (Exception e) {
return 0;
}
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class DayuByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(DayuByWordAnalysis.class);
public List<Map<String,Object>> getDayuByWordData(String result) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("iflowItems");
for(int i = 0;i < jsonArry.size();i++) {
Map<String,Object> map = new HashMap<String,Object>();
JSONObject data = jsonArry.getJSONObject(i);
map.put("title", data.getString("title").replaceAll("<.*?>", ""));
String url = data.getString("zzd_url");
map.put("url", url);
map.put("time", TimeParse.dateFormartString(new Date(data.getLong("publish_time")), "yyyy-MM-dd HH:mm:ss"));
map.put("id", data.getString("id"));
map.put("source", data.getString("source_name").replaceAll("<.*?>", ""));
map.put("content", getContent(url));
System.out.println(map.toString());
dataList.add(map);
}
return dataList;
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
return dataList;
}
}
public String getContent(String url) {
ZhiWeiTools.sleep(2000);
Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url, headerMap);
Pattern pat = Pattern.compile("xissJsonData = (.*);");
Matcher matcher = pat.matcher(result);
try {
if(matcher.find()) {
String s = matcher.group(0);
JSONObject json = JSONObject.parseObject(s.substring(15, s.length() - 1));
String content = json.getString("content").replaceAll("<.*?>", "");
return content;
}
return null;
} catch (Exception e) {
logger.error("解析文本出错",e.getMessage());
System.out.println(result);
return null;
}
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class PearVideoByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(PearVideoByWordAnalysis.class);
public List<Map<String,Object>> getPearVideoData(String result) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
Document doc = Jsoup.parse(result);
Elements elements = doc.select("li.result-list");
for(Element element : elements) {
Map<String,Object> map = new HashMap<String,Object>();
String title = element.select("div.list-right > a > h2").text();
map.put("title", title);
String time = element.select("div.list-right > a > div.publish-time").text();
map.put("time", time.split("发表于")[1]);
String content = element.select("div.list-right > a > div.cont").text();
map.put("content", content);
String like = element.select("span.like-num").text();
map.put("like", like);
String url = element.select("div.list-right > a").attr("href");
map.put("url", "http://www.pearvideo.com/"+url);
String source = element.select("div.list-right > div > a").text();
map.put("source", source);
dataList.add(map);
}
return dataList;
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
return null;
}
}
}
package com.zhiwei.parse.analysis;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class QQKBByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(QQKBByWordAnalysis.class);
// public List<Map<String,Object>> get
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class QQKBCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(QQKBCommentAnalysis.class);
/**
*
* @Description 获取post信息
* @param result
* @param page
* @param comment_id
* @param article_id
* @return
*/
public Map<String, Object> getParamMap(String result,int page, String comment_id, String article_id) {
try {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("comments").getJSONArray("hot");
JSONObject data = jsonArry.getJSONArray(jsonArry.size()-1).getJSONObject(0);
String coral_scorem = data.getString("coral_score");
String reply_id = data.getString("reply_id");
Map<String,Object> paMap = HeadGet.getQQKBCommentParamMap2(comment_id, page, coral_scorem, article_id, reply_id);
return paMap;
} catch (Exception e) {
logger.error("构造post请求信息失败",e.getMessage());
return null;
}
}
/**
*
* @Description 解析评论
* @param result
* @return
*/
public List<Map<String,Object>> getCommentData(String result,String cookie,String comment_id, String article_id) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("comments").getJSONArray("hot");
for(int i = 0; i < jsonArry.size() ;i++) {
JSONObject data = jsonArry.getJSONArray(i).getJSONObject(0);
Map<String,Object> map = new HashMap<String,Object>();
map.put("content", data.getString("reply_content"));
map.put("time", TimeParse.dateFormartString(new Date(Long.valueOf(data.getString("tipstime")) * 1000L), "yyyy-MM-dd HH:mm:ss"));
map.put("name", data.getString("nick"));
map.put("like", data.getInteger("agree_count")==null?0:data.getInteger("agree_count"));
int replay_num = 0;
String reply_id = data.getString("reply_id");
if(data.toString().contains("reply_num")) {
replay_num = data.getInteger("reply_num");
List<Map<String,Object>> lists = getReplyCommentData(cookie,reply_id,comment_id, article_id);
if(lists != null && lists.size() > 0) {
dataList.addAll(lists);
}
map.put("reply_num", replay_num);
}
map.put("reply_id", reply_id);
map.put("reply_num", replay_num);
System.out.println(map.toString());
dataList.add(map);
}
return dataList;
} catch (Exception e) {
logger.error("解析数据出错",e.getMessage());
return dataList;
}
}
/**
*
* @Description 解析单页的评论
* @param data
* @return
*/
public Map<String,Object> getOneReplyComment(JSONObject data) {
Map<String,Object> map = new HashMap<String,Object>();
try {
map.put("content", data.getString("reply_content"));
map.put("time", TimeParse.dateFormartString(new Date(Long.valueOf(data.getString("tipstime")) * 1000L), "yyyy-MM-dd HH:mm:ss"));
map.put("name", data.getString("nick"));
map.put("like", data.getInteger("agree_count")==null?0:data.getInteger("agree_count"));
map.put("reply_id", data.getString("reply_id"));
System.out.println(map.toString());
return map;
} catch (Exception e) {
logger.error("获取单个回复评论出错",e.getMessage());
return null;
}
}
public List<Map<String,Object>> getReplyCommentData(String cookie,String reply_id,String comment_id, String article_id) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
Map<String,String> headerMap = HeadGet.getQQKBCommentHeaderMap(cookie);
try {
String old_reply_id = "";
Map<String,Object> paramMap = HeadGet.getQQKBCommentReplyParamMap(null,comment_id, article_id, reply_id);
while(true) {
ZhiWeiTools.sleep(3000);
String result = HttpClient.executeHttpRequestPost("http://r.cnews.qq.com/getQQNewsOrigReplyComment", headerMap, paramMap);
JSONObject json = JSONObject.parseObject(result);
if(json.getJSONObject("comments").getString("reply_list") == null) {
break;
}
JSONArray jsonArry = json.getJSONObject("comments").getJSONArray("reply_list");
if(jsonArry.size() < 1) {
break;
}
for(int i = 0;i < jsonArry.size();i++) {
JSONObject data = jsonArry.getJSONArray(i).getJSONObject(jsonArry.getJSONArray(i).size()-1);
Map<String,Object> map = getOneReplyComment(data);
if(map != null) {
dataList.add(map);
}
old_reply_id = data.getString("reply_id");
}
paramMap.clear();
paramMap = HeadGet.getQQKBCommentReplyParamMap(old_reply_id, comment_id, article_id, reply_id);
}
return dataList;
} catch (Exception e) {
logger.error("获取评论回复出错",e.getMessage());
return dataList;
}
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.timeParse.TimeUtil;
public class SoKuByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(SoKuByWordAnalysis.class);
/**
*
* @Description 解析
* @param result
* @return
*/
public List<Map<String,Object>> getSoKuData(String result) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
Document doc = Jsoup.parse(result);
Elements elements = doc.select("div.v");
for(Element element : elements) {
Map<String,Object> map = new HashMap<String,Object>();
String title = element.select("div.v-thumb").select("img").attr("alt");
String url = element.select("div.va").select("div.v-meta-title").select("a").attr("href");
String source = element.select("div.va").select("div.v-meta-entry").select("div.v-meta-data").select("span.username").text();
String play_count = element.select("div.va").select("div.v-meta-entry").select("div.v-meta-data").select("span.pub").text().replaceAll(",", "");
String time = element.select("div.va").select("div.v-meta-entry").select("div.v-meta-data").select("span.r").text();
time = TimeParse.dateFormartString(stringFormartDate(time),"yyyy-MM-dd HH:mm:ss");
map.put("url", url);
map.put("title", title);
map.put("source", source);
map.put("play_count", Integer.valueOf(play_count));
map.put("time", time);
dataList.add(map);
}
return dataList;
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
return dataList;
}
}
public Date stringFormartDate(String time){
Date date = null;
Long timelong = null;
if (time.contains("天")) {
timelong = Long.valueOf(
new Date().getTime() - Long.valueOf(time.split("天")[0]).longValue() * 24L * 60L * 60L * 1000L);
} else if (time.contains("小时")) {
timelong = Long
.valueOf(new Date().getTime() - Long.valueOf(time.split("小时")[0]).longValue() * 60L * 60L * 1000L);
} else if (time.contains("分")) {
timelong = Long.valueOf(new Date().getTime() - Long.valueOf(time.split("分")[0]).longValue() * 60L * 1000L);
} else if (time.contains("秒")) {
timelong = Long.valueOf(new Date().getTime() - Long.valueOf(time.split("秒")[0]).longValue() * 1000L);
} else if (TimeUtil.isNum(time)) {
timelong = Long.valueOf(time);
} else if(time.contains("月")){
timelong = Long.valueOf(
new Date().getTime() - Long.valueOf(time.split("月")[0]).longValue() * 30L * 24L * 60L * 60L * 1000L);
}
if (timelong != null) {
date = new Date(timelong.longValue());
}
return date;
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
public class WangyiCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(WangyiCommentAnalysis.class);
public List<Map<String,Object>> getWangyiCommentData(String result,List<String> idList) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
JSONObject json = JSONObject.parseObject(result);
Map<String,Object> dataMap = (Map<String, Object>) json.get("comments");
for(Map.Entry<String, Object> entry : dataMap.entrySet()) {
JSONObject data = JSONObject.parseObject(entry.getValue().toString());
Map<String,Object> map = new HashMap<String,Object>();
String id = data.getString("postId");
if(idList.contains(id)) {
continue;
}
idList.add(id);
map.put("id", id);
map.put("content", data.getString("content").replaceAll("<.*?>", ""));
map.put("time", data.getString("createTime"));
map.put("name", data.getJSONObject("user").getString("nickname")==null?"火星网友":data.getJSONObject("user").getString("nickname"));
map.put("like", data.getInteger("vote"));
map.put("unlike", data.getInteger("against"));
dataList.add(map);
}
return dataList;
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
return dataList;
}
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
public class XiguaAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(XiguaByWordAnalysis.class);
public String getUid(String url) {
return url.split("user/")[1].split("/")[0];
}
public List<Map<String,Object>> getXiguaAccountData(JSONObject json,long startTime) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
JSONArray jsonArry = json.getJSONArray("data");
for(int i = 0;i < jsonArry.size();i++) {
Map<String,Object> map = new HashMap<String,Object>();
JSONObject data = jsonArry.getJSONObject(i);
long time = data.getLong("behot_time") * 1000L;
String time1 = TimeParse.dateFormartString(new Date(time), "yyyy-MM-dd HH:mm:ss");
if(time <= startTime ) {
continue;
}
map.put("time", time1);
map.put("title", data.getString("title"));
map.put("url", data.getString("display_url"));
map.put("content", data.getString("abstract"));
map.put("comments_count", data.getInteger("comments_count"));
map.put("source", data.getString("source"));
map.put("video_watch_count", data.getInteger("video_watch_count"));
dataList.add(map);
}
return dataList;
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
return dataList;
}
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
public class XiguaByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(XiguaByWordAnalysis.class);
/**
*
* @Description 解析
* @param result
* @return
*/
public List<Map<String,Object>> getXiguaData(String result) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("data");
for(int i = 0;i < jsonArry.size();i++) {
Map<String,Object> map = new HashMap<String,Object>();
JSONObject data = jsonArry.getJSONObject(i);
map.put("title", data.getString("title"));
map.put("content", data.getString("abstract"));
map.put("time", TimeParse.dateFormartString(new Date(data.getLong("create_time")*1000L), "yyyy-MM-dd HH:mm:ss"));
map.put("url", data.getString("display_url"));
map.put("play_count", data.getInteger("video_watch_count"));
map.put("comment_count", data.getString("comment_count"));
map.put("source", data.getString("source"));
map.put("like", data.getInteger("digg_count"));
map.put("unlike", data.getInteger("bury_count"));
dataList.add(map);
}
return dataList;
} catch (Exception e) {
logger.error("解析西瓜视频信息出错",e.getMessage());
return dataList;
}
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Aiqiyi;
public class AiqiyiByWordExample {
@Test
public void aiqiyiByWordTest() {
String word = "美食,味道,菜";
String[] words = word.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) {
List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w);
if(dataList != null && dataList.size() >= 1) {
bodyList.addAll(dataList);
}
}
List<String> headList = new ArrayList<String>();
headList.add("count");
headList.add("time");
headList.add("source");
headList.add("content");
headList.add("url");
headList.add("title");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata/爱奇艺关键词采集.xlsx", "数据", headList, bodyList);
}
}
package com.zhiwei.crawler;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.parse.Dayu;
public class DayuByWordExample {
@Test
public void dayuByWordTest() {
String word = "京东";
List<Map<String,Object>> dataList = Dayu.getDayuByWordData(word);
System.out.println(dataList.size());
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.PearVideo;
public class PearVideoByWordExample {
@Test
public void pearVideoByWordTest() {
String word = "美食";
List<Map<String,Object>> bodyList = PearVideo.getPearVideoData(word);
List<String> headList = new ArrayList<String>();
headList.add("time");
headList.add("title");
headList.add("content");
headList.add("url");
headList.add("like");
headList.add("source");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata/梨视频关键词采集.xlsx", "梨视频采集结果", headList, bodyList);
}
}
package com.zhiwei.crawler;
import org.junit.Test;
import com.zhiwei.parse.QQKB;
public class QQKBByWordExample {
@Test
public void qqkbByWordTest() {
String word = "麦当劳";
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000a7aa9ad0c4636295c0f484bf54820005db056be0651296f399e092bc56e4910bbedaf413ced3fb8a;%20uin=o0497332654;%20skey=MSF4MCe62n;%20sigA2=71400747BF93A51F5F103AC6180C723E940637B87127C104AA325F5709FD378BEB014D649D6B653031F6B2E0962C0F8D9C06807A1CE509983C1C70B641606FEA84B90777926863E4;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwsY93PCkKQ3oQ4nAAU9-VNf0eKb4SlfVLb-YW7bRaEL2jS3XDtJGO8m9DoDrFZ4bFyv96Pb5ZfvVIyehq-jQ65o;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
//无法找到下一页
// QQKB.getQQKBByWordData(word, cookie);
}
}
package com.zhiwei.crawler;
import org.junit.Test;
import com.zhiwei.parse.QQKB;
public class QQKBCommentCountExample {
@Test
public void qqkbCommentCountTest() {
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=0003000049dd058f533cbebb240223ede63b864224f7eebe0f4aeca6a623572bb290a5800741d191a5768bb0;%20uin=o0497332654;%20skey=MIZmc2Oel3;%20sigA2=4282ABA809551D3534C72F999EE8F2A75219ED9452DEF04E4CBCE6B680C2C893C3E1BA617F5E0F387E558888B2ABEDFE87A4A25B16F9066C1154B2BC7A1133CA7B356AB9D3BA26ED;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwgGT4n96Oq-jHALnMUe8UzpoJghQDouvfSSWdh-JOdgAm3jRJUPbux6fcIPghoNxo24xdED8ennAANksJuHiwdw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
String comment_id = "2334642420";
String article_id = "20180103A09WKN00";
int i = QQKB.getCommentCount(cookie, comment_id, article_id);
System.out.println(i);
}
}
package com.zhiwei.crawler;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.parse.QQKB;
public class QQKBCommentExample {
@Test
public void qqkbCommentTest() {
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=0003000049dd058f533cbebb240223ede63b864224f7eebe0f4aeca6a623572bb290a5800741d191a5768bb0;%20uin=o0497332654;%20skey=MIZmc2Oel3;%20sigA2=4282ABA809551D3534C72F999EE8F2A75219ED9452DEF04E4CBCE6B680C2C893C3E1BA617F5E0F387E558888B2ABEDFE87A4A25B16F9066C1154B2BC7A1133CA7B356AB9D3BA26ED;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwgGT4n96Oq-jHALnMUe8UzpoJghQDouvfSSWdh-JOdgAm3jRJUPbux6fcIPghoNxo24xdED8ennAANksJuHiwdw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
String comment_id = "2334642420";
String article_id = "20180103A09WKN00";
List<Map<String,Object>> dataList = QQKB.getQQKBCommentData(cookie, comment_id, article_id);
System.out.println(dataList.size());
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Soku;
public class SoKuByWordExample {
@Test
public void sokuByWordTest() {
String word = "美食,味道,吃,试吃,美味,好吃";
String type = "174,103,176";
String[] words = word.split(",");
String[] types = type.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words ) {
for(String t : types) {
List<Map<String,Object>> list = Soku.getSoKuByWordData(w, t);
if(list != null && list.size() > 0) {
bodyList.addAll(list);
}
}
}
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("play_count");
headList.add("url");
headList.add("source");
poi.exportExcel("D://crawlerdata/优酷采集.xlsx", "优酷数据", headList, bodyList);
}
}
package com.zhiwei.crawler;
import org.junit.Test;
import com.zhiwei.parse.Wangyi;
public class WangyiCommentCountExample {
@Test
public void wangyiCommentCountTest() {
String id = "D77CENT50001875P";
int i = Wangyi.getWangyiCommentCount(id);
System.out.println(i);
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Wangyi;
public class WangyiCommentExample {
//若出错 可能数据有重复 以id为准
@Test
public void wangyiCommentTest() {
String id = "D77CENT50001875P";
List<Map<String,Object>> lists = Wangyi.getWangyiCommentData(id);
System.out.println(lists.size());
List<String> headList = new ArrayList<String>();
headList.add("content");
headList.add("id");
headList.add("time");
headList.add("name");
headList.add("like");
headList.add("unlike");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\网易评论采集测试.xlsx", "asd", headList, lists);
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.XiGua;
public class XiguaAccountExample {
@Test
public void xiguaAccountTest() {
String path = "D:\\crawlerdata\\西瓜视频采集12.28.xlsx";
String startTime = "2017-01-01 00:00:00";
//2017-01-01 00:00:00
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
for(Map<String,Object> map1 : lists ) {
String url = map1.get("主页")+"";
if(url != null && url.length() > 5) {
List<Map<String,Object>> lists1 = XiGua.getXiguaAccountData(url,startTime);
if(lists1 != null && lists.size() > 0) {
bodyList.addAll(lists1);
}
}
}
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("comments_count");
headList.add("time");
headList.add("content");
headList.add("url");
headList.add("video_watch_count");
headList.add("source");
poi.exportExcel(path, "数据采集结果", headList, bodyList);
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.XiGua;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class XiguaByWordExample {
@Test
public void XiguaByWordTest() {
String word = "美食,味道,吃,试吃,美味,好吃";
String[] words = word.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) {
List<Map<String,Object>> list = XiGua.getXiguaVideoByWordData(w);
if(list != null && list.size() > 0) {
bodyList.addAll(list);
}
ZhiWeiTools.sleep(5000);
System.out.println("============总数" + bodyList.size());
}
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("like");
headList.add("unlike");
headList.add("play_count");
headList.add("source");
headList.add("comment_count");
headList.add("url");
poi.exportExcel("D://crawlerdata/西瓜美食-1.xlsx", "西瓜好吃不", headList, bodyList);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment