Commit b3d545a3 by yangchen

b站关键词 和 搜狐等视频采集

parent 05c92686
...@@ -27,25 +27,25 @@ public class Aiqiyi { ...@@ -27,25 +27,25 @@ public class Aiqiyi {
*/ */
public static List<Map<String,Object>> getAiqiyiByWordData(String word,Proxy proxy) { public static List<Map<String,Object>> getAiqiyiByWordData(String word,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getAiqiyiBywordHeaderMap(null); Map<String,String> headerMap = HeadGet.getAiqiyiBywordHeaderMap(null);
Map<String,String> headerMap1 = HeadGet.getAiqiyiHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
try { try {
for(int i = 1;i <= 20;i++) { for(int i = 1;i <= 20;i++) {
String url = "http://so.iqiyi.com/so/q_"+URLEncoder.encode(word, "UTF-8")+"_ctg_%E7%94%9F%E6%B4%BB_t_0_page_"+i+"_p_1_qc_0_rd__site__m_11_bitrate_?af=true"; int count = dataList.size();
String url = "https://so.iqiyi.com/so/q_"+URLEncoder.encode(word, "UTF-8")+"_ctg__t_0_page_"+i+"_p_1_qc_0_rd__site__m_4_bitrate_";
System.out.println(url);
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap); String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
List<String> urlList = aiqiyiByWordAnalysis.getAiqiyiUrlList(result); List<Map<String,Object>> map = aiqiyiByWordAnalysis.getAiqiyiData(result);
for(String newurl : urlList) {
ZhiWeiTools.sleep(2000);
Map<String,Object> map = aiqiyiByWordAnalysis.getAiqiyiData(newurl, headerMap1,proxy);
if(map != null) { if(map != null) {
dataList.add(map); dataList.addAll(map);
} }
if(count == dataList.size()) {
break;
} }
System.out.println("=============="+dataList.size()); System.out.println("=============="+dataList.size());
ZhiWeiTools.sleep(2000);
} }
return dataList; return dataList;
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
e.printStackTrace();
logger.info("采集数据出错:{}",e.getMessage()); logger.info("采集数据出错:{}",e.getMessage());
return dataList; return dataList;
} }
......
...@@ -13,10 +13,8 @@ import org.slf4j.LoggerFactory; ...@@ -13,10 +13,8 @@ import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder; import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.analysis.BilibilikeyWordAnalysis; import com.zhiwei.parse.analysis.BilibilikeyWordAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.util.WordReadFile;
import okhttp3.Headers; import okhttp3.Headers;
import okhttp3.Request; import okhttp3.Request;
...@@ -34,7 +32,6 @@ public class BiliBili { ...@@ -34,7 +32,6 @@ public class BiliBili {
Headers header = Headers.of("cookie",cookie,"Referer","https://www.bilibili.com/","Host","search.bilibili.com"); Headers header = Headers.of("cookie",cookie,"Referer","https://www.bilibili.com/","Host","search.bilibili.com");
Request request = HttpRequestBuilder.newGetRequest(url, header); Request request = HttpRequestBuilder.newGetRequest(url, header);
String result = httpBoot.syncCall(request, proxy).body().string(); String result = httpBoot.syncCall(request, proxy).body().string();
// System.out.println(result);
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(3000);
Map<String,Object> map = BilibilikeyWordAnalysis.getData(result); Map<String,Object> map = BilibilikeyWordAnalysis.getData(result);
boolean more = (boolean) map.get("more"); boolean more = (boolean) map.get("more");
...@@ -70,28 +67,7 @@ public class BiliBili { ...@@ -70,28 +67,7 @@ public class BiliBili {
return Collections.emptyList(); return Collections.emptyList();
} }
public static void main(String[] args) {
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
for(String word : wordList) {
List<Map<String,Object>> dataList = BiliBili.getData(word, null,cookie);
if(dataList != null) {
System.out.println(word + " ----- " + dataList.size());
bodyList.addAll(dataList);
}
}
List<String> headlist = new ArrayList<String>();
headlist.add("submitcount");
headlist.add("playcount");
headlist.add("time");
headlist.add("source");
headlist.add("title");
headlist.add("url");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//bilibili关键词采集数据-txh.xlsx", "B站数据", headlist, bodyList);
}
} }
package com.zhiwei.parse.analysis; package com.zhiwei.parse.analysis;
import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -15,82 +15,52 @@ import org.slf4j.LoggerFactory; ...@@ -15,82 +15,52 @@ import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response; import okhttp3.Response;
public class AiqiyiByWordAnalysis { public class AiqiyiByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(AiqiyiByWordAnalysis.class); private static Logger logger = LoggerFactory.getLogger(AiqiyiByWordAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot();
/** public List<Map<String,Object>> getAiqiyiData(String result) {
* List<Map<String,Object>> dataMap = new ArrayList<>();
* @Description 解析出所有有用链接
* @param result
* @return
*/
public List<String> getAiqiyiUrlList(String result) {
List<String> urlList = new ArrayList<String>();
try { try {
Document doc = Jsoup.parse(result); Document doc = Jsoup.parse(result);
Elements elements = doc.select("ul.mod_result_list").select("li.list_item"); Elements elements = doc.select("li.list_item");
for(Element element : elements) { for (Element element : elements) {
String url = element.select("a.figure-180101").attr("href"); Map<String, Object> map = new HashMap<>();
if(url != null && url.length() > 1) { String title = element.select("li").attr("data-widget-searchlist-tvname");
urlList.add(url); String time = element.select("em.result_info_desc").text().split(" ")[0];
String uurl = element.select("h3.result_title > a").attr("href");
map.put("time", TimeParse.stringFormartDate(time));
map.put("url", uurl);
map.put("title", title);
System.out.println(map.toString());
dataMap.add(map);
} }
}
return urlList;
} catch (Exception e) {
e.printStackTrace();
return urlList;
}
}
public Map<String,Object> getAiqiyiData(String url,Map<String,String> headerMap,Proxy proxy) {
Map<String,Object> dataMap = new HashMap<>();
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
String result = response.body().string();
Document doc = Jsoup.parse(result);
String time = doc.select("#widget-vshort-ptime").text();
if(!time.contains("2017")) {
return null;
}
dataMap.put("time", time.split("发布时间:")[1]);
String source = doc.select("#widget-vshort-un-inner").attr("title");
dataMap.put("source", source);
String content = doc.select("#widget-vshort-lesswrap").text();
dataMap.put("content", content);
dataMap.put("url", url);
String title = doc.select("#widget-videotitle").attr("title");
String id = result.split(" tvId: ")[1].split(",")[0];
ZhiWeiTools.sleep(2000);
int count = getVideo_count(id,proxy);
dataMap.put("count", count);
dataMap.put("title", title);
System.out.println(dataMap.toString());
return dataMap; return dataMap;
} catch (Exception e) { } catch (Exception e) {
logger.error("解析出错 {}",e); logger.error("解析出错 {}",e);
return dataMap; return Collections.emptyList();
} }
} }
public int getVideo_count(String id,Proxy proxy) { // public String getSource(String url,ProxyHolder proxy) {
try { // Map<String,String> headerMap = HeadGet.getAiqiyiForCountHeaderMap(null);
String url = "http://cache.video.iqiyi.com/jp/pc/"+id+"/"; // System.out.println(url);
Map<String,String> headerMap = HeadGet.getAiqiyiForCountHeaderMap(null); // try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap); // String result = response.body().string();
String count = result.split(":")[1].split("\\}")[0]; //// System.out.println(result);
return Integer.valueOf(count); // return "aiqiyi";
} catch (Exception e) { // } catch (Exception e) {
return 0; // e.printStackTrace();
} // return "";
} // }
// }
......
package com.zhiwei.parse.shipin;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
/**
*
* @ClassName QQTV
* @Description 腾讯视频采集
* @author byte-zbs
* @Date 2019年1月30日 下午3:01:47
* @version 1.0.0
*/
public class QQTV {
private static final Logger logger = LoggerFactory.getLogger(QQTV.class);
private static HttpBoot httpBoot = new HttpBoot();
public static List<Map<String,Object>> getData(String word,String time,ProxyHolder proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
try {
String url = "https://v.qq.com/x/search/?ses=qid%3DdKzxiFfC7NqpC6z2jq4m-KGeQjb_Th556Yrz24cQaZo1MUTw2PK4XA%26last_query%3D%E7%BE%8E%E5%9B%A2%26tabid_list%3D0%7C1%7C5%7C13%7C11%7C7%7C2%7C3%7C4%7C6%7C12%7C21%7C14%7C17%7C8%7C15%7C20%26tabname_list%3D%E5%85%A8%E9%83%A8%7C%E7%94%B5%E5%BD%B1%7C%E9%9F%B3%E4%B9%90%7C%E8%B4%A2%E7%BB%8F%7C%E6%96%B0%E9%97%BB%7C%E5%85%B6%E4%BB%96%7C%E7%94%B5%E8%A7%86%E5%89%A7%7C%E7%BB%BC%E8%89%BA%7C%E5%8A%A8%E6%BC%AB%7C%E7%BA%AA%E5%BD%95%E7%89%87%7C%E5%A8%B1%E4%B9%90%7C%E6%B1%BD%E8%BD%A6%7C%E4%BD%93%E8%82%B2%7C%E6%B8%B8%E6%88%8F%7C%E5%8E%9F%E5%88%9B%7C%E6%95%99%E8%82%B2%7C%E6%AF%8D%E5%A9%B4%26resolution_tabid_list%3D0%7C1%7C2%7C3%7C4%7C5%26resolution_tabname_list%3D%E5%85%A8%E9%83%A8%7C%E6%A0%87%E6%B8%85%7C%E9%AB%98%E6%B8%85%7C%E8%B6%85%E6%B8%85%7C%E8%93%9D%E5%85%89%7CVR&q="+URLEncoder.encode(word, "UTF-8")+"&stag=4&filter=sort%3D1%26pubfilter%3D0%26duration%3D0%26tabid%3D0%26resolution%3D0&cur=";
int page = 1;
while(true) {
int count = dataList.size();
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url+page), proxy)){
String result = response.body().string();
Document doc = Jsoup.parse(result);
Elements elements = doc.select("div._quickopen");
logger.info(" 关键词 {} 量 {} 页 数 {} 此页量 {} ",word,dataList.size(),page,elements.size());
for(Element element : elements) {
String nurl = element.select("h2.result_title").select("a").attr("href");
Map<String,Object> map = getUrlData(nurl, ProxyFactory.getNatProxy());
if(Objects.nonNull(map) && time.compareTo(String.valueOf(map.get("time"))) < 1) {
System.out.println(map.toString());
dataList.add(map);
}
ZhiWeiTools.sleep(50);
}
page++;
if(count != dataList.size()) {
continue;
}
break;
} catch (Exception e) {
logger.error(" 数据采集出错 {} ",e);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return dataList;
}
private static Map<String,Object> getUrlData(String url,Proxy proxy) {
for(int i = 1;i < 3;i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
String source = result.split("\\<span class=\"user_name\"\\>")[1].split("\\</span\\>")[0];
result = result.split("var VIDEO_INFO =")[1].split("\\</script\\>")[0];
JSONObject json = JSONObject.parseObject(result);
Map<String,Object> map = new HashMap<>();
map.put("playCount", json.getInteger("view_all_count"));
map.put("title", json.getString("title"));
map.put("time", json.getString("video_checkup_time"));
map.put("source", source);
map.put("url", url);
return map;
} catch (Exception e) {
e.printStackTrace();
}
}
return Collections.emptyMap();
}
}
package com.zhiwei.parse.shipin;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class SohuTV {
private static final Logger logger = LoggerFactory.getLogger(SohuTV.class);
private static HttpBoot httpBoot = new HttpBoot();
public static List<Map<String,Object>> sohuTVData(String word,String cookie,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
try {
String url = "https://so.tv.sohu.com/mts?wd="+URLEncoder.encode(word, "utf-8")+"&c=0&v=0&length=0&limit=0&site=0&o=3&st=0&suged=&p=";
int page = 1;
Map<String,Object> headers = new HashMap<>();
headers.put("cookie", cookie);
while(true) {
int count = dataList.size();
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url+page,headers), proxy)){
String result = response.body().string();
Document document = Jsoup.parse(result);
Elements elements = document.select("body > div.wrap.cfix > div.ssList.area").select("li");
for(Element element : elements){
Map<String,Object> map = new HashMap<>();
String title = element.select("strong.lt-title").select("a").text();
String userName = element.select(" p > a.name").text();
String time = element.select(" p > a.tcount").text();
String nurl = element.select("strong.lt-title").select("a").attr("href");
String amountOfPlay = element.select("div > a > span.acount").text();
map.put("title",title);
map.put("source",userName);
map.put("time",TimeParse.stringFormartDate(time));
map.put("url","https://" + nurl);
map.put("playCount",amountOfPlay);
dataList.add(map);
}
} catch (Exception e) {
logger.error("获取数据出错",e);
}
logger.info(" 采集关键词 {} 采集到底 {} 页 ,采集到 {} 条 ",word,page,dataList.size());
page++;
ZhiWeiTools.sleep(2000);
if(count == dataList.size()) {
break;
}
}
} catch (Exception e) {
logger.error("采集出错 {}",e);
}
return dataList;
}
public static void main(String[] args) {
String cookie = "SUV=1901101134139015; IPLOC=CN3301; gidinf=x099980109ee0f08567b42835000336ade2ef3762611; fuid=15474616189304048886; newpuid=15474616191372936893; beans_mz_userid=UBThg01XRPg8; pmai=dad35c1c318bdd22; ifoxinstalled=false; beans_freq=1; beans_dmp=%7B%22admaster%22%3A1547461620%2C%22shunfei%22%3A1547461620%2C%22reachmax%22%3A1548816807%2C%22lingji%22%3A1547461620%2C%22yoyi%22%3A1547461620%2C%22ipinyou%22%3A1547461620%2C%22ipinyou_admaster%22%3A1547461620%2C%22miaozhen%22%3A1548816807%2C%22diantong%22%3A1547461620%2C%22huayang%22%3A1547461620%7D; beans_dmp_done=1; reqtype=pc; sokey=%5B%7B%22key%22%3A%22%E7%BE%8E%E5%9B%A2%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%20%E4%BA%8C%E5%99%81%E8%8B%B1%22%7D%5D; t=1548817812321";
SohuTV.sohuTVData("美团",cookie, null);
}
}
//package com.zhiwei.crawler; package com.zhiwei.crawler;
//
//import java.util.ArrayList; import java.util.ArrayList;
//import java.util.List; import java.util.List;
//import java.util.Map; import java.util.Map;
//
//import org.junit.Test; import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.common.config.GroupType;
//import com.zhiwei.parse.Aiqiyi; import com.zhiwei.crawler.proxy.ProxyFactory;
// import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//public class AiqiyiByWordExample { import com.zhiwei.parse.Aiqiyi;
// import com.zhiwei.util.WordReadFile;
//
// @Test public class AiqiyiByWordExample {
// public void aiqiyiByWordTest() {
// String word = "美食,味道,菜";
// String[] words = word.split(","); @Test
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); public void aiqiyiByWordTest() {
// for(String w : words) { ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,null); List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// if(dataList != null && dataList.size() >= 1) { List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// bodyList.addAll(dataList); for(String w : wordList) {
// } List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,null);
// } if(dataList != null && dataList.size() >= 1) {
// List<String> headList = new ArrayList<String>(); bodyList.addAll(dataList);
// headList.add("count"); }
// headList.add("time"); }
// headList.add("source"); List<String> headList = new ArrayList<String>();
// headList.add("content"); headList.add("count");
// headList.add("url"); headList.add("time");
// headList.add("title"); headList.add("source");
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); headList.add("content");
// poi.exportExcel("D://crawlerdata/爱奇艺关键词采集.xlsx", "数据", headList, bodyList); headList.add("url");
// headList.add("title");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata/爱奇艺关键词采集.xlsx", "数据", headList, bodyList);
// }
//
//
// }
//}
}
...@@ -11,8 +11,8 @@ public class MaimaiBywordExample { ...@@ -11,8 +11,8 @@ public class MaimaiBywordExample {
public static void main(String[] args) { public static void main(String[] args) {
String word = "美团|某团|MT|大众点评|新美大|美团点评"; String word = "美团|某团|MT|大众点评|新美大|美团点评";
String cookie = "guid=GxsfBBgZGwQYGx4EGBkeVhsfGB4aHBpWHBkEHRkfBUNYS0xLeQoSEwQSHR8ZBBoEGx0FT0dFWEJpCgNFQUlPbQpPQUNGCgZmZ35iYQIKHBkEHRkfBV5DYUhPfU9GWlprCgMeHH1lfQoRGQQcCn5kClldRU5EQ30CChoEHwVLRkZDUEVn; token=\"7IGuqjEwgJ2gXX5PZ0UYSxvn81Aws6v5OFrwpSErsbctlSd1e/7+AzYEMMMeeFJJ8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; _buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiOGtDSnF6VG5QcFk0R3ZmVFB4MThIMW1ZIiwiX2V4cGlyZSI6MTU0ODMwODU0MTMyNCwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=cnQ0i1LwYxhjO3_BvQ4Coh0f9PQ"; String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=3oatshv55and4kwcz9gdpie7qdpj27yt; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHxwdGRMcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1548984672861; token=\"Ap1u6QzIdn8FCrohEAEPI86n9mNSKk1qJWlauQ8KeSbn7fDKTu6bN2Yv6B9V19nO8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoibVVlSlRTUW1NdVdUTUUtRjV0SjBZbExtIiwibWlkNDU2ODc2MCI6ZmFsc2UsInN0YXR1cyI6dHJ1ZSwiX2V4cGlyZSI6MTU0OTA3MTEzOTA2NywiX21heEFnZSI6ODY0MDAwMDB9; session.sig=UOz44C2rF-uJFxFvSwHyII5aJxM";
String time = "2019-01-17 00:00:00"; String time = "2019-01-24 00:00:00";
String[] words = word.split("\\|"); String[] words = word.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) { for(String w : words) {
...@@ -32,7 +32,7 @@ public class MaimaiBywordExample { ...@@ -32,7 +32,7 @@ public class MaimaiBywordExample {
headList.add("comment_count"); headList.add("comment_count");
headList.add("spreads"); headList.add("spreads");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0123.xlsx", "脉脉关键词", headList, bodyList); poi.exportExcel("D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0201.xlsx", "脉脉关键词", headList, bodyList);
} }
} }
package com.zhiwei.keyword; //package com.zhiwei.keyword;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Gftai; //import com.zhiwei.parse.Gftai;
//
public class GftaiTest { //public class GftaiTest {
@Test // @Test
public void f() { // public void f() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
String words = "美团 催收|美团 借款|美团 还钱|三快 借钱|三快 生活费|三快 借款|美团 征信"; // String words = "花呗|借呗|京东白条|京东金条|京东金融";
String[] ws = words.split("\\|"); // String[] ws = words.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<>(); // List<Map<String,Object>> bodyList = new ArrayList<>();
for(String word : ws) { // for(String word : ws) {
List<Map<String,Object>> list = Gftai.getData(word, null); // List<Map<String,Object>> list = Gftai.getData(word, null);
bodyList.addAll(list); // bodyList.addAll(list);
System.out.println(word + " --------- " + bodyList.size()); // System.out.println(word + " --------- " + bodyList.size());
} // }
List<String> headList = new ArrayList<>(); // List<String> headList = new ArrayList<>();
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("source"); // headList.add("source");
headList.add("url"); // headList.add("url");
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\国富泰信用-美团-2.xlsx", "数据", headList, bodyList); // poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\国富泰信用-美团-3.xlsx", "数据", headList, bodyList);
} // }
} //}
package com.zhiwei.keyword; //package com.zhiwei.keyword;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Gftai; //import com.zhiwei.parse.Gftai;
import com.zhiwei.parse.KuaiTousu; //import com.zhiwei.parse.KuaiTousu;
//
public class KuaiTousuTest { //public class KuaiTousuTest {
@Test // @Test
public void f() { // public void f() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
String words = "美团 催收|美团 借款|美团 还钱|三快 借钱|三快 生活费|三快 借款|美团 征信"; // String words = "花呗|借呗|京东白条|京东金条|京东金融";
String[] ws = words.split("\\|"); // String[] ws = words.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<>(); // List<Map<String,Object>> bodyList = new ArrayList<>();
for(String word : ws) { // for(String word : ws) {
List<Map<String,Object>> list = KuaiTousu.getData(word, null); // List<Map<String,Object>> list = KuaiTousu.getData(word, null);
bodyList.addAll(list); // bodyList.addAll(list);
System.out.println(word + " --------- " + bodyList.size()); // System.out.println(word + " --------- " + bodyList.size());
} // }
List<String> headList = new ArrayList<>(); // List<String> headList = new ArrayList<>();
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("source"); // headList.add("source");
headList.add("url"); // headList.add("url");
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\新浪广东快投诉-美团-2.xlsx", "数据", headList, bodyList); // poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\新浪广东快投诉-美团-3.xlsx", "数据", headList, bodyList);
//
//
//
//
} // }
} //}
package com.zhiwei.keyword; //package com.zhiwei.keyword;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.KuaiTousu; //import com.zhiwei.parse.KuaiTousu;
import com.zhiwei.parse.SinaTousu; //import com.zhiwei.parse.SinaTousu;
//
public class SinaTousuTest { //public class SinaTousuTest {
//
@Test // @Test
public void getSinaTousuData() { // public void getSinaTousuData() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
String words = "美团 催收|美团 借款|美团 还钱|三快 借钱|三快 生活费|三快 借款|美团 征信"; // String words = "花呗|借呗|京东白条|京东金条|京东金融";
String[] ws = words.split("\\|"); // String[] ws = words.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<>(); // List<Map<String,Object>> bodyList = new ArrayList<>();
for(String word : ws) { // for(String word : ws) {
List<Map<String,Object>> list = SinaTousu.getSinaTousuData(word, null, "2018-07-01 00:00:00"); // List<Map<String,Object>> list = SinaTousu.getSinaTousuData(word, null, "2018-07-01 00:00:00");
bodyList.addAll(list); // bodyList.addAll(list);
System.out.println(word + " --------- " + bodyList.size()); // System.out.println(word + " --------- " + bodyList.size());
} // }
List<String> headList = new ArrayList<>(); // List<String> headList = new ArrayList<>();
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("source"); // headList.add("source");
headList.add("url"); // headList.add("url");
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉-美团-2.xlsx", "数据", headList, bodyList); // poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉-美团-3.xlsx", "数据", headList, bodyList);
//
//
//
} // }
} //}
//package com.zhiwei.keyword; package com.zhiwei.keyword;
//
//import java.util.ArrayList; import java.util.ArrayList;
//import java.util.List; import java.util.List;
//import java.util.Map; import java.util.Map;
//
//import org.testng.annotations.Test; import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType; import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Youku; import com.zhiwei.parse.Youku;
//
//public class YoukuKeyWordTest { public class YoukuKeyWordTest {
// @Test @Test
// public void f() { public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER); GroupType.PROVIDER);
// String word = "帮宝适 二噁英," + String word = "帮宝适 二噁英," +
// "帮宝适 二恶英," + "帮宝适 二恶英," +
// "帮宝适 有毒," + "帮宝适 甲醛," +
// "帮宝适 剧毒," + "帮宝适 荧光," +
// "帮宝适 致癌," + "帮宝适 致癌," +
// "宝洁 二噁英," + "帮宝适 有毒," +
// "宝洁 二恶英," + "帮宝适 超标," +
// "宝洁 有毒," + "帮宝适 防腐剂," +
// "宝洁 剧毒," + "帮宝适 起诉," +
// "宝洁 致癌," + "帮宝适 伤害," +
// "纸尿裤 二噁英," + "帮宝适 气味," +
// "纸尿裤 二恶英," + "帮宝适 异味," +
// "纸尿裤 有毒," + "帮宝适 起坨," +
// "纸尿裤 剧毒," + "帮宝适 异物," +
// "纸尿裤 致癌"; "帮宝适 漏," +
// List<Map<String,Object>> bodyList = new ArrayList<>(); "帮宝适 刺鼻," +
// String[] words = word.split(","); "帮宝适 勒," +
// for(String w : words) { "帮宝适 脱皮," +
// System.out.println(w); "帮宝适 划伤," +
// bodyList.addAll(Youku.getDataList(w)); "绿帮 二噁英," +
// } "绿帮 二恶英," +
// List<String> headList = new ArrayList<>(); "绿帮 甲醛," +
// headList.add("title"); "绿帮 荧光," +
// headList.add("time"); "绿帮 致癌," +
// headList.add("url"); "绿帮 有毒," +
// headList.add("uper"); "绿帮 超标," +
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); "绿帮 起诉," +
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\优酷数据-txh-0121.xlsx", "数据", headList, bodyList); "绿帮 气味," +
// "绿帮 异味," +
// } "绿帮 异物," +
//} "绿帮 漏," +
"绿帮 刺鼻," +
"绿帮 勒," +
"绿帮 脱皮";
List<Map<String,Object>> bodyList = new ArrayList<>();
String[] words = word.split(",");
for(String w : words) {
System.out.println(w);
bodyList.addAll(Youku.getDataList(w));
}
List<String> headList = new ArrayList<>();
headList.add("title");
headList.add("time");
headList.add("url");
headList.add("uper");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\优酷数据-txh-0121.xlsx", "数据", headList, bodyList);
}
}
package com.zhiwei.shipin;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.BiliBili;
import com.zhiwei.util.WordReadFile;
public class BilibiliTest {
@Test
public void f() {
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String, Object>> bodyList = new ArrayList<>();
String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
for (String word : wordList) {
List<Map<String, Object>> dataList = BiliBili.getData(word, null,
cookie);
if (dataList != null) {
System.out.println(word + " ----- " + dataList.size());
bodyList.addAll(dataList);
}
}
List<String> headlist = new ArrayList<>();
headlist.add("submitcount");
headlist.add("playcount");
headlist.add("time");
headlist.add("source");
headlist.add("title");
headlist.add("url");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//bilibili关键词采集数据-txh-0130.xlsx", "B站数据", headlist, bodyList);
}
}
package com.zhiwei.shipin;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.shipin.QQTV;
import com.zhiwei.parse.shipin.SohuTV;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.util.WordReadFile;
public class QQTVTest {
@Test
public void f() {
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
String time = "2018-07-01 00:00:00";
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String, Object>> bodyList = new ArrayList<>();
for (String word : wordList) {
List<Map<String, Object>> dataList = QQTV.getData(word,time, ProxyHolder.NAT_PROXY);
if (dataList != null) {
System.out.println(word + " ----- " + dataList.size());
bodyList.addAll(dataList);
}
ZhiWeiTools.sleep(1000);
}
List<String> headlist = new ArrayList<>();
headlist.add("playCount");
headlist.add("time");
headlist.add("source");
headlist.add("title");
headlist.add("url");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//腾讯视频关键词采集数据-txh-0130.xlsx", "B站数据", headlist, bodyList);
}
}
package com.zhiwei.shipin;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.BiliBili;
import com.zhiwei.parse.shipin.SohuTV;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.util.WordReadFile;
public class SohuTVTest {
@Test
public void f() {
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String, Object>> bodyList = new ArrayList<>();
String cookie = "SUV=1901101134139015; IPLOC=CN3301; gidinf=x099980109ee0f08567b42835000336ade2ef3762611; fuid=15474616189304048886; newpuid=15474616191372936893; beans_mz_userid=UBThg01XRPg8; pmai=dad35c1c318bdd22; ifoxinstalled=false; beans_freq=1; beans_dmp=%7B%22admaster%22%3A1547461620%2C%22shunfei%22%3A1547461620%2C%22reachmax%22%3A1548816807%2C%22lingji%22%3A1547461620%2C%22yoyi%22%3A1547461620%2C%22ipinyou%22%3A1547461620%2C%22ipinyou_admaster%22%3A1547461620%2C%22miaozhen%22%3A1548816807%2C%22diantong%22%3A1547461620%2C%22huayang%22%3A1547461620%7D; beans_dmp_done=1; reqtype=pc; sokey=%5B%7B%22key%22%3A%22%E7%BE%8E%E5%9B%A2%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%20%E4%BA%8C%E5%99%81%E8%8B%B1%22%7D%5D; t=1548817812321";
for (String word : wordList) {
List<Map<String, Object>> dataList = SohuTV.sohuTVData(word, cookie, null);
if (dataList != null) {
System.out.println(word + " ----- " + dataList.size());
bodyList.addAll(dataList);
}
ZhiWeiTools.sleep(1000);
}
List<String> headlist = new ArrayList<>();
headlist.add("playCount");
headlist.add("time");
headlist.add("source");
headlist.add("title");
headlist.add("url");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//搜狐视频关键词采集数据-txh-0130.xlsx", "B站数据", headlist, bodyList);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment