Commit 3e350f8b by yangchen

修改部分代理使用方式 并升级版本

parent 096c4f21
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>articlenewscrawler</artifactId>
<version>0.1.6-SNAPSHOT</version>
<version>0.1.7-SNAPSHOT</version>
<name>articlenewscrawler</name>
<description>采集凤凰,一点资讯,搜狐历时文章和文章评论</description>
......
......@@ -14,6 +14,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
......@@ -21,8 +22,6 @@ import com.zhiwei.parse.analysis.BaijiaAccountAnalysis;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
public class Baijia {
private static Logger logger = LoggerFactory.getLogger(Baijia.class);
private static BaijiaAccountAnalysis baijiaAccountAnalysis = new BaijiaAccountAnalysis();
......@@ -70,17 +69,18 @@ public class Baijia {
/**
*
* @Description 获取百家号第三种方法
* @param app_id
* @param appId
* @param startTime
* @param proxy
* @return
*/
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getBaijiaAccountByBaiduData(String app_id,String name,String startTime,String cookie,Proxy proxy) {
public static List<Map<String,Object>> getBaijiaAccountByBaiduData(String appId, String name, String startTime,
String cookie, ProxyHolder proxy) {
Map<String,String> headerMap = HeaderTool.getCommonHead();
List<Map<String,Object>> dataList = new ArrayList<>();
headerMap.put("cookie",cookie);
String uk = getUkData(app_id,proxy,cookie);
String uk = getUkData(appId,proxy,cookie);
if(Objects.isNull(uk)) {
return Collections.emptyList();
}
......@@ -90,12 +90,11 @@ public class Baijia {
for(int i = 1;i < 3;i++) {
try {
String url = "https://author.baidu.com/list?type=article&tab=2&uk="+uk+"&ctime="+ctime+"&num=50";
Request request = RequestUtils.wrapGet(url, headerMap);
String result = httpBoot.syncCall(request, proxy).body().string();
String result = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
Map<String,Object> dMap = baijiaAccountAnalysis.getBaijiaAccountData3(result,name, startTime);
List<Map<String,Object>> dList = (List<Map<String, Object>>) dMap.get("data");
dataList.addAll(dList);
logger.info("{} 数据采集结果 {}",app_id, dataList.size());
logger.info("{} 数据采集结果 {}",appId, dataList.size());
if(!(boolean) dMap.get("more")) {
f = false;
}
......@@ -111,9 +110,9 @@ public class Baijia {
return dataList;
}
private static String getUkData(String app_id,Proxy proxy,String cookie) {
private static String getUkData(String appId,ProxyHolder proxy,String cookie) {
String url = "https://author.baidu.com/profile?context={\"from\":0,\"app_id\":\""
+app_id+"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#";
+appId+"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#";
Map<String,Object> headers = new HashMap<>();
headers.put("Host", "author.baidu.com");
headers.put("cookie", cookie);
......
......@@ -2,7 +2,6 @@ package com.zhiwei.parse;
import static java.util.Objects.nonNull;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
......@@ -16,6 +15,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.DoubanCommentAnalysis;
import com.zhiwei.tools.tools.URLCodeUtil;
......@@ -36,7 +36,7 @@ public class Douban {
* @param stime
* @return
*/
public static List<Map<String,Object>> doubanTopicGetByWord(String word,Proxy proxy,String cookie,String stime) {
public static List<Map<String,Object>> doubanTopicGetByWord(String word,ProxyHolder proxy,String cookie,String stime) {
int page = 0;
int count = 20;
boolean more = true;
......@@ -105,7 +105,7 @@ public class Douban {
* @param cookie
* @return
*/
public static List<Map<String,Object>> getDoubanComment(String url,Proxy proxy,String cookie) {
public static List<Map<String,Object>> getDoubanComment(String url,ProxyHolder proxy,String cookie) {
if(url.contains("#")) {
url = url.split("#")[0];
}
......
......@@ -44,7 +44,7 @@ public class Maimai {
* @return
*/
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getData(String key,String cookie,String time,Proxy proxy) {
public static List<Map<String,Object>> getData(String key,String cookie,String time,ProxyHolder proxy) {
Map<String,String> headerMap = HeadGet.getMaimaiKeywordHeaderMap(cookie);
List<Map<String,Object>> dataList = new ArrayList<>();
boolean f = true;
......@@ -82,7 +82,7 @@ public class Maimai {
* @return
*/
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getDataByNoName(String key,String cookie,String time,Proxy proxy) {
public static List<Map<String,Object>> getDataByNoName(String key,String cookie,String time,ProxyHolder proxy) {
Map<String,String> headerMap = HeadGet.getMaimaiKeywordHeaderMap(cookie);
List<Map<String,Object>> dataList = new ArrayList<>();
boolean f = true;
......@@ -94,7 +94,7 @@ public class Maimai {
Map<String,Object> map = maimaiBywordAnalysis.getDataByNoName(result, time,key);
f = (boolean) map.get("hasMore");
List<Map<String,Object>> daList = (List<Map<String, Object>>) map.get("data");
if(daList != null && daList.size() > 0) {
if(daList != null && !daList.isEmpty()) {
dataList.addAll(daList);
url = "https://maimai.cn/search/gossips?query="+URLEncoder.encode(key, "utf-8")+"&limit=20&offset="+i+"highlight=true&sortby=time&jsononly=1";
i+=20;
......
......@@ -12,7 +12,6 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.bean.QQkbUser;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
......@@ -32,7 +31,7 @@ public class QQKB {
* @param cookie
* @return
*/
public static List<Map<String,Object>> getQQAccountData(String child,String cookie,Proxy proxy) {
public static List<Map<String,Object>> getQQAccountData(String child,String cookie,ProxyHolder proxy) {
String url = "http://r.cnews.qq.com/getSubNewsIndex";
Map<String,String> headerMap = HeadGet.getQQAccountHeaderMap(cookie);
Map<String,Object> paramMap = HeadGet.getQQAccountOneParamMap(child);
......
package com.zhiwei.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
......@@ -31,7 +30,7 @@ public class TXNews {
public static boolean txNewshasMoreData = true;
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getData(String word,String devid,Proxy proxy) {
public static List<Map<String,Object>> getData(String word,String devid,ProxyHolder proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
Map<String,String> headerMap = HeadGet.getTxNewspage1HeaderMap(null);
Map<String,Object> paramMap = HeadGet.getTxNewspage1ParamMap(word);
......
......@@ -36,7 +36,7 @@ public class Wangyi {
* @param id
* @return
*/
public static List<Map<String,Object>> getWangyiCommentData(String id,Proxy proxy) {
public static List<Map<String,Object>> getWangyiCommentData(String id,ProxyHolder proxy) {
Map<String,String> headerMap = HeadGet.getWangyiCommentHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<>();
int i = 0;
......
package com.zhiwei.parse.analysis;
import static java.util.Objects.isNull;
import static java.util.Objects.nonNull;
import java.util.ArrayList;
......
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Dayu;
public class DayuAccountExample {
@Test
public void dayuAccountTest() {
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String mid = "0a8b2360fd8b4ded971cd324a56d32f0";
// String name = "大鱼海棠雨";
String startTime = "2017-01-01 00:00:00";
String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx";
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("source");
headList.add("url");
// headList.add("content_id");
// headList.add("origin_id");
// headList.add("xss_item_id");
for(Map<String,Object> data : lists) {
String mid = data.get("mid")+"";
String name = data.get("name")+"";
mid = "7b345070c4124574b9cbcab8c4a1aeb8";
name = "国魂";
if(mid.length() < 1 && name.length() < 1) {
continue;
}
List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null,null);
poi.exportExcel(path, name, headList, dataList);
}
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Dayu;
//
//public class DayuAccountExample {
//
//
// @Test
// public void dayuAccountTest() {
// //https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
//
//// String mid = "0a8b2360fd8b4ded971cd324a56d32f0";
//// String name = "大鱼海棠雨";
// String startTime = "2017-01-01 00:00:00";
// String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
//// headList.add("content_id");
//// headList.add("origin_id");
//// headList.add("xss_item_id");
// for(Map<String,Object> data : lists) {
// String mid = data.get("mid")+"";
// String name = data.get("name")+"";
// mid = "7b345070c4124574b9cbcab8c4a1aeb8";
// name = "国魂";
// if(mid.length() < 1 && name.length() < 1) {
// continue;
// }
// List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null,null);
// poi.exportExcel(path, name, headList, dataList);
// }
//
//
// }
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Yidianzixun;
import com.zhiwei.util.WordReadFile;
public class YidainzixunByWordExample {
@Test
public void yidianzixunByWordTest() {
List<String> wordList = WordReadFile.getWords("D://crawlerdata/关键词.txt");
List<Map<String,Object>> listAll = new ArrayList<Map<String,Object>>();
for(String word : wordList) {
List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunDataByWord(word,null);
System.out.println(dataList.size());
listAll.addAll(dataList);
System.out.println(listAll.size());
}
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("docid");
headList.add("title");
headList.add("comment_count");
headList.add("summary");
headList.add("source");
headList.add("wm_copyright");
headList.add("time");
headList.add("url");
System.out.println(listAll.size());
poi.exportExcel("D://crawlerdata/一点资讯-软博会.xlsx", "asd", headList, listAll);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Yidianzixun;
//import com.zhiwei.util.WordReadFile;
//
//public class YidainzixunByWordExample {
//
// @Test
// public void yidianzixunByWordTest() {
//
// List<String> wordList = WordReadFile.getWords("D://crawlerdata/关键词.txt");
// List<Map<String,Object>> listAll = new ArrayList<Map<String,Object>>();
// for(String word : wordList) {
// List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunDataByWord(word,null);
// System.out.println(dataList.size());
// listAll.addAll(dataList);
// System.out.println(listAll.size());
// }
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("docid");
// headList.add("title");
// headList.add("comment_count");
// headList.add("summary");
// headList.add("source");
// headList.add("wm_copyright");
// headList.add("time");
// headList.add("url");
// System.out.println(listAll.size());
// poi.exportExcel("D://crawlerdata/一点资讯-软博会.xlsx", "asd", headList, listAll);
//
//
// }
//
//
//}
package com.zhiwei.hsitory;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Yidianzixun;
public class YidianzixunAccountExample {
@Test
public void yidianzixunAccountTest() {
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
String channelid = "m190159";
String startTime = "2007-01-01 00:00:00";
List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunAccountData(channelid, startTime,ProxyHolder.NAT_HEAVY_PROXY,null);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("comment_count");
headList.add("ctype");
headList.add("source");
headList.add("url");
headList.add("summary");
poi.exportExcel("D://crawlerdata//历史文章采集/一点资讯-新华社中国新三板.xlsx", "新华社中国新三板", headList, dataList);
}
}
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Yidianzixun;
//
//
//public class YidianzixunAccountExample {
//
// @Test
// public void yidianzixunAccountTest() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// String channelid = "m190159";
// String startTime = "2007-01-01 00:00:00";
// List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunAccountData(channelid, startTime,ProxyHolder.NAT_HEAVY_PROXY,null);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("comment_count");
// headList.add("ctype");
// headList.add("source");
// headList.add("url");
// headList.add("summary");
// poi.exportExcel("D://crawlerdata//历史文章采集/一点资讯-新华社中国新三板.xlsx", "新华社中国新三板", headList, dataList);
// }
//
//
//}
package com.zhiwei.keyword;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Xueqiu;
public class XueqiuKeyWord {
@Test
public void f() {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
String word = "软博会|软件博览会";
String endTime = "2018-01-01 00:00:00";
String cookie = "aliyungf_tc=AQAAADi3G3I7GAEAgtgMPF/W3FeFauXk; device_id=3221abfd50f9f8be2abca9182c338c9e; Hm_lvt_1db88642e346389874251b5a1eded6e3=1561962809; s=d911zeymzo; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xq_a_token.sig=hj7OrhYid1GuAaF-AmLYrPuyFDk; xqat=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xqat.sig=gker8dcZJ2Ez4s58eIGPa5zCd0w; xq_r_token=b0ad7b059411acac8c9e2f0a40336804bb60d047; xq_r_token.sig=DwNSLIigE8xtwQKLygagcM28Dd0; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=1273068356; u.sig=UyqQW-br8hcsOyT6anFmS_SFpCs; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1561963129";
String[] words = word.split("\\|");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) {
System.out.println(w);
List<Map<String, Object>> dataList = Xueqiu.getData(w, endTime, null, cookie);
System.out.println(w + " ---- " + dataList.size());
bodyList.addAll(dataList);
}
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("uper");
headList.add("url");
headList.add("likeCount");
headList.add("replyCount");
poi.exportExcel("D:\\crawlerdata\\自媒体\\雪球网页采集-软博会-relevance.xlsx", "马化腾", headList, bodyList);
}
}
//package com.zhiwei.keyword;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Xueqiu;
//
//public class XueqiuKeyWord {
//
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// String word = "软博会|软件博览会";
// String endTime = "2018-01-01 00:00:00";
// String cookie = "aliyungf_tc=AQAAADi3G3I7GAEAgtgMPF/W3FeFauXk; device_id=3221abfd50f9f8be2abca9182c338c9e; Hm_lvt_1db88642e346389874251b5a1eded6e3=1561962809; s=d911zeymzo; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xq_a_token.sig=hj7OrhYid1GuAaF-AmLYrPuyFDk; xqat=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xqat.sig=gker8dcZJ2Ez4s58eIGPa5zCd0w; xq_r_token=b0ad7b059411acac8c9e2f0a40336804bb60d047; xq_r_token.sig=DwNSLIigE8xtwQKLygagcM28Dd0; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=1273068356; u.sig=UyqQW-br8hcsOyT6anFmS_SFpCs; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1561963129";
//
//
//
// String[] words = word.split("\\|");
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : words) {
// System.out.println(w);
//
// List<Map<String, Object>> dataList = Xueqiu.getData(w, endTime, null, cookie);
// System.out.println(w + " ---- " + dataList.size());
// bodyList.addAll(dataList);
// }
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("uper");
// headList.add("url");
// headList.add("likeCount");
// headList.add("replyCount");
// poi.exportExcel("D:\\crawlerdata\\自媒体\\雪球网页采集-软博会-relevance.xlsx", "马化腾", headList, bodyList);
//
// }
//}
package com.zhiwei.shipin;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Aiqiyi;
import com.zhiwei.util.WordReadFile;
public class AiqiyiTest {
@Test
public void aiqiyiTest() {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : wordList) {
List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,ProxyHolder.NAT_HEAVY_PROXY);
if(dataList != null && dataList.size() >= 1) {
bodyList.addAll(dataList);
}
}
List<String> headList = new ArrayList<String>();
headList.add("time");
headList.add("source");
headList.add("content");
headList.add("url");
headList.add("title");
headList.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//视频/爱奇艺关键词采集-毓婷-0716.xlsx", "数据", headList, bodyList);
}
}
//package com.zhiwei.shipin;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Aiqiyi;
//import com.zhiwei.util.WordReadFile;
//
//public class AiqiyiTest {
// @Test
// public void aiqiyiTest() {
//
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : wordList) {
// List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,ProxyHolder.NAT_HEAVY_PROXY);
// if(dataList != null && dataList.size() >= 1) {
// bodyList.addAll(dataList);
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("time");
// headList.add("source");
// headList.add("content");
// headList.add("url");
// headList.add("title");
// headList.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频/爱奇艺关键词采集-毓婷-0716.xlsx", "数据", headList, bodyList);
//
//
//
//
// }
//
//}
package com.zhiwei.shipin;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.BiliBili;
import com.zhiwei.util.WordReadFile;
public class BilibiliTest {
@Test
public void f() {
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词-1.txt");
List<Map<String, Object>> bodyList = new ArrayList<>();
String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
for (String word : wordList) {
List<Map<String, Object>> dataList = BiliBili.getData(word, null, "2019-07-18 00:00:00",
cookie);
if (dataList != null) {
System.out.println(word + " ----- " + dataList.size());
bodyList.addAll(dataList);
}
}
List<String> headlist = new ArrayList<>();
headlist.add("submitcount");
headlist.add("playcount");
headlist.add("time");
headlist.add("source");
headlist.add("title");
headlist.add("url");
headlist.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//视频//bilibili关键词采集数据-吃鸡否-0722.xlsx", "B站数据", headlist, bodyList);
}
}
//package com.zhiwei.shipin;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.BiliBili;
//import com.zhiwei.util.WordReadFile;
//
//public class BilibiliTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词-1.txt");
// List<Map<String, Object>> bodyList = new ArrayList<>();
// String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
// for (String word : wordList) {
// List<Map<String, Object>> dataList = BiliBili.getData(word, null, "2019-07-18 00:00:00",
// cookie);
// if (dataList != null) {
// System.out.println(word + " ----- " + dataList.size());
// bodyList.addAll(dataList);
// }
// }
// List<String> headlist = new ArrayList<>();
// headlist.add("submitcount");
// headlist.add("playcount");
// headlist.add("time");
// headlist.add("source");
// headlist.add("title");
// headlist.add("url");
// headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//bilibili关键词采集数据-吃鸡否-0722.xlsx", "B站数据", headlist, bodyList);
//
// }
//}
package com.zhiwei.shipin;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.shipin.QQTV;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.util.WordReadFile;
public class QQTVTest {
@Test
public void f() {
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
String time = "2019-04-11 00:00:00";
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String, Object>> bodyList = new ArrayList<>();
for (String word : wordList) {
List<Map<String, Object>> dataList = QQTV.getData(word,time, ProxyHolder.NAT_HEAVY_PROXY);
if (dataList != null) {
System.out.println(word + " ----- " + dataList.size());
bodyList.addAll(dataList);
}
ZhiWeiTools.sleep(1000);
}
List<String> headlist = new ArrayList<>();
headlist.add("playCount");
headlist.add("time");
headlist.add("source");
headlist.add("title");
headlist.add("url");
headlist.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//视频//腾讯视频关键词采集数据-毓婷-0716.xlsx", "腾讯视频数据", headlist, bodyList);
}
}
//package com.zhiwei.shipin;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.shipin.QQTV;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//import com.zhiwei.util.WordReadFile;
//
//public class QQTVTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// String time = "2019-04-11 00:00:00";
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String, Object>> bodyList = new ArrayList<>();
// for (String word : wordList) {
// List<Map<String, Object>> dataList = QQTV.getData(word,time, ProxyHolder.NAT_HEAVY_PROXY);
// if (dataList != null) {
// System.out.println(word + " ----- " + dataList.size());
// bodyList.addAll(dataList);
// }
// ZhiWeiTools.sleep(1000);
// }
// List<String> headlist = new ArrayList<>();
// headlist.add("playCount");
// headlist.add("time");
// headlist.add("source");
// headlist.add("title");
// headlist.add("url");
// headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//腾讯视频关键词采集数据-毓婷-0716.xlsx", "腾讯视频数据", headlist, bodyList);
//
//
//
//
// }
//}
package com.zhiwei.shipin;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.shipin.SohuTV;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.util.WordReadFile;
public class SohuTVTest {
@Test
public void f() {
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String, Object>> bodyList = new ArrayList<>();
String cookie = "SUV=1901101134139015; IPLOC=CN3301; gidinf=x099980109ee0f08567b42835000336ade2ef3762611; fuid=15474616189304048886; newpuid=15474616191372936893; beans_mz_userid=UBThg01XRPg8; pmai=dad35c1c318bdd22; ifoxinstalled=false; beans_freq=1; beans_dmp=%7B%22admaster%22%3A1547461620%2C%22shunfei%22%3A1547461620%2C%22reachmax%22%3A1548816807%2C%22lingji%22%3A1547461620%2C%22yoyi%22%3A1547461620%2C%22ipinyou%22%3A1547461620%2C%22ipinyou_admaster%22%3A1547461620%2C%22miaozhen%22%3A1548816807%2C%22diantong%22%3A1547461620%2C%22huayang%22%3A1547461620%7D; beans_dmp_done=1; reqtype=pc; sokey=%5B%7B%22key%22%3A%22%E7%BE%8E%E5%9B%A2%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%20%E4%BA%8C%E5%99%81%E8%8B%B1%22%7D%5D; t=1548817812321";
for (String word : wordList) {
List<Map<String, Object>> dataList = SohuTV.sohuTVData(word, cookie, null);
if (dataList != null) {
System.out.println(word + " ----- " + dataList.size());
bodyList.addAll(dataList);
}
ZhiWeiTools.sleep(1000);
}
List<String> headlist = new ArrayList<>();
headlist.add("playCount");
headlist.add("time");
headlist.add("source");
headlist.add("title");
headlist.add("url");
headlist.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//视频//搜狐视频关键词采集数据-毓婷-0716.xlsx", "搜狐数据", headlist, bodyList);
}
}
//package com.zhiwei.shipin;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.shipin.SohuTV;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//import com.zhiwei.util.WordReadFile;
//
//public class SohuTVTest {
// @Test
// public void f() {
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String, Object>> bodyList = new ArrayList<>();
// String cookie = "SUV=1901101134139015; IPLOC=CN3301; gidinf=x099980109ee0f08567b42835000336ade2ef3762611; fuid=15474616189304048886; newpuid=15474616191372936893; beans_mz_userid=UBThg01XRPg8; pmai=dad35c1c318bdd22; ifoxinstalled=false; beans_freq=1; beans_dmp=%7B%22admaster%22%3A1547461620%2C%22shunfei%22%3A1547461620%2C%22reachmax%22%3A1548816807%2C%22lingji%22%3A1547461620%2C%22yoyi%22%3A1547461620%2C%22ipinyou%22%3A1547461620%2C%22ipinyou_admaster%22%3A1547461620%2C%22miaozhen%22%3A1548816807%2C%22diantong%22%3A1547461620%2C%22huayang%22%3A1547461620%7D; beans_dmp_done=1; reqtype=pc; sokey=%5B%7B%22key%22%3A%22%E7%BE%8E%E5%9B%A2%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%20%E4%BA%8C%E5%99%81%E8%8B%B1%22%7D%5D; t=1548817812321";
// for (String word : wordList) {
// List<Map<String, Object>> dataList = SohuTV.sohuTVData(word, cookie, null);
// if (dataList != null) {
// System.out.println(word + " ----- " + dataList.size());
// bodyList.addAll(dataList);
// }
// ZhiWeiTools.sleep(1000);
// }
// List<String> headlist = new ArrayList<>();
// headlist.add("playCount");
// headlist.add("time");
// headlist.add("source");
// headlist.add("title");
// headlist.add("url");
// headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//搜狐视频关键词采集数据-毓婷-0716.xlsx", "搜狐数据", headlist, bodyList);
//
// }
//}
package com.zhiwei.shipin;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Youku;
import com.zhiwei.util.WordReadFile;
public class YoukuKeyWordTest {
@Test
public void f() {
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType.PROVIDER);
List<String> words = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String w : words) {
System.out.println(w);
bodyList.addAll(Youku.getDataList(w));
}
List<String> headList = new ArrayList<>();
headList.add("title");
headList.add("time");
headList.add("url");
headList.add("uper");
headList.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//视频//优酷数据-毓婷-0716.xlsx", "数据", headList, bodyList);
}
}
//package com.zhiwei.shipin;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Youku;
//import com.zhiwei.util.WordReadFile;
//
//public class YoukuKeyWordTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// List<String> words = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String w : words) {
// System.out.println(w);
// bodyList.addAll(Youku.getDataList(w));
// }
// List<String> headList = new ArrayList<>();
// headList.add("title");
// headList.add("time");
// headList.add("url");
// headList.add("uper");
// headList.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//优酷数据-毓婷-0716.xlsx", "数据", headList, bodyList);
//
// }
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment