Commit 9234d24c by yangchen

更新

parent cb5516a0
......@@ -3,42 +3,27 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>articlenewscrawler</artifactId>
<version>0.1.3-SNAPSHOT</version>
<version>0.1.6-SNAPSHOT</version>
<name>articlenewscrawler</name>
<description>采集凤凰,一点资讯,搜狐历时文章和文章评论</description>
<dependencies>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.14.3</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.29</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>excelpoi</artifactId>
<version>0.0.1-SNAPSHOT</version>
<version>0.0.3-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.2-SNAPSHOT</version>
<version>0.1.3-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.3.0-RELEASE</version>
<version>0.3.6-RELEASE</version>
<scope>provided</scope>
</dependency>
</dependency>
</dependencies>
<!-- 打包管理 -->
......
......@@ -113,14 +113,14 @@ public class HeadGet {
* @throws IOException
*/
public static Map<String,String> getFenghuangAccountHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent",
"IfengNews/6.1.8 (iPhone; iOS 11.2.1; Scale/2.00)");
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36");
headerMap.put("Accept",
"*/*");
headerMap.put("Accept-Language", "zh-cn");
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "api.3g.ifeng.com");
headerMap.put("Host", "shankapi.ifeng.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
......
......@@ -16,7 +16,7 @@ import okhttp3.Response;
public class HttpClient {
private static Logger logger = LoggerFactory.getLogger(HttpClient.class);
private static HttpBoot httpBoot = new HttpBoot(false,2);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
......@@ -44,15 +44,27 @@ public class HttpClient {
* @throws IOException
*/
public static String executeHttpRequestGet(String url,ProxyHolder proxy,Map<String, String> headerMap) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
for(int i = 0;i < 3;i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
return response.body().string();
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e);
}
}
return null;
}
public static String executeHttpRequestPost(String url,Proxy proxy,Map<String, String> headerMap,Map<String, Object> paramMap) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, paramMap), proxy)){
return response.body().string();
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e);
return null;
}
}
public static String executeHttpRequestPost(String url,Proxy proxy,Map<String, String> headerMap,Map<String, Object> paramMap) {
public static String executeHttpRequestPost(String url,ProxyHolder proxy,Map<String, String> headerMap,Map<String, Object> paramMap) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, paramMap), proxy)){
return response.body().string();
} catch (Exception e) {
......
......@@ -20,7 +20,7 @@ public class Aika {
private static Logger logger = LoggerFactory.getLogger(Aika.class);
private static AikaCommentAnalysis aikaCommentAnalysis = new AikaCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getAikaComment(String url,ProxyHolder proxy) {
......@@ -46,6 +46,7 @@ public class Aika {
page++;
} catch (Exception e) {
logger.error("爱卡汽车 评论采集出错 {}", e);
break;
}
}
......
......@@ -23,7 +23,7 @@ import okhttp3.Response;
public class Aiqiyi {
private static Logger logger = LoggerFactory.getLogger(Aiqiyi.class);
private static AiqiyiByWordAnalysis aiqiyiByWordAnalysis = new AiqiyiByWordAnalysis();
private static HttpBoot httpBoot = new HttpBoot(false, 2);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
......
......@@ -2,8 +2,11 @@ package com.zhiwei.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -23,7 +26,7 @@ import okhttp3.Request;
public class Baijia {
private static Logger logger = LoggerFactory.getLogger(Baijia.class);
private static BaijiaAccountAnalysis baijiaAccountAnalysis = new BaijiaAccountAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
......@@ -77,27 +80,29 @@ public class Baijia {
Map<String,String> headerMap = HeaderTool.getCommonHead();
List<Map<String,Object>> dataList = new ArrayList<>();
headerMap.put("cookie",cookie);
String uk = getUkData(app_id,proxy,cookie);
if(Objects.isNull(uk)) {
return Collections.emptyList();
}
boolean f = true;
int n = 0;
String ctime = "";
while(f) {
for(int i = 1;i < 3;i++) {
try {
String url = "https://author.baidu.com/list?type=article&context={%22offset%22:%22-1_"+n+"%22,%22app_id%22:%22"+app_id+"%22,%22pageSize%22:20}";
System.out.println(url);
String url = "https://author.baidu.com/list?type=article&tab=2&uk="+uk+"&ctime="+ctime+"&num=50";
Request request = RequestUtils.wrapGet(url, headerMap);
String result = httpBoot.syncCall(request, proxy, false).body().string();
String result = httpBoot.syncCall(request, proxy).body().string();
Map<String,Object> dMap = baijiaAccountAnalysis.getBaijiaAccountData3(result,name, startTime);
List<Map<String,Object>> dList = (List<Map<String, Object>>) dMap.get("data");
dataList.addAll(dList);
logger.info("{} 数据采集结果 {}",name, dataList.size());
logger.info("{} 数据采集结果 {}",app_id, dataList.size());
if(!(boolean) dMap.get("more")) {
f = false;
}
ctime = String.valueOf(dMap.get("ctime"));
ZhiWeiTools.sleep(3000);
n += 20;
break;
} catch (Exception e) {
e.printStackTrace();
ZhiWeiTools.sleep(3000);
}
}
......@@ -106,6 +111,22 @@ public class Baijia {
return dataList;
}
private static String getUkData(String app_id,Proxy proxy,String cookie) {
String url = "https://author.baidu.com/profile?context={\"from\":0,\"app_id\":\""
+app_id+"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#";
Map<String,Object> headers = new HashMap<>();
headers.put("Host", "author.baidu.com");
headers.put("cookie", cookie);
for(int i = 0; i < 3;i++) {
try {
String result = httpBoot.syncCall(RequestUtils.wrapGet(url,headers), proxy).body().string();
return result.split("uk\\\\\":\\\\\"")[1].split("\\\\\",")[0];
} catch (Exception e) {
logger.error("百家号uk 获取失败");
}
}
return null;
}
/**
*
* @Description 百家号历史文章采集
......@@ -114,7 +135,7 @@ public class Baijia {
* @return
*/
public static List<Map<String,Object>> getBaijiaAccountData(String app_id,String startTime,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
List<Map<String,Object>> dataList = new ArrayList<>();
int i = 0;
Map<String,String> headerMap = HeadGet.getBaijiaAccountHeaderMap(null);
try {
......
......@@ -12,28 +12,28 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.BilibilikeyWordAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Headers;
import okhttp3.Request;
public class BiliBili {
private static final Logger logger = LoggerFactory.getLogger(BiliBili.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).useCookieJar(true).build();
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getData(String word,Proxy proxy,String cookie) {
public static List<Map<String,Object>> getData(String word,Proxy proxy,String endTime,String cookie) {
List<Map<String,Object>> bodyList = new ArrayList<>();
try {
//
String url = "https://search.bilibili.com/all?keyword="+URLEncoder.encode(word, "utf-8")+"&order=pubdate&duration=0&tids_1=0";
Headers header = Headers.of("cookie",cookie,"Referer","https://www.bilibili.com/","Host","search.bilibili.com");
Request request = HttpRequestBuilder.newGetRequest(url, header);
String result = httpBoot.syncCall(request, proxy).body().string();
ZhiWeiTools.sleep(3000);
Map<String,Object> map = BilibilikeyWordAnalysis.getData(result,word);
String result = httpBoot.syncCall(RequestUtils.wrapGet(url, header), ProxyHolder.NAT_HEAVY_PROXY).body().string();
ZhiWeiTools.sleep(100);
Map<String,Object> map = BilibilikeyWordAnalysis.getData(result,word,endTime);
boolean more = (boolean) map.get("more");
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("data");
if(dataList != null) {
......@@ -43,27 +43,23 @@ public class BiliBili {
while(more) {
map.clear();
String ur = url + "&page=" + n;
System.out.println(ur);
request = HttpRequestBuilder.newGetRequest(ur, header);
String result2 = httpBoot.syncCall(request, proxy).body().string();
map = BilibilikeyWordAnalysis.getData(result2,word);
String result2 = httpBoot.syncCall(RequestUtils.wrapGet(ur, header), ProxyHolder.NAT_HEAVY_PROXY).body().string();
map = BilibilikeyWordAnalysis.getData(result2,word,endTime);
List<Map<String,Object>> dataList2 = (List<Map<String, Object>>) map.get("data");
if(dataList2 != null) {
bodyList.addAll(dataList2);
}
System.out.println(n + "页,数据总量为 -- " + bodyList.size() );
logger.info("word {} , {} 页,数据总量为 -- {}",word,n, bodyList.size());
more = (boolean) map.get("more");
n++;
ZhiWeiTools.sleep(3000);
ZhiWeiTools.sleep(100);
}
return bodyList;
} catch (UnsupportedEncodingException e) {
logger.error("e ",e);
logger.error("e {}",e);
} catch (Exception e) {
logger.error("e ",e);
logger.error("e {}",e);
}
return Collections.emptyList();
}
......
......@@ -25,7 +25,7 @@ import okhttp3.Response;
public class Chejia {
private static final Logger logger = LoggerFactory.getLogger(Chejia.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
......
......@@ -11,6 +11,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.DayuAccountAnalysis;
......@@ -30,26 +31,23 @@ public class Dayu {
* @param mid
* @return
*/
public static List<Map<String,Object>> getDayuAccountData(String mid,String name,String startTime,Proxy proxy) {
public static List<Map<String,Object>> getDayuAccountData(String mid,String name,String startTime,ProxyHolder proxy) {
int i = 1;
Map<String,String> headerMap = HeadGet.getDayuAccountHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
List<Map<String,Object>> dataList = new ArrayList<>();
try {
while(true) {
String url = "http://ff.dayu.com/contents/author/"+mid+"?biz_id=1002&_size=50&_page="+i+"&_order_type=published_at&status=1&_fetch=1";
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
System.out.println(url);
List<Map<String,Object>> lists = dayuAccountAnalysis.getDayuAccountData(result,name,startTime);
if(lists == null) {
break;
}
if(lists.size() < 1) {
if(lists == null || lists.isEmpty()) {
break;
}
dataList.addAll(lists);
System.out.println("================解析第"+i+"页====此时有数据=="+dataList.size());
i++;
ZhiWeiTools.sleep(7000);
ZhiWeiTools.sleep(100);
}
return dataList;
} catch (Exception e) {
......
......@@ -25,7 +25,7 @@ public class Douban {
private static final Logger logger = LoggerFactory.getLogger(Double.class);
private static DoubanCommentAnalysis doubanCommentAnalysis = new DoubanCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
......
......@@ -19,6 +19,7 @@ import com.zhiwei.parse.analysis.FenghuangCommentAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class Fenghuang {
private static Logger logger = LoggerFactory.getLogger(Fenghuang.class);
private static FenghuangAccountAnalysis fenghuangAccountAnalysis = new FenghuangAccountAnalysis();
private static FenghuangCommentAnalysis fenghuangCommentAnalysis = new FenghuangCommentAnalysis();
......@@ -31,7 +32,7 @@ public class Fenghuang {
* @param startTime 可不传 格式(2017-12-09 17:53:02)
* @return
*/
public static List<Map<String,Object>> getFenghuangAccountData(String id,String startTime,Proxy proxy) {
public static List<Map<String,Object>> getFenghuangAccountData(String id,String startTime,ProxyHolder proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
int i = 1;
boolean f = true;
......@@ -39,17 +40,17 @@ public class Fenghuang {
try {
for(int j = 0;j< 3;j++){
f = true;
String url = "http://api.3g.ifeng.com/api_wemedia_index?followid=weMedia_"+id+"&page="+i+"&pagesize=20&tag=article&uid=fe659b7e510444c28a31f88dee7a2747";
String url = "https://shankapi.ifeng.com/winter/feng/author/getFengAuthorListData/"+id+"/doc/"+i+"/getFengAuthorListData";
List<Map<String,Object>> list = fenghuangAccountAnalysis.getArticleData(url, startTime,proxy);
if(list != null && !list.isEmpty()) {
dataList.addAll(list);
logger.info("====================采集第 {} 页===共获取数据== {}",i,dataList.size());
logger.info("采集第 {} 页,.共获取数据{}",i,dataList.size());
i++;
ZhiWeiTools.sleep(2000);
ZhiWeiTools.sleep(100);
break;
}
f = false;
ZhiWeiTools.sleep(2000);
ZhiWeiTools.sleep(100);
}
} catch (Exception e) {
logger.error("程序出错 {}",e);
......
......@@ -18,7 +18,7 @@ public class Gftai {
private static final Logger logger = LoggerFactory.getLogger(Gftai.class);
private static GftaiAnalysis gftaiAnalysis = new GftaiAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getData(String word,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
......
......@@ -19,7 +19,7 @@ public class KuaiTousu {
private static Logger logger = LoggerFactory.getLogger(KuaiTousu.class);
private static KuaiTousuAnalysis kuaiTousuAnalysis = new KuaiTousuAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getData(String word,Proxy proxy) {
int page = 1;
......
......@@ -30,7 +30,7 @@ import okhttp3.Response;
public class Maimai {
private static Logger logger = LoggerFactory.getLogger(Maimai.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static MaimaiBywordAnalysis maimaiBywordAnalysis = new MaimaiBywordAnalysis();
......
......@@ -22,7 +22,7 @@ public class Pcauto {
private static Logger logger = LoggerFactory.getLogger(Pcauto.class);
private static PcautoCommentAnalysis pcautoCommentAnalysis = new PcautoCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
@SuppressWarnings("unchecked")
public static List<Map<String, Object>> getPcAutoComment(String url,ProxyHolder proxy) {
......
......@@ -13,6 +13,7 @@ import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.bean.QQkbUser;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.QQKBAccountAnalysis;
......@@ -120,7 +121,7 @@ public class QQKB {
while(true) {
try {
String result = HttpClient.executeHttpRequestPost("http://r.cnews.qq.com/getQQNewsComment",ProxyFactory.getNatProxy(), headerMap, paramMap);
String result = HttpClient.executeHttpRequestPost("http://r.cnews.qq.com/getQQNewsComment",ProxyHolder.NAT_HEAVY_PROXY, headerMap, paramMap);
paramMap.clear();
List<Map<String,Object>> lists = qqkbCommentAnalysis.getCommentData(result,null,comment_id, article_id,proxy);
if(lists == null || lists.size() < 1) {
......@@ -148,7 +149,7 @@ public class QQKB {
String cookie = "luin=o0497332654;%20lskey=00030000d63ffaf7eba88c86106eac5f2910d45515222334b91c75a66b449c990c2be43cd202ba39b35bef60;%20uin=o0497332654;%20skey=MH3wukytS4;%20sigA2=7AB4D8DEDF73E313801FD348FD77EC3B05C06DBC4D9DA669B20CA04A8D6B80F300A69567FBD11A7B799E419BB796F22D47D3AE5FA95E708A0ABC66161061131B0B21A0031AA0807C;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
Map<String,String> headerMap = HeadGet.getQQkbUserHeaderMap(cookie);
Map<String,Object> paramMap = HeadGet.getQQkbUserParamMap(name);
String result = HttpClient.executeHttpRequestPost(url, null, headerMap, paramMap);
String result = HttpClient.executeHttpRequestPost(url, ProxyHolder.NAT_HEAVY_PROXY, headerMap, paramMap);
JSONObject json = JSONObject.parseObject(result);
JSONObject json1 = json.getJSONObject("new_list");
JSONObject json2 = json1.getJSONArray("data").getJSONObject(0);
......
......@@ -24,7 +24,7 @@ public class QQNews {
private static final Logger logger = LoggerFactory.getLogger(QQNews.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
* .
......
......@@ -17,7 +17,7 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public class QicheHome {
private static Logger logger = LoggerFactory.getLogger(QicheHome.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static QicheHomeKwyWordAnalysis qicheHomeKwyWordAnalysis = new QicheHomeKwyWordAnalysis();
......
......@@ -24,7 +24,7 @@ public class SinaKeji {
private static Logger logger = LoggerFactory.getLogger(SinaKeji.class);
private static SinaKejiCommentAnalysis sinaKejiCommentAnalysis = new SinaKejiCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
* https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml
......
......@@ -21,7 +21,7 @@ public class SinaTousu {
private static final Logger logger = LoggerFactory.getLogger(SinaTousu.class);
private static SinaTousuAnalysis sinaTousuAnalysis = new SinaTousuAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getSinaTousuData(String word,ProxyHolder proxy,String time) {
List<Map<String,Object>> bodyList = new ArrayList<>();
......
......@@ -2,14 +2,11 @@ package com.zhiwei.parse;
import static java.util.Objects.nonNull;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -73,36 +70,28 @@ public class Souhu {
* @param isCulling 是否采集精选
* @return
*/
public static List<Map<String,Object>> getSouHuAccountData(String xpt,String startTime,boolean isCulling,Proxy proxy) {
public static List<Map<String,Object>> getSouHuAccountData(String id,String name,String startTime,boolean isCulling,ProxyHolder proxy) {
int i = 1;
String name = getName(xpt,proxy);
ZhiWeiTools.sleep(2000);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
Map<String,String> headerMap = HeadGet.getSouhuAccountHeaderMap(null);
ZhiWeiTools.sleep(200);
List<Map<String,Object>> dataList = new ArrayList<>();
boolean f = true;
int j = 0;
while(f) {
try {
String url = "http://mp.sohu.com/apiV2/profile/newsListAjax?xpt="+xpt+"&pageNumber="+i+"&pageSize=10";
String result = null;
String url = "http://v2.sohu.com/author-page-api/author-articles/pc/"+id+"?pNo="+i;
if(isCulling) {
url = url + "&categoryId=-1";
}
try {
result = HttpClient.executeHttpRequestGet(url,proxy,headerMap);
} catch (Exception e) {
e.printStackTrace();
url = url + "&columnId=-1";
}
result = result.replaceAll("\\\\", "");
result = result.substring(1, result.length()-1);
String result = HttpClient.executeHttpRequestGet(url,proxy,null);
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("data");
JSONArray jsonArray = json.getJSONObject("data").getJSONArray("pcArticleVOS");
List<Map<String,Object>> dataList1 = souhuAccountAnalysis.analysisData(jsonArray,name);
if(jsonArray.size() < 1) {
if(jsonArray.isEmpty()) {
break;
}
if(startTime == null) {
j = 0;
dataList.addAll(dataList1);
}
//判断时间
......@@ -113,40 +102,26 @@ public class Souhu {
f = false;
break;
}
j = 0;
dataList.add(map);
}
}
logger.info("=============获取到的数据数目{}",dataList.size());
i++;
ZhiWeiTools.sleep(3000);
ZhiWeiTools.sleep(300);
} catch (Exception e) {
ZhiWeiTools.sleep(3000);
logger.error("出错了",e.getMessage());
ZhiWeiTools.sleep(300);
logger.error("出错了 {}",e);
j++;
if(j > 5) {
f = false;
}
continue;
}
}
return dataList;
}
private static String getName(String xpt,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getSouhuAccountHeaderMap(null);
try {
String result = HttpClient.executeHttpRequestGet("http://mp.sohu.com/profile?xpt="+xpt, proxy, headerMap);
Document doc = Jsoup.parse(result);
String name = doc.select("p#ff").text();
System.out.println(name);
return name;
} catch (Exception e) {
return null;
}
}
/**
*
* @Description 传入搜狐文章链接和cookie 可获取此文章所有评论
......@@ -161,7 +136,7 @@ public class Souhu {
try {
while(true) {
String newurl = souhuCommentAnalysis.getSouhuURL(url,proxy) + "&page_no=" + j;
String result = HttpClient.executeHttpRequestGet(newurl,ProxyFactory.getNatProxy(),headerMap);
String result = HttpClient.executeHttpRequestGet(newurl,ProxyHolder.NAT_HEAVY_PROXY,headerMap);
System.out.println(newurl);
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("jsonObject").getJSONArray("comments");
......
......@@ -19,6 +19,7 @@ import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.TXNewsByWordAnalysis;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
......@@ -28,7 +29,7 @@ public class TXNews {
private static Logger logger = LoggerFactory.getLogger(TXNews.class);
private static TXNewsByWordAnalysis txNewsByWordAnalysis = new TXNewsByWordAnalysis();
public static boolean txNewshasMoreData = true;
public static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getData(String word,String devid,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
......@@ -120,5 +121,47 @@ public class TXNews {
return -1;
}
public static List<Map<String,Object>> getTxNewsHistory(String mid,String endTime,ProxyHolder proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
int page = 0;
int errorNum = 0;
while(true) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet("https://pacaio.match.qq.com/om/mediaArticles?mid="+mid+"&num=30&page="+page), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("data");
for(int i = 0,j = jsonArray.size();i < j;i++) {
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = new HashMap<>();
String time = TimeParse.dateFormartString(new Date(data.getLong("timestamp")*1000L), "yyyy-MM-dd HH:mm:ss");
if(endTime != null && endTime.length() > 1) {
System.out.println(time);
if(time.compareTo(endTime) <= 0) {
logger.info("超时时间采集范围 跳出采集");
return dataList;
}
}
map.put("title", data.getString("title"));
map.put("content", data.getString("abstract"));
map.put("time", time);
map.put("source", data.getString("source"));
map.put("url", data.getString("vurl"));
dataList.add(map);
}
logger.info("mid = {} , cralwer count = {}",mid,dataList.size() );
page++;
if(jsonArray.size() < 10) {
break;
}
} catch (Exception e) {
logger.info("采集数据出错 {}",e);
errorNum++;
if(errorNum > 3) {
break;
}
}
}
return dataList;
}
}
......@@ -21,7 +21,7 @@ public class TechTx {
private static Logger logger = LoggerFactory.getLogger(TechTx.class);
private static TechTxCommentAnalysis techTxCommentAnalysis = new TechTxCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getTechTxComment(String url,ProxyHolder proxy) {
......
......@@ -2,6 +2,7 @@ package com.zhiwei.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
......@@ -9,18 +10,24 @@ import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.WangyiCommentAnalysis;
import com.zhiwei.parse.analysis.WangyiHistoryAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class Wangyi {
private static Logger logger = LoggerFactory.getLogger(Wangyi.class);
private static WangyiCommentAnalysis wangyiCommentAnalysis = new WangyiCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static WangyiHistoryAnalysis wangyiHistoryAnalysis = new WangyiHistoryAnalysis();
/**
......@@ -74,24 +81,31 @@ public class Wangyi {
}
}
/**
*
* @Description 网易网页版数据
* @param url
* @param proxy
* @param endTime
* @return
*/
public static List<Map<String,Object>> getHistoryData(String url,Proxy proxy,String endTime) {
Map<String,String> headerMap = HeadGet.getWangyiHistoryHeaderMap(null);
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<Map<String,Object>> bodyList = new ArrayList<>();
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
String wemediaid = result.split("data-wemediaid=\"")[1].split("\"")[0];
String source = Jsoup.parse(result).select("body > div.colum_wrap.fl > div > div.colum_des > div.normal > div.colum_info > h4").text();
boolean f = true;
url = "http://dy.163.com/v2/article/list.do?wemediaId="+wemediaid+"&size=20&pageNo=";
url = "http://dy.163.com/v2/article/list.do?wemediaId="+wemediaid+"&size=10&pageNo=";
int i = 1;
ZhiWeiTools.sleep(1000);
int j = 0;
while(f) {
try {
result = "";
result = HttpClient.executeHttpRequestGet(url+i,proxy, headerMap);
JSONObject json = JSONObject.parseObject(result);
List<Map<String,Object>> dataList = wangyiHistoryAnalysis.getData(result,proxy, endTime,source);
if(dataList == null || dataList.size() < 1) {
if(dataList == null || dataList.isEmpty()) {
break;
}
bodyList.addAll(dataList);
......@@ -109,10 +123,58 @@ public class Wangyi {
if(j > 5) {
f = false;
}
continue;
}
}
return bodyList;
}
public static List<Map<String,Object>> getWangyiClientHistory(String id,ProxyHolder proxy,String endTime) {
List<Map<String,Object>> dataList = new ArrayList<>();
int page = 0;
int errorNum = 0;
while(true) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet("https://c.m.163.com/nc/subscribe/list/"+id+"/all/"+page+"-20.html"), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("tab_list");
for(int i = 0,j = jsonArray.size();i < j;i++) {
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = new HashMap<>();
String time = data.getString("ptime");
if(endTime != null && endTime.length() > 1) {
System.out.println(time);
if(time.compareTo(endTime) <= 0) {
logger.info("超时时间采集范围 跳出采集");
return dataList;
}
}
map.put("title", data.getString("title"));
map.put("content", data.getString("aheadBody"));
map.put("time", time);
map.put("source", data.getString("source"));
if("video".equals(data.getString("skipType"))) {
map.put("url", "https://c.m.163.com/news/v/" + data.getString("skipID") + ".html");
}else {
map.put("url", "https://c.m.163.com/news/a/" + data.getString("postid") + ".html");
}
// System.out.println(map.toString());
dataList.add(map);
}
logger.info("id = {} , cralwer count = {}",id,dataList.size() );
page += 20;
if(jsonArray.size() < 10) {
break;
}
} catch (Exception e) {
logger.info("采集数据出错 {}",e);
errorNum++;
if(errorNum > 3) {
break;
}
}
}
return dataList;
}
}
......@@ -26,12 +26,12 @@ import okhttp3.Response;
public class Xueqiu {
private static Logger logger = LoggerFactory.getLogger(Xueqiu.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static XueqiuKeyWordAnalysis xueqiuKeyWordAnalysis = new XueqiuKeyWordAnalysis();
/**
*
* @Description 关键词采集历史文章
* @Description 关键词采集文章
* @param word
* @param endTime
* @param proxy
......@@ -53,13 +53,16 @@ public class Xueqiu {
Request request = RequestUtils.wrapGet(url, headers);
String result = httpBoot.syncCall(request, proxy).body().string();
List<Map<String,Object>> list = xueqiuKeyWordAnalysis.getData(result, endTime);
ZhiWeiTools.sleep(3000);
if(list.size() < 1) {
if(list.isEmpty()) {
i++;
}else {
int count = JSONObject.parseObject(result).getIntValue("maxPage");
bodyList.addAll(list);
logger.info("采集到第{} 页 , 一共采集到 {} 数据",page,bodyList.size());
page++;
if(count < page) {
break;
}
}
} catch (Exception e) {
e.printStackTrace();
......@@ -98,16 +101,17 @@ public class Xueqiu {
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @Description 雪球历史文章采集
* @return
*/
public List<Map<String,Object>> getXueqiuAccountData(String userId,String cookie,Proxy proxy) {
public static List<Map<String,Object>> getXueqiuAccountData(String userId,String cookie,Proxy proxy) {
Map<String,Object> headers = new HashMap<>();
headers.put("cookie", cookie);
List<Map<String,Object>> bodyList = new ArrayList<>();
int page = 1;
int errorCount = 1;
while(true) {
int page = 1;
String url = "https://xueqiu.com/v4/statuses/user_timeline.json?page=" + page + "&user_id=6687544095&type=0";
String url = "https://xueqiu.com/v4/statuses/user_timeline.json?page=" + page + "&user_id="+userId+"&type=0";
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headers), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
......@@ -121,26 +125,30 @@ public class Xueqiu {
Date date = TimeParse.stringFormartDate(timeBefore);
Map<String, Object> map = new HashMap<>();
map.put("name", ob.getJSONObject("user").getString("screen_name"));//statuses user screen_name
map.put("source", ob.getJSONObject("user").getString("screen_name"));//statuses user screen_name
map.put("time", date);//statuses timeBefore
map.put("source", ob.getString("source"));//statuses source
map.put("content", ob.getString("description").replaceAll("<.*?>", ""));//statuses description
map.put("title", ob.getString("rawTitle"));
map.put("repostCount", ob.getString("retweet_count"));//statuses retweet_count
map.put("commentCount", ob.getString("reply_count"));//statuses reply_count
map.put("likeCount", ob.getString("like_count"));//statuses like_count
map.put("url", "https://xueqiu.coms" + ob.getString("target"));
map.put("url", "https://xueqiu.com" + ob.getString("target"));
bodyList.add(map);
}
int maxPage = json.getInteger("maxPage");
page++;
logger.info("userId = {} , crawler count = {} ,page = {} , maxPage = {}",userId,bodyList.size(),page,maxPage);
if(page > maxPage) {
break;
}
} catch (Exception e) {
logger.error("采集解析出错 {}",e);
break;
errorCount++;
if(errorCount > 3) {
break;
}
}
ZhiWeiTools.sleep(2000);
}
return bodyList;
}
......
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
/**
*
* @ClassName Yangshi
* @Description 央视网 采集
* @author byte-zbs
* @Date 2019年7月4日 下午6:08:12
* @version 1.0.0
*/
public class Yangshi {
private static final Logger logger = LoggerFactory.getLogger(Yangshi.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getData() {
return Collections.emptyList();
}
private static List<Map<String,Object>> analysisData(String result) {
List<Map<String,Object>> bodyList = new ArrayList<>();
JSONArray jsonArray = JSONObject.parseObject(result).getJSONArray("list");
try {
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject ob = jsonArray.getJSONObject(i);
String allTitle = ob.getString("all_title"); //视频标题
String urllink = ob.getString("urllink"); //链接
String channel = ob.getString("channel"); //频道来源
String uploadtime = ob.getString("uploadtime"); //时间
String durations = ob.getString("durations"); //时长
Map<String, Object> map = new HashMap<>();
map.put("视频标题", allTitle);
map.put("链接", urllink);
map.put("频道来源", channel);
map.put("时间", uploadtime);
map.put("时长", durations+" s");
System.out.println(map.toString());
bodyList.add(map);
}
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
return bodyList;
}
}
......@@ -23,7 +23,7 @@ import okhttp3.Response;
public class Yiche {
private static final Logger logger = LoggerFactory.getLogger(Yiche.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
......
......@@ -33,7 +33,7 @@ public class Yidianzixun {
private static YidianzixunCommentAnalysis yidianzixunCommentAnalysis = new YidianzixunCommentAnalysis();
private static YidianzixunByWordAnalysis yidianzixunByWordAnalysis = new YidianzixunByWordAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
......@@ -42,19 +42,19 @@ public class Yidianzixun {
* @param startTime
* @return
*/
public static List<Map<String,Object>> getYidianzixunAccountData(String channelid,String startTime,Proxy proxy,String cookie) {
public static List<Map<String,Object>> getYidianzixunAccountData(String channelid,String startTime,ProxyHolder proxy,String cookie) {
Map<String,String> headerMap = HeadGet.getYidianzixunAccountHeaderMap(cookie,"http://www.yidianzixun.com/channel/"+channelid);
List<Map<String,Object>> dataList = new ArrayList<>();
int j = 0;
boolean f = true;
try {
while(f) {
String url = "http://www.yidianzixun.com/"+getSpt(channelid, j, j+10);
String url = "http://www.yidianzixun.com"+getSpt(channelid, j, j+10);
System.out.println(url);
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
System.out.println(result);
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("result");
if(jsonArry.size() == 0) {
if(jsonArry.isEmpty()) {
break;
}
for(int i = 0;i < jsonArry.size();i++) {
......@@ -70,13 +70,12 @@ public class Yidianzixun {
dataList.add(map);
}
}
System.out.println("================================" + dataList.size());
ZhiWeiTools.sleep(3000);
logger.info("channelid = {} , crawler size = {}",channelid,dataList.size());
ZhiWeiTools.sleep(100);
j = dataList.size();
}
} catch (Exception e) {
logger.error("数据获取出错",e.getMessage());
e.printStackTrace();
logger.error("数据获取出错 {}",e);
}
return dataList;
}
......
......@@ -24,7 +24,7 @@ import okhttp3.Response;
public class Youku {
private static final Logger logger = LoggerFactory.getLogger(Youku.class);
private static HttpBoot httpBoot = new HttpBoot(false,2);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getDataList(String word) {
String aaid = "9cae49f0e031664b00d8f9c108e586ab";
......@@ -33,7 +33,7 @@ public class Youku {
String url = "https://so.youku.com/search_video/q_"+URLCodeUtil.getURLEncode(word, "UTF-8")+"?spm=a2h0k.11417342.filter.dnew&orderfield=createtime&aaid="+aaid+"&pg="+i;
System.out.println(url);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyFactory.getNatProxy())){
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY)){
String result = response.body().string();
String jsondata = result.split("bigview.view\\(")[1].split("\\)\\</script\\>")[0];
JSONObject json = JSONObject.parseObject(jsondata);
......@@ -45,7 +45,7 @@ public class Youku {
String title = element.select("div.mod-main > div.mod-header > h2 > a").text();
String surl = element.select("div.mod-main > div.mod-header > h2 > a").attr("href");
String time = element.select("div.mod-main > div.mod-info > p").text();
if(time.contains("上传时间:")) {
if(time.contains("上传时间:") && surl.contains("v.youku.com")) {
map.put("title", title);
map.put("url", "https:"+surl);
map.put("time", time.replaceAll("上传时间:", "").split(" ")[0]);
......
......@@ -24,7 +24,7 @@ import okhttp3.Response;
public class BaijiaAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(BaijiaAccountAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public Map<String,Object> getBaijiaAccount2Data(JSONObject data) {
Map<String,Object> map = new HashMap<String,Object>();
......@@ -57,12 +57,13 @@ public class BaijiaAccountAnalysis {
boolean more = false;
try {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("items");
if(json.getJSONObject("data") != null && json.getJSONObject("data").getBoolean("has_more") != null) {
if(json.getJSONObject("data").getBoolean("has_more")) {
more = true;
}
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("list");
if(json.getJSONObject("data").getBoolean("has_more") != null &&
json.getJSONObject("data").getBoolean("has_more") ) {
more = true;
rmap.put("ctime", json.getJSONObject("data").getString("ctime"));
}
// String name = json.getJSONObject("data").getJSONObject("author").getString("display_name");
for(int i = 0;i < jsonArry.size();i++) {
Map<String,Object> map = new HashMap<>();
JSONObject data = jsonArry.getJSONObject(i);
......@@ -77,10 +78,7 @@ public class BaijiaAccountAnalysis {
}
}
map.put("title", data.getString("title"));
String url = data.getString("url");
if(url == null) {
url = "https://baijia.baidu.com/s?old_id=" + id;
}
String url = "http://baijiahao.baidu.com/s?id=" + id;
map.put("content", ZhiWeiTools.delHTMLTag(getContent3(data)));
map.put("read_amount", data.getString("read_amount")==null?0:data.getString("read_amount"));
map.put("app_id", data.getString("app_id"));
......
......@@ -5,6 +5,7 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -13,7 +14,7 @@ import org.jsoup.select.Elements;
public class BilibilikeyWordAnalysis {
public static Map<String,Object> getData(String result,String word) {
public static Map<String,Object> getData(String result,String word,String endTime) {
try {
Document doc = Jsoup.parse(result);
boolean more = false;
......@@ -28,10 +29,9 @@ public class BilibilikeyWordAnalysis {
String source = null;
String submitcount = null;
Elements elements = doc.select("ul.video-contain.clearfix").select("li");
System.out.println(elements.size() + " --- " + more);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
List<Map<String,Object>> dataList = new ArrayList<>();
for(Element element : elements) {
Map<String,Object> map = new HashMap<String,Object>();
Map<String,Object> map = new HashMap<>();
title = element.select("a").attr("title");
url = element.select("a").attr("href");
playcount = element.select("div.tags").select("span.watch-num").text();
......@@ -45,6 +45,9 @@ public class BilibilikeyWordAnalysis {
map.put("source", source);
map.put("submitcount", submitcount);
map.put("word", word);
if(Objects.nonNull(endTime) && endTime.compareTo(time) > -1) {
more = false;
}
dataList.add(map);
}
Map<String,Object> rmap = new HashMap<>();
......
......@@ -49,7 +49,7 @@ public class DayuAccountAnalysis {
* @return
*/
private Map<String,Object> getOneData(JSONObject data,String name,String startTime) {
Map<String,Object> map = new HashMap<String, Object>();
Map<String,Object> map = new HashMap<>();
try {
String time = data.getString("published_at").replace("T", " ").split("\\.")[0];
if(startTime != null && startTime.length() > 1) {
......
......@@ -24,7 +24,7 @@ import okhttp3.Response;
public class DayuByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(DayuByWordAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public List<Map<String,Object>> getDayuByWordData(String result,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
......
package com.zhiwei.parse.analysis;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
......@@ -12,16 +11,15 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class FenghuangAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(FenghuangAccountAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
......@@ -29,70 +27,61 @@ public class FenghuangAccountAnalysis {
* @param result
* @return
*/
public List<Map<String,Object>> getArticleData(String url,String startTime,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
public List<Map<String,Object>> getArticleData(String url,String startTime,ProxyHolder proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
try {
Map<String,String> headerMap = HeadGet.getFenghuangAccountHeaderMap(null);
JSONArray jsonArry = null;
for(int i = 0;i < 3;i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
jsonArry = json.getJSONObject("data").getJSONObject("feeds").getJSONArray("list");
if(jsonArry == null || jsonArry.size() < 1) {
Map<String,String> headerMap = HeadGet.getFenghuangAccountHeaderMap(null);
for(int i = 0;i < 3;i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
String result = response.body().string();
System.out.println(result);
JSONObject json = JSONObject.parseObject(result.replace("getFengAuthorListData(", "").replace("]})", "]}"));
JSONArray jsonArry = json.getJSONArray("data");
for(int j = 0;j < jsonArry.size();j++) {
try {
JSONObject data = jsonArry.getJSONObject(j);
Map<String,Object> map = new HashMap<>();
map.put("title", data.getString("title"));
String time = data.getString("newsTime");
map.put("time", data.getString("newsTime"));
map.put("url", "https:" + data.getString("url"));
map.put("id", data.getString("commentUrl"));
if(time.compareTo(startTime) >= 0) {
dataList.add(map);
}
} catch (Exception e) {
logger.error(" exception {}",e);
}
}
break;
} catch (Exception e) {
e.printStackTrace();
continue;
}
} catch (Exception e) {
continue;
}
}
if(jsonArry == null || jsonArry.size() < 1) {
return dataList;
}
for(int i = 0;i < jsonArry.size();i++) {
try {
JSONObject data = jsonArry.getJSONObject(i);
String articleurl = data.getString("id");
String articleResult = HttpClient.executeHttpRequestGet(articleurl,proxy, headerMap);
Map<String,Object> dataMap = getArticle(articleResult);
ZhiWeiTools.sleep(1000);
if(dataMap != null) {
String time = (String)dataMap.get("time");
if(time.compareTo(startTime) >= 0) {
dataList.add(dataMap);
continue;
}
}
} catch (Exception e) {
e.printStackTrace();
continue;
}
}
return dataList;
} catch (Exception e1) {
e1.printStackTrace();
return dataList;
}
}
private static Map<String,Object> getArticle(String articleResult) {
JSONObject json = JSONObject.parseObject(articleResult).getJSONObject("body");
Map<String,Object> map = new HashMap<String, Object>();
try {
map.put("title", json.getString("title"));
String time = json.getString("cTime").replaceAll("/", "-");
map.put("time", time);
map.put("text", json.getString("text").replaceAll("<.*?>", ""));
map.put("source", json.getString("source"));
map.put("url", json.getString("shareurl"));
map.put("id", json.getString("aid"));
} catch (Exception e) {
logger.error("解析具体文章的时候出错 {}",e);
return null;
}
return map;
}
// private static Map<String,Object> getArticle(String articleResult) {
// try {
// Map<String,Object> map = new HashMap<>();
// JSONObject json = JSONObject.parseObject(articleResult).getJSONObject("body");
// map.put("title", json.getString("title"));
// String time = json.getString("cTime").replaceAll("/", "-");
// map.put("time", time);
// map.put("text", json.getString("text").replaceAll("<.*?>", ""));
// map.put("source", json.getString("source"));
// map.put("url", "https://share.iclient.ifeng.com/news/shareNews?aid=sub_" + json.getString("aid"));
// map.put("id", json.getString("aid"));
// return map;
// } catch (Exception e) {
// logger.error("解析具体文章的时候出错 {}",e);
// return null;
// }
// }
......
......@@ -23,7 +23,7 @@ import okhttp3.Response;
public class FenghuangCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(FenghuangCommentAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public Map<String,Object> getFenghuangCommentCount(String url,ProxyHolder proxy) {
Map<String,Object> map = new HashMap<>();
......
package com.zhiwei.parse.analysis;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
......@@ -26,7 +25,7 @@ public class SouhuAccountAnalysis {
* @return
*/
public List<Map<String,Object>> analysisData(JSONArray jsonArray,String name) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
List<Map<String,Object>> dataList = new ArrayList<>();
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = parseHtmlByAccount(data,name);
......@@ -46,19 +45,15 @@ public class SouhuAccountAnalysis {
* @return
*/
private static Map<String,Object> parseHtmlByAccount(JSONObject data,String name) {
Map<String,Object> map = new HashMap<String, Object>();
Map<String,Object> map = new HashMap<>();
try {
String title = data.getString("title");
map.put("title", URLDecoder.decode(title, "UTF-8"));
map.put("title", data.getString("title"));
map.put("source", name);
String content = data.getString("brief");
map.put("content", URLDecoder.decode(content,"UTF-8"));
map.put("content", data.getString("brief"));
map.put("newsPv", data.getString("newsPv"));
map.put("url", data.getString("url"));
long timelong = Long.valueOf(data.getString("postTime"));
map.put("time", new Date(timelong));
map.put("comment", data.getString("commentsCnt"));
JSONArray jsonArry = data.getJSONArray("tags");
map.put("url", data.getString("link"));
map.put("time", new Date(data.getLong("publicTime")));
JSONArray jsonArry = data.getJSONArray("tagDetails");
String tags = "";
for(int i = 0;i < jsonArry.size();i++) {
JSONObject ob = jsonArry.getJSONObject(i);
......@@ -68,10 +63,9 @@ public class SouhuAccountAnalysis {
tags = tags.substring(0,tags.length()-1);
}
map.put("tags", tags);
map.put("newsid", data.getString("newsid"));
map.put("newsid", data.getString("id"));
} catch (Exception e) {
logger.error("搜狐历史文章解析出错了",e.getMessage());
System.out.println(data.toString());
logger.error("搜狐历史文章解析出错了 {}",e.getMessage());
return null;
}
......
......@@ -20,7 +20,7 @@ import okhttp3.Response;
public class SouhuCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(SouhuCommentAnalysis.class);
private HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
......
package com.zhiwei.parse.shipin;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
......@@ -18,7 +17,6 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
......@@ -36,7 +34,7 @@ import okhttp3.Response;
public class QQTV {
private static final Logger logger = LoggerFactory.getLogger(QQTV.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getData(String word,String time,ProxyHolder proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
......@@ -52,8 +50,8 @@ public class QQTV {
logger.info(" 关键词 {} 量 {} 页 数 {} 此页量 {} ",word,dataList.size(),page,elements.size());
for(Element element : elements) {
String nurl = element.select("h2.result_title").select("a").attr("href");
Map<String,Object> map = getUrlData(nurl, ProxyFactory.getNatProxy());
if(Objects.nonNull(map) && time.compareTo(String.valueOf(map.get("time"))) < 1) {
Map<String,Object> map = getUrlData(nurl, ProxyHolder.NAT_HEAVY_PROXY);
if(Objects.nonNull(map) && !map.isEmpty() && time.compareTo(String.valueOf(map.get("time"))) < 1) {
map.put("word", word);
dataList.add(map);
}
......@@ -61,6 +59,9 @@ public class QQTV {
}
page++;
if(count != dataList.size()) {
if(page > 20) {
break;
}
continue;
}
......@@ -76,24 +77,26 @@ public class QQTV {
return dataList;
}
private static Map<String,Object> getUrlData(String url,Proxy proxy) {
for(int i = 1;i < 3;i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
String source = result.split("\\<span class=\"user_name\"\\>")[1].split("\\</span\\>")[0];
result = result.split("var VIDEO_INFO =")[1].split("\\</script\\>")[0];
JSONObject json = JSONObject.parseObject(result);
Map<String,Object> map = new HashMap<>();
map.put("playCount", json.getInteger("view_all_count"));
map.put("title", json.getString("title"));
map.put("time", json.getString("video_checkup_time"));
map.put("source", source);
map.put("url", url);
return map;
} catch (Exception e) {
e.printStackTrace();
}
private static Map<String,Object> getUrlData(String url,ProxyHolder proxy) {
if(!url.contains("v.qq.com")) {
return null;
}
System.out.println(url);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
String source = result.split("\\<span class=\"user_name\"\\>")[1].split("\\</span\\>")[0];
result = result.split("var VIDEO_INFO =")[1].split("\\</script\\>")[0];
JSONObject json = JSONObject.parseObject(result);
Map<String,Object> map = new HashMap<>();
map.put("playCount", json.getInteger("view_all_count"));
map.put("title", json.getString("title"));
map.put("time", json.getString("video_checkup_time"));
map.put("source", source);
map.put("url", url);
return map;
} catch (Exception e) {
e.printStackTrace();
}
return Collections.emptyMap();
}
......
......@@ -24,7 +24,7 @@ import okhttp3.Response;
public class SohuTV {
private static final Logger logger = LoggerFactory.getLogger(SohuTV.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> sohuTVData(String word,String cookie,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
......
//package com.zhiwei.Comment;
//
//import org.testng.annotations.Test;
//import org.junit.Test;
//
//import com.zhiwei.parse.Aika;
//import com.zhiwei.tools.timeparse.TimeExtraction;
//import com.zhiwei.tools.timeparse.TimeParse;
//
//public class AikaComment {
// @Test
// public void f() {
// String url = "http://newcar.xcar.com.cn/201809/news_2021765_1.html";
// String url = "http://info.xcar.com.cn/201906/news_2039730_1.html";
//
// Aika.getAikaComment(url, null);
//
......
package com.zhiwei.Comment;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Aiqiyi;
public class AiqiyiHotCountTest {
@Test
public void f() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\爱奇艺.xlsx";
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
List<String> headList = (List<String>) map.get("head");
headList.add("count");
dataList.forEach(m -> {
String url = String.valueOf(m.get("链接"));
int i = Aiqiyi.aiqiyiHotCount(url, ProxyHolder.NAT_PROXY);
System.out.println(url + " -- " + i);
m.put("count", i);
});
poi.exportExcel(path, "data", headList, dataList);
}
}
//package com.zhiwei.Comment;
//
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Aiqiyi;
//
//public class AiqiyiHotCountTest {
// @Test
// public void f() {
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\爱奇艺.xlsx";
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<String> headList = (List<String>) map.get("head");
// headList.add("count");
// dataList.forEach(m -> {
// String url = String.valueOf(m.get("链接"));
//
// int i = Aiqiyi.aiqiyiHotCount(url, ProxyHolder.NAT_PROXY);
// System.out.println(url + " -- " + i);
// m.put("count", i);
// });
// poi.exportExcel(path, "data", headList, dataList);
// }
//}
......@@ -4,7 +4,7 @@
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
......@@ -18,27 +18,28 @@
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String, Object> map = poi
// .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", 0);
// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
// String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"rhItcea5qkO6WCSnVcczW/NRVLLCTsq3kQbpUCGAwQ0ceLunVJRjT5rgoFVYrIBA8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAyMTY3MDQ5LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=zbs4cHtzTcHWvjtkpjAZmoqLXsQ";
// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
// List<String> headList = (List<String>) map.get("head");
// for (Map<String, Object> map1 : list) {
// String url = map1.get("地址") + "";
// Map<String,Object> map3 = Maimai.getMaiaiCount(url,null, ProxyHolder.NAT_PROXY);
//// Map<String, Object> map = poi
//// .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", 0);
//// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
//// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
//// List<String> headList = (List<String>) map.get("head");
//// for (Map<String, Object> map1 : list) {
//// String url = map1.get("地址") + "";
// String cookie = "_buuid=0668b664-13b3-4bd0-aa37-99d747432e85; guid=HBoEGxgEGBscBBsZGlYHGBsZHRsYExwZHFYcGQQdGR8FQ1hLTEt5ChITBBIdHxkEGgQbHQVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; token=\"ou+mv1hjxjm0uOOTss1vgck9+h6OCS/lYQUeFnJgSK70FHprmw6GmjBGwk2qPQH88CKuzcDfAvoCmBm7+jVysA==\"; uid=\"A8ELjewCDRgHnZ5bX0Vy0/Airs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMjI3NjU0NTI0Iiwic2VjcmV0IjoiV0wyZmEtZDZxbkx2TEkzZHF2dTN4UG5SIiwiX2V4cGlyZSI6MTU2MDU5Mzg4Mjc5NCwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=ujhqvC3wPAn-WlCPXfB6C5ZJIgY";
// String url = "https://maimai.cn/web/gossip_detail?src=app&webid=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlZ2lkIjoiMjU5OTg4YmE4YzI3MTFlOTllMjEyNDZlOTZiNDgwODgiLCJ1IjoxNzY2NDQ3NjUsImlkIjoyMjIwMDM4NH0.gBdp3i99L0xUKLglaVIJf07OrrPk6yoG9HAo-3Yftqk";
// Map<String,Object> map3 = Maimai.getMaiaiCount(url,cookie, ProxyHolder.NAT_HEAVY_PROXY);
// System.out.println(map3.toString());
// System.out.println(url);
// map1.putAll(map3);
// ZhiWeiTools.sleep(500);
// System.out.println("--------------------------");
// }
// headList.add("like");
// headList.add("spreads");
// headList.add("cmts");
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", "评论采集", headList,
// list);
//// map1.putAll(map3);
//// ZhiWeiTools.sleep(500);
//// System.out.println("--------------------------");
//// }
//// headList.add("like");
//// headList.add("spreads");
//// headList.add("cmts");
//// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", "评论采集", headList,
//// list);
// }
//}
package com.zhiwei.Comment;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Aiqiyi;
import com.zhiwei.parse.Youku;
public class YoukuHotCountTest {
@Test
public void f() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\优酷.xlsx";
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
List<String> headList = (List<String>) map.get("head");
headList.add("count");
dataList.forEach(m -> {
String url = String.valueOf(m.get("链接"));
int i = Youku.getYoukuHotCount(url, ProxyHolder.NAT_PROXY);
System.out.println(url + " -- " + i);
m.put("count", i);
});
poi.exportExcel(path, "data", headList, dataList);
}
}
//package com.zhiwei.Comment;
//
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Youku;
//
//public class YoukuHotCountTest {
// @Test
// public void f() {
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\视频奶粉.xlsx";
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<String> headList = (List<String>) map.get("head");
// headList.add("count");
// dataList.forEach(m -> {
// String url = String.valueOf(m.get("url"));
//
// int i = Youku.getYoukuHotCount(url, ProxyHolder.NAT_PROXY);
// System.out.println(url + " -- " + i);
// m.put("count", i);
// });
// poi.exportExcel(path, "data", headList, dataList);
//
//
// }
//}
//package com.zhiwei;
//
//import java.io.IOException;
//import java.util.HashMap;
//import java.util.Map;
//
//import java.util.HashMap;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.crawler.core.HttpBoot;
//import com.zhiwei.crawler.core.RequestUtils;
//
//public class TestHttpBoot {
// @Test
// public void f() {
// HttpBoot httpBoot = new HttpBoot();
// String url = "https://www.toutiao.com/c/user/following/?user_id=1034006366&count=20&_signature=wp5wPBAVmXlosTC8Fobui8KecC";
// Map<String,Object> headers = new HashMap<>();
// headers.put("referer", "https://www.qctt.cn/news/349056");
// headers.put("cookie", "PHPSESSID=3rd6bvonb4g15t1fp777mjums0; Hm_lvt_70af9ea91e7adc8195f6d49511b9a2f1=1542253722; open_ad=1; Hm_lpvt_70af9ea91e7adc8195f6d49511b9a2f1=1542271394; vcode=sqmm; XSRF-TOKEN=eyJpdiI6IlFTNzkyYWNcLzB2SUwyN2dcL1hhUlpsZz09IiwidmFsdWUiOiJRSUpycjZJNGx3d1hUWkpOQUl1R2psSStuVU0yYW8xT1YxXC9QOFY1NjdyRXNrMWpFVE1kSm9IQ1o5Nm5keXlMTEFnZXdCOHVvWDg0U2picTE1cjZzMkE9PSIsIm1hYyI6IjZlYzk5NDI3ODEzMzA3ZTJjNDc3M2ZjMjBlNDJhZjc2YjU2ODFmYmY3YWRlMzdlMzM1NTBlNWMxNDk3MjFiZDEifQ%3D%3D; laravel_session=eyJpdiI6InJQMnByeFlIbXVhaUVVVVBLK1wvaXlRPT0iLCJ2YWx1ZSI6IlhUOUtIS2ZQZ0ZKNFh1RDVQYjBjSVZkVkpQZTdYRDNpa1wvV0o5QlJPbk8xZE0rQ3dZdnFMdjcya011ejVkdWEwUk1Qa29Zb2Y3OU0yUGkrWDF4Wk5adz09IiwibWFjIjoiZGJiYjlkNWZhNmJhMDFiMjkxYTAyMmUwZTEyMWVmZTQ0NmJiZDQ2ZGU3ZjNjNmUzNTIwZGI0NTc4NDJlZjNiMCJ9");
// headers.put("origin", "https://www.qctt.cn");
// Map<String,Object> params = new HashMap<>();
// params.put("id", "349056");
// params.put("page", "3");
// params.put("_token", "EJ58V0qilRw7P77czp0U6iO9QW2IOS1ZGiBk4wH1");
// try {
// String result = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
// System.out.println(result);
//
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
//
//
// }
//}
package com.zhiwei;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.Response;
public class TestHttpBoot {
public static void main(String[] args) {
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).followSslRedirects(false).build();
String url = "http://v.youku.com/v_show/id_XMzg1ODAwOTcwOA==.html";
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url))){
url = response.body().string();
System.out.println(url);
} catch (Exception e) {
e.printStackTrace();
}
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Dayu;
//
//public class DayuAccountExample {
//
//
// @Test
// public void dayuAccountTest() {
// //https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
//
//
//// String mid = "d7300311c1504d24a229c3da345785c6";
//// String name = "大鱼海棠雨";
// String startTime = "2017-01-01 00:00:00";
// String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
//// headList.add("content_id");
//// headList.add("origin_id");
//// headList.add("xss_item_id");
// for(Map<String,Object> data : lists) {
// String mid = data.get("mid")+"";
// String name = data.get("name")+"";
// if(mid.length() < 1 && name.length() < 1) {
// continue;
// }
// List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null,null);
// poi.exportExcel(path, name, headList, dataList);
// }
//
//
// }
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Dayu;
public class DayuAccountExample {
@Test
public void dayuAccountTest() {
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String mid = "0a8b2360fd8b4ded971cd324a56d32f0";
// String name = "大鱼海棠雨";
String startTime = "2017-01-01 00:00:00";
String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx";
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("source");
headList.add("url");
// headList.add("content_id");
// headList.add("origin_id");
// headList.add("xss_item_id");
for(Map<String,Object> data : lists) {
String mid = data.get("mid")+"";
String name = data.get("name")+"";
mid = "7b345070c4124574b9cbcab8c4a1aeb8";
name = "国魂";
if(mid.length() < 1 && name.length() < 1) {
continue;
}
List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null,null);
poi.exportExcel(path, name, headList, dataList);
}
}
}
package com.zhiwei.crawler;
import org.junit.Test;
import com.zhiwei.parse.Dayu;
public class DayuCommentCountExample {
@Test
public void dayuCommentCountTest() {
String articleId = "6987993456991247474";
int i = Dayu.getDayuCommentCount(articleId,null);
System.out.println(i);
}
}
//package com.zhiwei.crawler;
//
//import org.junit.Test;
//
//import com.zhiwei.parse.Dayu;
//
//public class DayuCommentCountExample {
//
// @Test
// public void dayuCommentCountTest() {
// String articleId = "6987993456991247474";
//
// int i = Dayu.getDayuCommentCount(articleId,null);
// System.out.println(i);
// }
//
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Dayu;
public class DayuCommentExample {
@Test
public void getDayuCommentTest() {
//若已获取历史文章 哪里有这个字段 其他文章的
//http://m.uczzd.cn/iflow/api/v2/cmt/article/14180961224021425316/comments/byhot
//14180961224021425316 这个为此参数
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//UC评论采集-1.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) {
String url = "";
try {
url = map1.get("url")+"";
String articleId = "";
url = "16848608935470442496";
if(url.contains("aid")) {
articleId = url.split("aid=")[1].split("&")[0];
}else {
articleId = url;
}
List<Map<String,Object>> dataList = Dayu.getDayuCommentData(articleId,null);
if(dataList.size() <= 0) {
urlList.add(url);
}
if(dataList != null) {
bodyList.addAll(dataList);
}
} catch (Exception e) {
System.out.println(url);
e.printStackTrace();
continue;
}
}
List<String> headList = new ArrayList<String>();
headList.add("nickname");
headList.add("content");
headList.add("id");
headList.add("url");
headList.add("like");
headList.add("time");
headList.add("replay_count");
for(String s : urlList) {
System.out.println(s);
}
poi.exportExcel("D://crawlerdata/UC评论采集.xlsx", "评论", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Dayu;
//
//public class DayuCommentExample {
//
// @Test
// public void getDayuCommentTest() {
// //若已获取历史文章 哪里有这个字段 其他文章的
// //http://m.uczzd.cn/iflow/api/v2/cmt/article/14180961224021425316/comments/byhot
// //14180961224021425316 这个为此参数
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//UC评论采集-1.xlsx", 0);
// List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// List<String> urlList = new ArrayList<String>();
// for(Map<String,Object> map1 : list) {
// String url = "";
// try {
// url = map1.get("url")+"";
// String articleId = "";
// url = "16848608935470442496";
// if(url.contains("aid")) {
// articleId = url.split("aid=")[1].split("&")[0];
// }else {
// articleId = url;
// }
// List<Map<String,Object>> dataList = Dayu.getDayuCommentData(articleId,null);
// if(dataList.size() <= 0) {
// urlList.add(url);
// }
// if(dataList != null) {
// bodyList.addAll(dataList);
// }
// } catch (Exception e) {
// System.out.println(url);
// e.printStackTrace();
// continue;
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("nickname");
// headList.add("content");
// headList.add("id");
// headList.add("url");
// headList.add("like");
// headList.add("time");
// headList.add("replay_count");
// for(String s : urlList) {
// System.out.println(s);
// }
// poi.exportExcel("D://crawlerdata/UC评论采集.xlsx", "评论", headList, bodyList);
//
// }
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang;
public class FenghuangAccountExample {
@Test
public void fenghuangAccountTest() {
//所用时间长 1s1篇文章吧
//https://api.3g.ifeng.com/client_search_subscribe?k=号外财经
String id = "6452";
String[] ids = id.split(",");
for(int i = 0;i < ids.length;i++) {
try {
String startTime = "2010-05-01 00:00:00"; //可为空
List<Map<String,Object>> dataList = Fenghuang.getFenghuangAccountData(ids[i], startTime,null);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("text");
headList.add("source");
headList.add("url");
headList.add("id");
poi.exportExcel("D://crawlerdata/凤凰-6452.xlsx", ids[i], headList, dataList);
} catch (Exception e) {
continue;
}
}
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang;
import com.zhiwei.parse.Yidianzixun;
import com.zhiwei.util.WordReadFile;
public class FenghuangByWordExample {
@Test
public void fenghuangByWordTest() {
List<String> wordList = WordReadFile.getWords("D://crawlerdata/关键词.txt");
List<Map<String,Object>> listAll = new ArrayList<Map<String,Object>>();
for(String word : wordList) {
try {
List<Map<String,Object>> dataList = Fenghuang.getFenghuangByWord(word,null);
if(dataList != null && dataList.size() > 0) {
listAll.addAll(dataList);
}
System.out.println(dataList.size()+"==========="+listAll.size());
} catch (Exception e) {
continue;
}
}
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("content");
headList.add("source");
headList.add("time");
headList.add("url");
System.out.println(listAll.size());
poi.exportExcel("D://crawlerdata/凤凰-美林.xlsx", "asd", headList, listAll);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Fenghuang;
//import com.zhiwei.parse.Yidianzixun;
//import com.zhiwei.util.WordReadFile;
//
//public class FenghuangByWordExample {
//
// @Test
// public void fenghuangByWordTest() {
// List<String> wordList = WordReadFile.getWords("D://crawlerdata/关键词.txt");
// List<Map<String,Object>> listAll = new ArrayList<Map<String,Object>>();
// for(String word : wordList) {
// try {
// List<Map<String,Object>> dataList = Fenghuang.getFenghuangByWord(word,null);
// if(dataList != null && dataList.size() > 0) {
// listAll.addAll(dataList);
// }
// System.out.println(dataList.size()+"==========="+listAll.size());
// } catch (Exception e) {
// continue;
// }
// }
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("content");
// headList.add("source");
// headList.add("time");
// headList.add("url");
// System.out.println(listAll.size());
// poi.exportExcel("D://crawlerdata/凤凰-美林.xlsx", "asd", headList, listAll);
// }
//
//
//
//}
package com.zhiwei.crawler;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.parse.Fenghuang;
public class FenghuangCommentCountExample {
@Test
public void fenghuangCommentCountTest() {
String url = "http://tech.ifeng.com/a/20181113/45222352_0.shtml";
//http://news.ifeng.com/a/20161229/50492484_0.shtml
//http://wemedia.ifeng.com/4096977/wemedia.shtml
Map<String,Object> map = Fenghuang.getFenghuangCommentCount(url,null);
System.out.println(map.toString());
}
}
//package com.zhiwei.crawler;
//
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.parse.Fenghuang;
//
//
//public class FenghuangCommentCountExample {
//
// @Test
// public void fenghuangCommentCountTest() {
// String url = "http://tech.ifeng.com/a/20181113/45222352_0.shtml";
// //http://news.ifeng.com/a/20161229/50492484_0.shtml
// //http://wemedia.ifeng.com/4096977/wemedia.shtml
// Map<String,Object> map = Fenghuang.getFenghuangCommentCount(url,null);
// System.out.println(map.toString());
//
// }
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class FenghuangCommentExample {
@Test
public void fenghuangCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) {
String url = "";
try {
url = map1.get("url")+"";
System.out.println(url);
List<Map<String,Object>> dataList = Fenghuang.getFenghuangCommentData2(url,null);
if(dataList == null || dataList.size() <= 0) {
urlList.add(url);
}
if(dataList != null) {
for(Map<String,Object> m : dataList) {
m.put("from_url", url);
bodyList.add(m);
}
}
} catch (Exception e) {
System.out.println(url);
e.printStackTrace();
continue;
}
ZhiWeiTools.sleep(1000);
}
List<String> headList = new ArrayList<String>();
headList.add("nickname");
headList.add("content");
headList.add("id");
headList.add("like");
headList.add("from");
headList.add("time");
headList.add("from_url");
for(String s : urlList) {
System.out.println(s);
}
poi.exportExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", "评论采集", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Fenghuang;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class FenghuangCommentExample {
//
// @Test
// public void fenghuangCommentTest() {
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", 0);
// List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// List<String> urlList = new ArrayList<String>();
// for(Map<String,Object> map1 : list) {
// String url = "";
// try {
// url = map1.get("url")+"";
// System.out.println(url);
// List<Map<String,Object>> dataList = Fenghuang.getFenghuangCommentData2(url,null);
// if(dataList == null || dataList.size() <= 0) {
// urlList.add(url);
// }
// if(dataList != null) {
// for(Map<String,Object> m : dataList) {
// m.put("from_url", url);
// bodyList.add(m);
// }
// }
// } catch (Exception e) {
// System.out.println(url);
// e.printStackTrace();
// continue;
// }
// ZhiWeiTools.sleep(1000);
// }
// List<String> headList = new ArrayList<String>();
// headList.add("nickname");
// headList.add("content");
// headList.add("id");
// headList.add("like");
// headList.add("from");
// headList.add("time");
// headList.add("from_url");
// for(String s : urlList) {
// System.out.println(s);
// }
// poi.exportExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", "评论采集", headList, bodyList);
//
// }
//
//
//}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.Arrays;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Maimai;
//
//public class MaimaiBywordExample {
//
// public static void main(String[] args) {
// String word = "美团|某团|MT|大众点评|新美大|美团点评";
// String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"G8eNNNylPoi3oIPLUr/d/RDaMgtnpZCskxT7wu1pRRrkiy3J8G7StHgTx9DQBq4O8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAwNjY1Njg4LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=b_tga85tZskxsgKX8YIM_JKByi0";
// String time = "2019-02-15 00:00:00";
// String[] words = word.split("\\|");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : words) {
// //实名动态
//// List<Map<String,Object>> c = Maimai.getData(w, cookie, time, null);
// //职言交流
// List<Map<String,Object>> c2 = Maimai.getDataByNoName(w, cookie, time, null);
//// bodyList.addAll(c);
// bodyList.addAll(c2);
// }
// List<String> headList = Arrays.asList("time","url","text","name","like","comment_count","spreads","word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0222.xlsx", "脉脉关键词", headList, bodyList);
// }
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Maimai;
public class MaimaiBywordExample {
public static void main(String[] args) {
String word = "美团|某团|MT|大众点评|新美大|美团点评";
String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=8d1sx8i4gj0ocmtyc86x2yj0467ymayv; token=\"wl8U6GizDpoS6uzZ1ug93sJjfBucfB7IOoDxDVWOy+g7egJdXL/riMlMlHuQj+gM8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiLVctRlpDLXg3N1h4ZEhkeEs0Qi1NR0VDIiwibWlkNDU2ODc2MCI6ZmFsc2UsInN0YXR1cyI6dHJ1ZSwiX2V4cGlyZSI6MTU1NzEyNDAxMzA0NSwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=NZ2D9ZQU_Wlx6JGAFap4Znviz6k";
String time = "2019-02-15 00:00:00";
String[] words = word.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) {
//实名动态
// List<Map<String,Object>> c = Maimai.getData(w, cookie, time, null);
//职言交流
List<Map<String,Object>> c2 = Maimai.getDataByNoName(w, cookie, time, null);
// bodyList.addAll(c);
bodyList.addAll(c2);
}
List<String> headList = Arrays.asList("time","url","text","name","like","comment_count","spreads","word");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0222.xlsx", "脉脉关键词", headList, bodyList);
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Meipai;
public class MeipaiByWordExample {
@Test
public void meipaiByWordTest() {
String word = "美食,吃,菜";
String[] words = word.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) {
List<Map<String,Object>> dataList = Meipai.getMeipaiByWordData(w,null);
if(dataList != null) {
bodyList.addAll(dataList);
}
}
List<String> headList = new ArrayList<String>();
headList.add("time");
headList.add("video_count");
headList.add("content");
headList.add("url");
headList.add("like");
headList.add("comment_count");
headList.add("source");
headList.add("source_url");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata/美拍关键词采集.xlsx", "美拍数据", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Meipai;
//
//public class MeipaiByWordExample {
//
// @Test
// public void meipaiByWordTest() {
// String word = "美食,吃,菜";
// String[] words = word.split(",");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : words) {
// List<Map<String,Object>> dataList = Meipai.getMeipaiByWordData(w,null);
// if(dataList != null) {
// bodyList.addAll(dataList);
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("time");
// headList.add("video_count");
// headList.add("content");
// headList.add("url");
// headList.add("like");
// headList.add("comment_count");
// headList.add("source");
// headList.add("source_url");
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// poi.exportExcel("D://crawlerdata/美拍关键词采集.xlsx", "美拍数据", headList, bodyList);
//
// }
//
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Miaopai;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class MiaopaiByUrlExample {
@Test
public void miaopaiByUrlTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata/秒拍美食.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) {
String url = "";
try {
url = map1.get("url")+"";
if(urlList.contains(url)) {
continue;
}
urlList.add(url);
ZhiWeiTools.sleep(5000);
System.out.println(url);
Map<String,Object> dataMap = Miaopai.getMiaopaiDataByURL(url,null);
if(dataMap != null) {
bodyList.add(dataMap);
}
} catch (Exception e) {
System.out.println(url);
e.printStackTrace();
continue;
}
}
List<String> headList = new ArrayList<String>();
headList.add("time");
headList.add("source");
headList.add("title");
headList.add("url");
headList.add("video_count");
poi.exportExcel("D://crawlerdata/秒拍美食.xlsx", "asd", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Miaopai;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class MiaopaiByUrlExample {
//
// @Test
// public void miaopaiByUrlTest() {
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String,Object> map = poi.importExcel("D://crawlerdata/秒拍美食.xlsx", 0);
// List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// List<String> urlList = new ArrayList<String>();
// for(Map<String,Object> map1 : list) {
// String url = "";
// try {
// url = map1.get("url")+"";
// if(urlList.contains(url)) {
// continue;
// }
// urlList.add(url);
// ZhiWeiTools.sleep(5000);
// System.out.println(url);
// Map<String,Object> dataMap = Miaopai.getMiaopaiDataByURL(url,null);
// if(dataMap != null) {
// bodyList.add(dataMap);
// }
// } catch (Exception e) {
// System.out.println(url);
// e.printStackTrace();
// continue;
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("time");
// headList.add("source");
// headList.add("title");
// headList.add("url");
// headList.add("video_count");
// poi.exportExcel("D://crawlerdata/秒拍美食.xlsx", "asd", headList, bodyList);
//
// }
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.PearVideo;
public class PearVideoByWordExample {
@Test
public void pearVideoByWordTest() {
String word = "大宝 甲醛";
List<Map<String,Object>> bodyList = PearVideo.getPearVideoData(word,null);
List<String> headList = new ArrayList<String>();
headList.add("time");
headList.add("title");
headList.add("content");
headList.add("url");
headList.add("like");
headList.add("source");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata/梨视频关键词采集.xlsx", "梨视频采集结果", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.PearVideo;
//
//public class PearVideoByWordExample {
//
// @Test
// public void pearVideoByWordTest() {
// String word = "大宝 甲醛";
//
// List<Map<String,Object>> bodyList = PearVideo.getPearVideoData(word,null);
// List<String> headList = new ArrayList<String>();
// headList.add("time");
// headList.add("title");
// headList.add("content");
// headList.add("url");
// headList.add("like");
// headList.add("source");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata/梨视频关键词采集.xlsx", "梨视频采集结果", headList, bodyList);
//
//
// }
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.QQKB;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class QQAccountExample {
@Test
public void qqAccountTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> dataMap = poi.importExcel("D://crawlerdata//自媒体/天天快报历史文章采集.xlsx", 0);
List<Map<String,Object>> dataList = (List<Map<String, Object>>) dataMap.get("body");
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(Map<String,Object> map : dataList) {
String child = map.get("帐号链接")+"";
// System.out.println(child.split("chlid=")[1]);
System.out.println(child.split("=")[1]);
List<Map<String,Object>> lists = QQKB.getQQAccountData("5001789", cookie,null);
if(lists != null) {
for(Map<String,Object> map1 : lists) {
map1.put("name", map.get("呢称"));
map1.put("主页地址", map.get("帐号链接"));
bodyList.add(map1);
}
}
System.out.println("采集到的历史文章数总和============="+bodyList.size());
ZhiWeiTools.sleep(5000);
}
System.out.println(dataList.size());
List<String> headList = new ArrayList<String>();
headList.add("name");
headList.add("主页地址");
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("url");
headList.add("commentid");
poi.exportExcel("D://crawlerdata//自媒体/天天快报采集-科技编年史.xlsx", "asd", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.QQKB;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class QQAccountExample {
//
// @Test
// public void qqAccountTest() {
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> dataMap = poi.importExcel("D://crawlerdata//自媒体/天天快报历史文章采集.xlsx", 0);
// List<Map<String,Object>> dataList = (List<Map<String, Object>>) dataMap.get("body");
// String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(Map<String,Object> map : dataList) {
// String child = map.get("帐号链接")+"";
//// System.out.println(child.split("chlid=")[1]);
// System.out.println(child.split("=")[1]);
//
// List<Map<String,Object>> lists = QQKB.getQQAccountData("5456950", cookie,null);
// if(lists != null) {
// for(Map<String,Object> map1 : lists) {
// map1.put("name", map.get("呢称"));
// map1.put("主页地址", map.get("帐号链接"));
// bodyList.add(map1);
// }
// }
// System.out.println("采集到的历史文章数总和============="+bodyList.size());
// ZhiWeiTools.sleep(5000);
// }
// System.out.println(dataList.size());
// List<String> headList = new ArrayList<String>();
// headList.add("name");
// headList.add("主页地址");
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("url");
// headList.add("commentid");
// poi.exportExcel("D://crawlerdata//自媒体/天天快报采集-科技编年史.xlsx", "asd", headList, bodyList);
// }
//
//
//}
package com.zhiwei.crawler;
import org.junit.Test;
import com.zhiwei.parse.QQKB;
public class QQKBByWordExample {
@Test
public void qqkbByWordTest() {
String word = "麦当劳";
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000a7aa9ad0c4636295c0f484bf54820005db056be0651296f399e092bc56e4910bbedaf413ced3fb8a;%20uin=o0497332654;%20skey=MSF4MCe62n;%20sigA2=71400747BF93A51F5F103AC6180C723E940637B87127C104AA325F5709FD378BEB014D649D6B653031F6B2E0962C0F8D9C06807A1CE509983C1C70B641606FEA84B90777926863E4;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwsY93PCkKQ3oQ4nAAU9-VNf0eKb4SlfVLb-YW7bRaEL2jS3XDtJGO8m9DoDrFZ4bFyv96Pb5ZfvVIyehq-jQ65o;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
//无法找到下一页
// QQKB.getQQKBByWordData(word, cookie);
}
}
//package com.zhiwei.crawler;
//
//import org.junit.Test;
//
//import com.zhiwei.parse.QQKB;
//
//public class QQKBByWordExample {
//
// @Test
// public void qqkbByWordTest() {
// String word = "麦当劳";
// String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000a7aa9ad0c4636295c0f484bf54820005db056be0651296f399e092bc56e4910bbedaf413ced3fb8a;%20uin=o0497332654;%20skey=MSF4MCe62n;%20sigA2=71400747BF93A51F5F103AC6180C723E940637B87127C104AA325F5709FD378BEB014D649D6B653031F6B2E0962C0F8D9C06807A1CE509983C1C70B641606FEA84B90777926863E4;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwsY93PCkKQ3oQ4nAAU9-VNf0eKb4SlfVLb-YW7bRaEL2jS3XDtJGO8m9DoDrFZ4bFyv96Pb5ZfvVIyehq-jQ65o;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
// //无法找到下一页
//// QQKB.getQQKBByWordData(word, cookie);
//
// }
//
//
//
//}
package com.zhiwei.crawler;
import org.junit.Test;
import com.zhiwei.parse.QQKB;
public class QQKBCommentCountExample {
@Test
public void qqkbCommentCountTest() {
String cookie = "";
String url = "https://kuaibao.qq.com/s/20190305A16P6L00";
int i = QQKB.getCommentCount(url,null);
System.out.println(i);
}
}
//package com.zhiwei.crawler;
//
//import org.junit.Test;
//
//import com.zhiwei.parse.QQKB;
//
//public class QQKBCommentCountExample {
//
//
// @Test
// public void qqkbCommentCountTest() {
// String cookie = "";
// String url = "https://kuaibao.qq.com/s/20190305A16P6L00";
//
// int i = QQKB.getCommentCount(url,null);
// System.out.println(i);
//
// }
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.QQKB;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class QQKBCommentExample {
//天天快报与腾讯新闻都可用 不用cookie
@Test
public void qqkbCommentTest() {
String url = "https://kuaibao.qq.com/s/20181122A11WQB00";
//https://kuaibao.qq.com/s/20180423A1PI7400?refer=kb_news
// https://kuaibao.qq.com/s/20180423A0L60800?refer=kb_news
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType.PROVIDER);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/快报评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(Map<String,Object> m : list) {
String u = m.get("地址").toString();
System.out.println(u);
ZhiWeiTools.sleep(2000);
List<Map<String,Object>> dataList = QQKB.getQQKBCommentData(u,null);
if(dataList!= null) {
bodyList.addAll(dataList);
}
}
List<String> headList = new ArrayList<String>();
headList.add("reply_id"); //id
headList.add("like"); //点赞数
headList.add("name"); //呢称
headList.add("reply_num"); //回复数
headList.add("time"); //时间
headList.add("content"); //内容
System.out.println(bodyList.size());
poi.exportExcel("D:\\crawlerdata\\自媒体\\快报评论采集.xlsx", "sada", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.QQKB;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class QQKBCommentExample {
//
// //天天快报与腾讯新闻都可用 不用cookie
// @Test
// public void qqkbCommentTest() {
// String url = "https://kuaibao.qq.com/s/20181122A11WQB00";
// //https://kuaibao.qq.com/s/20180423A1PI7400?refer=kb_news
//// https://kuaibao.qq.com/s/20180423A0L60800?refer=kb_news
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/快报评论采集.xlsx", 0);
// List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(Map<String,Object> m : list) {
// String u = m.get("地址").toString();
// System.out.println(u);
// ZhiWeiTools.sleep(2000);
// List<Map<String,Object>> dataList = QQKB.getQQKBCommentData(u,null);
// if(dataList!= null) {
// bodyList.addAll(dataList);
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("reply_id"); //id
// headList.add("like"); //点赞数
// headList.add("name"); //呢称
// headList.add("reply_num"); //回复数
// headList.add("time"); //时间
// headList.add("content"); //内容
// System.out.println(bodyList.size());
// poi.exportExcel("D:\\crawlerdata\\自媒体\\快报评论采集.xlsx", "sada", headList, bodyList);
//
// }
//
//
//}
package com.zhiwei.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
/**
* @ClassName: SinaCommentListTest
* @Description: TODO(新浪新闻评论抓取)
* @author hero
* @date 2017年8月10日 下午6:08:41
*/
public class SinaCommentListTest {
public static void sinaCommentListTest(String url) {
Map<String,String> headerMap = HeaderTool.getCommonHead();
String newsId = getCommentId(url).split("=====")[1];
String channel = getCommentId(url).split("=====")[0];
int page = 1;
try {
String comment_url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel="+channel+"&newsid="+newsId+"&group=0&compress=0&ie=gbk&oe=gbk&page="+page+"&page_size=20&jsvar=loader_1525576000752_30189682";
System.out.println("commenturl========"+comment_url);
String html = HttpClientTemplateOK.get(comment_url, null, headerMap);
if(html!=null){
html = html.substring(html.indexOf("=",0)+1,html.length());
System.out.println(html);
JSONObject data = JSONObject.parseObject(html).getJSONObject("result");
JSONArray jsonArray = data.getJSONArray("cmntlist");
for(int a = 0;a<jsonArray.size();a++){
Map<String,Object> doc = new HashMap<String, Object>();
JSONObject json = jsonArray.getJSONObject(a);
doc.put("_id", json.getString("mid"));
doc.put("content", json.getString("content"));
doc.put("area", json.getString("area"));
doc.put("nick", json.getString("nick"));
doc.put("time", json.getString("time"));
doc.put("agree", json.getInteger("agree"));
doc.put("against", json.getInteger("against"));
doc.put("vote", json.getInteger("vote"));
doc.put("fromUrl", url);
System.out.println("doc==========="+doc);
}
}else{
System.out.println("--------------");
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static String getCommentId(String url){
String newsid = null;
String channel = null;
Map<String,String> headerMap = HeaderTool.getCommonHead();
System.out.println(url);
try {
String html = HttpClientTemplateOK.get(url, null, headerMap);
if(html!=null && html.contains("newsid")){
newsid = html.split("newsid: '")[1].split("',")[0];
channel = html.split("channel: '")[1].split("',")[0];
System.out.println(channel+"============"+newsid);
return channel+"====="+newsid;
}
} catch (IOException e) {
return null;
} catch (Exception e) {
e.printStackTrace();
}
return newsid;
}
}
//package com.zhiwei.crawler;
//
//import java.io.IOException;
//import java.util.ArrayList;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//
//import com.alibaba.fastjson.JSONArray;
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.tools.httpclient.HeaderTool;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//
///**
// * @ClassName: SinaCommentListTest
// * @Description: TODO(新浪新闻评论抓取)
// * @author hero
// * @date 2017年8月10日 下午6:08:41
// */
//public class SinaCommentListTest {
//
//
// public static void sinaCommentListTest(String url) {
// Map<String,String> headerMap = HeaderTool.getCommonHead();
// String newsId = getCommentId(url).split("=====")[1];
// String channel = getCommentId(url).split("=====")[0];
// int page = 1;
// try {
// String comment_url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel="+channel+"&newsid="+newsId+"&group=0&compress=0&ie=gbk&oe=gbk&page="+page+"&page_size=20&jsvar=loader_1525576000752_30189682";
// System.out.println("commenturl========"+comment_url);
// String html = HttpClientTemplateOK.get(comment_url, null, headerMap);
// if(html!=null){
// html = html.substring(html.indexOf("=",0)+1,html.length());
// System.out.println(html);
// JSONObject data = JSONObject.parseObject(html).getJSONObject("result");
// JSONArray jsonArray = data.getJSONArray("cmntlist");
// for(int a = 0;a<jsonArray.size();a++){
// Map<String,Object> doc = new HashMap<String, Object>();
// JSONObject json = jsonArray.getJSONObject(a);
// doc.put("_id", json.getString("mid"));
// doc.put("content", json.getString("content"));
// doc.put("area", json.getString("area"));
// doc.put("nick", json.getString("nick"));
// doc.put("time", json.getString("time"));
// doc.put("agree", json.getInteger("agree"));
// doc.put("against", json.getInteger("against"));
// doc.put("vote", json.getInteger("vote"));
// doc.put("fromUrl", url);
// System.out.println("doc==========="+doc);
//
// }
// }else{
// System.out.println("--------------");
// }
//
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
//
//
//
// public static String getCommentId(String url){
// String newsid = null;
// String channel = null;
// Map<String,String> headerMap = HeaderTool.getCommonHead();
// System.out.println(url);
// try {
// String html = HttpClientTemplateOK.get(url, null, headerMap);
// if(html!=null && html.contains("newsid")){
// newsid = html.split("newsid: '")[1].split("',")[0];
// channel = html.split("channel: '")[1].split("',")[0];
// System.out.println(channel+"============"+newsid);
// return channel+"====="+newsid;
// }
// } catch (IOException e) {
// return null;
// } catch (Exception e) {
// e.printStackTrace();
// }
// return newsid;
// }
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Soku;
public class SoKuByWordExample {
@Test
public void sokuByWordTest() {
String word = "美食,味道,吃,试吃,美味,好吃";
String type = "174,103,176";
String[] words = word.split(",");
String[] types = type.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words ) {
for(String t : types) {
List<Map<String,Object>> list = Soku.getSoKuByWordData(w, t,null);
if(list != null && list.size() > 0) {
bodyList.addAll(list);
}
}
}
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("play_count");
headList.add("url");
headList.add("source");
poi.exportExcel("D://crawlerdata/优酷采集.xlsx", "优酷数据", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Soku;
//
//public class SoKuByWordExample {
//
// @Test
// public void sokuByWordTest() {
// String word = "美食,味道,吃,试吃,美味,好吃";
// String type = "174,103,176";
// String[] words = word.split(",");
// String[] types = type.split(",");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : words ) {
// for(String t : types) {
// List<Map<String,Object>> list = Soku.getSoKuByWordData(w, t,null);
// if(list != null && list.size() > 0) {
// bodyList.addAll(list);
// }
// }
// }
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("play_count");
// headList.add("url");
// headList.add("source");
// poi.exportExcel("D://crawlerdata/优酷采集.xlsx", "优酷数据", headList, bodyList);
//
// }
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Souhu;
public class SouhuAccountExample {
//http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8
@Test
public void souhuAccountTest() {
List<Map<String,Object>> lists = Souhu.getSouHuAccountData("c29odXptdHNmbjZ0cnRAc29odS5jb20=","2018-05-01 00:00:00",false,null);
System.out.println(lists.size());
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("url");
headList.add("comment");
headList.add("tags");
headList.add("newsid");
headList.add("source");
headList.add("newsPv");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-乔.xlsx", "乔", headList, lists);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Souhu;
//
//
//public class SouhuAccountExample {
//
// //http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8
//
// @Test
// public void souhuAccountTest() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// List<Map<String,Object>> lists = Souhu.getSouHuAccountData("99938933","浅黑科技","2018-05-01 00:00:00",false,null);
// System.out.println(lists.size());
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("url");
// headList.add("comment");
// headList.add("tags");
// headList.add("newsid");
// headList.add("source");
// headList.add("newsPv");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-乔.xlsx", "乔", headList, lists);
// }
//
//}
package com.zhiwei.crawler;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Souhu;
public class SouhuCommentCountExample {
@SuppressWarnings("unchecked")
@Test
public void souhuCommentCountTest() {
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType.PROVIDER);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<String> headList = (List<String>) map.get("head");
for(Map<String,Object> map1 : list) {
String url = "";
try {
url = map1.get("url")+"";
System.out.println(url);
url = "http://m.sohu.com/a/299389309_114988";
int i = Souhu.getSouhuCommentCount(url,ProxyHolder.NAT_PROXY);
int j = Souhu.getSohuReadNum(url, ProxyHolder.NAT_PROXY);
map1.put("count", i);
map1.put("redNum", j);
System.out.println(map1.toString());
} catch (Exception e) {
System.out.println(url);
e.printStackTrace();
continue;
}
}
headList.add("count");
poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "sheet2", headList, list);
}
}
//package com.zhiwei.crawler;
//
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Souhu;
//
//public class SouhuCommentCountExample {
//
//
// @SuppressWarnings("unchecked")
// @Test
// public void souhuCommentCountTest() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0);
// List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
// List<String> headList = (List<String>) map.get("head");
// for(Map<String,Object> map1 : list) {
// String url = "";
// try {
// url = map1.get("url")+"";
// System.out.println(url);
// url = "http://m.sohu.com/a/299389309_114988";
// int i = Souhu.getSouhuCommentCount(url,ProxyHolder.NAT_PROXY);
// int j = Souhu.getSohuReadNum(url, ProxyHolder.NAT_PROXY);
// map1.put("count", i);
// map1.put("redNum", j);
// System.out.println(map1.toString());
// } catch (Exception e) {
// System.out.println(url);
// e.printStackTrace();
// continue;
// }
// }
// headList.add("count");
// poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "sheet2", headList, list);
// }
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang;
import com.zhiwei.parse.Souhu;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class SouhuCommentExample {
@Test
public void souhuCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType.PROVIDER);
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) {
String url = "";
try {
url = map1.get("url")+"";
System.out.println(url);
List<Map<String,Object>> dataList = Souhu.getSouhuCommentData(url,null);
if(dataList.size() <= 0) {
urlList.add(url);
}
ZhiWeiTools.sleep(100);
if(dataList != null) {
bodyList.addAll(dataList);
}
} catch (Exception e) {
System.out.println(url);
e.printStackTrace();
continue;
}
}
List<String> headList = new ArrayList<String>();
headList.add("nickname");
headList.add("content");
headList.add("user_id");
headList.add("loaction");
headList.add("support_count");
headList.add("comment_id");
headList.add("reply_id");
headList.add("time");
for(String s : urlList) {
System.out.println(s);
}
poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "搜狐评论", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Fenghuang;
//import com.zhiwei.parse.Souhu;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class SouhuCommentExample {
//
// @Test
// public void souhuCommentTest() {
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0);
// List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// List<String> urlList = new ArrayList<String>();
// for(Map<String,Object> map1 : list) {
// String url = "";
// try {
// url = map1.get("url")+"";
// System.out.println(url);
// List<Map<String,Object>> dataList = Souhu.getSouhuCommentData(url,null);
// if(dataList.size() <= 0) {
// urlList.add(url);
// }
// ZhiWeiTools.sleep(100);
// if(dataList != null) {
// bodyList.addAll(dataList);
// }
// } catch (Exception e) {
// System.out.println(url);
// e.printStackTrace();
// continue;
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("nickname");
// headList.add("content");
// headList.add("user_id");
// headList.add("loaction");
// headList.add("support_count");
// headList.add("comment_id");
// headList.add("reply_id");
// headList.add("time");
// for(String s : urlList) {
// System.out.println(s);
// }
// poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "搜狐评论", headList, bodyList);
// }
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.TXNews;
public class TXNewsByWordExample {
public static void main(String[] args) {
String word = "唐嫣";
String devid = "6D33F35F-880D-42A6-A23F-881BEC6960EC";
List<Map<String,Object>> dataList = TXNews.getData(word,devid,null);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("url");
headList.add("id");
headList.add("source");
poi.exportExcel("D://crawlerdata/腾讯新闻-唐嫣-1.xlsx", "腾讯新闻数据", headList, dataList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.TXNews;
//
//public class TXNewsByWordExample {
//
// public static void main(String[] args) {
// String word = "唐嫣";
// String devid = "6D33F35F-880D-42A6-A23F-881BEC6960EC";
// List<Map<String,Object>> dataList = TXNews.getData(word,devid,null);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("url");
// headList.add("id");
// headList.add("source");
// poi.exportExcel("D://crawlerdata/腾讯新闻-唐嫣-1.xlsx", "腾讯新闻数据", headList, dataList);
// }
//
//}
package com.zhiwei.crawler;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.junit.Test;
public class Test1 {
public static void main(String[] args) {
String time = "https://view.inews.qq.com/a/NEW2018021000440002";
System.out.println(time.split("/")[4]);
}
}
//package com.zhiwei.crawler;
//
//import java.io.UnsupportedEncodingException;
//import java.net.URLEncoder;
//import java.util.regex.Matcher;
//import java.util.regex.Pattern;
//
//import org.junit.Test;
//
//public class Test1 {
//
//
// public static void main(String[] args) {
// String time = "https://view.inews.qq.com/a/NEW2018021000440002";
//
// System.out.println(time.split("/")[4]);
//
// }
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Wangyi;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class WangyiCommentCountExample {
@Test
public void wangyiCommentCountTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String path = "D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx";
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
List<String> urlList = new ArrayList<String>();
for(Map<String,Object> u : list) {
String url = u.get("链接")+"";
urlList.add(url);
}
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String url : urlList) {
url = "https://3g.163.com/all/article/E9GAO0PK051188EC.html";
String id = url.split("/")[url.split("/").length-1].split(".ht")[0];
System.out.println(id);
int lists = Wangyi.getWangyiCommentCount(id, null);
System.out.println(lists);
ZhiWeiTools.sleep(3000);
}
List<String> headList = new ArrayList<String>();
headList.add("content");
headList.add("id");
headList.add("time");
headList.add("name");
headList.add("like");
headList.add("unlike");
headList.add("from_url");
poi.exportExcel(path, "评论数据", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Wangyi;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class WangyiCommentCountExample {
//
// @Test
// public void wangyiCommentCountTest() {
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String path = "D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx";
// Map<String,Object> map = poi.importExcel(path, 0);
//
// List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
// List<String> urlList = new ArrayList<String>();
// for(Map<String,Object> u : list) {
// String url = u.get("链接")+"";
// urlList.add(url);
// }
//
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String url : urlList) {
// url = "https://3g.163.com/all/article/E9GAO0PK051188EC.html";
// String id = url.split("/")[url.split("/").length-1].split(".ht")[0];
// System.out.println(id);
// int lists = Wangyi.getWangyiCommentCount(id, null);
// System.out.println(lists);
// ZhiWeiTools.sleep(3000);
// }
// List<String> headList = new ArrayList<String>();
// headList.add("content");
// headList.add("id");
// headList.add("time");
// headList.add("name");
// headList.add("like");
// headList.add("unlike");
// headList.add("from_url");
//
// poi.exportExcel(path, "评论数据", headList, bodyList);
//
// }
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Wangyi;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class WangyiCommentExample {
//若出错 可能数据有重复 以id为准
@Test
public void wangyiCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String path = "D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx";
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
List<String> urlList = new ArrayList<String>();
for(Map<String,Object> u : list) {
String url = u.get("链接")+"";
urlList.add(url);
}
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String url : urlList) {
String id = url.split("/")[url.split("/").length-1].split(".ht")[0];
System.out.println(id);
List<Map<String,Object>> lists = Wangyi.getWangyiCommentData(id,null);
System.out.println(url+"====="+lists.size());
if(lists != null) {
for(Map<String,Object> m : lists) {
m.put("from_url", url);
bodyList.add(m);
}
}
ZhiWeiTools.sleep(3000);
}
List<String> headList = new ArrayList<String>();
headList.add("content");
headList.add("id");
headList.add("time");
headList.add("name");
headList.add("like");
headList.add("unlike");
headList.add("from_url");
poi.exportExcel(path, "评论数据", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Wangyi;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class WangyiCommentExample {
//
// //若出错 可能数据有重复 以id为准
// @Test
// public void wangyiCommentTest() {
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String path = "D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx";
// Map<String,Object> map = poi.importExcel(path, 0);
//
// List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
// List<String> urlList = new ArrayList<String>();
// for(Map<String,Object> u : list) {
// String url = u.get("链接")+"";
// urlList.add(url);
// }
//
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String url : urlList) {
// String id = url.split("/")[url.split("/").length-1].split(".ht")[0];
// System.out.println(id);
// List<Map<String,Object>> lists = Wangyi.getWangyiCommentData(id,null);
// System.out.println(url+"====="+lists.size());
// if(lists != null) {
// for(Map<String,Object> m : lists) {
// m.put("from_url", url);
// bodyList.add(m);
// }
// }
// ZhiWeiTools.sleep(3000);
// }
// List<String> headList = new ArrayList<String>();
// headList.add("content");
// headList.add("id");
// headList.add("time");
// headList.add("name");
// headList.add("like");
// headList.add("unlike");
// headList.add("from_url");
//
// poi.exportExcel(path, "评论数据", headList, bodyList);
//
// }
//
//
//
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Wangyi;
public class WangyiHistoryExample {
public static void main(String[] args) {
String url = "http://dy.163.com/v2/article/detail/DPLAOP1605198CJN.html";
List<Map<String,Object>> list = Wangyi.getHistoryData(url, null, "2018-05-01 00:00:00");
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("source");
headList.add("url");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//自媒体/网易-财联社.xlsx", "财联社", headList, list);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Wangyi;
//
//public class WangyiHistoryExample {
//
// public static void main(String[] args) {
//
// String url = "http://dy.163.com/v2/article/detail/EBR9PF6J0512MLBG.html";
//
// List<Map<String,Object>> list = Wangyi.getHistoryData(url, null, "2018-05-01 00:00:00");
//
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//自媒体/网易-财联社.xlsx", "财联社", headList, list);
//
// }
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Xiaomi;
public class XiaomiShequByWordExample {
public static void main(String[] args) {
String word = "小米 电饭煲 锅体开裂,小米 电饭煲 开裂,小米 电饭煲 裂缝,小米 电饭煲 烫伤,小米 电饭煲 变形";
//
String[] words = word.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) {
List<Map<String,Object>> dataList = Xiaomi.getXiaomiByWordData(w,null);
if(dataList != null && dataList.size() > 0) {
bodyList.addAll(dataList);
}
}
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("source");
headList.add("url");
headList.add("content");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\小米-关键词电饭煲相关.xlsx", "小米社区采集", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Xiaomi;
//
//public class XiaomiShequByWordExample {
//
// public static void main(String[] args) {
// String word = "小米 电饭煲 锅体开裂,小米 电饭煲 开裂,小米 电饭煲 裂缝,小米 电饭煲 烫伤,小米 电饭煲 变形";
// //
// String[] words = word.split(",");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : words) {
// List<Map<String,Object>> dataList = Xiaomi.getXiaomiByWordData(w,null);
// if(dataList != null && dataList.size() > 0) {
// bodyList.addAll(dataList);
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("source");
// headList.add("url");
// headList.add("content");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// poi.exportExcel("D:\\crawlerdata\\小米-关键词电饭煲相关.xlsx", "小米社区采集", headList, bodyList);
//
// }
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.XiGua;
public class XiguaAccountExample {
@Test
public void xiguaAccountTest() {
String path = "D:\\crawlerdata\\西瓜视频采集12.28.xlsx";
String startTime = "2017-01-01 00:00:00";
//2017-01-01 00:00:00
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
for(Map<String,Object> map1 : lists ) {
String url = map1.get("主页")+"";
if(url != null && url.length() > 5) {
List<Map<String,Object>> lists1 = XiGua.getXiguaAccountData(url,startTime,null);
if(lists1 != null && lists.size() > 0) {
bodyList.addAll(lists1);
}
}
}
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("comments_count");
headList.add("time");
headList.add("content");
headList.add("url");
headList.add("video_watch_count");
headList.add("source");
poi.exportExcel(path, "数据采集结果", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.XiGua;
//
//public class XiguaAccountExample {
//
// @Test
// public void xiguaAccountTest() {
// String path = "D:\\crawlerdata\\西瓜视频采集12.28.xlsx";
// String startTime = "2017-01-01 00:00:00";
// //2017-01-01 00:00:00
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
// for(Map<String,Object> map1 : lists ) {
// String url = map1.get("主页")+"";
// if(url != null && url.length() > 5) {
// List<Map<String,Object>> lists1 = XiGua.getXiguaAccountData(url,startTime,null);
// if(lists1 != null && lists.size() > 0) {
// bodyList.addAll(lists1);
// }
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("comments_count");
// headList.add("time");
// headList.add("content");
// headList.add("url");
// headList.add("video_watch_count");
// headList.add("source");
// poi.exportExcel(path, "数据采集结果", headList, bodyList);
// }
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.XiGua;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class XiguaByWordExample {
@Test
public void XiguaByWordTest() {
String word = "美食,味道,吃,试吃,美味,好吃";
String[] words = word.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) {
List<Map<String,Object>> list = XiGua.getXiguaVideoByWordData(w,null);
if(list != null && list.size() > 0) {
bodyList.addAll(list);
}
ZhiWeiTools.sleep(5000);
System.out.println("============总数" + bodyList.size());
}
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("like");
headList.add("unlike");
headList.add("play_count");
headList.add("source");
headList.add("comment_count");
headList.add("url");
poi.exportExcel("D://crawlerdata/西瓜美食-1.xlsx", "西瓜好吃不", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.XiGua;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class XiguaByWordExample {
//
//
// @Test
// public void XiguaByWordTest() {
// String word = "美食,味道,吃,试吃,美味,好吃";
// String[] words = word.split(",");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : words) {
// List<Map<String,Object>> list = XiGua.getXiguaVideoByWordData(w,null);
// if(list != null && list.size() > 0) {
// bodyList.addAll(list);
// }
// ZhiWeiTools.sleep(5000);
// System.out.println("============总数" + bodyList.size());
// }
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("like");
// headList.add("unlike");
// headList.add("play_count");
// headList.add("source");
// headList.add("comment_count");
// headList.add("url");
//
// poi.exportExcel("D://crawlerdata/西瓜美食-1.xlsx", "西瓜好吃不", headList, bodyList);
//
// }
//
//
//
//}
......@@ -35,7 +35,7 @@ public class YidainzixunByWordExample {
headList.add("time");
headList.add("url");
System.out.println(listAll.size());
poi.exportExcel("D://crawlerdata/一点资讯-美食.xlsx", "asd", headList, listAll);
poi.exportExcel("D://crawlerdata/一点资讯-软博会.xlsx", "asd", headList, listAll);
}
......
package com.zhiwei.crawler;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.parse.Yidianzixun;
public class YidianzixunCommentExample {
@Test
public void yidianzixunCommentTest() {
String url = "http://www.yidianzixun.com/article/0ILHigvv";
List<Map<String,Object>> lists = Yidianzixun.getYidianzixunCommentData(url,null);
System.out.println(lists.size());
for(Map<String,Object> map : lists) {
System.out.println(map.toString());
}
}
}
//package com.zhiwei.crawler;
//
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.parse.Yidianzixun;
//
//public class YidianzixunCommentExample {
//
// @Test
// public void yidianzixunCommentTest() {
// String url = "http://www.yidianzixun.com/article/0ILHigvv";
// List<Map<String,Object>> lists = Yidianzixun.getYidianzixunCommentData(url,null);
// System.out.println(lists.size());
// for(Map<String,Object> map : lists) {
// System.out.println(map.toString());
// }
// }
//
//
//}
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Baijia;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class BaijiaAccountExample {
//
// @Test
// public void test3() {
// String path = "D://crawlerdata//自媒体/百家号采集.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String startTime = "2018-05-01 00:00:00";
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Map<String,Object> m : list) {
// try {
// String app_id = m.get("id").toString();
// app_id = "1594158489045754";
// String cookie = "__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5";
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id, startTime,cookie, null);
// if(lists != null) {
// bodyList.addAll(lists);
// }
// break;
// } catch (Exception e) {
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("source");
// headList.add("url");
// headList.add("content");
// headList.add("read_amount");
// poi.exportExcel("D://crawlerdata//历史文章采集/百家号-lxj-2.xlsx", "娱乐资本论", headList, bodyList);
// }
//
//}
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Fenghuang;
//
//public class FenghuangAccountExample {
//
// @Test
// public void fenghuangAccountTest() {
// //所用时间长 1s1篇文章吧
// //https://api.3g.ifeng.com/client_search_subscribe?k=号外财经
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String id = "1165210";
// String[] ids = id.split(",");
// String startTime = "2010-05-01 00:00:00"; //可为空
// for(int i = 0;i < ids.length;i++) {
// try {
// List<Map<String,Object>> dataList = Fenghuang.getFenghuangAccountData(ids[i], startTime,ProxyHolder.NAT_HEAVY_PROXY);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
// headList.add("id");
// poi.exportExcel("D://crawlerdata//历史文章采集/凤凰-三言财经.xlsx", ids[i], headList, dataList);
// } catch (Exception e) {
// continue;
// }
// }
// }
//
//}
package com.zhiwei.hsitory;
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Souhu;
//
//
//public class SouhuAccountExample {
//
// //http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8
//
// @Test
// public void souhuAccountTest() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// List<Map<String,Object>> lists = Souhu.getSouHuAccountData("99938933","浅黑科技","2018-05-01 00:00:00",false,null);
// System.out.println(lists.size());
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("url");
// headList.add("comment");
// headList.add("tags");
// headList.add("newsid");
// headList.add("source");
// headList.add("newsPv");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-乔.xlsx", "乔", headList, lists);
// }
//
//}
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.TXNews;
//
//public class TxNewsHostoryExample {
//
// public static void main(String[] args) {
//
//
// String url = "6839743";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<Map<String,Object>> list = TXNews.getTxNewsHistory(url, null,ProxyHolder.NAT_PROXY);
//
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//历史文章采集/腾讯网-三言财经-right.xlsx", "财联社", headList, list);
//
//
// }
//
//}
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Wangyi;
//
//public class WangyiHistoryExample {
//
// public static void main(String[] args) {
//
// String url = "T1520579168852";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<Map<String,Object>> list = Wangyi.getWangyiClientHistory(url, ProxyHolder.NAT_PROXY, "2019-01-01 00:00:00");
//
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//历史文章采集/网易-三言财经.xlsx", "财联社", headList, list);
//
// }
//
//
//}
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Xueqiu;
//
//public class XueqiuHostoryExample {
//
// public static void main(String[] args) {
//
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// String cookie = "_ga=GA1.2.2045733994.1547169202; device_id=5a986a59915983c3e2ef8074f80112ec; s=e618lxk3qw; __utmz=1.1547185990.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=1.2045733994.1547169202.1548122251.1553047746.3; aliyungf_tc=AQAAAJHA7Vrq7AYAgtgMPALb3ZCQP9o+; _gid=GA1.2.334283760.1554779038; Hm_lvt_1db88642e346389874251b5a1eded6e3=1553046552,1553046993,1553150890,1554779038; _gat=1; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=fed387c342aedea5c7883d1062ae6faf167975d8; xq_a_token.sig=j47ktDdYWr1FOgeL74U6yMCPhOY; xqat=fed387c342aedea5c7883d1062ae6faf167975d8; xqat.sig=oZPD4-6V_GPw-KsnR04L7vxf5oM; xq_r_token=6ffffd472dc300e2f89195a77b8e7064da45d78d; xq_r_token.sig=TPd7Y11kYPcQeOgzXVDApbRQauQ; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=5878436335; u.sig=j_g6RZ9GzzrgOfIsGHi9O9M1wvc; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1554791719";
// String userId = "7441422641";
//
// List<Map<String,Object>> dataList = Xueqiu.getXueqiuAccountData(userId, cookie, null);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("source");
// headList.add("content");
// headList.add("repostCount");
// headList.add("commentCount");
// headList.add("likeCount");
// headList.add("url");
// poi.exportExcel("D://crawlerdata//历史文章采集/雪球-三言财经.xlsx", "三言财经", headList, dataList);
//
// }
//
//}
package com.zhiwei.crawler;
package com.zhiwei.hsitory;
import java.util.ArrayList;
import java.util.List;
......@@ -6,6 +6,9 @@ import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Yidianzixun;
......@@ -14,10 +17,10 @@ public class YidianzixunAccountExample {
@Test
public void yidianzixunAccountTest() {
String channelid = "m23315";
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
String channelid = "m190159";
String startTime = "2007-01-01 00:00:00";
String cookie = "wuid=90742539356820; wuid_createAt=2019-01-10 11:45:41; UM_distinctid=16835dd9ba11cb-0ef8d17063d93f-671b197c-1fa400-16835dd9ba2243; JSESSIONID=174b8df350cb5400283abedf2c26076357b0b7af0581024f2e39e90532b4edc9; weather_auth=2; DID=node82eee6d174caf2d4; Hm_lvt_15fafbae2b9b11d280c79eff3b840e45=1551686450,1551686458; CNZZDATA1255169715=931563543-1547087800-%7C1551761063; captcha=s%3A6e56492ffceaf88d9f131fa79435464a.TLAhZ1cfwj0vBTjKTO9Qf5qc6QLuipitrEMZjiqm8BM; Hm_lpvt_15fafbae2b9b11d280c79eff3b840e45=1551764582; cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%2216835dd9ba11cb-0ef8d17063d93f-671b197c-1fa400-16835dd9ba2243%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201547544080%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201547544080%7D%2C%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201551765057%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201551765057%7D";
List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunAccountData(channelid, startTime,null,cookie);
List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunAccountData(channelid, startTime,ProxyHolder.NAT_HEAVY_PROXY,null);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
......@@ -27,7 +30,7 @@ public class YidianzixunAccountExample {
headList.add("source");
headList.add("url");
headList.add("summary");
poi.exportExcel("D://crawlerdata/一点资讯-m23315.xlsx", "虎嗅", headList, dataList);
poi.exportExcel("D://crawlerdata//历史文章采集/一点资讯-新华社中国新三板.xlsx", "新华社中国新三板", headList, dataList);
}
......
//package com.zhiwei.keyword;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Xueqiu;
//
//public class XueqiuKeyWord {
// @Test
// public void f() {
//// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// String word = "腾讯 股价|腾讯 市值|腾讯 估值|腾讯 投资|腾讯 财报";
// String endTime = "2018-01-01 00:00:00";
// String cookie = "_ga=GA1.2.590296572.1538275545; device_id=812aac42a8874c32ca97893a7b270684; s=ff12lgxbt1; __utma=1.590296572.1538275545.1538280279.1538280279.1; __utmz=1.1538280279.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAAPtocWy01ggA6tbncwYt4OIDsaiG; xq_a_token=088c6ad5e275496d7c91b8b5b2ecb929bee15772; xq_a_token.sig=NcpAm7FRsAVoMGRMOLrLRveBT7U; xq_r_token=08131eb9a4f33b43b2340fd782f3776c9823d74a; xq_r_token.sig=AZ7au6ICJtTgQJVPybkOJs_Fr54; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539599526,1539913171,1539913178,1541157360; _gid=GA1.2.1817290343.1541157360; u=721541157360383; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1541212289";
//
//
//
// String[] words = word.split("\\|");
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : words) {
// System.out.println(w);
//
// List<Map<String, Object>> dataList = Xueqiu.getData(w, endTime, null, cookie);
// System.out.println(w + " ---- " + dataList.size());
// bodyList.addAll(dataList);
// }
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("uper");
// headList.add("url");
// headList.add("likeCount");
// headList.add("replyCount");
// poi.exportExcel("D:\\crawlerdata\\自媒体\\雪球网页采集-腾讯-relevance.xlsx", "马化腾", headList, bodyList);
//
// }
//}
package com.zhiwei.keyword;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Xueqiu;
public class XueqiuKeyWord {
@Test
public void f() {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
String word = "软博会|软件博览会";
String endTime = "2018-01-01 00:00:00";
String cookie = "aliyungf_tc=AQAAADi3G3I7GAEAgtgMPF/W3FeFauXk; device_id=3221abfd50f9f8be2abca9182c338c9e; Hm_lvt_1db88642e346389874251b5a1eded6e3=1561962809; s=d911zeymzo; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xq_a_token.sig=hj7OrhYid1GuAaF-AmLYrPuyFDk; xqat=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xqat.sig=gker8dcZJ2Ez4s58eIGPa5zCd0w; xq_r_token=b0ad7b059411acac8c9e2f0a40336804bb60d047; xq_r_token.sig=DwNSLIigE8xtwQKLygagcM28Dd0; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=1273068356; u.sig=UyqQW-br8hcsOyT6anFmS_SFpCs; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1561963129";
String[] words = word.split("\\|");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) {
System.out.println(w);
List<Map<String, Object>> dataList = Xueqiu.getData(w, endTime, null, cookie);
System.out.println(w + " ---- " + dataList.size());
bodyList.addAll(dataList);
}
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("uper");
headList.add("url");
headList.add("likeCount");
headList.add("replyCount");
poi.exportExcel("D:\\crawlerdata\\自媒体\\雪球网页采集-软博会-relevance.xlsx", "马化腾", headList, bodyList);
}
}
......@@ -4,7 +4,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
......@@ -21,7 +21,7 @@ public class AiqiyiTest {
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : wordList) {
List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,ProxyHolder.NAT_PROXY);
List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,ProxyHolder.NAT_HEAVY_PROXY);
if(dataList != null && dataList.size() >= 1) {
bodyList.addAll(dataList);
}
......@@ -34,7 +34,7 @@ public class AiqiyiTest {
headList.add("title");
headList.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata/爱奇艺关键词采集-txh-0320.xlsx", "数据", headList, bodyList);
poi.exportExcel("D://crawlerdata//视频/爱奇艺关键词采集-毓婷-0716.xlsx", "数据", headList, bodyList);
......
......@@ -4,8 +4,10 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.BiliBili;
import com.zhiwei.util.WordReadFile;
......@@ -13,11 +15,12 @@ import com.zhiwei.util.WordReadFile;
public class BilibiliTest {
@Test
public void f() {
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词-1.txt");
List<Map<String, Object>> bodyList = new ArrayList<>();
String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
for (String word : wordList) {
List<Map<String, Object>> dataList = BiliBili.getData(word, null,
List<Map<String, Object>> dataList = BiliBili.getData(word, null, "2019-07-18 00:00:00",
cookie);
if (dataList != null) {
System.out.println(word + " ----- " + dataList.size());
......@@ -33,7 +36,7 @@ public class BilibiliTest {
headlist.add("url");
headlist.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//bilibili关键词采集数据-txh-0320.xlsx", "B站数据", headlist, bodyList);
poi.exportExcel("D://crawlerdata//视频//bilibili关键词采集数据-吃鸡否-0722.xlsx", "B站数据", headlist, bodyList);
}
}
......@@ -4,7 +4,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
......@@ -18,11 +18,11 @@ public class QQTVTest {
@Test
public void f() {
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
String time = "2018-01-01 00:00:00";
String time = "2019-04-11 00:00:00";
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String, Object>> bodyList = new ArrayList<>();
for (String word : wordList) {
List<Map<String, Object>> dataList = QQTV.getData(word,time, ProxyHolder.NAT_PROXY);
List<Map<String, Object>> dataList = QQTV.getData(word,time, ProxyHolder.NAT_HEAVY_PROXY);
if (dataList != null) {
System.out.println(word + " ----- " + dataList.size());
bodyList.addAll(dataList);
......@@ -37,7 +37,7 @@ public class QQTVTest {
headlist.add("url");
headlist.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//腾讯视频关键词采集数据-txh-0320.xlsx", "腾讯视频数据", headlist, bodyList);
poi.exportExcel("D://crawlerdata//视频//腾讯视频关键词采集数据-毓婷-0716.xlsx", "腾讯视频数据", headlist, bodyList);
......
......@@ -4,7 +4,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.shipin.SohuTV;
......@@ -33,7 +33,7 @@ public class SohuTVTest {
headlist.add("url");
headlist.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//搜狐视频关键词采集数据-txh-0320.xlsx", "搜狐数据", headlist, bodyList);
poi.exportExcel("D://crawlerdata//视频//搜狐视频关键词采集数据-毓婷-0716.xlsx", "搜狐数据", headlist, bodyList);
}
}
......@@ -4,7 +4,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
......@@ -30,7 +30,7 @@ public class YoukuKeyWordTest {
headList.add("uper");
headList.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//优酷数据-txh-0320.xlsx", "数据", headList, bodyList);
poi.exportExcel("D://crawlerdata//视频//优酷数据-毓婷-0716.xlsx", "数据", headList, bodyList);
}
}
//package com.zhiwei.user;
//
//import java.util.ArrayList;
//import java.util.Arrays;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Maimai;
//
//public class MaimaiTest {
// @Test
// public void maimaiUserCrawler() {
// String path = "D:\\crawlerdata\\脉脉用户.xlsx";
// String word = "美团|美团网|大众点评|美团点评|摩拜|猫眼|榛果|三快科技|三快在线";
// String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550629286782; token=\"OCY36EFdeYzGytlQFyKRdM0DcXNdViYI02kT4QbUMpaSk/CqMXrqBOx8EFo5/fQU8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"q1bNxxk8WW3MzjbCfKr/hfAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTc2NjQ0NzY1Iiwic2VjcmV0IjoiLXFsV2c2Ym9feEJqOWxQbWdWTjcwWWg3Iiwic3RhdHVzIjp0cnVlLCJtaWQ0NTY4NzYwIjpmYWxzZSwiX2V4cGlyZSI6MTU1MDcxNTc2NzgwMSwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=lVCTA7DLvo1K_r_bTjbQOH13Alc";
// String[] words = word.split("\\|");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String w : words) {
// bodyList.addAll(Maimai.getUserList(w, cookie, null));
// }
// List<String> headList = Arrays.asList("id","name","gender","url","rank","compos","city");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel(path, "result", headList, bodyList);
// }
//}
package com.zhiwei.user;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Maimai;
public class MaimaiTest {
public static void main(String[] args) {
String path = "D:\\crawlerdata\\用户采集\\脉脉用户.xlsx";
String word = "巨量引擎|巨量 引擎|巨 量 引 擎|巨 量 引擎|巨量引 擎";
String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; guid=HBoEGxgEGBscBBsZGlYHGBseHxoYGhIZHFYcGQQdGR8FQ1hLTEt5ChITBBIdHxkEGgQbHQVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; seid=s1553309971270; token=\"iUifMkpE9YKuFpz0yEj+jiWpUqM6IXvEvwWKzdd/jK8YgrWsT1/Ku7k9bkIRRYvG8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoidzdPUkhMelktVS1iN1Nsb3VxLXZQV2JvIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUzMzk2Mzk0MzczLCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=zGIN7VMizkYf1v48nLqTGAG1k8U";
String[] words = word.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String w : words) {
bodyList.addAll(Maimai.getUserList(w, cookie, null));
}
List<String> headList = Arrays.asList("id","name","gender","url","rank","compos","city");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel(path, "result", headList, bodyList);
}
}
package com.zhiwei.user;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.bean.QQKandianUser;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.QQKandian;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class QQkandianExample {
@Test
public void f() {
QQKandian qqKandian = new QQKandian();
String path = "D:\\crawlerdata\\用户采集\\qq看点用户.xlsx";
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
List<QQKandianUser> allList = new ArrayList<QQKandianUser>();
for(Map<String,Object> m : dataList) {
String name = m.get("渠道")+"";
System.out.println(name);
List<QQKandianUser> qqKandianUsers = qqKandian.getUser(name, null);
if(qqKandianUsers != null) {
System.out.println(qqKandianUsers.size());
allList.addAll(qqKandianUsers);
}else {
System.out.println( name + "--- null");
}
ZhiWeiTools.sleep(3000);
}
List<String> headList = new ArrayList<String>();
headList.add("name");
headList.add("url");
headList.add("verity");
headList.add("desc");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(QQKandianUser qqKandianUser : allList) {
Map<String,Object> m = new HashMap<String,Object>();
m.put("name", qqKandianUser.getName());
m.put("url", qqKandianUser.getUrl());
m.put("verity", qqKandianUser.isVerify());
m.put("desc", qqKandianUser.getDesc());
bodyList.add(m);
}
poi.exportExcel(path, "数据完成后", headList, bodyList);
}
}
//package com.zhiwei.user;
//
//import java.util.ArrayList;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.bean.QQKandianUser;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.QQKandian;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class QQkandianExample {
//
// @Test
// public void f() {
// QQKandian qqKandian = new QQKandian();
// String path = "D:\\crawlerdata\\用户采集\\qq看点用户.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> map = poi.importExcel(path, 0);
//
// List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
// List<QQKandianUser> allList = new ArrayList<QQKandianUser>();
// for(Map<String,Object> m : dataList) {
// String name = m.get("渠道")+"";
// System.out.println(name);
// List<QQKandianUser> qqKandianUsers = qqKandian.getUser(name, null);
// if(qqKandianUsers != null) {
// System.out.println(qqKandianUsers.size());
// allList.addAll(qqKandianUsers);
// }else {
// System.out.println( name + "--- null");
// }
// ZhiWeiTools.sleep(3000);
// }
// List<String> headList = new ArrayList<String>();
// headList.add("name");
// headList.add("url");
// headList.add("verity");
// headList.add("desc");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(QQKandianUser qqKandianUser : allList) {
// Map<String,Object> m = new HashMap<String,Object>();
// m.put("name", qqKandianUser.getName());
// m.put("url", qqKandianUser.getUrl());
// m.put("verity", qqKandianUser.isVerify());
// m.put("desc", qqKandianUser.getDesc());
// bodyList.add(m);
// }
// poi.exportExcel(path, "数据完成后", headList, bodyList);
// }
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment