Commit cb5516a0 by yangchen

采集修改

parent 2a35dd02
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>articlenewscrawler</artifactId> <artifactId>articlenewscrawler</artifactId>
<version>0.0.9-SNAPSHOT</version> <version>0.1.3-SNAPSHOT</version>
<name>articlenewscrawler</name> <name>articlenewscrawler</name>
<description>采集凤凰,一点资讯,搜狐历时文章和文章评论</description> <description>采集凤凰,一点资讯,搜狐历时文章和文章评论</description>
...@@ -31,12 +31,12 @@ ...@@ -31,12 +31,12 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.1-SNAPSHOT</version> <version>0.1.2-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.1.1-RELEASE</version> <version>0.3.0-RELEASE</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
</dependencies> </dependencies>
......
package com.zhiwei.httpclient; package com.zhiwei.httpclient;
import java.io.IOException; import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
...@@ -278,7 +280,11 @@ public class HeadGet { ...@@ -278,7 +280,11 @@ public class HeadGet {
headerMap.put("Connection", "keep-alive"); headerMap.put("Connection", "keep-alive");
headerMap.put("Accept", "*/*"); headerMap.put("Accept", "*/*");
headerMap.put("Accept-Language", "zh-Hans-CN;q=1"); headerMap.put("Accept-Language", "zh-Hans-CN;q=1");
headerMap.put("User-Agent", "天天快报 4.6.0 qnreading (iPhone8,1; iOS 10.3.3; zh_CN; 4.6.0.81)"); try {
headerMap.put("User-Agent", URLEncoder.encode("天天快报 4.6.0 qnreading (iPhone8,1; iOS 10.3.3; zh_CN; 4.6.0.81)","utf-8"));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
if(cookie != null) { if(cookie != null) {
headerMap.put("Cookie", cookie); headerMap.put("Cookie", cookie);
......
...@@ -8,15 +8,15 @@ import org.slf4j.Logger; ...@@ -8,15 +8,15 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.Response; import okhttp3.Response;
public class HttpClient { public class HttpClient {
private static Logger logger = LoggerFactory.getLogger(HttpClient.class); private static Logger logger = LoggerFactory.getLogger(HttpClient.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot(false,2);
/** /**
* *
......
...@@ -11,8 +11,8 @@ import org.slf4j.Logger; ...@@ -11,8 +11,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.AikaCommentAnalysis; import com.zhiwei.parse.analysis.AikaCommentAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
......
package com.zhiwei.parse; package com.zhiwei.parse;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
...@@ -10,14 +9,21 @@ import java.util.Map; ...@@ -10,14 +9,21 @@ import java.util.Map;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.AiqiyiByWordAnalysis; import com.zhiwei.parse.analysis.AiqiyiByWordAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class Aiqiyi { public class Aiqiyi {
private static Logger logger = LoggerFactory.getLogger(Aiqiyi.class); private static Logger logger = LoggerFactory.getLogger(Aiqiyi.class);
private static AiqiyiByWordAnalysis aiqiyiByWordAnalysis = new AiqiyiByWordAnalysis(); private static AiqiyiByWordAnalysis aiqiyiByWordAnalysis = new AiqiyiByWordAnalysis();
private static HttpBoot httpBoot = new HttpBoot(false, 2);
/** /**
* *
...@@ -25,16 +31,16 @@ public class Aiqiyi { ...@@ -25,16 +31,16 @@ public class Aiqiyi {
* @param word * @param word
* @return * @return
*/ */
public static List<Map<String,Object>> getAiqiyiByWordData(String word,Proxy proxy) { public static List<Map<String,Object>> getAiqiyiByWordData(String word,ProxyHolder proxy) {
Map<String,String> headerMap = HeadGet.getAiqiyiBywordHeaderMap(null); Map<String,String> headerMap = HeadGet.getAiqiyiBywordHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
try { try {
for(int i = 1;i <= 5;i++) { for(int i = 1;i <= 20;i++) {
int count = dataList.size(); int count = dataList.size();
String url = "https://so.iqiyi.com/so/q_"+URLEncoder.encode(word, "UTF-8")+"_ctg__t_0_page_"+i+"_p_1_qc_0_rd__site__m_4_bitrate_"; String url = "https://so.iqiyi.com/so/q_"+URLEncoder.encode(word, "UTF-8")+"_ctg__t_0_page_"+i+"_p_1_qc_0_rd__site__m_4_bitrate_";
System.out.println(url); System.out.println(url);
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap); String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
List<Map<String,Object>> map = aiqiyiByWordAnalysis.getAiqiyiData(result); List<Map<String,Object>> map = aiqiyiByWordAnalysis.getAiqiyiData(result,word);
if(map != null) { if(map != null) {
dataList.addAll(map); dataList.addAll(map);
} }
...@@ -42,7 +48,7 @@ public class Aiqiyi { ...@@ -42,7 +48,7 @@ public class Aiqiyi {
break; break;
} }
System.out.println("=============="+dataList.size()); System.out.println("=============="+dataList.size());
ZhiWeiTools.sleep(2000); ZhiWeiTools.sleep(200);
} }
return dataList; return dataList;
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
...@@ -51,5 +57,24 @@ public class Aiqiyi { ...@@ -51,5 +57,24 @@ public class Aiqiyi {
} }
} }
/**
*
* @Description 获取 爱奇艺视频热度
* @param url
* @param proxy
* @return
*/
public static int aiqiyiHotCount(String url,ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
String id = result.split("tvId\":")[1].split(",")[0];
result = httpBoot.syncCall(RequestUtils.wrapGet("https://pcw-api.iqiyi.com/video/video/hotplaytimes/" + id)).body().string();
return JSONObject.parseObject(result).getJSONArray("data").getJSONObject(0).getInteger("hot");
} catch (Exception e) {
logger.error(" 爱奇艺 热度采集出错 {} ",e);
return -1;
}
}
} }
...@@ -11,7 +11,7 @@ import org.slf4j.LoggerFactory; ...@@ -11,7 +11,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.BaijiaAccountAnalysis; import com.zhiwei.parse.analysis.BaijiaAccountAnalysis;
......
...@@ -33,7 +33,7 @@ public class BiliBili { ...@@ -33,7 +33,7 @@ public class BiliBili {
Request request = HttpRequestBuilder.newGetRequest(url, header); Request request = HttpRequestBuilder.newGetRequest(url, header);
String result = httpBoot.syncCall(request, proxy).body().string(); String result = httpBoot.syncCall(request, proxy).body().string();
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(3000);
Map<String,Object> map = BilibilikeyWordAnalysis.getData(result); Map<String,Object> map = BilibilikeyWordAnalysis.getData(result,word);
boolean more = (boolean) map.get("more"); boolean more = (boolean) map.get("more");
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("data"); List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("data");
if(dataList != null) { if(dataList != null) {
...@@ -46,7 +46,7 @@ public class BiliBili { ...@@ -46,7 +46,7 @@ public class BiliBili {
System.out.println(ur); System.out.println(ur);
request = HttpRequestBuilder.newGetRequest(ur, header); request = HttpRequestBuilder.newGetRequest(ur, header);
String result2 = httpBoot.syncCall(request, proxy).body().string(); String result2 = httpBoot.syncCall(request, proxy).body().string();
map = BilibilikeyWordAnalysis.getData(result2); map = BilibilikeyWordAnalysis.getData(result2,word);
List<Map<String,Object>> dataList2 = (List<Map<String, Object>>) map.get("data"); List<Map<String,Object>> dataList2 = (List<Map<String, Object>>) map.get("data");
if(dataList2 != null) { if(dataList2 != null) {
bodyList.addAll(dataList2); bodyList.addAll(dataList2);
......
...@@ -15,8 +15,8 @@ import org.slf4j.LoggerFactory; ...@@ -15,8 +15,8 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
......
...@@ -16,7 +16,7 @@ import org.slf4j.Logger; ...@@ -16,7 +16,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.DoubanCommentAnalysis; import com.zhiwei.parse.analysis.DoubanCommentAnalysis;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
......
...@@ -10,7 +10,7 @@ import org.slf4j.Logger; ...@@ -10,7 +10,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.GftaiAnalysis; import com.zhiwei.parse.analysis.GftaiAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
......
...@@ -11,7 +11,7 @@ import org.slf4j.Logger; ...@@ -11,7 +11,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.KuaiTousuAnalysis; import com.zhiwei.parse.analysis.KuaiTousuAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
......
...@@ -17,8 +17,8 @@ import org.slf4j.LoggerFactory; ...@@ -17,8 +17,8 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.MaimaiBywordAnalysis; import com.zhiwei.parse.analysis.MaimaiBywordAnalysis;
......
...@@ -13,8 +13,8 @@ import org.slf4j.LoggerFactory; ...@@ -13,8 +13,8 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.PcautoCommentAnalysis; import com.zhiwei.parse.analysis.PcautoCommentAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
......
...@@ -2,6 +2,7 @@ package com.zhiwei.parse; ...@@ -2,6 +2,7 @@ package com.zhiwei.parse;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -34,10 +35,10 @@ public class QQKB { ...@@ -34,10 +35,10 @@ public class QQKB {
String url = "http://r.cnews.qq.com/getSubNewsIndex"; String url = "http://r.cnews.qq.com/getSubNewsIndex";
Map<String,String> headerMap = HeadGet.getQQAccountHeaderMap(cookie); Map<String,String> headerMap = HeadGet.getQQAccountHeaderMap(cookie);
Map<String,Object> paramMap = HeadGet.getQQAccountOneParamMap(child); Map<String,Object> paramMap = HeadGet.getQQAccountOneParamMap(child);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
try { try {
String result = ""; String result = "";
List<String> idsList = new ArrayList<String>(); List<String> idsList = new ArrayList<>();
for(int i = 0;i < 3;i++) { for(int i = 0;i < 3;i++) {
result = HttpClient.executeHttpRequestPost(url,proxy, headerMap, paramMap); result = HttpClient.executeHttpRequestPost(url,proxy, headerMap, paramMap);
idsList = qqAccountAnalysis.getQQAllIds(result); idsList = qqAccountAnalysis.getQQAllIds(result);
...@@ -45,6 +46,9 @@ public class QQKB { ...@@ -45,6 +46,9 @@ public class QQKB {
break; break;
} }
} }
if(idsList.isEmpty()) {
return Collections.emptyList();
}
System.out.println("此帐号可采集的历史文章数==============="+idsList.size()); System.out.println("此帐号可采集的历史文章数==============="+idsList.size());
url = "http://r.cnews.qq.com/getSubNewsListItems"; url = "http://r.cnews.qq.com/getSubNewsListItems";
String ids = ""; String ids = "";
...@@ -106,7 +110,7 @@ public class QQKB { ...@@ -106,7 +110,7 @@ public class QQKB {
* @return * @return
*/ */
public static List<Map<String,Object>> getQQKBCommentData(String url,Proxy proxy) { public static List<Map<String,Object>> getQQKBCommentData(String url,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
String comment_id = getCid(url,proxy); String comment_id = getCid(url,proxy);
String article_id = url.split("/")[4].split("\\?")[0]; String article_id = url.split("/")[4].split("\\?")[0];
Map<String,String> headerMap = HeadGet.getQQKBCommentHeaderMap(null); Map<String,String> headerMap = HeadGet.getQQKBCommentHeaderMap(null);
...@@ -133,7 +137,7 @@ public class QQKB { ...@@ -133,7 +137,7 @@ public class QQKB {
} }
return dataList; return dataList;
} catch (Exception e) { } catch (Exception e) {
logger.error("解析天天快报评论出错",e.getMessage()); logger.error("解析天天快报评论出错 {}",e);
return dataList; return dataList;
} }
} }
...@@ -208,9 +212,10 @@ public class QQKB { ...@@ -208,9 +212,10 @@ public class QQKB {
* @param article_id * @param article_id
* @return * @return
*/ */
public static int getCommentCount(String cookie,String url,Proxy proxy) { public static int getCommentCount(String url,Proxy proxy) {
String comment_id = getCid(url,proxy); String comment_id = getCid(url,proxy);
String article_id = url.split("/")[4]; String article_id = url.split("/")[4];
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=0003000049dd058f533cbebb240223ede63b864224f7eebe0f4aeca6a623572bb290a5800741d191a5768bb0;%20uin=o0497332654;%20skey=MIZmc2Oel3;%20sigA2=4282ABA809551D3534C72F999EE8F2A75219ED9452DEF04E4CBCE6B680C2C893C3E1BA617F5E0F387E558888B2ABEDFE87A4A25B16F9066C1154B2BC7A1133CA7B356AB9D3BA26ED;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwgGT4n96Oq-jHALnMUe8UzpoJghQDouvfSSWdh-JOdgAm3jRJUPbux6fcIPghoNxo24xdED8ennAANksJuHiwdw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
Map<String,String> headerMap = HeadGet.getQQKBCommentHeaderMap(cookie); Map<String,String> headerMap = HeadGet.getQQKBCommentHeaderMap(cookie);
try { try {
Map<String,Object> paramMap = HeadGet.getQQKBCommentParamMap(comment_id, article_id); Map<String,Object> paramMap = HeadGet.getQQKBCommentParamMap(comment_id, article_id);
...@@ -218,7 +223,7 @@ public class QQKB { ...@@ -218,7 +223,7 @@ public class QQKB {
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
return json.getJSONObject("comments").getInteger("count"); return json.getJSONObject("comments").getInteger("count");
} catch (Exception e) { } catch (Exception e) {
logger.error("解析天天快报评论出错",e.getMessage()); logger.error("解析天天快报评论出错 {}",e);
return 0; return 0;
} }
} }
......
package com.zhiwei.parse;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.Response;
/**
*
* @ClassName QQNews
* @Description 腾讯网相关采集
* @author byte-zbs
* @Date 2019年3月6日 下午1:54:26
* @version 1.0.0
*/
public class QQNews {
private static final Logger logger = LoggerFactory.getLogger(QQNews.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
* .
* @Description 获取腾讯网评论数 (https://new.qq.com/cmsn/20190305/TEC2019030500050000)
* @param id TEC2019030500050000
* @param proxy
* @return
*/
public static int getQQNewsCommentCount(String id,ProxyHolder proxy) {
String cid = getCid(id,proxy);
if(Objects.nonNull(cid)) {
String url = "https://coral.qq.com/article/"+cid+"/commentnum";
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
return Integer.parseInt(json.getJSONObject("data").getString("commentnum"));
} catch (Exception e) {
logger.error("腾讯网评论采集出错 {}",e);
}
}
return -1;
}
/**
*
* @Description 依据网站获取文章cid
* @param id
* @param proxy
* @return
*/
private static String getCid(String id, ProxyHolder proxy) {
String url = "https://openapi.inews.qq.com/getQQNewsNormalContent?id="+id+"&refer=mobilewwwqqcom";
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
return json.getString("cid");
} catch (Exception e) {
logger.error("获取文章cid失败{}",e);
}
return null;
}
}
...@@ -9,8 +9,8 @@ import org.slf4j.Logger; ...@@ -9,8 +9,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.QicheHomeKwyWordAnalysis; import com.zhiwei.parse.analysis.QicheHomeKwyWordAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
......
...@@ -11,12 +11,15 @@ import java.util.Map; ...@@ -11,12 +11,15 @@ import java.util.Map;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.SinaKejiCommentAnalysis; import com.zhiwei.parse.analysis.SinaKejiCommentAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class SinaKeji { public class SinaKeji {
private static Logger logger = LoggerFactory.getLogger(SinaKeji.class); private static Logger logger = LoggerFactory.getLogger(SinaKeji.class);
...@@ -78,5 +81,27 @@ public class SinaKeji { ...@@ -78,5 +81,27 @@ public class SinaKeji {
return null; return null;
} }
/**
*
* @Description 新浪科技评论数获取
* @param url
* @param proxy
* @return
*/
public static int getCommentCount(String url,ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
if(result.contains("getcomments:'")) {
url = result.split("getcomments:'")[1].split("',")[0];
}else{
url = result.split("getcomments\":\"")[1].split("\"")[0];
}
result = httpBoot.syncCall(RequestUtils.wrapGet("https:"+url), proxy).body().string();
return JSONObject.parseObject(result).getJSONObject("data").getJSONObject("cmnt").getInteger("total");
} catch (Exception e) {
logger.error("新浪 文章获取评论数失败 {}",e);
}
return -1;
}
} }
...@@ -2,7 +2,6 @@ package com.zhiwei.parse; ...@@ -2,7 +2,6 @@ package com.zhiwei.parse;
import java.io.IOException; import java.io.IOException;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
...@@ -12,7 +11,8 @@ import org.slf4j.Logger; ...@@ -12,7 +11,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.SinaTousuAnalysis; import com.zhiwei.parse.analysis.SinaTousuAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
...@@ -23,7 +23,7 @@ public class SinaTousu { ...@@ -23,7 +23,7 @@ public class SinaTousu {
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot();
public static List<Map<String,Object>> getSinaTousuData(String word,Proxy proxy,String time) { public static List<Map<String,Object>> getSinaTousuData(String word,ProxyHolder proxy,String time) {
List<Map<String,Object>> bodyList = new ArrayList<>(); List<Map<String,Object>> bodyList = new ArrayList<>();
int page = 1; int page = 1;
int count = 1; int count = 1;
...@@ -43,7 +43,7 @@ public class SinaTousu { ...@@ -43,7 +43,7 @@ public class SinaTousu {
logger.info("黑猫投诉 关键词采集 第{}页 ,一共采集到数据 {} ",page,bodyList.size()); logger.info("黑猫投诉 关键词采集 第{}页 ,一共采集到数据 {} ",page,bodyList.size());
page++; page++;
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(100);
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
count++; count++;
logger.error("UnsupportedEncodingException {}",e); logger.error("UnsupportedEncodingException {}",e);
......
package com.zhiwei.parse; package com.zhiwei.parse;
import static java.util.Objects.nonNull;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
...@@ -21,7 +23,6 @@ import com.zhiwei.parse.analysis.SouhuAccountAnalysis; ...@@ -21,7 +23,6 @@ import com.zhiwei.parse.analysis.SouhuAccountAnalysis;
import com.zhiwei.parse.analysis.SouhuCommentAnalysis; import com.zhiwei.parse.analysis.SouhuCommentAnalysis;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import static java.util.Objects.nonNull;
public class Souhu { public class Souhu {
private static Logger logger = LoggerFactory.getLogger(Souhu.class); private static Logger logger = LoggerFactory.getLogger(Souhu.class);
...@@ -39,9 +40,7 @@ public class Souhu { ...@@ -39,9 +40,7 @@ public class Souhu {
try { try {
String newurl = souhuCommentAnalysis.getSouhuURL(url,proxy); String newurl = souhuCommentAnalysis.getSouhuURL(url,proxy);
if(nonNull(newurl)) { if(nonNull(newurl)) {
int i; return souhuCommentAnalysis.getSouhuCommentCount(newurl,proxy);
i = souhuCommentAnalysis.getSouhuCommentCount(newurl,proxy);
return i;
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("搜狐获取评论数出错了 {}",e); logger.error("搜狐获取评论数出错了 {}",e);
...@@ -49,6 +48,21 @@ public class Souhu { ...@@ -49,6 +48,21 @@ public class Souhu {
return -1; return -1;
} }
/**
*
* @Description 获取搜狐号 阅读数
* @param url
* @param proxy
* @return
*/
public static int getSohuReadNum(String url,ProxyHolder proxy) {
try {
return souhuCommentAnalysis.getReadNum(url,proxy);
} catch (Exception e) {
logger.error("搜狐获取阅读数出错 {}",e);
}
return -1;
}
/** /**
* *
......
...@@ -2,25 +2,36 @@ package com.zhiwei.parse; ...@@ -2,25 +2,36 @@ package com.zhiwei.parse;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.TXNewsByWordAnalysis; import com.zhiwei.parse.analysis.TXNewsByWordAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class TXNews { public class TXNews {
private static Logger logger = LoggerFactory.getLogger(TXNews.class); private static Logger logger = LoggerFactory.getLogger(TXNews.class);
private static TXNewsByWordAnalysis txNewsByWordAnalysis = new TXNewsByWordAnalysis(); private static TXNewsByWordAnalysis txNewsByWordAnalysis = new TXNewsByWordAnalysis();
public static boolean txNewshasMoreData = true; public static boolean txNewshasMoreData = true;
public static HttpBoot httpBoot = new HttpBoot();
public static List<Map<String,Object>> getData(String word,String devid,Proxy proxy) { public static List<Map<String,Object>> getData(String word,String devid,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
Map<String,String> headerMap = HeadGet.getTxNewspage1HeaderMap(null); Map<String,String> headerMap = HeadGet.getTxNewspage1HeaderMap(null);
Map<String,Object> paramMap = HeadGet.getTxNewspage1ParamMap(word); Map<String,Object> paramMap = HeadGet.getTxNewspage1ParamMap(word);
// b3dd1e7d-9d3c-4e75-bf3e-3a76f326ee34 // b3dd1e7d-9d3c-4e75-bf3e-3a76f326ee34
...@@ -43,15 +54,71 @@ public class TXNews { ...@@ -43,15 +54,71 @@ public class TXNews {
logger.info("采集到数据======={}" ,dataList.size()); logger.info("采集到数据======={}" ,dataList.size());
count = 0; count = 0;
} catch (Exception e) { } catch (Exception e) {
if(count > 2) {
count++; count++;
if(count > 2) {
break;
}
}
}
return dataList;
}
public static List<Map<String,Object>> getTxNewsComments(String coralUin,String coralUid,ProxyHolder proxy) {
String replayId = "";
int tryCount = 0;
List<Map<String,Object>> dataList = new ArrayList<>();
while(true) {
String content = StringUtils.join("coral_uin=", coralUin, "&coral_uid=", coralUid,"&reply_id=",replayId);
//eca55388bbbb596e632bca03a2378efe94b83142fd046f1f70 876579532
System.out.println(content);
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost("https://r.inews.qq.com/getMyComments", "application/json", content), proxy)){
JSONObject json = JSONObject.parseObject(response.body().string());
JSONArray jsonArray = json.getJSONObject("comments").getJSONArray("new");
for(int i = 0;i < jsonArray.size();i++) {
JSONArray dataJson = jsonArray.getJSONArray(i);
JSONObject data = dataJson.getJSONObject(dataJson.size()-1);
Map<String,Object> replaymap = new HashMap<>();
replaymap.put("name", data.getString("nick"));
replaymap.put("replayUrl", data.getString("url"));
replaymap.put("content", data.getString("reply_content"));
replaymap.put("time", new Date(Long.parseLong(data.getString("pub_time")+"000")));
replaymap.put("replayNum", data.getInteger("reply_num"));
replaymap.put("agreeNum", data.getInteger("agree_count"));
replayId = data.getString("reply_id");
dataList.add(replaymap);
tryCount = 0;
}
logger.info(" 采集到 {} 条 采集uid为 {}",dataList.size(),coralUid);
if(json.getInteger("bnext") == 0) {
break; break;
} }
continue; } catch (Exception e) {
logger.error("腾讯新闻采集有部分出错 {} ",e);
tryCount++;
}
if(tryCount > 3) {
break;
} }
} }
return dataList; return dataList;
} }
/**
*
* @Description 腾讯新闻客户端评论数(https://view.inews.qq.com/a/20190305A0D0MR00)
* @param url
* @param proxy
* @return
*/
public static int getTxNewsCommentCount(String url,ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
return Integer.parseInt(result.split("comment_count\":")[1].split("}")[0]);
} catch (Exception e) {
logger.error(" 腾讯 新闻 评论数获取失败 {}",e);
}
return -1;
}
} }
...@@ -12,8 +12,8 @@ import org.slf4j.Logger; ...@@ -12,8 +12,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.TechTxCommentAnalysis; import com.zhiwei.parse.analysis.TechTxCommentAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
...@@ -70,4 +70,5 @@ public class TechTx { ...@@ -70,4 +70,5 @@ public class TechTx {
return null; return null;
} }
} }
package com.zhiwei.parse; package com.zhiwei.parse;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.Proxy; import java.net.Proxy;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -13,10 +12,12 @@ import java.util.Map; ...@@ -13,10 +12,12 @@ import java.util.Map;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.XueqiuKeyWordAnalysis; import com.zhiwei.parse.analysis.XueqiuKeyWordAnalysis;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request; import okhttp3.Request;
...@@ -28,6 +29,15 @@ public class Xueqiu { ...@@ -28,6 +29,15 @@ public class Xueqiu {
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot();
private static XueqiuKeyWordAnalysis xueqiuKeyWordAnalysis = new XueqiuKeyWordAnalysis(); private static XueqiuKeyWordAnalysis xueqiuKeyWordAnalysis = new XueqiuKeyWordAnalysis();
/**
*
* @Description 关键词采集历史文章
* @param word
* @param endTime
* @param proxy
* @param cookie
* @return
*/
public static List<Map<String,Object>> getData(String word,String endTime,Proxy proxy,String cookie) { public static List<Map<String,Object>> getData(String word,String endTime,Proxy proxy,String cookie) {
List<Map<String,Object>> bodyList = new ArrayList<>(); List<Map<String,Object>> bodyList = new ArrayList<>();
int i = 0; int i = 0;
...@@ -51,10 +61,7 @@ public class Xueqiu { ...@@ -51,10 +61,7 @@ public class Xueqiu {
logger.info("采集到第{} 页 , 一共采集到 {} 数据",page,bodyList.size()); logger.info("采集到第{} 页 , 一共采集到 {} 数据",page,bodyList.size());
page++; page++;
} }
} catch (UnsupportedEncodingException e) { } catch (Exception e) {
e.printStackTrace();
i++;
} catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
i++; i++;
} }
...@@ -65,6 +72,13 @@ public class Xueqiu { ...@@ -65,6 +72,13 @@ public class Xueqiu {
return bodyList; return bodyList;
} }
/**
*
* @Description 雪球获取点赞评论数
* @param url
* @param proxy
* @return
*/
public static Map<String,Object> getUrlData(String url,Proxy proxy) { public static Map<String,Object> getUrlData(String url,Proxy proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){ try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string(); String result = response.body().string();
...@@ -79,9 +93,56 @@ public class Xueqiu { ...@@ -79,9 +93,56 @@ public class Xueqiu {
} catch (Exception e) { } catch (Exception e) {
logger.error(" 雪球 数据转评赞获取失败 exception {} url = {}",e,url); logger.error(" 雪球 数据转评赞获取失败 exception {} url = {}",e,url);
} }
return Collections.emptyMap(); return Collections.emptyMap();
} }
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @return
*/
public List<Map<String,Object>> getXueqiuAccountData(String userId,String cookie,Proxy proxy) {
Map<String,Object> headers = new HashMap<>();
headers.put("cookie", cookie);
List<Map<String,Object>> bodyList = new ArrayList<>();
while(true) {
int page = 1;
String url = "https://xueqiu.com/v4/statuses/user_timeline.json?page=" + page + "&user_id=6687544095&type=0";
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headers), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("statuses");
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject ob = jsonArray.getJSONObject(i);//得到json数组的第i个数组
String timeBefore = ob.getString("timeBefore");//时间
Date date = TimeParse.stringFormartDate(timeBefore);
Map<String, Object> map = new HashMap<>();
map.put("name", ob.getJSONObject("user").getString("screen_name"));//statuses user screen_name
map.put("time", date);//statuses timeBefore
map.put("source", ob.getString("source"));//statuses source
map.put("content", ob.getString("description").replaceAll("<.*?>", ""));//statuses description
map.put("repostCount", ob.getString("retweet_count"));//statuses retweet_count
map.put("commentCount", ob.getString("reply_count"));//statuses reply_count
map.put("likeCount", ob.getString("like_count"));//statuses like_count
map.put("url", "https://xueqiu.coms" + ob.getString("target"));
bodyList.add(map);
}
int maxPage = json.getInteger("maxPage");
page++;
if(page > maxPage) {
break;
}
} catch (Exception e) {
logger.error("采集解析出错 {}",e);
break;
}
}
return bodyList;
}
} }
...@@ -14,8 +14,8 @@ import org.slf4j.LoggerFactory; ...@@ -14,8 +14,8 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response; import okhttp3.Response;
......
...@@ -6,11 +6,18 @@ import java.util.ArrayList; ...@@ -6,11 +6,18 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import javax.script.Invocable;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.YidianzixunAccountAnalysis; import com.zhiwei.parse.analysis.YidianzixunAccountAnalysis;
...@@ -18,12 +25,16 @@ import com.zhiwei.parse.analysis.YidianzixunByWordAnalysis; ...@@ -18,12 +25,16 @@ import com.zhiwei.parse.analysis.YidianzixunByWordAnalysis;
import com.zhiwei.parse.analysis.YidianzixunCommentAnalysis; import com.zhiwei.parse.analysis.YidianzixunCommentAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class Yidianzixun { public class Yidianzixun {
private static Logger logger = LoggerFactory.getLogger(Yidianzixun.class); private static Logger logger = LoggerFactory.getLogger(Yidianzixun.class);
private static YidianzixunAccountAnalysis yidianzixunAccountAnalysis = new YidianzixunAccountAnalysis(); private static YidianzixunAccountAnalysis yidianzixunAccountAnalysis = new YidianzixunAccountAnalysis();
private static YidianzixunCommentAnalysis yidianzixunCommentAnalysis = new YidianzixunCommentAnalysis(); private static YidianzixunCommentAnalysis yidianzixunCommentAnalysis = new YidianzixunCommentAnalysis();
private static YidianzixunByWordAnalysis yidianzixunByWordAnalysis = new YidianzixunByWordAnalysis(); private static YidianzixunByWordAnalysis yidianzixunByWordAnalysis = new YidianzixunByWordAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
/** /**
* *
* @Description (获取一点资讯历时文章) * @Description (获取一点资讯历时文章)
...@@ -33,13 +44,14 @@ public class Yidianzixun { ...@@ -33,13 +44,14 @@ public class Yidianzixun {
*/ */
public static List<Map<String,Object>> getYidianzixunAccountData(String channelid,String startTime,Proxy proxy,String cookie) { public static List<Map<String,Object>> getYidianzixunAccountData(String channelid,String startTime,Proxy proxy,String cookie) {
Map<String,String> headerMap = HeadGet.getYidianzixunAccountHeaderMap(cookie,"http://www.yidianzixun.com/channel/"+channelid); Map<String,String> headerMap = HeadGet.getYidianzixunAccountHeaderMap(cookie,"http://www.yidianzixun.com/channel/"+channelid);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
int j = 0; int j = 0;
boolean f = true; boolean f = true;
try { try {
while(f) { while(f) {
String url = "http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id="+channelid+"&cstart="+j+"&cend="+(j+10); String url = "http://www.yidianzixun.com/"+getSpt(channelid, j, j+10);
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap); String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
System.out.println(result);
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("result"); JSONArray jsonArry = json.getJSONArray("result");
if(jsonArry.size() == 0) { if(jsonArry.size() == 0) {
...@@ -47,6 +59,7 @@ public class Yidianzixun { ...@@ -47,6 +59,7 @@ public class Yidianzixun {
} }
for(int i = 0;i < jsonArry.size();i++) { for(int i = 0;i < jsonArry.size();i++) {
Map<String,Object> map = yidianzixunAccountAnalysis.parseJsonByAccount(jsonArry.getJSONObject(i)); Map<String,Object> map = yidianzixunAccountAnalysis.parseJsonByAccount(jsonArry.getJSONObject(i));
if(!map.isEmpty()) {
if(startTime != null) { if(startTime != null) {
String time = map.get("time")+""; String time = map.get("time")+"";
if(startTime.compareTo(time) > 0) { if(startTime.compareTo(time) > 0) {
...@@ -56,9 +69,10 @@ public class Yidianzixun { ...@@ -56,9 +69,10 @@ public class Yidianzixun {
} }
dataList.add(map); dataList.add(map);
} }
}
System.out.println("================================" + dataList.size()); System.out.println("================================" + dataList.size());
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(3000);
j += 10; j = dataList.size();
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("数据获取出错",e.getMessage()); logger.error("数据获取出错",e.getMessage());
...@@ -67,6 +81,28 @@ public class Yidianzixun { ...@@ -67,6 +81,28 @@ public class Yidianzixun {
return dataList; return dataList;
} }
private static String getSpt(String channel_id, int cstart, int cend) {
String n = "/home/q/news_list_for_channel?channel_id=" + channel_id
+ "&cstart=" + cstart + "&cend=" + (cstart + 10)
+ "&infinite=true&refresh=1&__from__=pc&multi=5";
String jsText = "function spt(n, e, i, t) {"
+ "for (var o = \"sptoken\", a = \"\", c = 1; c < arguments.length; c++){o += arguments[c];}"
+ "for (var c = 0; c < o.length; c++) {var r = 10 ^ o.charCodeAt(c); a += String.fromCharCode(r)}return n += (/\\?/.test(n) ? \"&_spt=\" : \"?_spt=\") + encodeURIComponent(a)}";
ScriptEngineManager manager = new ScriptEngineManager();
ScriptEngine engine = manager.getEngineByName("javascript");
try {
engine.eval(jsText);
if (engine instanceof Invocable) {
Invocable invoke = (Invocable) engine;
return invoke.invokeFunction("spt", n, channel_id, cstart, cend)
.toString();
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/** /**
* *
...@@ -140,10 +176,27 @@ public class Yidianzixun { ...@@ -140,10 +176,27 @@ public class Yidianzixun {
} }
return dataList; return dataList;
} catch (Exception e) { } catch (Exception e) {
logger.error("获取一点资讯数据失败",e.getMessage()); logger.error("获取一点资讯数据失败 {}",e);
e.printStackTrace(); }
return dataList; return dataList;
} }
/**
*
* @Description 一点资讯评论数获取 (http://www.yidianzixun.com/article/0LQaOacC)
* @param id 0LQaOacC
* @param proxy
* @return
*/
public static int getYidianzixunCommentCount(String id,ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet("http://www.yidianzixun.com/home/q/getcomments?&docid=" + id + "&s=&count=30&last_comment_id=&appid=web_yidian"), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
return json.getInteger("total");
} catch (Exception e) {
logger.error(" 一点资讯 评论数解析出错 {}",e);
}
return -1;
} }
} }
...@@ -14,8 +14,9 @@ import org.slf4j.LoggerFactory; ...@@ -14,8 +14,9 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import okhttp3.Response; import okhttp3.Response;
...@@ -23,7 +24,7 @@ import okhttp3.Response; ...@@ -23,7 +24,7 @@ import okhttp3.Response;
public class Youku { public class Youku {
private static final Logger logger = LoggerFactory.getLogger(Youku.class); private static final Logger logger = LoggerFactory.getLogger(Youku.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot(false,2);
public static List<Map<String,Object>> getDataList(String word) { public static List<Map<String,Object>> getDataList(String word) {
String aaid = "9cae49f0e031664b00d8f9c108e586ab"; String aaid = "9cae49f0e031664b00d8f9c108e586ab";
...@@ -49,6 +50,7 @@ public class Youku { ...@@ -49,6 +50,7 @@ public class Youku {
map.put("url", "https:"+surl); map.put("url", "https:"+surl);
map.put("time", time.replaceAll("上传时间:", "").split(" ")[0]); map.put("time", time.replaceAll("上传时间:", "").split(" ")[0]);
map.put("uper",time.replace(time.split("上传者:")[0], "")); map.put("uper",time.replace(time.split("上传者:")[0], ""));
map.put("word", word);
list.add(map); list.add(map);
} }
} }
...@@ -61,5 +63,29 @@ public class Youku { ...@@ -61,5 +63,29 @@ public class Youku {
return list; return list;
} }
/**
*
* @Description 优酷热度采集
* @param url
* @param proxy
* @return
*/
public static int getYoukuHotCount(String url,ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
Document doc = Jsoup.parse(result);
String title = doc.select("meta[name='title']").attr("content");
Elements elements = doc.select("div#listitem_page1").select("div.item.item-cover");
for(Element element : elements) {
if(element.toString().contains(title)) {
String hot = element.select("div.status > span").text().replace("热度 ", "");
return Integer.parseInt(hot);
}
}
} catch (Exception e) {
logger.error("优酷热度采集出错{}",e);
}
return -1;
}
} }
...@@ -18,8 +18,7 @@ import com.zhiwei.tools.timeparse.TimeParse; ...@@ -18,8 +18,7 @@ import com.zhiwei.tools.timeparse.TimeParse;
public class AiqiyiByWordAnalysis { public class AiqiyiByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(AiqiyiByWordAnalysis.class); private static Logger logger = LoggerFactory.getLogger(AiqiyiByWordAnalysis.class);
public List<Map<String,Object>> getAiqiyiData(String result,String word) {
public List<Map<String,Object>> getAiqiyiData(String result) {
List<Map<String,Object>> dataMap = new ArrayList<>(); List<Map<String,Object>> dataMap = new ArrayList<>();
try { try {
Document doc = Jsoup.parse(result); Document doc = Jsoup.parse(result);
...@@ -28,11 +27,14 @@ public class AiqiyiByWordAnalysis { ...@@ -28,11 +27,14 @@ public class AiqiyiByWordAnalysis {
Map<String, Object> map = new HashMap<>(); Map<String, Object> map = new HashMap<>();
String title = element.select("li").attr("data-widget-searchlist-tvname"); String title = element.select("li").attr("data-widget-searchlist-tvname");
String time = element.select("em.result_info_desc").text().split(" ")[0]; String time = element.select("em.result_info_desc").text().split(" ")[0];
if(element.select("label.result_info_lbl").text().contains("上传者")) {
map.put("source", element.select("a.result_info_link").text());
}
String uurl = element.select("h3.result_title > a").attr("href"); String uurl = element.select("h3.result_title > a").attr("href");
map.put("time", TimeParse.stringFormartDate(time)); map.put("time", TimeParse.stringFormartDate(time));
map.put("url", uurl); map.put("url", uurl);
map.put("title", title); map.put("title", title);
System.out.println(map.toString()); map.put("word", word);
dataMap.add(map); dataMap.add(map);
} }
return dataMap; return dataMap;
...@@ -42,7 +44,6 @@ public class AiqiyiByWordAnalysis { ...@@ -42,7 +44,6 @@ public class AiqiyiByWordAnalysis {
} }
} }
// public String getSource(String url,ProxyHolder proxy) { // public String getSource(String url,ProxyHolder proxy) {
// Map<String,String> headerMap = HeadGet.getAiqiyiForCountHeaderMap(null); // Map<String,String> headerMap = HeadGet.getAiqiyiForCountHeaderMap(null);
// System.out.println(url); // System.out.println(url);
......
...@@ -15,7 +15,7 @@ import org.slf4j.LoggerFactory; ...@@ -15,7 +15,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
......
package com.zhiwei.parse.analysis; package com.zhiwei.parse.analysis;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -12,7 +13,7 @@ import org.jsoup.select.Elements; ...@@ -12,7 +13,7 @@ import org.jsoup.select.Elements;
public class BilibilikeyWordAnalysis { public class BilibilikeyWordAnalysis {
public static Map<String,Object> getData(String result) { public static Map<String,Object> getData(String result,String word) {
try { try {
Document doc = Jsoup.parse(result); Document doc = Jsoup.parse(result);
boolean more = false; boolean more = false;
...@@ -43,17 +44,17 @@ public class BilibilikeyWordAnalysis { ...@@ -43,17 +44,17 @@ public class BilibilikeyWordAnalysis {
map.put("time", time); map.put("time", time);
map.put("source", source); map.put("source", source);
map.put("submitcount", submitcount); map.put("submitcount", submitcount);
map.put("word", word);
dataList.add(map); dataList.add(map);
System.out.println(map.toString());
} }
Map<String,Object> rmap = new HashMap<String,Object>(); Map<String,Object> rmap = new HashMap<>();
rmap.put("more", more); rmap.put("more", more);
rmap.put("data", dataList); rmap.put("data", dataList);
return rmap; return rmap;
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }
return null; return Collections.emptyMap();
} }
} }
...@@ -15,7 +15,7 @@ import org.slf4j.LoggerFactory; ...@@ -15,7 +15,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
......
...@@ -12,7 +12,7 @@ import org.slf4j.LoggerFactory; ...@@ -12,7 +12,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
......
...@@ -13,8 +13,8 @@ import org.slf4j.LoggerFactory; ...@@ -13,8 +13,8 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
......
package com.zhiwei.parse.analysis; package com.zhiwei.parse.analysis;
import static java.util.Objects.nonNull;
import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import static java.util.Objects.nonNull;
import java.util.ArrayList;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
......
package com.zhiwei.parse.analysis; package com.zhiwei.parse.analysis;
import static java.util.Objects.nonNull;
import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import static java.util.Objects.nonNull;
import java.util.ArrayList;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
......
package com.zhiwei.parse.analysis; package com.zhiwei.parse.analysis;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -23,11 +25,11 @@ public class QQKBAccountAnalysis { ...@@ -23,11 +25,11 @@ public class QQKBAccountAnalysis {
public List<Map<String,Object>> analysisQQAccountData(String result) { public List<Map<String,Object>> analysisQQAccountData(String result) {
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("newslist"); JSONArray jsonArry = json.getJSONArray("newslist");
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
try { try {
for(int i = 0;i < jsonArry.size();i++) { for(int i = 0;i < jsonArry.size();i++) {
JSONObject data = jsonArry.getJSONObject(i); JSONObject data = jsonArry.getJSONObject(i);
Map<String,Object> map = new HashMap<String,Object>(); Map<String,Object> map = new HashMap<>();
map.put("url", data.getString("url_comment")); map.put("url", data.getString("url_comment"));
map.put("time", data.getString("time")); map.put("time", data.getString("time"));
map.put("title", data.getString("title")); map.put("title", data.getString("title"));
...@@ -38,8 +40,8 @@ public class QQKBAccountAnalysis { ...@@ -38,8 +40,8 @@ public class QQKBAccountAnalysis {
dataList.add(map); dataList.add(map);
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("解析出错",e.getMessage()); logger.error("解析出错 {}",e);
return null; return Collections.emptyList();
} }
return dataList; return dataList;
} }
...@@ -51,18 +53,20 @@ public class QQKBAccountAnalysis { ...@@ -51,18 +53,20 @@ public class QQKBAccountAnalysis {
* @return * @return
*/ */
public List<String> getQQAllIds(String result) { public List<String> getQQAllIds(String result) {
List<String> list = new ArrayList<String>(); List<String> list = new ArrayList<>();
try { try {
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("ids"); JSONArray jsonArry = json.getJSONArray("ids");
if(Objects.nonNull(jsonArry) ) {
for(int i = 0;i < jsonArry.size();i++) { for(int i = 0;i < jsonArry.size();i++) {
JSONObject data = jsonArry.getJSONObject(i); JSONObject data = jsonArry.getJSONObject(i);
list.add(data.getString("id")); list.add(data.getString("id"));
} }
}
return list; return list;
} catch (Exception e) { } catch (Exception e) {
logger.error("获取企鹅号所有id出错",e.getMessage()); logger.error("获取企鹅号所有id出错 {}",e);
return null; return Collections.emptyList();
} }
} }
......
package com.zhiwei.parse.analysis; package com.zhiwei.parse.analysis;
import static java.util.Objects.nonNull; import static java.util.Objects.nonNull;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.Date; import java.util.Date;
...@@ -28,9 +29,9 @@ public class SinaTousuAnalysis { ...@@ -28,9 +29,9 @@ public class SinaTousuAnalysis {
for(int i = 0;i < jsonArray.size() ;i++) { for(int i = 0;i < jsonArray.size() ;i++) {
JSONObject data = jsonArray.getJSONObject(i); JSONObject data = jsonArray.getJSONObject(i);
String ctime = TimeParse.dateFormartString(new Date(data.getJSONObject("main").getLong("timestamp")*1000L), "yyyy-MM-dd HH:mm:ss"); String ctime = TimeParse.dateFormartString(new Date(data.getJSONObject("main").getLong("timestamp")*1000L), "yyyy-MM-dd HH:mm:ss");
if(!nonNull(time) || ctime.compareTo(time) <= 0) { // if(nonNull(time) || ctime.compareTo(time) <= 0) {
continue; // continue;
} // }
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
map.put("title", data.getJSONObject("main").getString("title").replaceAll("<.*?>", "")); map.put("title", data.getJSONObject("main").getString("title").replaceAll("<.*?>", ""));
map.put("url", "https:" + data.getJSONObject("main").getString("url")); map.put("url", "https:" + data.getJSONObject("main").getString("url"));
......
...@@ -3,14 +3,15 @@ package com.zhiwei.parse.analysis; ...@@ -3,14 +3,15 @@ package com.zhiwei.parse.analysis;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
...@@ -30,24 +31,58 @@ public class SouhuCommentAnalysis { ...@@ -30,24 +31,58 @@ public class SouhuCommentAnalysis {
public String getSouhuURL(String url,ProxyHolder proxy) { public String getSouhuURL(String url,ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){ try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string(); String result = response.body().string();
String source_id = result.split("news_id: \"")[1].split("\",")[0]; String sourceId = getNewsId(result);
String topic_id = result.split("media_id: \"")[1].split("\",")[0]; String topicId = getTopicId(result);
return "http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id="+topic_id+"&source_id=mp_"+source_id; if(Objects.nonNull(topicId) && Objects.nonNull(sourceId)) {
return "http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id="+topicId+"&source_id=mp_"+sourceId;
}
} catch (Exception e) { } catch (Exception e) {
logger.error("Exception {} ",e); logger.error("Exception {} ",e);
} }
return null; return null;
} }
private String getTopicId(String result) {
try {
String topicId = null;
if(result.contains("news_id")) {
topicId = result.split("media_id: \"")[1].split("\",")[0];
}
if(result.contains("newsId")) {
topicId = result.split("media_id: '")[1].split("',")[0];
}
return topicId;
} catch (Exception e) {
logger.error("获取topicID出错");
}
return null;
}
private String getNewsId(String result) {
try {
String sourceId = null;
if(result.contains("news_id")) {
sourceId = result.split("news_id: \"")[1].split("\",")[0];
}
if(result.contains("newsId")) {
sourceId = result.split("newsId : '")[1].split("',")[0];
}
return sourceId;
} catch (Exception e) {
logger.error("获取sourceId出错");
}
return null;
}
public int getSouhuCommentCount(String url,ProxyHolder proxy) { public int getSouhuCommentCount(String url,ProxyHolder proxy) {
Map<String,String> headerMap = HeadGet.getSouhuCommentHeaderMap(null); Map<String,String> headerMap = HeadGet.getSouhuCommentHeaderMap(null);
int i;
try { try {
System.out.println(url);
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap); String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
i = json.getJSONObject("jsonObject").getInteger("cmt_sum"); if(json.getInteger("code") == 500) {
return i; return 0;
}
return json.getJSONObject("jsonObject").getInteger("cmt_sum");
} catch (Exception e) { } catch (Exception e) {
logger.error("获取搜狐评论数信息出错 {}",e); logger.error("获取搜狐评论数信息出错 {}",e);
return -1; return -1;
...@@ -75,13 +110,24 @@ public class SouhuCommentAnalysis { ...@@ -75,13 +110,24 @@ public class SouhuCommentAnalysis {
map.put("comment_id", data.getString("comment_id")); map.put("comment_id", data.getString("comment_id"));
map.put("reply_id", data.getString("reply_id")); map.put("reply_id", data.getString("reply_id"));
} catch (Exception e) { } catch (Exception e) {
System.out.println(data.toString()); logger.error("解析出错 {}",e);
System.out.println(map.toString());
logger.error("解析出错",e.getMessage());
} }
return map; return map;
} }
public int getReadNum(String url, ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
String sourceId = getNewsId(result);
url = "http://v2.sohu.com/public-api/articles/pv?articleIds=" + sourceId;
result = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
return JSONObject.parseObject(result).getInteger(sourceId);
} catch (Exception e) {
logger.error("Exception {} ",e);
}
return -1;
}
......
package com.zhiwei.parse.analysis; package com.zhiwei.parse.analysis;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
...@@ -19,19 +20,21 @@ public class YidianzixunAccountAnalysis { ...@@ -19,19 +20,21 @@ public class YidianzixunAccountAnalysis {
* @return * @return
*/ */
public Map<String,Object> parseJsonByAccount(JSONObject data) { public Map<String,Object> parseJsonByAccount(JSONObject data) {
Map<String,Object> map = new HashMap<String, Object>(); Map<String,Object> map = new HashMap<>();
try { try {
if(data.containsKey("url")) {
map.put("title", data.getString("title")); map.put("title", data.getString("title"));
map.put("time", data.getString("date")); map.put("time", data.getString("date"));
map.put("comment_count", data.getString("comment_count")==null?0:data.getString("comment_count")); map.put("comment_count", data.getString("comment_count")==null?0:data.getString("comment_count"));
map.put("ctype", data.getString("ctype")); map.put("ctype", data.getString("ctype"));
map.put("source", data.getString("source")); map.put("source", data.getString("source"));
map.put("url", data.getString("url")); map.put("url", "http://www.yidianzixun.com/article/" + data.getString("docid"));
map.put("summary", data.getString("summary")); map.put("summary", data.getString("summary"));
}else {
return Collections.emptyMap();
}
} catch (Exception e) { } catch (Exception e) {
System.out.println(data.toString()); logger.error("解析此条出错",e);
System.out.println(map.toString());
logger.error("解析此条出错",e.getMessage());
} }
return map; return map;
} }
......
...@@ -18,9 +18,9 @@ import org.slf4j.LoggerFactory; ...@@ -18,9 +18,9 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response; import okhttp3.Response;
...@@ -54,7 +54,7 @@ public class QQTV { ...@@ -54,7 +54,7 @@ public class QQTV {
String nurl = element.select("h2.result_title").select("a").attr("href"); String nurl = element.select("h2.result_title").select("a").attr("href");
Map<String,Object> map = getUrlData(nurl, ProxyFactory.getNatProxy()); Map<String,Object> map = getUrlData(nurl, ProxyFactory.getNatProxy());
if(Objects.nonNull(map) && time.compareTo(String.valueOf(map.get("time"))) < 1) { if(Objects.nonNull(map) && time.compareTo(String.valueOf(map.get("time"))) < 1) {
// System.out.println(map.toString()); map.put("word", word);
dataList.add(map); dataList.add(map);
} }
ZhiWeiTools.sleep(50); ZhiWeiTools.sleep(50);
......
...@@ -15,7 +15,7 @@ import org.slf4j.Logger; ...@@ -15,7 +15,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
...@@ -52,6 +52,7 @@ public class SohuTV { ...@@ -52,6 +52,7 @@ public class SohuTV {
map.put("time",TimeParse.stringFormartDate(time)); map.put("time",TimeParse.stringFormartDate(time));
map.put("url","https://" + nurl); map.put("url","https://" + nurl);
map.put("playCount",amountOfPlay); map.put("playCount",amountOfPlay);
map.put("word", word);
dataList.add(map); dataList.add(map);
} }
} catch (Exception e) { } catch (Exception e) {
......
package com.zhiwei.Comment;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Aiqiyi;
public class AiqiyiHotCountTest {
@Test
public void f() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\爱奇艺.xlsx";
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
List<String> headList = (List<String>) map.get("head");
headList.add("count");
dataList.forEach(m -> {
String url = String.valueOf(m.get("链接"));
int i = Aiqiyi.aiqiyiHotCount(url, ProxyHolder.NAT_PROXY);
System.out.println(url + " -- " + i);
m.put("count", i);
});
poi.exportExcel(path, "data", headList, dataList);
}
}
package com.zhiwei.Comment; //package com.zhiwei.Comment;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.common.config.GroupType; //import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; //import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder; //import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Maimai; //import com.zhiwei.parse.Maimai;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
//
public class MaimaiCommentCountTest { //public class MaimaiCommentCountTest {
@Test // @Test
public void f() { // public void f() {
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", // ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType.PROVIDER); // GroupType.PROVIDER);
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map<String, Object> map = poi // Map<String, Object> map = poi
.importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", 0); // .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", 0);
List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body"); // List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"rhItcea5qkO6WCSnVcczW/NRVLLCTsq3kQbpUCGAwQ0ceLunVJRjT5rgoFVYrIBA8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAyMTY3MDQ5LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=zbs4cHtzTcHWvjtkpjAZmoqLXsQ"; // String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"rhItcea5qkO6WCSnVcczW/NRVLLCTsq3kQbpUCGAwQ0ceLunVJRjT5rgoFVYrIBA8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAyMTY3MDQ5LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=zbs4cHtzTcHWvjtkpjAZmoqLXsQ";
List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>(); // List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
List<String> headList = (List<String>) map.get("head"); // List<String> headList = (List<String>) map.get("head");
for (Map<String, Object> map1 : list) { // for (Map<String, Object> map1 : list) {
String url = map1.get("地址") + ""; // String url = map1.get("地址") + "";
Map<String,Object> map3 = Maimai.getMaiaiCount(url,null, ProxyHolder.NAT_PROXY); // Map<String,Object> map3 = Maimai.getMaiaiCount(url,null, ProxyHolder.NAT_PROXY);
System.out.println(map3.toString()); // System.out.println(map3.toString());
System.out.println(url); // System.out.println(url);
map1.putAll(map3); // map1.putAll(map3);
ZhiWeiTools.sleep(500); // ZhiWeiTools.sleep(500);
System.out.println("--------------------------"); // System.out.println("--------------------------");
} // }
headList.add("like"); // headList.add("like");
headList.add("spreads"); // headList.add("spreads");
headList.add("cmts"); // headList.add("cmts");
poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", "评论采集", headList, // poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", "评论采集", headList,
list); // list);
} // }
} //}
//package com.zhiwei.Comment;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.parse.QQNews;
//
//public class QQNewCommentCountTest {
// @Test
// public void qqNewCommentCountTest() {
// String id = "TEC2019030500050000";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// System.out.println(ProxyFactory.getNatProxy());
// int i = QQNews.getQQNewsCommentCount(id, ProxyHolder.NAT_PROXY);
// System.out.println(i);
// }
//}
//package com.zhiwei.Comment;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.parse.SinaKeji;
//
//public class SinkeCommentCountTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String url = "https://k.sina.cn/article_6972257940_19f94369400100fyxr.html?cre=tianyi&mod=nfin&loc=10&r=0&rfunc=24&tj=cxvertical_nfin&tr=12&fromsinago=1&http=fromhttp";
//
// int i = SinaKeji.getCommentCount(url, ProxyHolder.NAT_PROXY);
// System.out.println(i);
// }
//}
//package com.zhiwei.Comment;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.parse.TXNews;
//
//public class TxNewsCommentCountTest {
// @Test
// public void txNewsCommentCountTest() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local", GroupType.PROVIDER);
// String url = "https://view.inews.qq.com/a/20190207A0480R00";
//
// System.out.println(TXNews.getTxNewsCommentCount(url, ProxyHolder.NAT_PROXY));
// }
//}
//package com.zhiwei.Comment;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.parse.Yidianzixun;
//
//public class YidianzixunCommentCountTest {
// @Test
// public void yidianzixunCommentCountTest() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String id = "0LQaOacC";
// int i = Yidianzixun.getYidianzixunCommentCount(id, ProxyHolder.NAT_PROXY);
// System.out.println(i);
// }
//}
package com.zhiwei.Comment;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Aiqiyi;
import com.zhiwei.parse.Youku;
public class YoukuHotCountTest {
@Test
public void f() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\优酷.xlsx";
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
List<String> headList = (List<String>) map.get("head");
headList.add("count");
dataList.forEach(m -> {
String url = String.valueOf(m.get("链接"));
int i = Youku.getYoukuHotCount(url, ProxyHolder.NAT_PROXY);
System.out.println(url + " -- " + i);
m.put("count", i);
});
poi.exportExcel(path, "data", headList, dataList);
}
}
package com.zhiwei; //package com.zhiwei;
//
import java.io.IOException; //import java.io.IOException;
import java.util.HashMap; //import java.util.HashMap;
import java.util.Map; //import java.util.Map;
//
import java.util.HashMap; //import java.util.HashMap;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.crawler.core.HttpBoot; //import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; //import com.zhiwei.crawler.core.RequestUtils;
//
public class TestHttpBoot { //public class TestHttpBoot {
@Test // @Test
public void f() { // public void f() {
HttpBoot httpBoot = new HttpBoot(); // HttpBoot httpBoot = new HttpBoot();
String url = "https://www.toutiao.com/c/user/following/?user_id=1034006366&count=20&_signature=wp5wPBAVmXlosTC8Fobui8KecC"; // String url = "https://www.toutiao.com/c/user/following/?user_id=1034006366&count=20&_signature=wp5wPBAVmXlosTC8Fobui8KecC";
Map<String,Object> headers = new HashMap<>(); // Map<String,Object> headers = new HashMap<>();
headers.put("referer", "https://www.qctt.cn/news/349056"); // headers.put("referer", "https://www.qctt.cn/news/349056");
headers.put("cookie", "PHPSESSID=3rd6bvonb4g15t1fp777mjums0; Hm_lvt_70af9ea91e7adc8195f6d49511b9a2f1=1542253722; open_ad=1; Hm_lpvt_70af9ea91e7adc8195f6d49511b9a2f1=1542271394; vcode=sqmm; XSRF-TOKEN=eyJpdiI6IlFTNzkyYWNcLzB2SUwyN2dcL1hhUlpsZz09IiwidmFsdWUiOiJRSUpycjZJNGx3d1hUWkpOQUl1R2psSStuVU0yYW8xT1YxXC9QOFY1NjdyRXNrMWpFVE1kSm9IQ1o5Nm5keXlMTEFnZXdCOHVvWDg0U2picTE1cjZzMkE9PSIsIm1hYyI6IjZlYzk5NDI3ODEzMzA3ZTJjNDc3M2ZjMjBlNDJhZjc2YjU2ODFmYmY3YWRlMzdlMzM1NTBlNWMxNDk3MjFiZDEifQ%3D%3D; laravel_session=eyJpdiI6InJQMnByeFlIbXVhaUVVVVBLK1wvaXlRPT0iLCJ2YWx1ZSI6IlhUOUtIS2ZQZ0ZKNFh1RDVQYjBjSVZkVkpQZTdYRDNpa1wvV0o5QlJPbk8xZE0rQ3dZdnFMdjcya011ejVkdWEwUk1Qa29Zb2Y3OU0yUGkrWDF4Wk5adz09IiwibWFjIjoiZGJiYjlkNWZhNmJhMDFiMjkxYTAyMmUwZTEyMWVmZTQ0NmJiZDQ2ZGU3ZjNjNmUzNTIwZGI0NTc4NDJlZjNiMCJ9"); // headers.put("cookie", "PHPSESSID=3rd6bvonb4g15t1fp777mjums0; Hm_lvt_70af9ea91e7adc8195f6d49511b9a2f1=1542253722; open_ad=1; Hm_lpvt_70af9ea91e7adc8195f6d49511b9a2f1=1542271394; vcode=sqmm; XSRF-TOKEN=eyJpdiI6IlFTNzkyYWNcLzB2SUwyN2dcL1hhUlpsZz09IiwidmFsdWUiOiJRSUpycjZJNGx3d1hUWkpOQUl1R2psSStuVU0yYW8xT1YxXC9QOFY1NjdyRXNrMWpFVE1kSm9IQ1o5Nm5keXlMTEFnZXdCOHVvWDg0U2picTE1cjZzMkE9PSIsIm1hYyI6IjZlYzk5NDI3ODEzMzA3ZTJjNDc3M2ZjMjBlNDJhZjc2YjU2ODFmYmY3YWRlMzdlMzM1NTBlNWMxNDk3MjFiZDEifQ%3D%3D; laravel_session=eyJpdiI6InJQMnByeFlIbXVhaUVVVVBLK1wvaXlRPT0iLCJ2YWx1ZSI6IlhUOUtIS2ZQZ0ZKNFh1RDVQYjBjSVZkVkpQZTdYRDNpa1wvV0o5QlJPbk8xZE0rQ3dZdnFMdjcya011ejVkdWEwUk1Qa29Zb2Y3OU0yUGkrWDF4Wk5adz09IiwibWFjIjoiZGJiYjlkNWZhNmJhMDFiMjkxYTAyMmUwZTEyMWVmZTQ0NmJiZDQ2ZGU3ZjNjNmUzNTIwZGI0NTc4NDJlZjNiMCJ9");
headers.put("origin", "https://www.qctt.cn"); // headers.put("origin", "https://www.qctt.cn");
Map<String,Object> params = new HashMap<>(); // Map<String,Object> params = new HashMap<>();
params.put("id", "349056"); // params.put("id", "349056");
params.put("page", "3"); // params.put("page", "3");
params.put("_token", "EJ58V0qilRw7P77czp0U6iO9QW2IOS1ZGiBk4wH1"); // params.put("_token", "EJ58V0qilRw7P77czp0U6iO9QW2IOS1ZGiBk4wH1");
try { // try {
String result = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string(); // String result = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
System.out.println(result); // System.out.println(result);
//
} catch (IOException e) { // } catch (IOException e) {
// TODO Auto-generated catch block // // TODO Auto-generated catch block
e.printStackTrace(); // e.printStackTrace();
} // }
//
//
} // }
} //}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Aiqiyi;
//import com.zhiwei.util.WordReadFile;
//
//public class AiqiyiByWordExample {
//
//
// @Test
// public void aiqiyiByWordTest() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : wordList) {
// List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,null);
// if(dataList != null && dataList.size() >= 1) {
// bodyList.addAll(dataList);
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("count");
// headList.add("time");
// headList.add("source");
// headList.add("content");
// headList.add("url");
// headList.add("title");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata/爱奇艺关键词采集.xlsx", "数据", headList, bodyList);
//
//
//
// }
//
//
//
//}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.Arrays; //import java.util.Arrays;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Maimai; //import com.zhiwei.parse.Maimai;
//
public class MaimaiBywordExample { //public class MaimaiBywordExample {
//
public static void main(String[] args) { // public static void main(String[] args) {
String word = "美团|某团|MT|大众点评|新美大|美团点评"; // String word = "美团|某团|MT|大众点评|新美大|美团点评";
String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"G8eNNNylPoi3oIPLUr/d/RDaMgtnpZCskxT7wu1pRRrkiy3J8G7StHgTx9DQBq4O8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAwNjY1Njg4LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=b_tga85tZskxsgKX8YIM_JKByi0"; // String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"G8eNNNylPoi3oIPLUr/d/RDaMgtnpZCskxT7wu1pRRrkiy3J8G7StHgTx9DQBq4O8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAwNjY1Njg4LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=b_tga85tZskxsgKX8YIM_JKByi0";
String time = "2019-02-15 00:00:00"; // String time = "2019-02-15 00:00:00";
String[] words = word.split("\\|"); // String[] words = word.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) { // for(String w : words) {
//实名动态 // //实名动态
// List<Map<String,Object>> c = Maimai.getData(w, cookie, time, null); //// List<Map<String,Object>> c = Maimai.getData(w, cookie, time, null);
//职言交流 // //职言交流
List<Map<String,Object>> c2 = Maimai.getDataByNoName(w, cookie, time, null); // List<Map<String,Object>> c2 = Maimai.getDataByNoName(w, cookie, time, null);
// bodyList.addAll(c); //// bodyList.addAll(c);
bodyList.addAll(c2); // bodyList.addAll(c2);
} // }
List<String> headList = Arrays.asList("time","url","text","name","like","comment_count","spreads","word"); // List<String> headList = Arrays.asList("time","url","text","name","like","comment_count","spreads","word");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0222.xlsx", "脉脉关键词", headList, bodyList); // poi.exportExcel("D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0222.xlsx", "脉脉关键词", headList, bodyList);
} // }
//
} //}
...@@ -24,7 +24,8 @@ public class QQAccountExample { ...@@ -24,7 +24,8 @@ public class QQAccountExample {
String child = map.get("帐号链接")+""; String child = map.get("帐号链接")+"";
// System.out.println(child.split("chlid=")[1]); // System.out.println(child.split("chlid=")[1]);
System.out.println(child.split("=")[1]); System.out.println(child.split("=")[1]);
List<Map<String,Object>> lists = QQKB.getQQAccountData(child.split("=")[1], cookie,null);
List<Map<String,Object>> lists = QQKB.getQQAccountData("5001789", cookie,null);
if(lists != null) { if(lists != null) {
for(Map<String,Object> map1 : lists) { for(Map<String,Object> map1 : lists) {
map1.put("name", map.get("呢称")); map1.put("name", map.get("呢称"));
......
...@@ -9,10 +9,10 @@ public class QQKBCommentCountExample { ...@@ -9,10 +9,10 @@ public class QQKBCommentCountExample {
@Test @Test
public void qqkbCommentCountTest() { public void qqkbCommentCountTest() {
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=0003000049dd058f533cbebb240223ede63b864224f7eebe0f4aeca6a623572bb290a5800741d191a5768bb0;%20uin=o0497332654;%20skey=MIZmc2Oel3;%20sigA2=4282ABA809551D3534C72F999EE8F2A75219ED9452DEF04E4CBCE6B680C2C893C3E1BA617F5E0F387E558888B2ABEDFE87A4A25B16F9066C1154B2BC7A1133CA7B356AB9D3BA26ED;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwgGT4n96Oq-jHALnMUe8UzpoJghQDouvfSSWdh-JOdgAm3jRJUPbux6fcIPghoNxo24xdED8ennAANksJuHiwdw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"; String cookie = "";
String url = "https://tech.sina.cn/i/gn/2018-04-26/detail-ifztkpin4282154.d.html?pos=18"; String url = "https://kuaibao.qq.com/s/20190305A16P6L00";
int i = QQKB.getCommentCount(cookie, url,null); int i = QQKB.getCommentCount(url,null);
System.out.println(i); System.out.println(i);
} }
......
...@@ -28,8 +28,11 @@ public class SouhuCommentCountExample { ...@@ -28,8 +28,11 @@ public class SouhuCommentCountExample {
try { try {
url = map1.get("url")+""; url = map1.get("url")+"";
System.out.println(url); System.out.println(url);
url = "http://m.sohu.com/a/299389309_114988";
int i = Souhu.getSouhuCommentCount(url,ProxyHolder.NAT_PROXY); int i = Souhu.getSouhuCommentCount(url,ProxyHolder.NAT_PROXY);
int j = Souhu.getSohuReadNum(url, ProxyHolder.NAT_PROXY);
map1.put("count", i); map1.put("count", i);
map1.put("redNum", j);
System.out.println(map1.toString()); System.out.println(map1.toString());
} catch (Exception e) { } catch (Exception e) {
System.out.println(url); System.out.println(url);
......
...@@ -27,9 +27,11 @@ public class WangyiCommentCountExample { ...@@ -27,9 +27,11 @@ public class WangyiCommentCountExample {
List<Map<String,Object>> bodyList = new ArrayList<>(); List<Map<String,Object>> bodyList = new ArrayList<>();
for(String url : urlList) { for(String url : urlList) {
url = "https://3g.163.com/all/article/E9GAO0PK051188EC.html";
String id = url.split("/")[url.split("/").length-1].split(".ht")[0]; String id = url.split("/")[url.split("/").length-1].split(".ht")[0];
System.out.println(id); System.out.println(id);
int lists = Wangyi.getWangyiCommentCount(id, null); int lists = Wangyi.getWangyiCommentCount(id, null);
System.out.println(lists);
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(3000);
} }
List<String> headList = new ArrayList<String>(); List<String> headList = new ArrayList<String>();
......
...@@ -14,9 +14,9 @@ public class YidianzixunAccountExample { ...@@ -14,9 +14,9 @@ public class YidianzixunAccountExample {
@Test @Test
public void yidianzixunAccountTest() { public void yidianzixunAccountTest() {
String channelid = "m133695"; String channelid = "m23315";
String startTime = "2017-01-01 00:00:00"; String startTime = "2007-01-01 00:00:00";
String cookie = "wuid=257912989774746; wuid_createAt=2018-04-21 12:26:54; UM_distinctid=162e674783dc4e-030ed894a4953b-4446042d-1fa400-162e674783e34a; JSESSIONID=8ee0cee7a49e812492917a669074974b9a004e7b28ed41bc99e96793df734961; weather_auth=2; Hm_lvt_15fafbae2b9b11d280c79eff3b840e45=1527148836,1527213305,1527752112; CNZZDATA1255169715=542587606-1524284730-null%7C1527749514; sptoken=Ug%3B99%3C3%3FU%3AU%3B%3AU48261efeced332cc9f20413132c69381bcc921bb210c93b90058b318eec23117; captcha=s%3A7c9d6bca395d270e3a4774968531f470.e1IzHNmf94UVpZlGYHYmDUnUk6sA1s7sPYj7RA932lo; Hm_lpvt_15fafbae2b9b11d280c79eff3b840e45=1527752125; cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%22162e674783dc4e-030ed894a4953b-4446042d-1fa400-162e674783e34a%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201527752148%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201527752148%7D%7D"; String cookie = "wuid=90742539356820; wuid_createAt=2019-01-10 11:45:41; UM_distinctid=16835dd9ba11cb-0ef8d17063d93f-671b197c-1fa400-16835dd9ba2243; JSESSIONID=174b8df350cb5400283abedf2c26076357b0b7af0581024f2e39e90532b4edc9; weather_auth=2; DID=node82eee6d174caf2d4; Hm_lvt_15fafbae2b9b11d280c79eff3b840e45=1551686450,1551686458; CNZZDATA1255169715=931563543-1547087800-%7C1551761063; captcha=s%3A6e56492ffceaf88d9f131fa79435464a.TLAhZ1cfwj0vBTjKTO9Qf5qc6QLuipitrEMZjiqm8BM; Hm_lpvt_15fafbae2b9b11d280c79eff3b840e45=1551764582; cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%2216835dd9ba11cb-0ef8d17063d93f-671b197c-1fa400-16835dd9ba2243%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201547544080%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201547544080%7D%2C%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201551765057%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201551765057%7D";
List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunAccountData(channelid, startTime,null,cookie); List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunAccountData(channelid, startTime,null,cookie);
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>(); List<String> headList = new ArrayList<String>();
...@@ -27,7 +27,7 @@ public class YidianzixunAccountExample { ...@@ -27,7 +27,7 @@ public class YidianzixunAccountExample {
headList.add("source"); headList.add("source");
headList.add("url"); headList.add("url");
headList.add("summary"); headList.add("summary");
poi.exportExcel("D://crawlerdata/一点资讯-虎嗅.xlsx", "虎嗅", headList, dataList); poi.exportExcel("D://crawlerdata/一点资讯-m23315.xlsx", "虎嗅", headList, dataList);
} }
......
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.Arrays;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.TXNews;
//
//public class TxNewsTest {
// @Test
// public void txNewsTest() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> map = poi.importExcel("C:\\Users\\byte-zbs\\Desktop\\腾讯.xlsx", 0);
// List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// lists.forEach(m -> {
// String url = String.valueOf(m.get("url"));
// System.out.println(url + "start");
// url = url.split("\\?")[0];
// String coralUin = url.split("/")[5];
// String coralUid = url.split("/")[4];
// List<Map<String,Object>> dataList = TXNews.getTxNewsComments(coralUin, coralUid, ProxyHolder.NAT_PROXY);
// bodyList.addAll(dataList);
// System.out.println(url + " end " + dataList.size());
// });
//
// List<String> headList = Arrays.asList("name","replayUrl","content","time","replayNum","agreeNum");
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\腾讯.xlsx", "result", headList, bodyList);
// }
//}
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
// //
//import org.testng.annotations.Test; //import org.testng.annotations.Test;
// //
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.KuaiTousu; //import com.zhiwei.parse.KuaiTousu;
//import com.zhiwei.parse.SinaTousu; //import com.zhiwei.parse.SinaTousu;
...@@ -14,12 +17,13 @@ ...@@ -14,12 +17,13 @@
// //
// @Test // @Test
// public void getSinaTousuData() { // public void getSinaTousuData() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String words = "花呗|借呗|京东白条|京东金条|京东金融"; // String words = "美团|三快";
// String[] ws = words.split("\\|"); // String[] ws = words.split("\\|");
// List<Map<String,Object>> bodyList = new ArrayList<>(); // List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String word : ws) { // for(String word : ws) {
// List<Map<String,Object>> list = SinaTousu.getSinaTousuData(word, null, "2018-07-01 00:00:00"); // List<Map<String,Object>> list = SinaTousu.getSinaTousuData(word, ProxyHolder.NAT_PROXY, null);
// bodyList.addAll(list); // bodyList.addAll(list);
// System.out.println(word + " --------- " + bodyList.size()); // System.out.println(word + " --------- " + bodyList.size());
// } // }
...@@ -30,7 +34,7 @@ ...@@ -30,7 +34,7 @@
// headList.add("source"); // headList.add("source");
// headList.add("url"); // headList.add("url");
// //
// poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉-美团-3.xlsx", "数据", headList, bodyList); // poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉-美团-2.xlsx", "数据", headList, bodyList);
// //
// //
// //
......
package com.zhiwei.shipin;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Aiqiyi;
import com.zhiwei.util.WordReadFile;
public class AiqiyiTest {
@Test
public void aiqiyiTest() {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : wordList) {
List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,ProxyHolder.NAT_PROXY);
if(dataList != null && dataList.size() >= 1) {
bodyList.addAll(dataList);
}
}
List<String> headList = new ArrayList<String>();
headList.add("time");
headList.add("source");
headList.add("content");
headList.add("url");
headList.add("title");
headList.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata/爱奇艺关键词采集-txh-0320.xlsx", "数据", headList, bodyList);
}
}
...@@ -13,7 +13,7 @@ import com.zhiwei.util.WordReadFile; ...@@ -13,7 +13,7 @@ import com.zhiwei.util.WordReadFile;
public class BilibiliTest { public class BilibiliTest {
@Test @Test
public void f() { public void f() {
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词-2.txt"); List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String, Object>> bodyList = new ArrayList<>(); List<Map<String, Object>> bodyList = new ArrayList<>();
String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274"; String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
for (String word : wordList) { for (String word : wordList) {
...@@ -31,8 +31,9 @@ public class BilibiliTest { ...@@ -31,8 +31,9 @@ public class BilibiliTest {
headlist.add("source"); headlist.add("source");
headlist.add("title"); headlist.add("title");
headlist.add("url"); headlist.add("url");
headlist.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//bilibili关键词采集数据-txh-0219-农药.xlsx", "B站数据", headlist, bodyList); poi.exportExcel("D://crawlerdata//bilibili关键词采集数据-txh-0320.xlsx", "B站数据", headlist, bodyList);
} }
} }
...@@ -18,7 +18,7 @@ public class QQTVTest { ...@@ -18,7 +18,7 @@ public class QQTVTest {
@Test @Test
public void f() { public void f() {
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER); ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
String time = "1970-07-01 00:00:00"; String time = "2018-01-01 00:00:00";
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt"); List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String, Object>> bodyList = new ArrayList<>(); List<Map<String, Object>> bodyList = new ArrayList<>();
for (String word : wordList) { for (String word : wordList) {
...@@ -35,8 +35,9 @@ public class QQTVTest { ...@@ -35,8 +35,9 @@ public class QQTVTest {
headlist.add("source"); headlist.add("source");
headlist.add("title"); headlist.add("title");
headlist.add("url"); headlist.add("url");
headlist.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//腾讯视频关键词采集数据-txh-0130.xlsx", "B站数据", headlist, bodyList); poi.exportExcel("D://crawlerdata//腾讯视频关键词采集数据-txh-0320.xlsx", "腾讯视频数据", headlist, bodyList);
......
//package com.zhiwei.shipin; package com.zhiwei.shipin;
//
//import java.util.ArrayList; import java.util.ArrayList;
//import java.util.List; import java.util.List;
//import java.util.Map; import java.util.Map;
//
//import org.testng.annotations.Test; import org.testng.annotations.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.shipin.SohuTV; import com.zhiwei.parse.shipin.SohuTV;
//import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
//import com.zhiwei.util.WordReadFile; import com.zhiwei.util.WordReadFile;
//
//public class SohuTVTest { public class SohuTVTest {
// @Test @Test
// public void f() { public void f() {
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt"); List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String, Object>> bodyList = new ArrayList<>(); List<Map<String, Object>> bodyList = new ArrayList<>();
// String cookie = "SUV=1901101134139015; IPLOC=CN3301; gidinf=x099980109ee0f08567b42835000336ade2ef3762611; fuid=15474616189304048886; newpuid=15474616191372936893; beans_mz_userid=UBThg01XRPg8; pmai=dad35c1c318bdd22; ifoxinstalled=false; beans_freq=1; beans_dmp=%7B%22admaster%22%3A1547461620%2C%22shunfei%22%3A1547461620%2C%22reachmax%22%3A1548816807%2C%22lingji%22%3A1547461620%2C%22yoyi%22%3A1547461620%2C%22ipinyou%22%3A1547461620%2C%22ipinyou_admaster%22%3A1547461620%2C%22miaozhen%22%3A1548816807%2C%22diantong%22%3A1547461620%2C%22huayang%22%3A1547461620%7D; beans_dmp_done=1; reqtype=pc; sokey=%5B%7B%22key%22%3A%22%E7%BE%8E%E5%9B%A2%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%20%E4%BA%8C%E5%99%81%E8%8B%B1%22%7D%5D; t=1548817812321"; String cookie = "SUV=1901101134139015; IPLOC=CN3301; gidinf=x099980109ee0f08567b42835000336ade2ef3762611; fuid=15474616189304048886; newpuid=15474616191372936893; beans_mz_userid=UBThg01XRPg8; pmai=dad35c1c318bdd22; ifoxinstalled=false; beans_freq=1; beans_dmp=%7B%22admaster%22%3A1547461620%2C%22shunfei%22%3A1547461620%2C%22reachmax%22%3A1548816807%2C%22lingji%22%3A1547461620%2C%22yoyi%22%3A1547461620%2C%22ipinyou%22%3A1547461620%2C%22ipinyou_admaster%22%3A1547461620%2C%22miaozhen%22%3A1548816807%2C%22diantong%22%3A1547461620%2C%22huayang%22%3A1547461620%7D; beans_dmp_done=1; reqtype=pc; sokey=%5B%7B%22key%22%3A%22%E7%BE%8E%E5%9B%A2%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%20%E4%BA%8C%E5%99%81%E8%8B%B1%22%7D%5D; t=1548817812321";
// for (String word : wordList) { for (String word : wordList) {
// List<Map<String, Object>> dataList = SohuTV.sohuTVData(word, cookie, null); List<Map<String, Object>> dataList = SohuTV.sohuTVData(word, cookie, null);
// if (dataList != null) { if (dataList != null) {
// System.out.println(word + " ----- " + dataList.size()); System.out.println(word + " ----- " + dataList.size());
// bodyList.addAll(dataList); bodyList.addAll(dataList);
// } }
// ZhiWeiTools.sleep(1000); ZhiWeiTools.sleep(1000);
// } }
// List<String> headlist = new ArrayList<>(); List<String> headlist = new ArrayList<>();
// headlist.add("playCount"); headlist.add("playCount");
// headlist.add("time"); headlist.add("time");
// headlist.add("source"); headlist.add("source");
// headlist.add("title"); headlist.add("title");
// headlist.add("url"); headlist.add("url");
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); headlist.add("word");
// poi.exportExcel("D://crawlerdata//搜狐视频关键词采集数据-txh-0219.xlsx", "B站数据", headlist, bodyList); PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//搜狐视频关键词采集数据-txh-0320.xlsx", "搜狐数据", headlist, bodyList);
// }
//} }
}
//package com.zhiwei.shipin; package com.zhiwei.shipin;
//
//import java.util.ArrayList; import java.util.ArrayList;
//import java.util.List; import java.util.List;
//import java.util.Map; import java.util.Map;
//
//import org.testng.annotations.Test; import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType; import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Youku; import com.zhiwei.parse.Youku;
//import com.zhiwei.util.WordReadFile; import com.zhiwei.util.WordReadFile;
//
//public class YoukuKeyWordTest { public class YoukuKeyWordTest {
// @Test @Test
// public void f() { public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER); GroupType.PROVIDER);
// List<String> words = WordReadFile.getWords("D://crawlerdata//关键词.txt"); List<String> words = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String,Object>> bodyList = new ArrayList<>(); List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String w : words) { for(String w : words) {
// System.out.println(w); System.out.println(w);
// bodyList.addAll(Youku.getDataList(w)); bodyList.addAll(Youku.getDataList(w));
// } }
// List<String> headList = new ArrayList<>(); List<String> headList = new ArrayList<>();
// headList.add("title"); headList.add("title");
// headList.add("time"); headList.add("time");
// headList.add("url"); headList.add("url");
// headList.add("uper"); headList.add("uper");
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); headList.add("word");
// poi.exportExcel("D://crawlerdata//优酷数据-txh-0219.xlsx", "数据", headList, bodyList); PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//优酷数据-txh-0320.xlsx", "数据", headList, bodyList);
// }
//} }
}
//package com.zhiwei.user;
//
//import java.util.ArrayList;
//import java.util.Arrays;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Maimai;
//
//public class MaimaiTest {
// @Test
// public void maimaiUserCrawler() {
// String path = "D:\\crawlerdata\\脉脉用户.xlsx";
// String word = "美团|美团网|大众点评|美团点评|摩拜|猫眼|榛果|三快科技|三快在线";
// String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550629286782; token=\"OCY36EFdeYzGytlQFyKRdM0DcXNdViYI02kT4QbUMpaSk/CqMXrqBOx8EFo5/fQU8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"q1bNxxk8WW3MzjbCfKr/hfAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTc2NjQ0NzY1Iiwic2VjcmV0IjoiLXFsV2c2Ym9feEJqOWxQbWdWTjcwWWg3Iiwic3RhdHVzIjp0cnVlLCJtaWQ0NTY4NzYwIjpmYWxzZSwiX2V4cGlyZSI6MTU1MDcxNTc2NzgwMSwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=lVCTA7DLvo1K_r_bTjbQOH13Alc";
// String[] words = word.split("\\|");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String w : words) {
// bodyList.addAll(Maimai.getUserList(w, cookie, null));
// }
// List<String> headList = Arrays.asList("id","name","gender","url","rank","compos","city");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel(path, "result", headList, bodyList);
// }
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment