Commit 6018f0b3 by yangchen

提交修改

parent 3e350f8b
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>articlenewscrawler</artifactId> <artifactId>articlenewscrawler</artifactId>
<version>0.1.7-SNAPSHOT</version> <version>0.2.2-SNAPSHOT</version>
<name>articlenewscrawler</name> <name>articlenewscrawler</name>
<description>采集凤凰,一点资讯,搜狐历时文章和文章评论</description> <description>采集凤凰,一点资讯,搜狐历时文章和文章评论</description>
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.3.6-RELEASE</version> <version>0.5.5.6-SNAPSHOT</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
</dependencies> </dependencies>
......
...@@ -67,7 +67,7 @@ public class HeadGet { ...@@ -67,7 +67,7 @@ public class HeadGet {
public static Map<String,String> getYidianzixunAccountHeaderMap(String cookie,String referer) { public static Map<String,String> getYidianzixunAccountHeaderMap(String cookie,String referer) {
Map<String, String> headerMap = new HashMap<String, String>(); Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent", headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"); "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1");
headerMap.put("Accept", headerMap.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9"); headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
...@@ -254,14 +254,13 @@ public class HeadGet { ...@@ -254,14 +254,13 @@ public class HeadGet {
* @throws IOException * @throws IOException
*/ */
public static Map<String,String> getDayuCommentHeaderMap(String cookie) { public static Map<String,String> getDayuCommentHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>(); Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"); "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
headerMap.put("Accept", headerMap.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9"); headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive"); headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "m.uczzd.cn");
if(cookie != null) { if(cookie != null) {
headerMap.put("Cookie", cookie); headerMap.put("Cookie", cookie);
} }
...@@ -293,13 +292,13 @@ public class HeadGet { ...@@ -293,13 +292,13 @@ public class HeadGet {
} }
public static Map<String,Object> getQQAccountOneParamMap(String chlid) { public static Map<String,Object> getQQAccountOneParamMap(String chlid) {
Map<String,Object> paramMap = new HashMap<String,Object>(); Map<String,Object> paramMap = new HashMap<>();
paramMap.put("chlid", chlid); paramMap.put("chlid", chlid);
return paramMap; return paramMap;
} }
public static Map<String,Object> getQQAccountOtherParamMap(String ids) { public static Map<String,Object> getQQAccountOtherParamMap(String ids) {
Map<String,Object> paramMap = new HashMap<String,Object>(); Map<String,Object> paramMap = new HashMap<>();
paramMap.put("ids", ids); paramMap.put("ids", ids);
return paramMap; return paramMap;
} }
......
...@@ -3,6 +3,7 @@ package com.zhiwei.httpclient; ...@@ -3,6 +3,7 @@ package com.zhiwei.httpclient;
import java.io.IOException; import java.io.IOException;
import java.net.Proxy; import java.net.Proxy;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -11,12 +12,14 @@ import com.zhiwei.crawler.core.HttpBoot; ...@@ -11,12 +12,14 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.FormBody;
import okhttp3.Headers;
import okhttp3.Response; import okhttp3.Response;
public class HttpClient { public class HttpClient {
private static Logger logger = LoggerFactory.getLogger(HttpClient.class); private static Logger logger = LoggerFactory.getLogger(HttpClient.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(false).build();
/** /**
* *
...@@ -43,6 +46,25 @@ public class HttpClient { ...@@ -43,6 +46,25 @@ public class HttpClient {
* @return * @return
* @throws IOException * @throws IOException
*/ */
public static String executeHttpRequestGet(String url,ProxyHolder proxy) {
for(int i = 0;i < 3;i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
return response.body().string();
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e);
}
}
return null;
}
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static String executeHttpRequestGet(String url,ProxyHolder proxy,Map<String, String> headerMap) { public static String executeHttpRequestGet(String url,ProxyHolder proxy,Map<String, String> headerMap) {
for(int i = 0;i < 3;i++) { for(int i = 0;i < 3;i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){ try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
...@@ -54,8 +76,37 @@ public class HttpClient { ...@@ -54,8 +76,37 @@ public class HttpClient {
return null; return null;
} }
public static String executeHttpRequestPost(String url,Proxy proxy,Map<String, String> headerMap,Map<String, Object> paramMap) { /**
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, paramMap), proxy)){ *
* @Description (TODO这里用一句话描述这个方法的作用)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static String executeHttpRequestGet(String url,ProxyHolder proxy,Headers header) {
for(int i = 0;i < 3;i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, header), proxy)){
return response.body().string();
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e);
}
}
return null;
}
public static String executeHttpRequestPost(String url,Proxy proxy,Map<String, String> headerMap,Map<String, Object> params) {
FormBody body = null;
if (Objects.nonNull(params) && !params.isEmpty()) {
FormBody.Builder builder = new FormBody.Builder();
params.forEach((lt, rt) -> {
if (Objects.nonNull(lt)) {
builder.add(String.valueOf(lt), Objects.isNull(rt) ? "" : String.valueOf(rt));
}
});
body = builder.build();
}
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, body), proxy)){
return response.body().string(); return response.body().string();
} catch (Exception e) { } catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e); logger.error("httpClient 获取数据出现问题:{}", e);
...@@ -64,8 +115,18 @@ public class HttpClient { ...@@ -64,8 +115,18 @@ public class HttpClient {
} }
public static String executeHttpRequestPost(String url,ProxyHolder proxy,Map<String, String> headerMap,Map<String, Object> paramMap) { public static String executeHttpRequestPost(String url,ProxyHolder proxy,Map<String, String> headerMap,Map<String, Object> params) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, paramMap), proxy)){ FormBody body = null;
if (Objects.nonNull(params) && !params.isEmpty()) {
FormBody.Builder builder = new FormBody.Builder();
params.forEach((lt, rt) -> {
if (Objects.nonNull(lt)) {
builder.add(String.valueOf(lt), Objects.isNull(rt) ? "" : String.valueOf(rt));
}
});
body = builder.build();
}
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, body), proxy)){
return response.body().string(); return response.body().string();
} catch (Exception e) { } catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e); logger.error("httpClient 获取数据出现问题:{}", e);
......
...@@ -89,20 +89,21 @@ public class Baijia { ...@@ -89,20 +89,21 @@ public class Baijia {
while(f) { while(f) {
for(int i = 1;i < 3;i++) { for(int i = 1;i < 3;i++) {
try { try {
String url = "https://author.baidu.com/list?type=article&tab=2&uk="+uk+"&ctime="+ctime+"&num=50"; String url = "https://mbd.baidu.com/webpage?tab=article&num=10&uk="+uk+"&ctime="+ctime+"&type=newhome&action=dynamic&format=json";
String result = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); String result = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
Map<String,Object> dMap = baijiaAccountAnalysis.getBaijiaAccountData3(result,name, startTime); Map<String,Object> dMap = baijiaAccountAnalysis.getBaijiaAccountData3(result,name, startTime);
List<Map<String,Object>> dList = (List<Map<String, Object>>) dMap.get("data"); List<Map<String,Object>> dList = (List<Map<String, Object>>) dMap.get("data");
dataList.addAll(dList); if(Objects.nonNull(dList))
dataList.addAll(dList);
logger.info("{} 数据采集结果 {}",appId, dataList.size()); logger.info("{} 数据采集结果 {}",appId, dataList.size());
if(!(boolean) dMap.get("more")) { if(!(boolean) dMap.get("more")) {
f = false; f = false;
} }
ctime = String.valueOf(dMap.get("ctime")); ctime = String.valueOf(dMap.get("ctime"));
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(1000);
break; break;
} catch (Exception e) { } catch (Exception e) {
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(2000);
} }
} }
} }
...@@ -111,15 +112,16 @@ public class Baijia { ...@@ -111,15 +112,16 @@ public class Baijia {
} }
private static String getUkData(String appId,ProxyHolder proxy,String cookie) { private static String getUkData(String appId,ProxyHolder proxy,String cookie) {
String url = "https://author.baidu.com/profile?context={\"from\":0,\"app_id\":\"" // String url = "https://author.baidu.com/profile?context={\"from\":0,\"app_id\":\""
+appId+"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#"; // +appId+"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#";
String url = "https://author.baidu.com/home/" + appId;
Map<String,Object> headers = new HashMap<>(); Map<String,Object> headers = new HashMap<>();
headers.put("Host", "author.baidu.com"); headers.put("Host", "author.baidu.com");
headers.put("cookie", cookie); headers.put("cookie", cookie);
for(int i = 0; i < 3;i++) { for(int i = 0; i < 3;i++) {
try { try {
String result = httpBoot.syncCall(RequestUtils.wrapGet(url,headers), proxy).body().string(); String result = httpBoot.syncCall(RequestUtils.wrapGet(url,headers), proxy).body().string();
return result.split("uk\\\\\":\\\\\"")[1].split("\\\\\",")[0]; return result.split("uk\":\"")[1].split("\",")[0];
} catch (Exception e) { } catch (Exception e) {
logger.error("百家号uk 获取失败"); logger.error("百家号uk 获取失败");
} }
......
...@@ -4,7 +4,6 @@ import java.io.UnsupportedEncodingException; ...@@ -4,7 +4,6 @@ import java.io.UnsupportedEncodingException;
import java.net.Proxy; import java.net.Proxy;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -13,7 +12,7 @@ import org.slf4j.LoggerFactory; ...@@ -13,7 +12,7 @@ import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.BilibilikeyWordAnalysis; import com.zhiwei.parse.analysis.BilibilikeyWordAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
...@@ -22,16 +21,16 @@ import okhttp3.Headers; ...@@ -22,16 +21,16 @@ import okhttp3.Headers;
public class BiliBili { public class BiliBili {
private static final Logger logger = LoggerFactory.getLogger(BiliBili.class); private static final Logger logger = LoggerFactory.getLogger(BiliBili.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).useCookieJar(true).build();
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<Map<String,Object>> getData(String word,Proxy proxy,String endTime,String cookie) { public static List<Map<String,Object>> getData(String word,Proxy proxy,String endTime,String cookie) {
List<Map<String,Object>> bodyList = new ArrayList<>(); List<Map<String,Object>> bodyList = new ArrayList<>();
try { try {
// //
String url = "https://search.bilibili.com/all?keyword="+URLEncoder.encode(word, "utf-8")+"&order=pubdate&duration=0&tids_1=0"; String url = "https://search.bilibili.com/all?keyword="+URLEncoder.encode(word, "utf-8")+"&single_column=1&order=stow&duration=0&tids_1=0";
System.out.println(url);
Headers header = Headers.of("cookie",cookie,"Referer","https://www.bilibili.com/","Host","search.bilibili.com"); Headers header = Headers.of("cookie",cookie,"Referer","https://www.bilibili.com/","Host","search.bilibili.com");
String result = httpBoot.syncCall(RequestUtils.wrapGet(url, header), ProxyHolder.NAT_HEAVY_PROXY).body().string(); String result = HttpClient.executeHttpRequestGet(url, ProxyHolder.NAT_HEAVY_PROXY, header);
ZhiWeiTools.sleep(100); ZhiWeiTools.sleep(100);
Map<String,Object> map = BilibilikeyWordAnalysis.getData(result,word,endTime); Map<String,Object> map = BilibilikeyWordAnalysis.getData(result,word,endTime);
boolean more = (boolean) map.get("more"); boolean more = (boolean) map.get("more");
...@@ -43,7 +42,7 @@ public class BiliBili { ...@@ -43,7 +42,7 @@ public class BiliBili {
while(more) { while(more) {
map.clear(); map.clear();
String ur = url + "&page=" + n; String ur = url + "&page=" + n;
String result2 = httpBoot.syncCall(RequestUtils.wrapGet(ur, header), ProxyHolder.NAT_HEAVY_PROXY).body().string(); String result2 = HttpClient.executeHttpRequestGet(ur, ProxyHolder.NAT_HEAVY_PROXY, header);
map = BilibilikeyWordAnalysis.getData(result2,word,endTime); map = BilibilikeyWordAnalysis.getData(result2,word,endTime);
List<Map<String,Object>> dataList2 = (List<Map<String, Object>>) map.get("data"); List<Map<String,Object>> dataList2 = (List<Map<String, Object>>) map.get("data");
if(dataList2 != null) { if(dataList2 != null) {
...@@ -60,7 +59,7 @@ public class BiliBili { ...@@ -60,7 +59,7 @@ public class BiliBili {
} catch (Exception e) { } catch (Exception e) {
logger.error("e {}",e); logger.error("e {}",e);
} }
return Collections.emptyList(); return bodyList;
} }
......
...@@ -102,7 +102,7 @@ public class Dayu { ...@@ -102,7 +102,7 @@ public class Dayu {
* @param articleId * @param articleId
* @return * @return
*/ */
public static int getDayuCommentCount(String articleId,Proxy proxy) { public static int getDayuCommentCount(String articleId,ProxyHolder proxy) {
String url = "http://m.uczzd.cn/iflow/api/v2/cmt/article/"+articleId+"/comments/byhot"; String url = "http://m.uczzd.cn/iflow/api/v2/cmt/article/"+articleId+"/comments/byhot";
Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null); Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap); String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
...@@ -110,6 +110,39 @@ public class Dayu { ...@@ -110,6 +110,39 @@ public class Dayu {
return json.getJSONObject("data").getInteger("comment_cnt"); return json.getJSONObject("data").getInteger("comment_cnt");
} }
/**
** 大鱼阅读数
* @param url
* @param proxy
* @return
* @return int
*/
public static int getDayuReadCount(String url,ProxyHolder proxy) {
try {
if(url.contains("!wm_aid=")) {
String articleId = url.split("wm_aid=")[1];
String eUrl = "https://ff.dayu.com/contents/origin/"+articleId+"?biz_id=1002&_fetch_author=1&_incr_fields=click1,click2,click3,click_total,play,like";
Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(eUrl,proxy, headerMap);
JSONObject json = JSONObject.parseObject(result);
return json.getJSONObject("data").getJSONObject("_incrs").getIntValue("click2") +
json.getJSONObject("data").getJSONObject("_incrs").getIntValue("click1");
}else if(url.contains("wm_cid=")) {
String articleId = url.split("wm_cid=")[1];
String eUrl = "https://ff.dayu.com/contents/"+articleId+"?biz_id=1002&_fetch_author=1&_incr_fields=click1,click2,click3,click_total,play,like";
Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(eUrl,proxy, headerMap);
JSONObject json = JSONObject.parseObject(result);
return json.getJSONObject("data").getJSONObject("_incrs").getIntValue("click2") +
json.getJSONObject("data").getJSONObject("_incrs").getIntValue("click1");
}
} catch (Exception e) {
e.printStackTrace();
}
return -1;
}
/** /**
* *
* @Description 大鱼号依据关键词采集 * @Description 大鱼号依据关键词采集
......
...@@ -38,7 +38,7 @@ public class Douban { ...@@ -38,7 +38,7 @@ public class Douban {
*/ */
public static List<Map<String,Object>> doubanTopicGetByWord(String word,ProxyHolder proxy,String cookie,String stime) { public static List<Map<String,Object>> doubanTopicGetByWord(String word,ProxyHolder proxy,String cookie,String stime) {
int page = 0; int page = 0;
int count = 20; int count = 50;
boolean more = true; boolean more = true;
Map<String,String> headerMap = new HashMap<>(); Map<String,String> headerMap = new HashMap<>();
headerMap.put("Host", "www.douban.com"); headerMap.put("Host", "www.douban.com");
...@@ -77,10 +77,9 @@ public class Douban { ...@@ -77,10 +77,9 @@ public class Douban {
map.put("time", time); map.put("time", time);
map.put("reply_count", replyCount); map.put("reply_count", replyCount);
bodyList.add(map); bodyList.add(map);
// System.out.println(map.toString());
} }
} }
if(bodyList.size() - cou < 30){ if(bodyList.size() - cou < 10 || page > 500){
more = false; more = false;
} }
logger.info("采集到 第 {} 页 ,一共采集到 {} 条 是否有下一页 {}",page,bodyList.size(),more); logger.info("采集到 第 {} 页 ,一共采集到 {} 条 是否有下一页 {}",page,bodyList.size(),more);
......
...@@ -16,7 +16,9 @@ import org.slf4j.LoggerFactory; ...@@ -16,7 +16,9 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
...@@ -139,6 +141,44 @@ public class Maimai { ...@@ -139,6 +141,44 @@ public class Maimai {
return Collections.emptyMap(); return Collections.emptyMap();
} }
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER, 10000002L);
String url = "https://maimai.cn/web/feed_detail?fid=1353566056&efid=QTa45Y1e-oQzyn1dZ5ozlQ";
System.out.println(getMaiaiCount2(url, ProxyHolder.NAT_HEAVY_PROXY));
}
/**
* https://maimai.cn/web/feed_detail?fid=1304191535&efid=0CQbJXhoYLXdC87NFIkRMA
* @Description 获取脉脉转评赞
* @param url
* @param proxy
* @return
*/
public static Map<String,Object> getMaiaiCount2(String url,ProxyHolder proxy) {
for(int i = 1; i < 3; i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
result = result.split("JSON.parse\\(\"")[1].split("\"\\);\\</script\\>")[0];
result = ZhiWeiTools.decodeUnicode(result);
JSONObject json = JSONObject.parseObject(result);
Map<String,Object> map = new HashMap<>();
JSONObject data = json.getJSONObject("data").getJSONObject("feed");
map.put("like", data.getJSONObject("likes").getInteger("n"));
map.put("spreads", data.getJSONObject("spreads").getInteger("n"));
map.put("cmts", data.getJSONObject("comments").getInteger("n"));
map.put("gid", data.getLong("id"));
map.put("title", data.getJSONObject("main").getString("text"));
map.put("author", data.getJSONObject("main").getJSONObject("u").getString("name"));
map.put("userId", data.getJSONObject("main").getJSONObject("u").getString("mmid"));
map.put("company", data.getJSONObject("main").getJSONObject("u").getString("career_str"));
return map;
} catch (Exception e) {
logger.error(" 脉脉 转评攒 获取失败 {}",e);
}
}
return Collections.emptyMap();
}
/** /**
* //https://maimai.cn/web/gossip_detail?encode_id=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MTk2MzEyNjYsImlhdCI6MTU0ODI5NzI5NX0.N6SPmcf-fyitLNomzY-a8BEY31eseYnvG7RTUQ3jxYY * //https://maimai.cn/web/gossip_detail?encode_id=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MTk2MzEyNjYsImlhdCI6MTU0ODI5NzI5NX0.N6SPmcf-fyitLNomzY-a8BEY31eseYnvG7RTUQ3jxYY
* @Description 获取脉脉转评赞 * @Description 获取脉脉转评赞
......
...@@ -60,8 +60,8 @@ public class QQKB { ...@@ -60,8 +60,8 @@ public class QQKB {
try { try {
for(int j = 1; j < 3;j++) { for(int j = 1; j < 3;j++) {
ids = ids.substring(0,ids.length()-1); ids = ids.substring(0,ids.length()-1);
System.out.println(ids); logger.info("data {}",ids);
ZhiWeiTools.sleep(7000); ZhiWeiTools.sleep(1000);
paramMap.clear(); paramMap.clear();
paramMap = HeadGet.getQQAccountOtherParamMap(ids); paramMap = HeadGet.getQQAccountOtherParamMap(ids);
result = HttpClient.executeHttpRequestPost(url,proxy, headerMap, paramMap); result = HttpClient.executeHttpRequestPost(url,proxy, headerMap, paramMap);
...@@ -76,7 +76,6 @@ public class QQKB { ...@@ -76,7 +76,6 @@ public class QQKB {
} catch (Exception e) { } catch (Exception e) {
ids = ""; ids = "";
paramMap.clear(); paramMap.clear();
continue;
} }
} }
} }
......
...@@ -34,15 +34,15 @@ public class SinaKeji { ...@@ -34,15 +34,15 @@ public class SinaKeji {
* @return * @return
*/ */
public static List<Map<String, Object>> getSinaKejiComment(String url,ProxyHolder proxy) { public static List<Map<String, Object>> getSinaKejiComment(String url,ProxyHolder proxy) {
String commentId = getCommentId(url, proxy); String comUrl = getCommentId(url, proxy);
if(nonNull(commentId)) { if(nonNull(comUrl)) {
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
int page = 1; int page = 1;
int count = 1; int count = 1;
while(true) { while(true) {
try { try {
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(1000);
String newUrl = "http://comment.sina.com.cn/page/info?version=1&format=json&channel=kj&newsid="+commentId+"&compress=0&ie=utf-8&oe=utf-8&page_size=20&t_size=3&h_size=3&thread=1&page="+page; String newUrl = comUrl+page;
String result = httpBoot.syncCall(RequestUtils.wrapGet(newUrl), proxy).body().string(); String result = httpBoot.syncCall(RequestUtils.wrapGet(newUrl), proxy).body().string();
List<Map<String,Object>> list = sinaKejiCommentAnalysis.getSinaCommet(result); List<Map<String,Object>> list = sinaKejiCommentAnalysis.getSinaCommet(result);
dataList.addAll(list); dataList.addAll(list);
...@@ -63,16 +63,27 @@ public class SinaKeji { ...@@ -63,16 +63,27 @@ public class SinaKeji {
return Collections.emptyList(); return Collections.emptyList();
} }
/**
** 获取新浪评论链接
* @param url
* @param proxy
* @return
* @return String
*/
private static String getCommentId(String url,ProxyHolder proxy) { private static String getCommentId(String url,ProxyHolder proxy) {
String commentId = null;
for(int i = 0;i < 3;i++) { for(int i = 0;i < 3;i++) {
try { try {
String result = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string(); String result = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
//list?channel=
if(result.contains("newsid:")) { if(result.contains("newsid:")) {
commentId = result.split("newsid: '")[1].split("'")[0]; String commentId = result.split("newsid: '")[1].split("'")[0];
if(nonNull(commentId)) { String channel = result.split("channel: '")[1].split("'")[0];
return commentId; if(nonNull(commentId) && nonNull(channel)) {
return "http://comment.sina.com.cn/page/info?version=1&format=json&channel="+channel+"&newsid="+commentId+"&compress=0&ie=utf-8&oe=utf-8&page_size=20&t_size=3&h_size=3&thread=1&page=";
} }
}else if(result.contains("__cmntId")){
String key = result.split("__cmntId\":\"")[1].split("\",")[0];
return "http://comment.sina.com.cn/page/info?version=1&format=json&channel="+key.split(":")[0]+"&newsid="+key.split(":")[1]+"&compress=0&ie=utf-8&oe=utf-8&page_size=20&t_size=3&h_size=3&thread=1&page=";
} }
} catch (IOException e) { } catch (IOException e) {
logger.error("获取 文章评论 id 失败"); logger.error("获取 文章评论 id 失败");
......
...@@ -29,7 +29,7 @@ public class SinaTousu { ...@@ -29,7 +29,7 @@ public class SinaTousu {
int count = 1; int count = 1;
while(true) { while(true) {
try { try {
if(count > 3) { if(count > 3 || page > 200) {
break; break;
} }
String url = "https://tousu.sina.com.cn/api/index/s?keywords="+URLEncoder.encode(word, "utf-8")+"&page_size=100&page="; String url = "https://tousu.sina.com.cn/api/index/s?keywords="+URLEncoder.encode(word, "utf-8")+"&page_size=100&page=";
......
package com.zhiwei.parse; package com.zhiwei.parse;
import static java.util.Objects.nonNull;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
...@@ -12,7 +10,6 @@ import org.slf4j.LoggerFactory; ...@@ -12,7 +10,6 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
...@@ -35,10 +32,7 @@ public class Souhu { ...@@ -35,10 +32,7 @@ public class Souhu {
*/ */
public static int getSouhuCommentCount(String url,ProxyHolder proxy) { public static int getSouhuCommentCount(String url,ProxyHolder proxy) {
try { try {
String newurl = souhuCommentAnalysis.getSouhuURL(url,proxy); return souhuCommentAnalysis.getSouhuCommentCount(url,proxy);
if(nonNull(newurl)) {
return souhuCommentAnalysis.getSouhuCommentCount(newurl,proxy);
}
} catch (Exception e) { } catch (Exception e) {
logger.error("搜狐获取评论数出错了 {}",e); logger.error("搜狐获取评论数出错了 {}",e);
} }
...@@ -83,7 +77,7 @@ public class Souhu { ...@@ -83,7 +77,7 @@ public class Souhu {
if(isCulling) { if(isCulling) {
url = url + "&columnId=-1"; url = url + "&columnId=-1";
} }
String result = HttpClient.executeHttpRequestGet(url,proxy,null); String result = HttpClient.executeHttpRequestGet(url,proxy);
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONObject("data").getJSONArray("pcArticleVOS"); JSONArray jsonArray = json.getJSONObject("data").getJSONArray("pcArticleVOS");
List<Map<String,Object>> dataList1 = souhuAccountAnalysis.analysisData(jsonArray,name); List<Map<String,Object>> dataList1 = souhuAccountAnalysis.analysisData(jsonArray,name);
......
...@@ -21,6 +21,8 @@ import com.zhiwei.parse.analysis.TXNewsByWordAnalysis; ...@@ -21,6 +21,8 @@ import com.zhiwei.parse.analysis.TXNewsByWordAnalysis;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.MediaType;
import okhttp3.RequestBody;
import okhttp3.Response; import okhttp3.Response;
public class TXNews { public class TXNews {
...@@ -71,7 +73,7 @@ public class TXNews { ...@@ -71,7 +73,7 @@ public class TXNews {
String content = StringUtils.join("coral_uin=", coralUin, "&coral_uid=", coralUid,"&reply_id=",replayId); String content = StringUtils.join("coral_uin=", coralUin, "&coral_uid=", coralUid,"&reply_id=",replayId);
//eca55388bbbb596e632bca03a2378efe94b83142fd046f1f70 876579532 //eca55388bbbb596e632bca03a2378efe94b83142fd046f1f70 876579532
System.out.println(content); System.out.println(content);
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost("https://r.inews.qq.com/getMyComments", "application/json", content), proxy)){ try (Response response = httpBoot.syncCall(RequestUtils.wrapPost("https://r.inews.qq.com/getMyComments", RequestBody.create(MediaType.get("application/json"), content)), proxy)){
JSONObject json = JSONObject.parseObject(response.body().string()); JSONObject json = JSONObject.parseObject(response.body().string());
JSONArray jsonArray = json.getJSONObject("comments").getJSONArray("new"); JSONArray jsonArray = json.getJSONObject("comments").getJSONArray("new");
for(int i = 0;i < jsonArray.size();i++) { for(int i = 0;i < jsonArray.size();i++) {
......
...@@ -2,6 +2,7 @@ package com.zhiwei.parse; ...@@ -2,6 +2,7 @@ package com.zhiwei.parse;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -27,7 +28,7 @@ public class Wangyi { ...@@ -27,7 +28,7 @@ public class Wangyi {
private static Logger logger = LoggerFactory.getLogger(Wangyi.class); private static Logger logger = LoggerFactory.getLogger(Wangyi.class);
private static WangyiCommentAnalysis wangyiCommentAnalysis = new WangyiCommentAnalysis(); private static WangyiCommentAnalysis wangyiCommentAnalysis = new WangyiCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(false).build();
private static WangyiHistoryAnalysis wangyiHistoryAnalysis = new WangyiHistoryAnalysis(); private static WangyiHistoryAnalysis wangyiHistoryAnalysis = new WangyiHistoryAnalysis();
/** /**
...@@ -81,6 +82,21 @@ public class Wangyi { ...@@ -81,6 +82,21 @@ public class Wangyi {
} }
} }
public static Map<String, Object> getReadAndLikeCount(String url, ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
String cCount = result.split("data-count=\"")[1].split("\" data-hidead")[0];
String lCount = result.split("data-like=\"")[1].split("\"><")[0];
Map<String, Object> rMap = new HashMap<>();
rMap.put("commentCount", cCount);
rMap.put("likes", lCount);
return rMap;
} catch (Exception e) {
e.printStackTrace();
}
return Collections.emptyMap();
}
/** /**
* *
* @Description 网易网页版数据 * @Description 网易网页版数据
......
...@@ -22,15 +22,13 @@ public class AiqiyiByWordAnalysis { ...@@ -22,15 +22,13 @@ public class AiqiyiByWordAnalysis {
List<Map<String,Object>> dataMap = new ArrayList<>(); List<Map<String,Object>> dataMap = new ArrayList<>();
try { try {
Document doc = Jsoup.parse(result); Document doc = Jsoup.parse(result);
Elements elements = doc.select("li.list_item"); Elements elements = doc.select("div.layout-main > div");
for (Element element : elements) { for (Element element : elements) {
Map<String, Object> map = new HashMap<>(); Map<String, Object> map = new HashMap<>();
String title = element.select("li").attr("data-widget-searchlist-tvname"); String title = element.select("a.main-tit").attr("title");
String time = element.select("em.result_info_desc").text().split(" ")[0]; String time = element.select("span.info-des").text().split(" ")[0];
if(element.select("label.result_info_lbl").text().contains("上传者")) { String uurl = element.select("a.main-tit").attr("href");
map.put("source", element.select("a.result_info_link").text()); map.put("source", element.select("a.uploader-name").text());
}
String uurl = element.select("h3.result_title > a").attr("href");
map.put("time", TimeParse.stringFormartDate(time)); map.put("time", TimeParse.stringFormartDate(time));
map.put("url", uurl); map.put("url", uurl);
map.put("title", title); map.put("title", title);
......
...@@ -58,24 +58,22 @@ public class BaijiaAccountAnalysis { ...@@ -58,24 +58,22 @@ public class BaijiaAccountAnalysis {
try { try {
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("list"); JSONArray jsonArry = json.getJSONObject("data").getJSONArray("list");
if(json.getJSONObject("data").getBoolean("has_more") != null && if(json.getJSONObject("data").getBoolean("hasMore") != null &&
json.getJSONObject("data").getBoolean("has_more") ) { json.getJSONObject("data").getBoolean("hasMore") ) {
more = true; more = true;
rmap.put("ctime", json.getJSONObject("data").getString("ctime")); rmap.put("ctime", json.getJSONObject("data").getJSONObject("query").getString("ctime"));
} }
// String name = json.getJSONObject("data").getJSONObject("author").getString("display_name"); // String name = json.getJSONObject("data").getJSONObject("author").getString("display_name");
for(int i = 0;i < jsonArry.size();i++) { for(int i = 0;i < jsonArry.size();i++) {
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
JSONObject data = jsonArry.getJSONObject(i); JSONObject data = jsonArry.getJSONObject(i).getJSONObject("itemData");
String id = data.getString("article_id"); String id = data.getString("article_id");
int t = data.getInteger("updated_at"); int t = data.getInteger("updated_at");
String time = TimeParse.dateFormartString(new Date(t*1000L), "yyyy-MM-dd HH:mm:ss"); String time = TimeParse.dateFormartString(new Date(t*1000L), "yyyy-MM-dd HH:mm:ss");
System.out.println(time); System.out.println(time);
if(startTime != null && startTime.length() > 1) { if(startTime != null && startTime.length() > 1 && time.compareTo(startTime) < 1) {
if(time.compareTo(startTime) < 1) { more = false;
more = false; continue;
continue;
}
} }
map.put("title", data.getString("title")); map.put("title", data.getString("title"));
String url = "http://baijiahao.baidu.com/s?id=" + id; String url = "http://baijiahao.baidu.com/s?id=" + id;
......
...@@ -18,7 +18,7 @@ public class BilibilikeyWordAnalysis { ...@@ -18,7 +18,7 @@ public class BilibilikeyWordAnalysis {
try { try {
Document doc = Jsoup.parse(result); Document doc = Jsoup.parse(result);
boolean more = false; boolean more = false;
if(doc.select("#server-search-app > div.contain > div.body-contain > div > div.page-wrap > div > ul > li.page-item.next > button").text().contains("下一页")) { if(doc.select("#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button").text().contains("下一页")) {
more = true; more = true;
} }
...@@ -28,7 +28,7 @@ public class BilibilikeyWordAnalysis { ...@@ -28,7 +28,7 @@ public class BilibilikeyWordAnalysis {
String playcount = null; String playcount = null;
String source = null; String source = null;
String submitcount = null; String submitcount = null;
Elements elements = doc.select("ul.video-contain.clearfix").select("li"); Elements elements = doc.select("ul.video-list.clearfix").select("li");
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
for(Element element : elements) { for(Element element : elements) {
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
......
...@@ -77,9 +77,10 @@ public class SouhuCommentAnalysis { ...@@ -77,9 +77,10 @@ public class SouhuCommentAnalysis {
public int getSouhuCommentCount(String url,ProxyHolder proxy) { public int getSouhuCommentCount(String url,ProxyHolder proxy) {
Map<String,String> headerMap = HeadGet.getSouhuCommentHeaderMap(null); Map<String,String> headerMap = HeadGet.getSouhuCommentHeaderMap(null);
try { try {
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap); String id = getUrlId(url);
String result = HttpClient.executeHttpRequestGet("https://apiv2.sohu.com/api/comment/list?callback=&page_size=10&topic_id=&page_no=1&source_id=mp_"+id,proxy, headerMap);
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
if(json.getInteger("code") == 500) { if(Objects.nonNull(json.get("code")) && json.getInteger("code") == 500) {
return 0; return 0;
} }
return json.getJSONObject("jsonObject").getInteger("cmt_sum"); return json.getJSONObject("jsonObject").getInteger("cmt_sum");
...@@ -116,21 +117,28 @@ public class SouhuCommentAnalysis { ...@@ -116,21 +117,28 @@ public class SouhuCommentAnalysis {
} }
public int getReadNum(String url, ProxyHolder proxy) { public int getReadNum(String url, ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){ String id = getUrlId(url);
String result = response.body().string(); try (Response response = httpBoot.syncCall(RequestUtils.wrapGet("http://v2.sohu.com/public-api/articles/"+id+"/pv"), proxy)){
String sourceId = getNewsId(result); return Integer.parseInt(response.body().string());
url = "http://v2.sohu.com/public-api/articles/pv?articleIds=" + sourceId;
result = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
return JSONObject.parseObject(result).getInteger(sourceId);
} catch (Exception e) { } catch (Exception e) {
logger.error("Exception {} ",e); logger.error("Exception {} ",e);
} }
return -1; return -1;
} }
/**
** 获取搜狐文章id
* @param url
* @return
* @return String
*/
private String getUrlId(String url) {
try {
return url.split("/a/")[1].split("_")[0];
} catch (Exception e) {
logger.error("搜狐链接解析失败");
}
return null;
}
} }
...@@ -27,9 +27,9 @@ ...@@ -27,9 +27,9 @@
//// List<String> headList = (List<String>) map.get("head"); //// List<String> headList = (List<String>) map.get("head");
//// for (Map<String, Object> map1 : list) { //// for (Map<String, Object> map1 : list) {
//// String url = map1.get("地址") + ""; //// String url = map1.get("地址") + "";
// String cookie = "_buuid=0668b664-13b3-4bd0-aa37-99d747432e85; guid=HBoEGxgEGBscBBsZGlYHGBsZHRsYExwZHFYcGQQdGR8FQ1hLTEt5ChITBBIdHxkEGgQbHQVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; token=\"ou+mv1hjxjm0uOOTss1vgck9+h6OCS/lYQUeFnJgSK70FHprmw6GmjBGwk2qPQH88CKuzcDfAvoCmBm7+jVysA==\"; uid=\"A8ELjewCDRgHnZ5bX0Vy0/Airs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMjI3NjU0NTI0Iiwic2VjcmV0IjoiV0wyZmEtZDZxbkx2TEkzZHF2dTN4UG5SIiwiX2V4cGlyZSI6MTU2MDU5Mzg4Mjc5NCwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=ujhqvC3wPAn-WlCPXfB6C5ZJIgY"; // String cookie = "";
// String url = "https://maimai.cn/web/gossip_detail?src=app&webid=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlZ2lkIjoiMjU5OTg4YmE4YzI3MTFlOTllMjEyNDZlOTZiNDgwODgiLCJ1IjoxNzY2NDQ3NjUsImlkIjoyMjIwMDM4NH0.gBdp3i99L0xUKLglaVIJf07OrrPk6yoG9HAo-3Yftqk"; // String url = "https://maimai.cn/web/gossip_detail?src=app&webid=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlZ2lkIjoiMjU5OTg4YmE4YzI3MTFlOTllMjEyNDZlOTZiNDgwODgiLCJ1IjoxNzY2NDQ3NjUsImlkIjoyMjIwMDM4NH0.gBdp3i99L0xUKLglaVIJf07OrrPk6yoG9HAo-3Yftqk";
// Map<String,Object> map3 = Maimai.getMaiaiCount(url,cookie, ProxyHolder.NAT_HEAVY_PROXY); // List<Map<String, Object>> map3 = Maimai.getMaimaiCommentList(url, cookie, ProxyHolder.NAT_HEAVY_PROXY);
// System.out.println(map3.toString()); // System.out.println(map3.toString());
// System.out.println(url); // System.out.println(url);
//// map1.putAll(map3); //// map1.putAll(map3);
......
//package com.zhiwei.Comment; //package com.zhiwei.Comment;
// //
//import org.testng.annotations.Test; //import org.junit.Test;
// //
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.parse.SinaKeji; //import com.zhiwei.parse.SinaKeji;
// //
//public class SinaKejiComment { //public class SinaKejiComment {
//
// @Test // @Test
// public void f() { // public void f() {
// String url = "https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml"; // ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String url = "https://tech.sina.com.cn/d/v/2019-08-19/doc-ihytcitn0207512.shtml";
// //
// SinaKeji.getSinaKejiComment(url, null); // SinaKeji.getSinaKejiComment(url, ProxyHolder.NAT_HEAVY_PROXY);
// //
// } // }
//
//} //}
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
//// System.out.println(child.split("chlid=")[1]); //// System.out.println(child.split("chlid=")[1]);
// System.out.println(child.split("=")[1]); // System.out.println(child.split("=")[1]);
// //
// List<Map<String,Object>> lists = QQKB.getQQAccountData("5456950", cookie,null); // List<Map<String,Object>> lists = QQKB.getQQAccountData("5060059", cookie,null);
// if(lists != null) { // if(lists != null) {
// for(Map<String,Object> map1 : lists) { // for(Map<String,Object> map1 : lists) {
// map1.put("name", map.get("呢称")); // map1.put("name", map.get("呢称"));
......
...@@ -6,14 +6,17 @@ ...@@ -6,14 +6,17 @@
// //
//import org.junit.Test; //import org.junit.Test;
// //
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Baijia; //import com.zhiwei.parse.Baijia;
//import com.zhiwei.tools.tools.ZhiWeiTools;
// //
//public class BaijiaAccountExample { //public class BaijiaAccountExample {
// //
// @Test // @Test
// public void test3() { // public void test3() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String path = "D://crawlerdata//自媒体/百家号采集.xlsx"; // String path = "D://crawlerdata//自媒体/百家号采集.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String startTime = "2018-05-01 00:00:00"; // String startTime = "2018-05-01 00:00:00";
...@@ -23,9 +26,10 @@ ...@@ -23,9 +26,10 @@
// for(Map<String,Object> m : list) { // for(Map<String,Object> m : list) {
// try { // try {
// String app_id = m.get("id").toString(); // String app_id = m.get("id").toString();
// app_id = "1594158489045754"; // app_id = "1602674438508810";
// String cookie = "__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5"; // String cookie = "BAIDUID=7D453C932433A93F7AD1F3B8ABC8B0E1:FG=1; BIDUPSID=7D453C932433A93F7AD1F3B8ABC8B0E1; PSTM=1570766401; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID=eH-OJeCmH6VwoRJwCdmehrB7leKK0gOTHllvCh8hmwLadLIVJeC6EG0Ptf8g0KubFTPRogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tJkD_I_hJKt3qn7I5KToh4Athxob2bbXHDo-LIvHWT6cOR5JhfA-3R-e046f3-3L5CbH5D3s5lvvhb3O3M7ShbKdMa732RbrKCnraxQF5l8-sq0x0bOte-bQypoa0q3TLDOMahkM5h7xOKQoQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3YjjISKx-_J5LJJxK; H_PS_PSSID=1442_21103_29567_29699_29220_22158; delPer=0; PSINO=5; ZD_ENTRY=baidu; yjs_js_security_passport=9687699d4b0965c0be1e6e312fc59ff5cf3d03a2_1571106914_js; Hmery-Time=1215393878";
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id, startTime,cookie, null); // System.out.println(app_id);
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id,"聚富财经", startTime,cookie, ProxyHolder.NAT_HEAVY_PROXY);
// if(lists != null) { // if(lists != null) {
// bodyList.addAll(lists); // bodyList.addAll(lists);
// } // }
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
// @Test // @Test
// public void aiqiyiTest() { // public void aiqiyiTest() {
// //
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); // ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER,10000002);
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt"); // List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : wordList) { // for(String w : wordList) {
...@@ -34,7 +34,7 @@ ...@@ -34,7 +34,7 @@
// headList.add("title"); // headList.add("title");
// headList.add("word"); // headList.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频/爱奇艺关键词采集-毓婷-0716.xlsx", "数据", headList, bodyList); // poi.exportExcel("D://crawlerdata//视频/爱奇艺关键词采集-精装修.xlsx", "数据", headList, bodyList);
// //
// //
// //
......
//package com.zhiwei.shipin; package com.zhiwei.shipin;
//
//import java.util.ArrayList; import java.util.ArrayList;
//import java.util.List; import java.util.List;
//import java.util.Map; import java.util.Map;
//
//import org.junit.Test; import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType; import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.BiliBili; import com.zhiwei.parse.BiliBili;
//import com.zhiwei.util.WordReadFile; import com.zhiwei.util.WordReadFile;
//
//public class BilibiliTest { public class BilibiliTest {
// @Test @Test
// public void f() { public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER); ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词-1.txt"); List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String, Object>> bodyList = new ArrayList<>(); List<Map<String, Object>> bodyList = new ArrayList<>();
// String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274"; String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
// for (String word : wordList) { for (String word : wordList) {
// List<Map<String, Object>> dataList = BiliBili.getData(word, null, "2019-07-18 00:00:00", List<Map<String, Object>> dataList = BiliBili.getData(word, null, "2001-01-14 00:00:00",
// cookie); cookie);
// if (dataList != null) { if (dataList != null) {
// System.out.println(word + " ----- " + dataList.size()); System.out.println(word + " ----- " + dataList.size());
// bodyList.addAll(dataList); bodyList.addAll(dataList);
// } }
// } }
// List<String> headlist = new ArrayList<>(); List<String> headlist = new ArrayList<>();
// headlist.add("submitcount"); headlist.add("submitcount");
// headlist.add("playcount"); headlist.add("playcount");
// headlist.add("time"); headlist.add("time");
// headlist.add("source"); headlist.add("source");
// headlist.add("title"); headlist.add("title");
// headlist.add("url"); headlist.add("url");
// headlist.add("word"); headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//bilibili关键词采集数据-吃鸡否-0722.xlsx", "B站数据", headlist, bodyList); poi.exportExcel("D://crawlerdata//视频//bilibili关键词采集数据-封神神话-收藏.xlsx", "B站数据", headlist, bodyList);
//
// } }
//} }
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
//public class QQTVTest { //public class QQTVTest {
// @Test // @Test
// public void f() { // public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER); // ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER, 10000002);
// String time = "2019-04-11 00:00:00"; // String time = "2019-01-11 00:00:00";
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt"); // List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String, Object>> bodyList = new ArrayList<>(); // List<Map<String, Object>> bodyList = new ArrayList<>();
// for (String word : wordList) { // for (String word : wordList) {
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
// headlist.add("url"); // headlist.add("url");
// headlist.add("word"); // headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//腾讯视频关键词采集数据-毓婷-0716.xlsx", "腾讯视频数据", headlist, bodyList); // poi.exportExcel("D://crawlerdata//视频//腾讯视频关键词采集数据-精装修.xlsx", "腾讯视频数据", headlist, bodyList);
// //
// //
// //
......
...@@ -33,7 +33,7 @@ ...@@ -33,7 +33,7 @@
// headlist.add("url"); // headlist.add("url");
// headlist.add("word"); // headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//搜狐视频关键词采集数据-毓婷-0716.xlsx", "搜狐数据", headlist, bodyList); // poi.exportExcel("D://crawlerdata//视频//搜狐视频关键词采集数据-华为-0812.xlsx", "搜狐数据", headlist, bodyList);
// //
// } // }
//} //}
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
// headList.add("uper"); // headList.add("uper");
// headList.add("word"); // headList.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//优酷数据-毓婷-0716.xlsx", "数据", headList, bodyList); // poi.exportExcel("D://crawlerdata//视频//优酷数据-华为-0812.xlsx", "数据", headList, bodyList);
// //
// } // }
//} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment