Commit 6018f0b3 by yangchen

提交修改

parent 3e350f8b
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>articlenewscrawler</artifactId>
<version>0.1.7-SNAPSHOT</version>
<version>0.2.2-SNAPSHOT</version>
<name>articlenewscrawler</name>
<description>采集凤凰,一点资讯,搜狐历时文章和文章评论</description>
......@@ -21,7 +21,7 @@
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.3.6-RELEASE</version>
<version>0.5.5.6-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
</dependencies>
......
......@@ -67,7 +67,7 @@ public class HeadGet {
public static Map<String,String> getYidianzixunAccountHeaderMap(String cookie,String referer) {
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
"Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1");
headerMap.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
......@@ -254,14 +254,13 @@ public class HeadGet {
* @throws IOException
*/
public static Map<String,String> getDayuCommentHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
headerMap.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "m.uczzd.cn");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
......@@ -293,13 +292,13 @@ public class HeadGet {
}
public static Map<String,Object> getQQAccountOneParamMap(String chlid) {
Map<String,Object> paramMap = new HashMap<String,Object>();
Map<String,Object> paramMap = new HashMap<>();
paramMap.put("chlid", chlid);
return paramMap;
}
public static Map<String,Object> getQQAccountOtherParamMap(String ids) {
Map<String,Object> paramMap = new HashMap<String,Object>();
Map<String,Object> paramMap = new HashMap<>();
paramMap.put("ids", ids);
return paramMap;
}
......
......@@ -3,6 +3,7 @@ package com.zhiwei.httpclient;
import java.io.IOException;
import java.net.Proxy;
import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -11,12 +12,14 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.FormBody;
import okhttp3.Headers;
import okhttp3.Response;
public class HttpClient {
private static Logger logger = LoggerFactory.getLogger(HttpClient.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(false).build();
/**
*
......@@ -43,6 +46,25 @@ public class HttpClient {
* @return
* @throws IOException
*/
public static String executeHttpRequestGet(String url,ProxyHolder proxy) {
for(int i = 0;i < 3;i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
return response.body().string();
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e);
}
}
return null;
}
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static String executeHttpRequestGet(String url,ProxyHolder proxy,Map<String, String> headerMap) {
for(int i = 0;i < 3;i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
......@@ -54,8 +76,37 @@ public class HttpClient {
return null;
}
public static String executeHttpRequestPost(String url,Proxy proxy,Map<String, String> headerMap,Map<String, Object> paramMap) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, paramMap), proxy)){
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static String executeHttpRequestGet(String url,ProxyHolder proxy,Headers header) {
for(int i = 0;i < 3;i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, header), proxy)){
return response.body().string();
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e);
}
}
return null;
}
public static String executeHttpRequestPost(String url,Proxy proxy,Map<String, String> headerMap,Map<String, Object> params) {
FormBody body = null;
if (Objects.nonNull(params) && !params.isEmpty()) {
FormBody.Builder builder = new FormBody.Builder();
params.forEach((lt, rt) -> {
if (Objects.nonNull(lt)) {
builder.add(String.valueOf(lt), Objects.isNull(rt) ? "" : String.valueOf(rt));
}
});
body = builder.build();
}
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, body), proxy)){
return response.body().string();
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e);
......@@ -64,8 +115,18 @@ public class HttpClient {
}
public static String executeHttpRequestPost(String url,ProxyHolder proxy,Map<String, String> headerMap,Map<String, Object> paramMap) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, paramMap), proxy)){
public static String executeHttpRequestPost(String url,ProxyHolder proxy,Map<String, String> headerMap,Map<String, Object> params) {
FormBody body = null;
if (Objects.nonNull(params) && !params.isEmpty()) {
FormBody.Builder builder = new FormBody.Builder();
params.forEach((lt, rt) -> {
if (Objects.nonNull(lt)) {
builder.add(String.valueOf(lt), Objects.isNull(rt) ? "" : String.valueOf(rt));
}
});
body = builder.build();
}
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, body), proxy)){
return response.body().string();
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e);
......
......@@ -89,20 +89,21 @@ public class Baijia {
while(f) {
for(int i = 1;i < 3;i++) {
try {
String url = "https://author.baidu.com/list?type=article&tab=2&uk="+uk+"&ctime="+ctime+"&num=50";
String url = "https://mbd.baidu.com/webpage?tab=article&num=10&uk="+uk+"&ctime="+ctime+"&type=newhome&action=dynamic&format=json";
String result = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
Map<String,Object> dMap = baijiaAccountAnalysis.getBaijiaAccountData3(result,name, startTime);
List<Map<String,Object>> dList = (List<Map<String, Object>>) dMap.get("data");
dataList.addAll(dList);
if(Objects.nonNull(dList))
dataList.addAll(dList);
logger.info("{} 数据采集结果 {}",appId, dataList.size());
if(!(boolean) dMap.get("more")) {
f = false;
}
ctime = String.valueOf(dMap.get("ctime"));
ZhiWeiTools.sleep(3000);
ZhiWeiTools.sleep(1000);
break;
} catch (Exception e) {
ZhiWeiTools.sleep(3000);
ZhiWeiTools.sleep(2000);
}
}
}
......@@ -111,15 +112,16 @@ public class Baijia {
}
private static String getUkData(String appId,ProxyHolder proxy,String cookie) {
String url = "https://author.baidu.com/profile?context={\"from\":0,\"app_id\":\""
+appId+"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#";
// String url = "https://author.baidu.com/profile?context={\"from\":0,\"app_id\":\""
// +appId+"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#";
String url = "https://author.baidu.com/home/" + appId;
Map<String,Object> headers = new HashMap<>();
headers.put("Host", "author.baidu.com");
headers.put("cookie", cookie);
for(int i = 0; i < 3;i++) {
try {
String result = httpBoot.syncCall(RequestUtils.wrapGet(url,headers), proxy).body().string();
return result.split("uk\\\\\":\\\\\"")[1].split("\\\\\",")[0];
return result.split("uk\":\"")[1].split("\",")[0];
} catch (Exception e) {
logger.error("百家号uk 获取失败");
}
......
......@@ -4,7 +4,6 @@ import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
......@@ -13,7 +12,7 @@ import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.BilibilikeyWordAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
......@@ -22,16 +21,16 @@ import okhttp3.Headers;
public class BiliBili {
private static final Logger logger = LoggerFactory.getLogger(BiliBili.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).useCookieJar(true).build();
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getData(String word,Proxy proxy,String endTime,String cookie) {
List<Map<String,Object>> bodyList = new ArrayList<>();
try {
//
String url = "https://search.bilibili.com/all?keyword="+URLEncoder.encode(word, "utf-8")+"&order=pubdate&duration=0&tids_1=0";
String url = "https://search.bilibili.com/all?keyword="+URLEncoder.encode(word, "utf-8")+"&single_column=1&order=stow&duration=0&tids_1=0";
System.out.println(url);
Headers header = Headers.of("cookie",cookie,"Referer","https://www.bilibili.com/","Host","search.bilibili.com");
String result = httpBoot.syncCall(RequestUtils.wrapGet(url, header), ProxyHolder.NAT_HEAVY_PROXY).body().string();
String result = HttpClient.executeHttpRequestGet(url, ProxyHolder.NAT_HEAVY_PROXY, header);
ZhiWeiTools.sleep(100);
Map<String,Object> map = BilibilikeyWordAnalysis.getData(result,word,endTime);
boolean more = (boolean) map.get("more");
......@@ -43,7 +42,7 @@ public class BiliBili {
while(more) {
map.clear();
String ur = url + "&page=" + n;
String result2 = httpBoot.syncCall(RequestUtils.wrapGet(ur, header), ProxyHolder.NAT_HEAVY_PROXY).body().string();
String result2 = HttpClient.executeHttpRequestGet(ur, ProxyHolder.NAT_HEAVY_PROXY, header);
map = BilibilikeyWordAnalysis.getData(result2,word,endTime);
List<Map<String,Object>> dataList2 = (List<Map<String, Object>>) map.get("data");
if(dataList2 != null) {
......@@ -60,7 +59,7 @@ public class BiliBili {
} catch (Exception e) {
logger.error("e {}",e);
}
return Collections.emptyList();
return bodyList;
}
......
......@@ -102,7 +102,7 @@ public class Dayu {
* @param articleId
* @return
*/
public static int getDayuCommentCount(String articleId,Proxy proxy) {
public static int getDayuCommentCount(String articleId,ProxyHolder proxy) {
String url = "http://m.uczzd.cn/iflow/api/v2/cmt/article/"+articleId+"/comments/byhot";
Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
......@@ -110,6 +110,39 @@ public class Dayu {
return json.getJSONObject("data").getInteger("comment_cnt");
}
/**
** 大鱼阅读数
* @param url
* @param proxy
* @return
* @return int
*/
public static int getDayuReadCount(String url,ProxyHolder proxy) {
try {
if(url.contains("!wm_aid=")) {
String articleId = url.split("wm_aid=")[1];
String eUrl = "https://ff.dayu.com/contents/origin/"+articleId+"?biz_id=1002&_fetch_author=1&_incr_fields=click1,click2,click3,click_total,play,like";
Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(eUrl,proxy, headerMap);
JSONObject json = JSONObject.parseObject(result);
return json.getJSONObject("data").getJSONObject("_incrs").getIntValue("click2") +
json.getJSONObject("data").getJSONObject("_incrs").getIntValue("click1");
}else if(url.contains("wm_cid=")) {
String articleId = url.split("wm_cid=")[1];
String eUrl = "https://ff.dayu.com/contents/"+articleId+"?biz_id=1002&_fetch_author=1&_incr_fields=click1,click2,click3,click_total,play,like";
Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(eUrl,proxy, headerMap);
JSONObject json = JSONObject.parseObject(result);
return json.getJSONObject("data").getJSONObject("_incrs").getIntValue("click2") +
json.getJSONObject("data").getJSONObject("_incrs").getIntValue("click1");
}
} catch (Exception e) {
e.printStackTrace();
}
return -1;
}
/**
*
* @Description 大鱼号依据关键词采集
......
......@@ -38,7 +38,7 @@ public class Douban {
*/
public static List<Map<String,Object>> doubanTopicGetByWord(String word,ProxyHolder proxy,String cookie,String stime) {
int page = 0;
int count = 20;
int count = 50;
boolean more = true;
Map<String,String> headerMap = new HashMap<>();
headerMap.put("Host", "www.douban.com");
......@@ -77,10 +77,9 @@ public class Douban {
map.put("time", time);
map.put("reply_count", replyCount);
bodyList.add(map);
// System.out.println(map.toString());
}
}
if(bodyList.size() - cou < 30){
if(bodyList.size() - cou < 10 || page > 500){
more = false;
}
logger.info("采集到 第 {} 页 ,一共采集到 {} 条 是否有下一页 {}",page,bodyList.size(),more);
......
......@@ -16,7 +16,9 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet;
......@@ -139,6 +141,44 @@ public class Maimai {
return Collections.emptyMap();
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER, 10000002L);
String url = "https://maimai.cn/web/feed_detail?fid=1353566056&efid=QTa45Y1e-oQzyn1dZ5ozlQ";
System.out.println(getMaiaiCount2(url, ProxyHolder.NAT_HEAVY_PROXY));
}
/**
* https://maimai.cn/web/feed_detail?fid=1304191535&efid=0CQbJXhoYLXdC87NFIkRMA
* @Description 获取脉脉转评赞
* @param url
* @param proxy
* @return
*/
public static Map<String,Object> getMaiaiCount2(String url,ProxyHolder proxy) {
for(int i = 1; i < 3; i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
result = result.split("JSON.parse\\(\"")[1].split("\"\\);\\</script\\>")[0];
result = ZhiWeiTools.decodeUnicode(result);
JSONObject json = JSONObject.parseObject(result);
Map<String,Object> map = new HashMap<>();
JSONObject data = json.getJSONObject("data").getJSONObject("feed");
map.put("like", data.getJSONObject("likes").getInteger("n"));
map.put("spreads", data.getJSONObject("spreads").getInteger("n"));
map.put("cmts", data.getJSONObject("comments").getInteger("n"));
map.put("gid", data.getLong("id"));
map.put("title", data.getJSONObject("main").getString("text"));
map.put("author", data.getJSONObject("main").getJSONObject("u").getString("name"));
map.put("userId", data.getJSONObject("main").getJSONObject("u").getString("mmid"));
map.put("company", data.getJSONObject("main").getJSONObject("u").getString("career_str"));
return map;
} catch (Exception e) {
logger.error(" 脉脉 转评攒 获取失败 {}",e);
}
}
return Collections.emptyMap();
}
/**
* //https://maimai.cn/web/gossip_detail?encode_id=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MTk2MzEyNjYsImlhdCI6MTU0ODI5NzI5NX0.N6SPmcf-fyitLNomzY-a8BEY31eseYnvG7RTUQ3jxYY
* @Description 获取脉脉转评赞
......
......@@ -60,8 +60,8 @@ public class QQKB {
try {
for(int j = 1; j < 3;j++) {
ids = ids.substring(0,ids.length()-1);
System.out.println(ids);
ZhiWeiTools.sleep(7000);
logger.info("data {}",ids);
ZhiWeiTools.sleep(1000);
paramMap.clear();
paramMap = HeadGet.getQQAccountOtherParamMap(ids);
result = HttpClient.executeHttpRequestPost(url,proxy, headerMap, paramMap);
......@@ -76,7 +76,6 @@ public class QQKB {
} catch (Exception e) {
ids = "";
paramMap.clear();
continue;
}
}
}
......
package com.zhiwei.parse;
import java.io.IOException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.bean.HistortyBean;
import com.zhiwei.bean.QQKandianUser;
import com.zhiwei.crawler.core.HttpClientBuilder;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Headers;
import okhttp3.OkHttpClient;
import okhttp3.Request;
public class QQKandian {
public List<QQKandianUser> getUser(String name,Proxy proxy) {
if(name != null && name.length() > 0) {
List<QQKandianUser> dataList = new ArrayList<>();
OkHttpClient okhttp = HttpClientBuilder.newInstance();
Map<String,String> map = new HashMap<>();
map.put("Host", "sou.qq.com");
map.put("Referer", "https://sou.qq.com/kandian/kd.html?_bid=3216&_wv=3&_wwv=1293&_wvSb=0&hotword=%E7%9F%A5%E5%90%8D%E5%A4%A7V%E7%AB%A0%E6%96%87%E6%B6%89%E6%80%A7%E4%BE%B5");
map.put("Cookie", "skey=MUzU7gdtRz; uin=o0497332654; RK=rNiJH0RBav; pgv_pvid=8990378504; pt2gguin=o0497332654; ptcz=062d936df33011f468637ee72be262a020a8df79977df7e7bde9c105b2b2ddf6");
try {
//https://sou.qq.com/cgi-bin/kandian/tab_search?key_word=%E9%98%BF%E9%87%8C&business=64&page_size=20&cookie=&Group_masks=1003&bkn=1215238072
String url = "https://sou.qq.com/cgi-bin/kandian/tab_search?key_word="+URLEncoder.encode(name, "utf-8")+"&business=64&page_size=20&cookie=&Group_masks=1003&bkn=1215238072";
Request request = HttpRequestBuilder.newGetRequest(url, Headers.of(map));
okhttp = okhttp.newBuilder().proxy(proxy).build();
String result = okhttp.newCall(request).execute().body().string();
System.out.println(result);
JSONObject json = JSONObject.parseObject(result);
JSONObject json2 = json.getJSONObject("result").getJSONArray("item_groups").getJSONObject(0);
JSONArray jsonArray = json2.getJSONArray("result_items");
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
if(name.equals(data.getString("name"))) {
QQKandianUser kandianUser = new QQKandianUser();
if(data.getString("layout_content")!=null) {
String m = data.getString("layout_content");
JSONObject m1 = JSONObject.parseObject(m);
kandianUser.setVerify(m1.getBoolean("verify"));
kandianUser.setDesc(m1.getString("secondLineText"));
}
String nam = data.getString("name");
String ur = data.getString("jmp_url");
String id = data.getString("result_id");
System.out.println(data.toString());
kandianUser.setId(id);
kandianUser.setName(nam);
kandianUser.setUrl(ur);
dataList.add(kandianUser);
}
}
return dataList;
} catch (Exception e) {
return null;
}
}
return null;
}
public List<HistortyBean> getHistoryData(String uid,Proxy proxy) {
String url = "https://kandian.qq.com/cgi-bin/social/getHomePage?uin="+uid+"&pageNo=1&pageSize=10&pageCookies=&is715=1&isInQQ=1&g_tk=1066845421&bkn=1066845421&_="+new Date().getTime();
List<HistortyBean> dataList = new ArrayList<HistortyBean>();
OkHttpClient okhttp = HttpClientBuilder.newInstance();
Map<String,String> map = new HashMap<String,String>();
map.put("Host", "kandian.qq.com");
map.put("Referer", "https://kandian.qq.com/mqq/vue/main?_wv=10145&_bid=2378&adfrom=search&x5PreFetch=1&accountId=MjY2MTY0MjM4Ng%3D%3D");
map.put("Cookie", "skey=MQmBo5A1N7; uin=o0497332654; pgv_pvid=8990378504");
try {
okhttp = okhttp.newBuilder().proxy(proxy).build();
while(true) {
try {
Request request = HttpRequestBuilder.newGetRequest(url, Headers.of(map));
String result = okhttp.newCall(request).execute().body().string();
JSONObject json = JSONObject.parseObject(result).getJSONObject("result");
JSONArray jsonArray = json.getJSONArray("articleinfos");
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
String ur = data.getString("articleurl");
HistortyBean history = getOnhistoryData(ur);
if(history != null) {
dataList.add(history);
}
ZhiWeiTools.sleep(1500);
}
String pageCookies = json.getString("pageCookies");
String pacs = request.url().queryParameter("pageCookies");
int pageno = Integer.valueOf(request.url().queryParameter("pageNo"));
url = request.url().toString().replace("pageNo="+pageno, "pageNo="+(pageno+1)).replace("&pageCookies="+pacs, "&pageCookies="+pageCookies);
ZhiWeiTools.sleep(5000);
} catch (Exception e) {
break;
}
}
return dataList;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
private static HistortyBean getOnhistoryData(String url) {
OkHttpClient okhttp = HttpClientBuilder.newInstance();
Request request = HttpRequestBuilder.newGetRequest(url, Headers.of("Host","post.mp.qq.com"));
try {
HistortyBean histortyBean = new HistortyBean();
String result = okhttp.newCall(request).execute().body().string();
Date date = getTime(result);
String source = getSource(result);
if(date != null && source != null) {
Document doc = Jsoup.parse(result);
String content = doc.select("div#main-content").select("section").text();
String title = doc.select("meta[itemprop=name]").attr("content");
histortyBean.setSource(source);
histortyBean.setTime(date);
histortyBean.setTitle(title);
histortyBean.setContent(content);
histortyBean.setUrl(url);
return histortyBean;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
public List<HistortyBean> getDataByword(String word,Proxy proxy) {
List<HistortyBean> dataList = new ArrayList<HistortyBean>();
OkHttpClient okhttp = HttpClientBuilder.newInstance();
Map<String,String> map = new HashMap<String,String>();
map.put("Host", "sou.qq.com");
map.put("Referer", "https://sou.qq.com/kandian/kd.html?_bid=3216&_wv=3&_wwv=1293&_wvSb=0&hotword=%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4%E9%82%93%E4%BC%A6%E7%94%B5%E6%A2%AF%E5%90%BB");
map.put("Cookie", "skey=MU7zbaRXu8; uin=o0497332654; RK=rNiJH0RBav; pgv_pvid=8990378504; pt2gguin=o0497332654; ptcz=062d936df33011f468637ee72be262a020a8df79977df7e7bde9c105b2b2ddf6");
try {
String url = "https://sou.qq.com/cgi-bin/kandian/unite_search?key_word="+URLEncoder.encode(word, "utf-8")+"&business=64&page_size=20&cookie=&bkn=2000031506";
//https://sou.qq.com/cgi-bin/kandian/unite_search?key_word=%E5%94%90%E5%AB%A3&business=64&page_size=20&cookie=&bkn=2000031506
List<String> urlList = new ArrayList<String>();
int count = 0;
while(true) {
try {
okhttp = okhttp.newBuilder().proxy(proxy).build();
Request request = HttpRequestBuilder.newGetRequest(url, Headers.of(map));
String result = okhttp.newCall(request).execute().body().string();
JSONObject json = JSONObject.parseObject(result).getJSONObject("result");
JSONArray jsonArray = json.getJSONArray("item_groups");
count = urlList.size();
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
String type = data.getString("group_name");
if("视频".equals(type) || "小视频".equals(type) || "相关搜索".equals(type) || "话题".equals(type)) {
}else {
JSONObject da = data.getJSONArray("result_items").getJSONObject(0);
String title = da.getString("name");
String ur = da.getString("jmp_url");
if(!urlList.contains(ur.split("\\?")[0])) {
urlList.add(ur.split("\\?")[0]);
String extension = da.getString("extension");
JSONObject obj = JSONObject.parseObject(extension);
String time = obj.getString("create_time");
String content = obj.getString("content");
if(content == null) {
content = obj.getString("brief");
}
String source = obj.getString("from");
HistortyBean histortyBean = new HistortyBean();
System.out.println(title + " -- " + ur.split("\\?")[0]);
histortyBean.setTime(TimeParse.stringFormartDate(time+"000"));
histortyBean.setContent(content);
histortyBean.setTitle(title);
histortyBean.setSource(source);
histortyBean.setUrl(ur);
dataList.add(histortyBean);
}
}
}
if(count == urlList.size()) {
break;
}
String pageCookies = json.getString("cookie");
String pacs = request.url().queryParameter("cookie");
url = request.url().toString().replace("&cookie="+pacs, "&cookie="+pageCookies);
ZhiWeiTools.sleep(3000);
} catch (Exception e) {
break;
}
}
return dataList;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
*
* @Description 获取时间
* @param result
* @return
*/
private static Date getTime(String result) {
Pattern pa = Pattern.compile("data-timestamp=\"(.*?)\"");
Matcher ma = pa.matcher(result);
while(ma.find()) {
String t = ma.group(0);
t = t.split("ata-timestamp=\"")[1].split("\"")[0];
return TimeParse.stringFormartDate(t+"000");
}
return null;
}
/**
*
* @Description 获取来源
* @param result
* @return
*/
private static String getSource(String result) {
Pattern pa = Pattern.compile("ata-author=\"(.*?)\"");
Matcher ma = pa.matcher(result);
while(ma.find()) {
String t = ma.group(0);
t = t.split("ata-author=\"")[1].split("\"")[0];
return t;
}
return null;
}
}
//package com.zhiwei.parse;
//
//import java.io.IOException;
//import java.net.Proxy;
//import java.net.URLEncoder;
//import java.util.ArrayList;
//import java.util.Date;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//import java.util.regex.Matcher;
//import java.util.regex.Pattern;
//
//import org.jsoup.Jsoup;
//import org.jsoup.nodes.Document;
//
//import com.alibaba.fastjson.JSONArray;
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.bean.HistortyBean;
//import com.zhiwei.bean.QQKandianUser;
//import com.zhiwei.crawler.core.HttpClientBuilder;
//import com.zhiwei.crawler.core.HttpRequestBuilder;
//import com.zhiwei.tools.timeparse.TimeParse;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//import okhttp3.Headers;
//import okhttp3.OkHttpClient;
//import okhttp3.Request;
//
//public class QQKandian {
//
// public List<QQKandianUser> getUser(String name,Proxy proxy) {
// if(name != null && name.length() > 0) {
// List<QQKandianUser> dataList = new ArrayList<>();
// OkHttpClient okhttp = HttpClientBuilder.newInstance();
// Map<String,String> map = new HashMap<>();
// map.put("Host", "sou.qq.com");
// map.put("Referer", "https://sou.qq.com/kandian/kd.html?_bid=3216&_wv=3&_wwv=1293&_wvSb=0&hotword=%E7%9F%A5%E5%90%8D%E5%A4%A7V%E7%AB%A0%E6%96%87%E6%B6%89%E6%80%A7%E4%BE%B5");
// map.put("Cookie", "skey=MUzU7gdtRz; uin=o0497332654; RK=rNiJH0RBav; pgv_pvid=8990378504; pt2gguin=o0497332654; ptcz=062d936df33011f468637ee72be262a020a8df79977df7e7bde9c105b2b2ddf6");
// try {
// //https://sou.qq.com/cgi-bin/kandian/tab_search?key_word=%E9%98%BF%E9%87%8C&business=64&page_size=20&cookie=&Group_masks=1003&bkn=1215238072
// String url = "https://sou.qq.com/cgi-bin/kandian/tab_search?key_word="+URLEncoder.encode(name, "utf-8")+"&business=64&page_size=20&cookie=&Group_masks=1003&bkn=1215238072";
// Request request = HttpRequestBuilder.newGetRequest(url, Headers.of(map));
// okhttp = okhttp.newBuilder().proxy(proxy).build();
// String result = okhttp.newCall(request).execute().body().string();
// System.out.println(result);
// JSONObject json = JSONObject.parseObject(result);
// JSONObject json2 = json.getJSONObject("result").getJSONArray("item_groups").getJSONObject(0);
// JSONArray jsonArray = json2.getJSONArray("result_items");
// for(int i = 0;i < jsonArray.size();i++) {
// JSONObject data = jsonArray.getJSONObject(i);
// if(name.equals(data.getString("name"))) {
// QQKandianUser kandianUser = new QQKandianUser();
// if(data.getString("layout_content")!=null) {
// String m = data.getString("layout_content");
// JSONObject m1 = JSONObject.parseObject(m);
// kandianUser.setVerify(m1.getBoolean("verify"));
// kandianUser.setDesc(m1.getString("secondLineText"));
// }
// String nam = data.getString("name");
// String ur = data.getString("jmp_url");
// String id = data.getString("result_id");
// System.out.println(data.toString());
// kandianUser.setId(id);
// kandianUser.setName(nam);
// kandianUser.setUrl(ur);
// dataList.add(kandianUser);
// }
// }
// return dataList;
// } catch (Exception e) {
// return null;
// }
// }
// return null;
// }
//
// public List<HistortyBean> getHistoryData(String uid,Proxy proxy) {
// String url = "https://kandian.qq.com/cgi-bin/social/getHomePage?uin="+uid+"&pageNo=1&pageSize=10&pageCookies=&is715=1&isInQQ=1&g_tk=1066845421&bkn=1066845421&_="+new Date().getTime();
// List<HistortyBean> dataList = new ArrayList<HistortyBean>();
// OkHttpClient okhttp = HttpClientBuilder.newInstance();
// Map<String,String> map = new HashMap<String,String>();
// map.put("Host", "kandian.qq.com");
// map.put("Referer", "https://kandian.qq.com/mqq/vue/main?_wv=10145&_bid=2378&adfrom=search&x5PreFetch=1&accountId=MjY2MTY0MjM4Ng%3D%3D");
// map.put("Cookie", "skey=MQmBo5A1N7; uin=o0497332654; pgv_pvid=8990378504");
// try {
// okhttp = okhttp.newBuilder().proxy(proxy).build();
// while(true) {
// try {
// Request request = HttpRequestBuilder.newGetRequest(url, Headers.of(map));
// String result = okhttp.newCall(request).execute().body().string();
// JSONObject json = JSONObject.parseObject(result).getJSONObject("result");
// JSONArray jsonArray = json.getJSONArray("articleinfos");
// for(int i = 0;i < jsonArray.size();i++) {
// JSONObject data = jsonArray.getJSONObject(i);
// String ur = data.getString("articleurl");
// HistortyBean history = getOnhistoryData(ur);
// if(history != null) {
// dataList.add(history);
// }
// ZhiWeiTools.sleep(1500);
// }
// String pageCookies = json.getString("pageCookies");
// String pacs = request.url().queryParameter("pageCookies");
// int pageno = Integer.valueOf(request.url().queryParameter("pageNo"));
// url = request.url().toString().replace("pageNo="+pageno, "pageNo="+(pageno+1)).replace("&pageCookies="+pacs, "&pageCookies="+pageCookies);
// ZhiWeiTools.sleep(5000);
// } catch (Exception e) {
// break;
// }
// }
// return dataList;
// } catch (Exception e) {
// e.printStackTrace();
// }
//
// return null;
// }
//
//
// private static HistortyBean getOnhistoryData(String url) {
// OkHttpClient okhttp = HttpClientBuilder.newInstance();
// Request request = HttpRequestBuilder.newGetRequest(url, Headers.of("Host","post.mp.qq.com"));
// try {
// HistortyBean histortyBean = new HistortyBean();
// String result = okhttp.newCall(request).execute().body().string();
// Date date = getTime(result);
// String source = getSource(result);
// if(date != null && source != null) {
//
// Document doc = Jsoup.parse(result);
// String content = doc.select("div#main-content").select("section").text();
// String title = doc.select("meta[itemprop=name]").attr("content");
//
// histortyBean.setSource(source);
// histortyBean.setTime(date);
// histortyBean.setTitle(title);
// histortyBean.setContent(content);
// histortyBean.setUrl(url);
// return histortyBean;
// }
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
// return null;
// }
//
// public List<HistortyBean> getDataByword(String word,Proxy proxy) {
// List<HistortyBean> dataList = new ArrayList<HistortyBean>();
// OkHttpClient okhttp = HttpClientBuilder.newInstance();
// Map<String,String> map = new HashMap<String,String>();
// map.put("Host", "sou.qq.com");
// map.put("Referer", "https://sou.qq.com/kandian/kd.html?_bid=3216&_wv=3&_wwv=1293&_wvSb=0&hotword=%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4%E9%82%93%E4%BC%A6%E7%94%B5%E6%A2%AF%E5%90%BB");
// map.put("Cookie", "skey=MU7zbaRXu8; uin=o0497332654; RK=rNiJH0RBav; pgv_pvid=8990378504; pt2gguin=o0497332654; ptcz=062d936df33011f468637ee72be262a020a8df79977df7e7bde9c105b2b2ddf6");
// try {
// String url = "https://sou.qq.com/cgi-bin/kandian/unite_search?key_word="+URLEncoder.encode(word, "utf-8")+"&business=64&page_size=20&cookie=&bkn=2000031506";
// //https://sou.qq.com/cgi-bin/kandian/unite_search?key_word=%E5%94%90%E5%AB%A3&business=64&page_size=20&cookie=&bkn=2000031506
// List<String> urlList = new ArrayList<String>();
// int count = 0;
// while(true) {
// try {
// okhttp = okhttp.newBuilder().proxy(proxy).build();
// Request request = HttpRequestBuilder.newGetRequest(url, Headers.of(map));
// String result = okhttp.newCall(request).execute().body().string();
// JSONObject json = JSONObject.parseObject(result).getJSONObject("result");
// JSONArray jsonArray = json.getJSONArray("item_groups");
// count = urlList.size();
// for(int i = 0;i < jsonArray.size();i++) {
// JSONObject data = jsonArray.getJSONObject(i);
// String type = data.getString("group_name");
// if("视频".equals(type) || "小视频".equals(type) || "相关搜索".equals(type) || "话题".equals(type)) {
//
// }else {
// JSONObject da = data.getJSONArray("result_items").getJSONObject(0);
// String title = da.getString("name");
// String ur = da.getString("jmp_url");
// if(!urlList.contains(ur.split("\\?")[0])) {
// urlList.add(ur.split("\\?")[0]);
// String extension = da.getString("extension");
// JSONObject obj = JSONObject.parseObject(extension);
// String time = obj.getString("create_time");
// String content = obj.getString("content");
// if(content == null) {
// content = obj.getString("brief");
// }
// String source = obj.getString("from");
// HistortyBean histortyBean = new HistortyBean();
// System.out.println(title + " -- " + ur.split("\\?")[0]);
// histortyBean.setTime(TimeParse.stringFormartDate(time+"000"));
// histortyBean.setContent(content);
// histortyBean.setTitle(title);
// histortyBean.setSource(source);
// histortyBean.setUrl(ur);
// dataList.add(histortyBean);
// }
// }
// }
// if(count == urlList.size()) {
// break;
// }
// String pageCookies = json.getString("cookie");
// String pacs = request.url().queryParameter("cookie");
// url = request.url().toString().replace("&cookie="+pacs, "&cookie="+pageCookies);
// ZhiWeiTools.sleep(3000);
// } catch (Exception e) {
// break;
// }
// }
// return dataList;
// } catch (Exception e) {
// e.printStackTrace();
// }
//
// return null;
// }
//
// /**
// *
// * @Description 获取时间
// * @param result
// * @return
// */
// private static Date getTime(String result) {
// Pattern pa = Pattern.compile("data-timestamp=\"(.*?)\"");
// Matcher ma = pa.matcher(result);
// while(ma.find()) {
// String t = ma.group(0);
// t = t.split("ata-timestamp=\"")[1].split("\"")[0];
// return TimeParse.stringFormartDate(t+"000");
// }
// return null;
// }
//
// /**
// *
// * @Description 获取来源
// * @param result
// * @return
// */
// private static String getSource(String result) {
// Pattern pa = Pattern.compile("ata-author=\"(.*?)\"");
// Matcher ma = pa.matcher(result);
// while(ma.find()) {
// String t = ma.group(0);
// t = t.split("ata-author=\"")[1].split("\"")[0];
// return t;
// }
// return null;
// }
//
//}
......@@ -34,15 +34,15 @@ public class SinaKeji {
* @return
*/
public static List<Map<String, Object>> getSinaKejiComment(String url,ProxyHolder proxy) {
String commentId = getCommentId(url, proxy);
if(nonNull(commentId)) {
String comUrl = getCommentId(url, proxy);
if(nonNull(comUrl)) {
List<Map<String,Object>> dataList = new ArrayList<>();
int page = 1;
int count = 1;
while(true) {
try {
ZhiWeiTools.sleep(3000);
String newUrl = "http://comment.sina.com.cn/page/info?version=1&format=json&channel=kj&newsid="+commentId+"&compress=0&ie=utf-8&oe=utf-8&page_size=20&t_size=3&h_size=3&thread=1&page="+page;
ZhiWeiTools.sleep(1000);
String newUrl = comUrl+page;
String result = httpBoot.syncCall(RequestUtils.wrapGet(newUrl), proxy).body().string();
List<Map<String,Object>> list = sinaKejiCommentAnalysis.getSinaCommet(result);
dataList.addAll(list);
......@@ -63,16 +63,27 @@ public class SinaKeji {
return Collections.emptyList();
}
/**
** 获取新浪评论链接
* @param url
* @param proxy
* @return
* @return String
*/
private static String getCommentId(String url,ProxyHolder proxy) {
String commentId = null;
for(int i = 0;i < 3;i++) {
try {
String result = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
//list?channel=
if(result.contains("newsid:")) {
commentId = result.split("newsid: '")[1].split("'")[0];
if(nonNull(commentId)) {
return commentId;
String commentId = result.split("newsid: '")[1].split("'")[0];
String channel = result.split("channel: '")[1].split("'")[0];
if(nonNull(commentId) && nonNull(channel)) {
return "http://comment.sina.com.cn/page/info?version=1&format=json&channel="+channel+"&newsid="+commentId+"&compress=0&ie=utf-8&oe=utf-8&page_size=20&t_size=3&h_size=3&thread=1&page=";
}
}else if(result.contains("__cmntId")){
String key = result.split("__cmntId\":\"")[1].split("\",")[0];
return "http://comment.sina.com.cn/page/info?version=1&format=json&channel="+key.split(":")[0]+"&newsid="+key.split(":")[1]+"&compress=0&ie=utf-8&oe=utf-8&page_size=20&t_size=3&h_size=3&thread=1&page=";
}
} catch (IOException e) {
logger.error("获取 文章评论 id 失败");
......
......@@ -29,7 +29,7 @@ public class SinaTousu {
int count = 1;
while(true) {
try {
if(count > 3) {
if(count > 3 || page > 200) {
break;
}
String url = "https://tousu.sina.com.cn/api/index/s?keywords="+URLEncoder.encode(word, "utf-8")+"&page_size=100&page=";
......
package com.zhiwei.parse;
import static java.util.Objects.nonNull;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
......@@ -12,7 +10,6 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
......@@ -35,10 +32,7 @@ public class Souhu {
*/
public static int getSouhuCommentCount(String url,ProxyHolder proxy) {
try {
String newurl = souhuCommentAnalysis.getSouhuURL(url,proxy);
if(nonNull(newurl)) {
return souhuCommentAnalysis.getSouhuCommentCount(newurl,proxy);
}
return souhuCommentAnalysis.getSouhuCommentCount(url,proxy);
} catch (Exception e) {
logger.error("搜狐获取评论数出错了 {}",e);
}
......@@ -83,7 +77,7 @@ public class Souhu {
if(isCulling) {
url = url + "&columnId=-1";
}
String result = HttpClient.executeHttpRequestGet(url,proxy,null);
String result = HttpClient.executeHttpRequestGet(url,proxy);
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONObject("data").getJSONArray("pcArticleVOS");
List<Map<String,Object>> dataList1 = souhuAccountAnalysis.analysisData(jsonArray,name);
......
......@@ -21,6 +21,8 @@ import com.zhiwei.parse.analysis.TXNewsByWordAnalysis;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.MediaType;
import okhttp3.RequestBody;
import okhttp3.Response;
public class TXNews {
......@@ -71,7 +73,7 @@ public class TXNews {
String content = StringUtils.join("coral_uin=", coralUin, "&coral_uid=", coralUid,"&reply_id=",replayId);
//eca55388bbbb596e632bca03a2378efe94b83142fd046f1f70 876579532
System.out.println(content);
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost("https://r.inews.qq.com/getMyComments", "application/json", content), proxy)){
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost("https://r.inews.qq.com/getMyComments", RequestBody.create(MediaType.get("application/json"), content)), proxy)){
JSONObject json = JSONObject.parseObject(response.body().string());
JSONArray jsonArray = json.getJSONObject("comments").getJSONArray("new");
for(int i = 0;i < jsonArray.size();i++) {
......
......@@ -2,6 +2,7 @@ package com.zhiwei.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
......@@ -27,7 +28,7 @@ public class Wangyi {
private static Logger logger = LoggerFactory.getLogger(Wangyi.class);
private static WangyiCommentAnalysis wangyiCommentAnalysis = new WangyiCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(false).build();
private static WangyiHistoryAnalysis wangyiHistoryAnalysis = new WangyiHistoryAnalysis();
/**
......@@ -81,6 +82,21 @@ public class Wangyi {
}
}
public static Map<String, Object> getReadAndLikeCount(String url, ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
String cCount = result.split("data-count=\"")[1].split("\" data-hidead")[0];
String lCount = result.split("data-like=\"")[1].split("\"><")[0];
Map<String, Object> rMap = new HashMap<>();
rMap.put("commentCount", cCount);
rMap.put("likes", lCount);
return rMap;
} catch (Exception e) {
e.printStackTrace();
}
return Collections.emptyMap();
}
/**
*
* @Description 网易网页版数据
......
......@@ -22,15 +22,13 @@ public class AiqiyiByWordAnalysis {
List<Map<String,Object>> dataMap = new ArrayList<>();
try {
Document doc = Jsoup.parse(result);
Elements elements = doc.select("li.list_item");
Elements elements = doc.select("div.layout-main > div");
for (Element element : elements) {
Map<String, Object> map = new HashMap<>();
String title = element.select("li").attr("data-widget-searchlist-tvname");
String time = element.select("em.result_info_desc").text().split(" ")[0];
if(element.select("label.result_info_lbl").text().contains("上传者")) {
map.put("source", element.select("a.result_info_link").text());
}
String uurl = element.select("h3.result_title > a").attr("href");
String title = element.select("a.main-tit").attr("title");
String time = element.select("span.info-des").text().split(" ")[0];
String uurl = element.select("a.main-tit").attr("href");
map.put("source", element.select("a.uploader-name").text());
map.put("time", TimeParse.stringFormartDate(time));
map.put("url", uurl);
map.put("title", title);
......
......@@ -58,24 +58,22 @@ public class BaijiaAccountAnalysis {
try {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("list");
if(json.getJSONObject("data").getBoolean("has_more") != null &&
json.getJSONObject("data").getBoolean("has_more") ) {
if(json.getJSONObject("data").getBoolean("hasMore") != null &&
json.getJSONObject("data").getBoolean("hasMore") ) {
more = true;
rmap.put("ctime", json.getJSONObject("data").getString("ctime"));
rmap.put("ctime", json.getJSONObject("data").getJSONObject("query").getString("ctime"));
}
// String name = json.getJSONObject("data").getJSONObject("author").getString("display_name");
for(int i = 0;i < jsonArry.size();i++) {
Map<String,Object> map = new HashMap<>();
JSONObject data = jsonArry.getJSONObject(i);
JSONObject data = jsonArry.getJSONObject(i).getJSONObject("itemData");
String id = data.getString("article_id");
int t = data.getInteger("updated_at");
String time = TimeParse.dateFormartString(new Date(t*1000L), "yyyy-MM-dd HH:mm:ss");
System.out.println(time);
if(startTime != null && startTime.length() > 1) {
if(time.compareTo(startTime) < 1) {
more = false;
continue;
}
if(startTime != null && startTime.length() > 1 && time.compareTo(startTime) < 1) {
more = false;
continue;
}
map.put("title", data.getString("title"));
String url = "http://baijiahao.baidu.com/s?id=" + id;
......
......@@ -18,7 +18,7 @@ public class BilibilikeyWordAnalysis {
try {
Document doc = Jsoup.parse(result);
boolean more = false;
if(doc.select("#server-search-app > div.contain > div.body-contain > div > div.page-wrap > div > ul > li.page-item.next > button").text().contains("下一页")) {
if(doc.select("#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button").text().contains("下一页")) {
more = true;
}
......@@ -28,7 +28,7 @@ public class BilibilikeyWordAnalysis {
String playcount = null;
String source = null;
String submitcount = null;
Elements elements = doc.select("ul.video-contain.clearfix").select("li");
Elements elements = doc.select("ul.video-list.clearfix").select("li");
List<Map<String,Object>> dataList = new ArrayList<>();
for(Element element : elements) {
Map<String,Object> map = new HashMap<>();
......
......@@ -77,9 +77,10 @@ public class SouhuCommentAnalysis {
public int getSouhuCommentCount(String url,ProxyHolder proxy) {
Map<String,String> headerMap = HeadGet.getSouhuCommentHeaderMap(null);
try {
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
String id = getUrlId(url);
String result = HttpClient.executeHttpRequestGet("https://apiv2.sohu.com/api/comment/list?callback=&page_size=10&topic_id=&page_no=1&source_id=mp_"+id,proxy, headerMap);
JSONObject json = JSONObject.parseObject(result);
if(json.getInteger("code") == 500) {
if(Objects.nonNull(json.get("code")) && json.getInteger("code") == 500) {
return 0;
}
return json.getJSONObject("jsonObject").getInteger("cmt_sum");
......@@ -116,21 +117,28 @@ public class SouhuCommentAnalysis {
}
public int getReadNum(String url, ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
String sourceId = getNewsId(result);
url = "http://v2.sohu.com/public-api/articles/pv?articleIds=" + sourceId;
result = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
return JSONObject.parseObject(result).getInteger(sourceId);
String id = getUrlId(url);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet("http://v2.sohu.com/public-api/articles/"+id+"/pv"), proxy)){
return Integer.parseInt(response.body().string());
} catch (Exception e) {
logger.error("Exception {} ",e);
}
return -1;
}
/**
** 获取搜狐文章id
* @param url
* @return
* @return String
*/
private String getUrlId(String url) {
try {
return url.split("/a/")[1].split("_")[0];
} catch (Exception e) {
logger.error("搜狐链接解析失败");
}
return null;
}
}
......@@ -27,9 +27,9 @@
//// List<String> headList = (List<String>) map.get("head");
//// for (Map<String, Object> map1 : list) {
//// String url = map1.get("地址") + "";
// String cookie = "_buuid=0668b664-13b3-4bd0-aa37-99d747432e85; guid=HBoEGxgEGBscBBsZGlYHGBsZHRsYExwZHFYcGQQdGR8FQ1hLTEt5ChITBBIdHxkEGgQbHQVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; token=\"ou+mv1hjxjm0uOOTss1vgck9+h6OCS/lYQUeFnJgSK70FHprmw6GmjBGwk2qPQH88CKuzcDfAvoCmBm7+jVysA==\"; uid=\"A8ELjewCDRgHnZ5bX0Vy0/Airs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMjI3NjU0NTI0Iiwic2VjcmV0IjoiV0wyZmEtZDZxbkx2TEkzZHF2dTN4UG5SIiwiX2V4cGlyZSI6MTU2MDU5Mzg4Mjc5NCwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=ujhqvC3wPAn-WlCPXfB6C5ZJIgY";
// String cookie = "";
// String url = "https://maimai.cn/web/gossip_detail?src=app&webid=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlZ2lkIjoiMjU5OTg4YmE4YzI3MTFlOTllMjEyNDZlOTZiNDgwODgiLCJ1IjoxNzY2NDQ3NjUsImlkIjoyMjIwMDM4NH0.gBdp3i99L0xUKLglaVIJf07OrrPk6yoG9HAo-3Yftqk";
// Map<String,Object> map3 = Maimai.getMaiaiCount(url,cookie, ProxyHolder.NAT_HEAVY_PROXY);
// List<Map<String, Object>> map3 = Maimai.getMaimaiCommentList(url, cookie, ProxyHolder.NAT_HEAVY_PROXY);
// System.out.println(map3.toString());
// System.out.println(url);
//// map1.putAll(map3);
......
//package com.zhiwei.Comment;
//
//import org.testng.annotations.Test;
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.parse.SinaKeji;
//
//public class SinaKejiComment {
//
// @Test
// public void f() {
// String url = "https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String url = "https://tech.sina.com.cn/d/v/2019-08-19/doc-ihytcitn0207512.shtml";
//
// SinaKeji.getSinaKejiComment(url, null);
// SinaKeji.getSinaKejiComment(url, ProxyHolder.NAT_HEAVY_PROXY);
//
// }
//
//}
......@@ -25,7 +25,7 @@
//// System.out.println(child.split("chlid=")[1]);
// System.out.println(child.split("=")[1]);
//
// List<Map<String,Object>> lists = QQKB.getQQAccountData("5456950", cookie,null);
// List<Map<String,Object>> lists = QQKB.getQQAccountData("5060059", cookie,null);
// if(lists != null) {
// for(Map<String,Object> map1 : lists) {
// map1.put("name", map.get("呢称"));
......
......@@ -6,14 +6,17 @@
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Baijia;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class BaijiaAccountExample {
//
// @Test
// public void test3() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String path = "D://crawlerdata//自媒体/百家号采集.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String startTime = "2018-05-01 00:00:00";
......@@ -23,9 +26,10 @@
// for(Map<String,Object> m : list) {
// try {
// String app_id = m.get("id").toString();
// app_id = "1594158489045754";
// String cookie = "__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5";
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id, startTime,cookie, null);
// app_id = "1602674438508810";
// String cookie = "BAIDUID=7D453C932433A93F7AD1F3B8ABC8B0E1:FG=1; BIDUPSID=7D453C932433A93F7AD1F3B8ABC8B0E1; PSTM=1570766401; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID=eH-OJeCmH6VwoRJwCdmehrB7leKK0gOTHllvCh8hmwLadLIVJeC6EG0Ptf8g0KubFTPRogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tJkD_I_hJKt3qn7I5KToh4Athxob2bbXHDo-LIvHWT6cOR5JhfA-3R-e046f3-3L5CbH5D3s5lvvhb3O3M7ShbKdMa732RbrKCnraxQF5l8-sq0x0bOte-bQypoa0q3TLDOMahkM5h7xOKQoQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3YjjISKx-_J5LJJxK; H_PS_PSSID=1442_21103_29567_29699_29220_22158; delPer=0; PSINO=5; ZD_ENTRY=baidu; yjs_js_security_passport=9687699d4b0965c0be1e6e312fc59ff5cf3d03a2_1571106914_js; Hmery-Time=1215393878";
// System.out.println(app_id);
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id,"聚富财经", startTime,cookie, ProxyHolder.NAT_HEAVY_PROXY);
// if(lists != null) {
// bodyList.addAll(lists);
// }
......
......@@ -17,7 +17,7 @@
// @Test
// public void aiqiyiTest() {
//
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER,10000002);
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : wordList) {
......@@ -34,7 +34,7 @@
// headList.add("title");
// headList.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频/爱奇艺关键词采集-毓婷-0716.xlsx", "数据", headList, bodyList);
// poi.exportExcel("D://crawlerdata//视频/爱奇艺关键词采集-精装修.xlsx", "数据", headList, bodyList);
//
//
//
......
//package com.zhiwei.shipin;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.BiliBili;
//import com.zhiwei.util.WordReadFile;
//
//public class BilibiliTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词-1.txt");
// List<Map<String, Object>> bodyList = new ArrayList<>();
// String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
// for (String word : wordList) {
// List<Map<String, Object>> dataList = BiliBili.getData(word, null, "2019-07-18 00:00:00",
// cookie);
// if (dataList != null) {
// System.out.println(word + " ----- " + dataList.size());
// bodyList.addAll(dataList);
// }
// }
// List<String> headlist = new ArrayList<>();
// headlist.add("submitcount");
// headlist.add("playcount");
// headlist.add("time");
// headlist.add("source");
// headlist.add("title");
// headlist.add("url");
// headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//bilibili关键词采集数据-吃鸡否-0722.xlsx", "B站数据", headlist, bodyList);
//
// }
//}
package com.zhiwei.shipin;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.BiliBili;
import com.zhiwei.util.WordReadFile;
public class BilibiliTest {
@Test
public void f() {
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String, Object>> bodyList = new ArrayList<>();
String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
for (String word : wordList) {
List<Map<String, Object>> dataList = BiliBili.getData(word, null, "2001-01-14 00:00:00",
cookie);
if (dataList != null) {
System.out.println(word + " ----- " + dataList.size());
bodyList.addAll(dataList);
}
}
List<String> headlist = new ArrayList<>();
headlist.add("submitcount");
headlist.add("playcount");
headlist.add("time");
headlist.add("source");
headlist.add("title");
headlist.add("url");
headlist.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//视频//bilibili关键词采集数据-封神神话-收藏.xlsx", "B站数据", headlist, bodyList);
}
}
......@@ -17,8 +17,8 @@
//public class QQTVTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// String time = "2019-04-11 00:00:00";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER, 10000002);
// String time = "2019-01-11 00:00:00";
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String, Object>> bodyList = new ArrayList<>();
// for (String word : wordList) {
......@@ -37,7 +37,7 @@
// headlist.add("url");
// headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//腾讯视频关键词采集数据-毓婷-0716.xlsx", "腾讯视频数据", headlist, bodyList);
// poi.exportExcel("D://crawlerdata//视频//腾讯视频关键词采集数据-精装修.xlsx", "腾讯视频数据", headlist, bodyList);
//
//
//
......
......@@ -33,7 +33,7 @@
// headlist.add("url");
// headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//搜狐视频关键词采集数据-毓婷-0716.xlsx", "搜狐数据", headlist, bodyList);
// poi.exportExcel("D://crawlerdata//视频//搜狐视频关键词采集数据-华为-0812.xlsx", "搜狐数据", headlist, bodyList);
//
// }
//}
......@@ -30,7 +30,7 @@
// headList.add("uper");
// headList.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//优酷数据-毓婷-0716.xlsx", "数据", headList, bodyList);
// poi.exportExcel("D://crawlerdata//视频//优酷数据-华为-0812.xlsx", "数据", headList, bodyList);
//
// }
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment