Commit d979d793 by yangchen

脉脉 评论采集 和部分视频采集

parent 1116d3c5
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>articlenewscrawler</artifactId>
<version>0.0.4-SNAPSHOT</version>
<version>0.0.8-SNAPSHOT</version>
<name>articlenewscrawler</name>
<description>采集凤凰,一点资讯,搜狐历时文章和文章评论</description>
......
package com.zhiwei.httpclient;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.SocketAddress;
import java.net.URLEncoder;
import java.net.Proxy.Type;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.nodes.Document;
import com.zhiwei.tools.httpclient.HeaderTool;
public class HeadGet {
/**
......@@ -409,12 +401,10 @@ public class HeadGet {
* @return
*/
public static Map<String,String> getPearVideoByWordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "www.pearvideo.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
Map<String,String> headerMap = new HashMap<>();
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
headerMap.put("Accept", "text/html, */*; q=0.01");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
......@@ -492,8 +482,8 @@ public class HeadGet {
*/
public static Map<String,String> getQQKBCommentHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"天天快报 4.6.2 qnreading (iPhone8,1; iOS 11.2.1; zh_CN; 4.6.2.89)");
// headerMap.put("User-Agent",
// "天天快报 4.6.2 qnreading (iPhone8,1; iOS 11.2.1; zh_CN; 4.6.2.89)");
headerMap.put("Accept",
"*/*");
headerMap.put("Accept-Language", "zh-Hans-CN;q=1");
......@@ -514,7 +504,7 @@ public class HeadGet {
* @return
*/
public static Map<String,Object> getQQKBCommentParamMap(String comment_id,String article_id){
Map<String,Object> param = new HashMap<String,Object>();
Map<String,Object> param = new HashMap<>();
param.put("chlid", "daily_timeline");
param.put("comment_id", comment_id);
param.put("page", 1);
......@@ -944,15 +934,5 @@ public class HeadGet {
return paramMap;
}
public static void main(String[] args) throws UnsupportedEncodingException {
String url = "http://180.186.38.200/rest/n/feed/profile2";
System.out.println(url);
String cookie = "";
Map<String,String> headerMap = HeaderTool.getCommonHead();
Map<String,Object> paramMap = HeadGet.getKuaishouParamMap();
String result = HttpClient.executeHttpRequestPost(url, null, headerMap, paramMap);
System.out.println(result);
System.out.println(result.length());
}
}
......@@ -7,11 +7,16 @@ import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import okhttp3.Response;
public class HttpClient {
private static Logger logger = LoggerFactory.getLogger(HttpClient.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
......@@ -21,22 +26,20 @@ public class HttpClient {
* @throws IOException
*/
public static String executeHttpRequestGet(String url,Proxy proxy,Map<String, String> headerMap) {
try {
String result = HttpClientTemplateOK.get(url, proxy, headerMap);
return result;
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
return response.body().string();
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e.getMessage());
logger.error("httpClient 获取数据出现问题:{}", e);
return null;
}
}
public static String executeHttpRequestPost(String url,Proxy proxy,Map<String, String> headerMap,Map<String, Object> paramMap) {
try {
String result = HttpClientTemplateOK.post(url, proxy, headerMap, paramMap);
return result;
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, paramMap), proxy)){
return response.body().string();
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e.getMessage());
logger.error("httpClient 获取数据出现问题:{}", e);
return null;
}
......
......@@ -28,7 +28,7 @@ public class Aiqiyi {
public static List<Map<String,Object>> getAiqiyiByWordData(String word,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getAiqiyiBywordHeaderMap(null);
Map<String,String> headerMap1 = HeadGet.getAiqiyiHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
List<Map<String,Object>> dataList = new ArrayList<>();
try {
for(int i = 1;i <= 20;i++) {
String url = "http://so.iqiyi.com/so/q_"+URLEncoder.encode(word, "UTF-8")+"_ctg_%E7%94%9F%E6%B4%BB_t_0_page_"+i+"_p_1_qc_0_rd__site__m_11_bitrate_?af=true";
......
package com.zhiwei.parse;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
......@@ -23,7 +23,7 @@ import okhttp3.Request;
public class BiliBili {
private static Logger logger = LoggerFactory.getLogger(BiliBili.class);
private static final Logger logger = LoggerFactory.getLogger(BiliBili.class);
private static HttpBoot httpBoot = new HttpBoot();
@SuppressWarnings("unchecked")
......@@ -46,6 +46,7 @@ public class BiliBili {
while(more) {
map.clear();
String ur = url + "&page=" + n;
System.out.println(ur);
request = HttpRequestBuilder.newGetRequest(ur, header);
String result2 = httpBoot.syncCall(request, proxy).body().string();
map = BilibilikeyWordAnalysis.getData(result2);
......@@ -60,13 +61,13 @@ public class BiliBili {
}
return bodyList;
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
logger.error("e ",e);
} catch (Exception e) {
logger.error("e ",e);
}
return null;
return Collections.emptyList();
}
public static void main(String[] args) {
......@@ -88,7 +89,7 @@ public class BiliBili {
headlist.add("title");
headlist.add("url");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//bilibili关键词采集数据-竹鼠.xlsx", "B站数据", headlist, bodyList);
poi.exportExcel("D://crawlerdata//bilibili关键词采集数据-txh.xlsx", "B站数据", headlist, bodyList);
}
......
package com.zhiwei.parse;
import static java.util.Objects.nonNull;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class Chejia {
private static final Logger logger = LoggerFactory.getLogger(Chejia.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
*
* @Description 车家 号 评论数
* @param url
* @param proxy
* @return
*/
public static int getChejiaCommentCount(String url,Proxy proxy) {
String id = getCommentUrl(url, proxy);
if(nonNull(id)) {
System.out.println(id);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(id), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
return json.getInteger("commentcount");
} catch (Exception e) {
logger.error("error {} ",e);
}
}
return -1;
}
/**
*
* @Description 车家 号 评论数
* @param url
* @param proxy
* @return
*/
public static List<Map<String,Object>> getChejiaComment(String url,Proxy proxy) {
String nUrl = getCommentUrl(url, proxy);
if(nonNull(nUrl)) {
int page = 1;
List<Map<String, Object>> bodyList = new ArrayList<>();
boolean f = true;
while(f) {
String surl = nUrl + "&page=" + page;
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(surl), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("commentlist");
for(int i = 0;i< jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = new HashMap<>();
map.put("source", data.getString("RMemberName"));
String time = data.getString("RReplyDate");
time = time.split("/Date\\(")[1].split("\\+")[0];
map.put("time", TimeParse.dateFormartString(new Date(Long.parseLong(time)), "yyyy-MM-dd HH:mm:ss"));
map.put("content", data.getString("RContent"));
map.put("like", data.get("RUp"));
map.put("id", data.getString("ReplyId"));
bodyList.add(map);
}
int total = json.getInteger("commentcount");
logger.info(" 一共采集 了 {} 条 采集到 {} 页 一共有 {} 条",bodyList.size(),page,total);
if(page*50 > total) {
f = false;
}
} catch (Exception e) {
logger.error("error {} ",e);
f = false;
}
ZhiWeiTools.sleep(2000);
page++;
}
return bodyList;
}
return Collections.emptyList();
}
private static String getCommentUrl(String url,Proxy proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String objectID = response.body().string().split("pvTrack.object = ")[1].split(";")[0].replace("\"", "");
return "https://reply.autohome.com.cn/api/comments/show.json?appid=21&count=50&id="+objectID;
} catch (Exception e) {
logger.error("error {} ",e);
}
return null;
}
}
......@@ -25,6 +25,7 @@ public class Douyin {
* @param url
* @return
*/
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getDouyinHotData(String url,Proxy proxy) {
String iid = url.split("iid=")[1].split("&")[0];
String ch_id = url.split("challenge/")[1].split("\\?")[0];
......
......@@ -3,7 +3,6 @@ package com.zhiwei.parse;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
......
package com.zhiwei.parse;
import static com.alibaba.fastjson.JSON.toJavaObject;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.MaimaiBywordAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class Maimai {
private static Logger logger = LoggerFactory.getLogger(Maimai.class);
private static HttpBoot httpBoot = new HttpBoot();
private static MaimaiBywordAnalysis maimaiBywordAnalysis = new MaimaiBywordAnalysis();
/**
*
* @Description 实名动态
* @param key
* @param cookie
* @param time
* @param proxy
* @return
*/
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getData(String key,String cookie,String time,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getMaimaiKeywordHeaderMap(cookie);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
List<Map<String,Object>> dataList = new ArrayList<>();
boolean f = true;
try {
String url = "https://maimai.cn/search/feeds?query="+URLEncoder.encode(key, "utf-8")+"&limit=20&offset=0&highlight=true&sortby=time&jsononly=1";
......@@ -32,11 +54,11 @@ public class Maimai {
Map<String,Object> map = maimaiBywordAnalysis.getData(result, time);
f = (boolean) map.get("hasMore");
List<Map<String,Object>> daList = (List<Map<String, Object>>) map.get("data");
if(daList != null && daList.size() > 0) {
if(daList != null && !daList.isEmpty()) {
dataList.addAll(daList);
url = "https://maimai.cn/search/feeds?query="+URLEncoder.encode(key, "utf-8")+"&limit=20&offset="+i+"&highlight=true&sortby=time&jsononly=1";
i+=20;
logger.info("{}==采集到的数据量=="+dataList.size(),key);
logger.info("{} ==采集到的数据量== {}",dataList.size(),key);
ZhiWeiTools.sleep(2000);
}else {
break;
......@@ -48,9 +70,19 @@ public class Maimai {
return dataList;
}
/**
*
* @Description 职言交流
* @param key
* @param cookie
* @param time
* @param proxy
* @return
*/
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getDataByNoName(String key,String cookie,String time,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getMaimaiKeywordHeaderMap(cookie);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
List<Map<String,Object>> dataList = new ArrayList<>();
boolean f = true;
try {
String url = "https://maimai.cn/search/gossips?query="+URLEncoder.encode(key, "utf-8")+"&limit=20&offset=0&highlight=true&sortby=time&jsononly=1";
......@@ -64,7 +96,7 @@ public class Maimai {
dataList.addAll(daList);
url = "https://maimai.cn/search/gossips?query="+URLEncoder.encode(key, "utf-8")+"&limit=20&offset="+i+"highlight=true&sortby=time&jsononly=1";
i+=20;
logger.info("{}==采集到的数据量=="+dataList.size(),key);
logger.info("{} ==采集到的数据量== {} ",dataList.size(),key);
ZhiWeiTools.sleep(2000);
}else {
break;
......@@ -76,4 +108,80 @@ public class Maimai {
return dataList;
}
/**
* //https://maimai.cn/web/gossip_detail?encode_id=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MTk2MzEyNjYsImlhdCI6MTU0ODI5NzI5NX0.N6SPmcf-fyitLNomzY-a8BEY31eseYnvG7RTUQ3jxYY
* @Description 获取脉脉转评赞
* @param url
* @param proxy
* @return
*/
public static Map<String,Object> getMaiaiCount(String url,ProxyHolder proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
result = result.split("JSON.parse\\(\"")[1].split("\"\\);\\</script\\>")[0];
result = ZhiWeiTools.decodeUnicode(result);
JSONObject json = JSONObject.parseObject(result);
Map<String,Object> map = new HashMap<>();
JSONObject data = json.getJSONObject("data").getJSONObject("gossip");
map.put("like", data.getInteger("likes"));
map.put("spreads", data.getInteger("spreads"));
map.put("cmts", data.getInteger("cmts"));
map.put("gid", data.getLong("id"));
map.put("title", data.getString("text"));
map.put("author", data.getString("author"));
return map;
} catch (Exception e) {
logger.error(" 脉脉 转评攒 获取失败 {}",e);
}
return Collections.emptyMap();
}
/**
* //https://maimai.cn/web/gossip_detail?encode_id=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MTk2MzEyNjYsImlhdCI6MTU0ODI5NzI5NX0.N6SPmcf-fyitLNomzY-a8BEY31eseYnvG7RTUQ3jxYY
* @Description 脉脉评论采集获取
* @param url
* @param proxy
* @return
*/
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getMaimaiCommentList(String url,ProxyHolder proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
Map<String,Object> mmid = getMaiaiCount(url, proxy);
if(mmid!=null) {
String gid = String.valueOf(mmid.get("gid"));
boolean more = true;
int page = 0;
while(more) {
try {
String link = "https://maimai.cn/sdk/web/gossip/getcmts?gid="+gid+"&page="+page+"&count=50&hotcmts_limit_count=100";
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(link),proxy).body().string();
if(htmlBody!=null && htmlBody.length()>0) {
JSONObject dataJson = JSONObject.parseObject(htmlBody);
JSONArray commentJson = dataJson.getJSONArray("comments");
if(commentJson!=null && !commentJson.isEmpty()) {
for(int i=0;i<commentJson.size();i++) {
JSONObject json = commentJson.getJSONObject(i);
Map<String,Object> dataMap = toJavaObject(json, Map.class);
dataMap.put("fromUrl", url);
dataMap.putAll(mmid);
dataList.add(dataMap);
}
page++;
}else {
more = false;
}
int moreInt = dataJson.getIntValue("more");
if(moreInt==0) {
more = false;
}
}
} catch (Exception e) {
logger.info("数据采集出错 {}",e);
}
}
return dataList;
}
return Collections.emptyList();
}
}
package com.zhiwei.parse;
import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.PearVideoByWordAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
......@@ -26,24 +26,31 @@ public class PearVideo {
* @return
*/
public static List<Map<String,Object>> getPearVideoData(String word,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getPearVideoByWordHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
Map<String,String> headerMap = new HashMap<>();
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
headerMap.put("Accept", "text/html, */*; q=0.01");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
headerMap.put(":authority", "www.pearvideo.com");
List<Map<String,Object>> dataList = new ArrayList<>();
try {
headerMap.put("referer", "https://www.pearvideo.com/search.jsp?start=0&k="+URLEncoder.encode(word, "UTF-8"));
for(int i = 0; i <= 9000;i+=10) {
String url = "http://www.pearvideo.com/search_loading.jsp?start="+i+"&k="+URLEncoder.encode(word, "UTF-8");
String url = "https://www.pearvideo.com/search_loading.jsp?start="+i+"&k="+URLEncoder.encode(word, "UTF-8") + "&sort=first_publish_time";
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
List<Map<String,Object>> dataList1 = pearVideoByWordAnalysis.getPearVideoData(result);
if(dataList1 != null && dataList1.size() > 0) {
if(dataList1 != null && !dataList1.isEmpty()) {
dataList.addAll(dataList1);
}
System.out.println(i+"=========="+dataList.size());
ZhiWeiTools.sleep(4000);
}
return dataList;
} catch (UnsupportedEncodingException e) {
logger.error("获取数据出错",e.getMessage());
e.printStackTrace();
return null;
} catch (Exception e) {
logger.error("获取数据出错 {}",e);
return Collections.emptyList();
}
}
......
......@@ -11,16 +11,21 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.bean.QQkbUser;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.QQKBAccountAnalysis;
import com.zhiwei.parse.analysis.QQKBCommentAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class QQKB {
private static Logger logger = LoggerFactory.getLogger(QQKB.class);
private static QQKBAccountAnalysis qqAccountAnalysis = new QQKBAccountAnalysis();
private static QQKBCommentAnalysis qqkbCommentAnalysis = new QQKBCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
/**
*
......@@ -113,8 +118,9 @@ public class QQKB {
Map<String,Object> paramMap = HeadGet.getQQKBCommentParamMap(comment_id, article_id);
int i = 1;
while(true) {
try {
String result = HttpClient.executeHttpRequestPost("http://r.cnews.qq.com/getQQNewsComment",proxy, headerMap, paramMap);
String result = HttpClient.executeHttpRequestPost("http://r.cnews.qq.com/getQQNewsComment",ProxyFactory.getNatProxy(), headerMap, paramMap);
paramMap.clear();
List<Map<String,Object>> lists = qqkbCommentAnalysis.getCommentData(result,null,comment_id, article_id,proxy);
if(lists == null || lists.size() < 1) {
......@@ -124,7 +130,10 @@ public class QQKB {
paramMap = qqkbCommentAnalysis.getParamMap(result,i,comment_id,article_id);
i++;
ZhiWeiTools.sleep(5000);
ZhiWeiTools.sleep(300);
} catch (Exception e) {
e.printStackTrace();
}
}
return dataList;
} catch (Exception e) {
......
......@@ -13,8 +13,6 @@ import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
......@@ -31,7 +29,6 @@ import okhttp3.Request;
public class QQKandian {
private static Logger logger = LoggerFactory.getLogger(QQKandian.class);
public List<QQKandianUser> getUser(String name,Proxy proxy) {
if(name != null && name.length() > 0) {
......
......@@ -3,7 +3,6 @@ package com.zhiwei.parse;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
......
......@@ -5,7 +5,6 @@ import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
......
......@@ -68,6 +68,7 @@ public class SouBao {
poi.exportExcel("D:\\crawlerdata\\搜报网-EA 品牌 关键词-06.11-06.12.xlsx", "sa", headList, bodyList);
}
@SuppressWarnings("unchecked")
public static Map<String,String> getdata() {
Map<String,String> map = new HashMap<String,String>();
PoiExcelUtil poi = PoiExcelUtil.getInstance();
......
......@@ -13,6 +13,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.SouhuAccountAnalysis;
......@@ -144,8 +145,8 @@ public class Souhu {
int j = 1;
try {
while(true) {
String newurl = souhuCommentAnalysis.getSouhuURL(url,proxy) + "&page_no=" + j;
String result = HttpClient.executeHttpRequestGet(newurl,proxy,headerMap);
String newurl = souhuCommentAnalysis.getSouhuURL(url,ProxyFactory.getNatProxy()) + "&page_no=" + j;
String result = HttpClient.executeHttpRequestGet(newurl,ProxyFactory.getNatProxy(),headerMap);
System.out.println(newurl);
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("jsonObject").getJSONArray("comments");
......@@ -158,7 +159,7 @@ public class Souhu {
dataList.add(map);
}
j++;
ZhiWeiTools.sleep(3000);
ZhiWeiTools.sleep(300);
}
} catch (Exception e) {
......
......@@ -7,9 +7,6 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.ToutiaoKeyWordAnalysis;
......@@ -17,7 +14,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public class Toutiao {
private static Logger logger = LoggerFactory.getLogger(Toutiao.class);
private static ToutiaoKeyWordAnalysis toutiaoKeyWordAnalysis = new ToutiaoKeyWordAnalysis();
......
......@@ -5,6 +5,8 @@ import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
......@@ -12,13 +14,14 @@ import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.parse.analysis.XueqiuKeyWordAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
import okhttp3.Response;
public class Xueqiu {
......@@ -60,8 +63,26 @@ public class Xueqiu {
break;
}
}
return bodyList;
}
public static Map<String,Object> getUrlData(String url,Proxy proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
String jsondata = result.split("window.SNOWMAN_STATUS = ")[1].split("window.SNOWMAN_TARGET")[0];
jsondata = jsondata.substring(0, jsondata.lastIndexOf(";"));
JSONObject json = JSONObject.parseObject(jsondata);
Map<String,Object> map = new HashMap<>();
map.put("like", json.getInteger("like_count"));
map.put("repostCount", json.getInteger("retweet_count"));
map.put("commentCount", json.getInteger("reply_count"));
return map;
} catch (Exception e) {
logger.error(" 雪球 数据转评赞获取失败 exception {} url = {}",e,url);
}
return Collections.emptyMap();
}
}
......@@ -86,6 +86,7 @@ public class Yiche {
ZhiWeiTools.sleep(2000);
page++;
}
return bodyList;
}
return Collections.emptyList();
......
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.tools.tools.URLCodeUtil;
import okhttp3.Response;
public class Youku {
private static final Logger logger = LoggerFactory.getLogger(Youku.class);
private static HttpBoot httpBoot = new HttpBoot();
public static List<Map<String,Object>> getDataList(String word) {
String aaid = "9cae49f0e031664b00d8f9c108e586ab";
List<Map<String,Object>> list = new ArrayList<>();
for(int i = 1;i <= 20;i++) {
String url = "https://so.youku.com/search_video/q_"+URLCodeUtil.getURLEncode(word, "UTF-8")+"?spm=a2h0k.11417342.filter.dnew&orderfield=createtime&aaid="+aaid+"&pg="+i;
System.out.println(url);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyFactory.getNatProxy())){
String result = response.body().string();
String jsondata = result.split("bigview.view\\(")[1].split("\\)\\</script\\>")[0];
JSONObject json = JSONObject.parseObject(jsondata);
String docData = json.getString("html");
Document doc = Jsoup.parse(docData);
Elements elements = doc.select("div.sk-result-list").select("div.sk-mod");
for(Element element : elements) {
Map<String,Object> map = new HashMap<>();
String title = element.select("div.mod-main > div.mod-header > h2 > a").text();
String surl = element.select("div.mod-main > div.mod-header > h2 > a").attr("href");
String time = element.select("div.mod-main > div.mod-info > p").text();
if(time.contains("上传时间:")) {
map.put("title", title);
map.put("url", "https:"+surl);
map.put("time", time.replaceAll("上传时间:", "").split(" ")[0]);
map.put("uper",time.replace(time.split("上传者:")[0], ""));
list.add(map);
}
}
logger.info(" i = {} dataSize = {} ",i,list.size());
} catch (Exception e) {
logger.error(" Exception {} ",e);
}
}
return list;
}
}
......@@ -13,13 +13,19 @@ import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class AiqiyiByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(AiqiyiByWordAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
*
* @Description 解析出所有有用链接
......@@ -45,9 +51,9 @@ public class AiqiyiByWordAnalysis {
}
public Map<String,Object> getAiqiyiData(String url,Map<String,String> headerMap,Proxy proxy) {
Map<String,Object> dataMap = new HashMap<String,Object>();
try {
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
Map<String,Object> dataMap = new HashMap<>();
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
String result = response.body().string();
Document doc = Jsoup.parse(result);
String time = doc.select("#widget-vshort-ptime").text();
if(!time.contains("2017")) {
......@@ -68,7 +74,7 @@ public class AiqiyiByWordAnalysis {
System.out.println(dataMap.toString());
return dataMap;
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
logger.error("解析出错 {}",e);
return dataMap;
}
}
......
......@@ -14,13 +14,17 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class BaijiaAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(BaijiaAccountAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot();
public Map<String,Object> getBaijiaAccount2Data(JSONObject data) {
Map<String,Object> map = new HashMap<String,Object>();
......@@ -159,8 +163,8 @@ public class BaijiaAccountAnalysis {
public String getBaijiaContent(String url,Proxy proxy) {
ZhiWeiTools.sleep(2000);
Map<String,String> headerMap = HeadGet.getBaijiaAccountHeaderMap(null);
try {
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
String result = response.body().string();
Document document = Jsoup.parse(result);
return document.select("section.news-content").text();
} catch (Exception e) {
......
......@@ -14,13 +14,17 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class DayuByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(DayuByWordAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot();
public List<Map<String,Object>> getDayuByWordData(String result,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
......@@ -28,7 +32,7 @@ public class DayuByWordAnalysis {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("iflowItems");
for(int i = 0;i < jsonArry.size();i++) {
Map<String,Object> map = new HashMap<String,Object>();
Map<String,Object> map = new HashMap<>();
JSONObject data = jsonArry.getJSONObject(i);
map.put("title", data.getString("title").replaceAll("<.*?>", ""));
String url = data.getString("zzd_url");
......@@ -42,7 +46,7 @@ public class DayuByWordAnalysis {
}
return dataList;
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
logger.error("解析出错 {}",e);
return dataList;
}
......@@ -51,22 +55,19 @@ public class DayuByWordAnalysis {
public String getContent(String url,Proxy proxy) {
ZhiWeiTools.sleep(2000);
Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
String result = response.body().string();
Pattern pat = Pattern.compile("xissJsonData = (.*);");
Matcher matcher = pat.matcher(result);
try {
if(matcher.find()) {
String s = matcher.group(0);
JSONObject json = JSONObject.parseObject(s.substring(15, s.length() - 1));
String content = json.getString("content").replaceAll("<.*?>", "");
return content;
return json.getString("content").replaceAll("<.*?>", "");
}
return null;
} catch (Exception e) {
logger.error("解析文本出错",e.getMessage());
System.out.println(result);
return null;
e.printStackTrace();
}
return null;
}
}
......@@ -10,12 +10,8 @@ import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class DayuCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(DayuCommentAnalysis.class);
......@@ -33,7 +29,7 @@ public class DayuCommentAnalysis {
JSONObject json = JSONObject.parseObject(result).getJSONObject("data").getJSONObject("comments_map");
Map<String,Object> map = (Map<String,Object>)json;
for(Map.Entry<String, Object> entry : map.entrySet() ) {
Map<String,Object> dataMap = new HashMap<String, Object>();
Map<String,Object> dataMap = new HashMap<>();
JSONObject data = JSONObject.parseObject(entry.getValue().toString());
dataMap.put("content", data.getString("content"));
dataMap.put("nickname", data.getJSONObject("user").getString("nickname"));
......@@ -45,78 +41,78 @@ public class DayuCommentAnalysis {
dataMap.put("time", TimeParse.dateFormartString(new Date(time), "yyyy-MM-dd HH:mm:ss"));
int i = data.getInteger("reply_cnt");
dataMap.put("replay_count", i);
if(i > 0) {
dataList.addAll(getReplayData(id,articleId,proxy));
}
// if(i > 0) {
// dataList.addAll(getReplayData(id,articleId,proxy));
// }
dataList.add(dataMap);
}
return dataList;
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
logger.error("解析出错 {}",e);
return dataList;
}
}
/**
*
* @Description 解析
* @param id
* @param articleId
* @return
*/
private List<Map<String,Object>> getReplayData(String id,String articleId,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
String url = "http://m.uczzd.cn/iflow/api/v2/cmt/detail/"+id+"/comments?articleId="+articleId+"&count=10&ts=";
String result = HttpClient.executeHttpRequestGet(url+"-1",proxy, headerMap);
List<Map<String,Object>> data = new ArrayList<Map<String,Object>>();
List<String> timeList = new ArrayList<String>();
while(true) {
ZhiWeiTools.sleep(2000);
long time = analysisReplayData(result,data);
if(timeList.contains(String.valueOf(time))){
break;
}
timeList.add(String.valueOf(time));
if(time == 0) {
break;
}
result = HttpClient.executeHttpRequestGet(url+time,proxy, headerMap);
}
System.out.println("=====================评论下回复获取数=="+data.size());
return data;
}
// /**
// *
// * @Description 解析
// * @param id
// * @param articleId
// * @return
// */
// private List<Map<String,Object>> getReplayData(String id,String articleId,Proxy proxy) {
// Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
// String url = "http://m.uczzd.cn/iflow/api/v2/cmt/detail/"+id+"/comments?articleId="+articleId+"&count=10&ts=";
// String result = HttpClient.executeHttpRequestGet(url+"-1",proxy, headerMap);
// List<Map<String,Object>> data = new ArrayList<Map<String,Object>>();
// List<String> timeList = new ArrayList<String>();
// while(true) {
// ZhiWeiTools.sleep(2000);
// long time = analysisReplayData(result,data);
// if(timeList.contains(String.valueOf(time))){
// break;
// }
// timeList.add(String.valueOf(time));
// if(time == 0) {
// break;
// }
// result = HttpClient.executeHttpRequestGet(url+time,proxy, headerMap);
// }
// System.out.println("=====================评论下回复获取数=="+data.size());
// return data;
// }
/**
*
* @Description 解析
* @param result
* @param dataList
* @return
*/
private long analysisReplayData(String result,List<Map<String,Object>> dataList) {
long time = 0;
try {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("replies");
for(int i = 0; i < jsonArry.size();i++) {
Map<String,Object> map = new HashMap<String, Object>();
JSONObject data = jsonArry.getJSONObject(i);
map.put("content", data.getString("content"));
map.put("nickname", data.getString("nickname"));
map.put("like", data.getString("up_cnt"));
map.put("id", data.getString("commentId"));
map.put("url", data.getString("shareUrl"));
time = data.getLong("timeShow");
map.put("time", TimeParse.dateFormartString(new Date(time), "yyyy-MM-dd HH:mm:ss"));
map.put("replay_count", data.getInteger("replyCnt"));
dataList.add(map);
}
return time;
} catch (Exception e) {
logger.error("获取大鱼号评论出错--回复的",e.getMessage());
return 0;
}
}
// /**
// *
// * @Description 解析
// * @param result
// * @param dataList
// * @return
// */
// private long analysisReplayData(String result,List<Map<String,Object>> dataList) {
// long time = 0;
// try {
// JSONObject json = JSONObject.parseObject(result);
// JSONArray jsonArry = json.getJSONObject("data").getJSONArray("replies");
// for(int i = 0; i < jsonArry.size();i++) {
// Map<String,Object> map = new HashMap<String, Object>();
// JSONObject data = jsonArry.getJSONObject(i);
// map.put("content", data.getString("content"));
// map.put("nickname", data.getString("nickname"));
// map.put("like", data.getString("up_cnt"));
// map.put("id", data.getString("commentId"));
// map.put("url", data.getString("shareUrl"));
// time = data.getLong("timeShow");
// map.put("time", TimeParse.dateFormartString(new Date(time), "yyyy-MM-dd HH:mm:ss"));
// map.put("replay_count", data.getInteger("replyCnt"));
// dataList.add(map);
// }
// return time;
// } catch (Exception e) {
// logger.error("获取大鱼号评论出错--回复的",e.getMessage());
// return 0;
// }
// }
......
......@@ -10,8 +10,6 @@ import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.plaf.synth.SynthSpinnerUI;
import org.apache.commons.lang3.math.NumberUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......
......@@ -6,16 +6,12 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.timeparse.TimeParse;
public class DouyinHotDataAnalysis {
private static Logger logger = LoggerFactory.getLogger(DouyinHotDataAnalysis.class);
public Map<String,Object> getData(String result) {
try {
......
......@@ -11,12 +11,17 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class FenghuangAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(FenghuangAccountAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
*
......@@ -31,8 +36,8 @@ public class FenghuangAccountAnalysis {
Map<String,String> headerMap = HeadGet.getFenghuangAccountHeaderMap(null);
JSONArray jsonArry = null;
for(int i = 0;i < 3;i++) {
try {
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
jsonArry = json.getJSONObject("data").getJSONObject("feeds").getJSONArray("list");
if(jsonArry == null || jsonArry.size() < 1) {
......@@ -83,7 +88,7 @@ public class FenghuangAccountAnalysis {
map.put("url", json.getString("shareurl"));
map.put("id", json.getString("aid"));
} catch (Exception e) {
logger.error("解析具体文章的时候出错",e.getMessage());
logger.error("解析具体文章的时候出错 {}",e);
return null;
}
return map;
......
......@@ -13,17 +13,22 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.tools.timeparse.TimeParse;
import okhttp3.Response;
public class FenghuangCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(FenghuangCommentAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot();
public Map<String,Object> getFenghuangCommentCount(String url,Proxy proxy) {
Map<String,Object> map = new HashMap<>();
try {
String result = HttpClient.executeHttpRequestGet(url,proxy, null);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
map.put("real_count", json.getInteger("count"));
map.put("comment_num", json.getInteger("join_count"));
......@@ -44,8 +49,8 @@ public class FenghuangCommentAnalysis {
public String getdocUrl(String url,Proxy proxy) {
String docUrl = null;
for(int i = 0;i < 3;i++) {
try {
String result = HttpClient.executeHttpRequestGet(url,proxy, null);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
if(result.contains("commentUrl\": \"")) {
docUrl = result.split("commentUrl\": \"")[1].split("\",")[0];
break;
......@@ -76,24 +81,18 @@ public class FenghuangCommentAnalysis {
*/
public List<Map<String,Object>> getData(String url,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getFenghuangCommentHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
String result;
try {
result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
} catch (Exception e) {
logger.error("链接获取信息失败",e.getMessage());
return null;
}
List<Map<String,Object>> dataList = new ArrayList<>();
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("data");
try {
for(int i = 0;i < jsonArry.size(); i ++) {
Map<String,Object> map = getcommentData(jsonArry.getJSONObject(i));
dataList.add(map);
}
} catch (Exception e) {
logger.error("获取信息出错",e.getMessage());
return null;
logger.error("链接获取信息失败",e);
return Collections.emptyList();
}
return dataList;
......@@ -109,22 +108,16 @@ public class FenghuangCommentAnalysis {
*/
public List<Map<String,Object>> getData2(String url,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
String result;
try {
result = HttpClient.executeHttpRequestGet(url,proxy, null);
} catch (Exception e) {
logger.error("链接获取信息失败 {}",e);
return Collections.emptyList();
}
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("comments");
try {
for(int i = 0;i < jsonArry.size(); i ++) {
Map<String,Object> map = getcommentData2(jsonArry.getJSONObject(i));
dataList.add(map);
}
} catch (Exception e) {
logger.error("获取信息出错 {}",e);
logger.error("链接获取信息失败 {}",e);
return Collections.emptyList();
}
return dataList;
......@@ -154,7 +147,7 @@ public class FenghuangCommentAnalysis {
* @return
*/
private Map<String,Object> getcommentData(JSONObject json) {
Map<String,Object> map = new HashMap<String, Object>();
Map<String,Object> map = new HashMap<>();
try {
JSONObject data = json.getJSONObject("data");
map.put("nickname", json.getString("nickname"));
......@@ -169,7 +162,7 @@ public class FenghuangCommentAnalysis {
long time = data.getLong("add_time") * 1000;
map.put("time", TimeParse.dateFormartString(new Date(time), "yyyy-MM-dd HH:mm:ss"));
} catch (Exception e) {
logger.error("具体解析一条数据出错",e.getMessage());
logger.error("具体解析一条数据出错 {}",e);
return null;
}
return map;
......
......@@ -5,15 +5,11 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
public class MaimaiBywordAnalysis {
private static Logger logger = LoggerFactory.getLogger(MaimaiBywordAnalysis.class);
public Map<String,Object> getData(String result,String time) {
Map<String,Object> map1 = new HashMap<String,Object>();
......@@ -38,7 +34,7 @@ public class MaimaiBywordAnalysis {
map.put("like", data.getJSONObject("feed").getInteger("likes"));
map.put("comment_count", data.getJSONObject("feed").getInteger("total_cnt"));
map.put("spreads", data.getJSONObject("feed").getInteger("spreads")); //传播数
System.out.println(map.toString());
// System.out.println(map.toString());
dataList.add(map);
}
map1.put("data", dataList);
......@@ -69,7 +65,7 @@ public class MaimaiBywordAnalysis {
map.put("like", data.getJSONObject("gossip").getInteger("likes"));
map.put("comment_count", data.getJSONObject("gossip").getInteger("total_cnt"));
map.put("spreads", data.getJSONObject("gossip").getInteger("search_order")); //传播数
System.out.println(map.toString());
// System.out.println(map.toString());
dataList.add(map);
}
map1.put("data", dataList);
......
......@@ -19,7 +19,6 @@ import com.zhiwei.util.TimeUtil;
public class MeipaiByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(MeipaiByWordAnalysis.class);
/**
*
* @Description 解析此页
......
......@@ -20,7 +20,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public class QQKBCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(QQKBCommentAnalysis.class);
/**
*
* @Description 获取post信息
......@@ -37,10 +36,9 @@ public class QQKBCommentAnalysis {
JSONObject data = jsonArry.getJSONArray(jsonArry.size()-1).getJSONObject(0);
String coral_scorem = data.getString("coral_score");
String reply_id = data.getString("reply_id");
Map<String,Object> paMap = HeadGet.getQQKBCommentParamMap2(comment_id, page, coral_scorem, article_id, reply_id);
return paMap;
return HeadGet.getQQKBCommentParamMap2(comment_id, page, coral_scorem, article_id, reply_id);
} catch (Exception e) {
logger.error("构造post请求信息失败",e.getMessage());
logger.error("构造post请求信息失败 {}",e);
return null;
}
}
......@@ -52,13 +50,13 @@ public class QQKBCommentAnalysis {
* @return
*/
public List<Map<String,Object>> getCommentData(String result,String cookie,String comment_id, String article_id,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
List<Map<String,Object>> dataList = new ArrayList<>();
try {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("comments").getJSONArray("hot");
for(int i = 0; i < jsonArry.size() ;i++) {
JSONObject data = jsonArry.getJSONArray(i).getJSONObject(0);
Map<String,Object> map = new HashMap<String,Object>();
Map<String,Object> map = new HashMap<>();
map.put("content", data.getString("reply_content"));
map.put("time", TimeParse.dateFormartString(new Date(Long.valueOf(data.getString("tipstime")) * 1000L), "yyyy-MM-dd HH:mm:ss"));
map.put("name", data.getString("nick"));
......@@ -66,12 +64,11 @@ public class QQKBCommentAnalysis {
int replay_num = 0;
String reply_id = data.getString("reply_id");
if(data.toString().contains("reply_num")) {
replay_num = data.getInteger("reply_num");
List<Map<String,Object>> lists = getReplyCommentData(cookie,reply_id,comment_id, article_id,proxy);
if(lists != null && lists.size() > 0) {
dataList.addAll(lists);
}
map.put("reply_num", replay_num);
// replay_num = data.getInteger("reply_num");
// List<Map<String,Object>> lists = getReplyCommentData(cookie,reply_id,comment_id, article_id,proxy);
// if(lists != null && lists.size() > 0) {
// dataList.addAll(lists);
// }
}
map.put("reply_id", reply_id);
map.put("reply_num", replay_num);
......@@ -80,7 +77,7 @@ public class QQKBCommentAnalysis {
}
return dataList;
} catch (Exception e) {
logger.error("解析数据出错",e.getMessage());
logger.error("解析数据出错 {}",e);
return dataList;
}
......@@ -93,7 +90,7 @@ public class QQKBCommentAnalysis {
* @return
*/
public Map<String,Object> getOneReplyComment(JSONObject data) {
Map<String,Object> map = new HashMap<String,Object>();
Map<String,Object> map = new HashMap<>();
try {
map.put("content", data.getString("reply_content"));
map.put("time", TimeParse.dateFormartString(new Date(Long.valueOf(data.getString("tipstime")) * 1000L), "yyyy-MM-dd HH:mm:ss"));
......@@ -103,13 +100,13 @@ public class QQKBCommentAnalysis {
System.out.println(map.toString());
return map;
} catch (Exception e) {
logger.error("获取单个回复评论出错",e.getMessage());
logger.error("获取单个回复评论出错 {}",e);
return null;
}
}
public List<Map<String,Object>> getReplyCommentData(String cookie,String reply_id,String comment_id, String article_id,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
List<Map<String,Object>> dataList = new ArrayList<>();
Map<String,String> headerMap = HeadGet.getQQKBCommentHeaderMap(cookie);
try {
String old_reply_id = "";
......
......@@ -6,16 +6,12 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.timeparse.TimeParse;
public class QicheHomeKwyWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(QicheHomeKwyWordAnalysis.class);
public List<Map<String,Object>> getData(String result) {
try {
......
......@@ -21,15 +21,16 @@ public class WangyiHistoryAnalysis {
private static Logger logger = LoggerFactory.getLogger(WangyiHistoryAnalysis.class);
public List<Map<String,Object>> getData(String result,Proxy proxy,String endTime,String source) {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("list");
Map<String, String> headerMap = HeadGet.getWangyiHistoryHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
List<Map<String,Object>> dataList = new ArrayList<>();
for(int i = 0;i < jsonArry.size();i++) {
try {
JSONObject data = jsonArry.getJSONObject(i);
Map<String,Object> map = new HashMap<String,Object>();
Map<String,Object> map = new HashMap<>();
map.put("title", data.getString("title"));
if(endTime != null && endTime.length() > 1) {
if(data.getString("ptime").compareTo(endTime) <= 0) {
......
......@@ -4,7 +4,7 @@
log4j.appender.stdout.layout.ConversionPattern=<%d>[%5p] %c - %m%n
log4j.appender.ROLLING_FILE=org.apache.log4j.DailyRollingFileAppender
log4j.appender.ROLLING_FILE.Threshold=stdout
log4j.appender.ROLLING_FILE.File=./Log/wechatcrawler.log
log4j.appender.ROLLING_FILE.File=./Log/artivleData.log
log4j.appender.ROLLING_FILE.Append=true
log4j.appender.ROLLING_FILE.layout=org.apache.log4j.PatternLayout
log4j.appender.ROLLING_FILE.layout.ConversionPattern=<%d>[%5p] %c - %m%n
\ No newline at end of file
//package com.zhiwei.Comment;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Chejia;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class ChejiaCommentCountTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String, Object> map = poi
// .importExcel("D://crawlerdata//自媒体/车家号.xlsx", 0);
// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
// List<String> headList = (List<String>) map.get("head");
// for (Map<String, Object> map1 : list) {
// String url = map1.get("地址") + "";
//// url = "https://chejiahao.autohome.com.cn/info/3073188#reply";
// System.out.println(url);
// Chejia.getChejiaComment(url, ProxyFactory.getNatProxy());
//// int i = Chejia.getChejiaCommentCount(url, ProxyFactory.getNatProxy());
//// System.out.println(i);
//// map1.put("count", i);
// ZhiWeiTools.sleep(100);
// }
// headList.add("count");
// poi.exportExcel("D://crawlerdata//自媒体/车家号.xlsx", "评论采集", headList,
// list);
//
// }
//}
//package com.zhiwei.Comment;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Maimai;
//import com.zhiwei.parse.Yiche;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class MaimaiCommentCountTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String, Object> map = poi
// .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉#美团 裁员#汇总截至12月20日10点30分.xlsx(1).xlsx", 0);
// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
// List<String> headList = (List<String>) map.get("head");
// for (Map<String, Object> map1 : list) {
// String url = map1.get("地址") + "";
// Map<String,Object> map3 = Maimai.getMaiaiCount(url, ProxyFactory.getNatProxy());
// map1.putAll(map3);
// ZhiWeiTools.sleep(100);
// }
// headList.add("like");
// headList.add("spreads");
// headList.add("cmts");
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉#美团 裁员#汇总截至12月20日10点30分.xlsx(1).xlsx", "评论采集", headList,
// list);
// }
//}
//package com.zhiwei.Comment;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//import java.util.Objects;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Xueqiu;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class XueqiuCommentCountTest {
// @Test
// public void f() {
//
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String, Object> map = poi
// .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\雪球-腾讯.xlsx", 0);
// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
// List<String> headList = (List<String>) map.get("head");
// for (Map<String, Object> map1 : list) {
// for(int i = 1;i < 5;i++) {
// String url = map1.get("地址") + "";
// Map<String,Object> map3 = Xueqiu.getUrlData(url, ProxyFactory.getNatProxy());
// ZhiWeiTools.sleep(100);
// if(Objects.nonNull(map3)) {
// System.out.println(map3.toString());
// map1.putAll(map3);
// break;
// }
// }
// }
// headList.add("like");
// headList.add("repostCount");
// headList.add("commentCount");
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\雪球-腾讯.xlsx", "评论数采集", headList,
// list);
//
// }
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Aiqiyi;
public class AiqiyiByWordExample {
@Test
public void aiqiyiByWordTest() {
String word = "美食,味道,菜";
String[] words = word.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) {
List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,null);
if(dataList != null && dataList.size() >= 1) {
bodyList.addAll(dataList);
}
}
List<String> headList = new ArrayList<String>();
headList.add("count");
headList.add("time");
headList.add("source");
headList.add("content");
headList.add("url");
headList.add("title");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata/爱奇艺关键词采集.xlsx", "数据", headList, bodyList);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Aiqiyi;
//
//public class AiqiyiByWordExample {
//
//
// @Test
// public void aiqiyiByWordTest() {
// String word = "美食,味道,菜";
// String[] words = word.split(",");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : words) {
// List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,null);
// if(dataList != null && dataList.size() >= 1) {
// bodyList.addAll(dataList);
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("count");
// headList.add("time");
// headList.add("source");
// headList.add("content");
// headList.add("url");
// headList.add("title");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata/爱奇艺关键词采集.xlsx", "数据", headList, bodyList);
//
//
//
// }
//
//
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Baijia;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class BaijiaAccountExample {
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Baijia;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class BaijiaAccountExample {
//
//// @Test
// public void baijiaAccountTest() {
// String app_id = "1536766276004443";
// String startTime = "2015-01-01 00:00:00";
// //2017-11-30 17:48:17
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountData(app_id,startTime,null);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("read_amount");
// headList.add("app_id");
// headList.add("source");
// headList.add("url");
// headList.add("content");
// poi.exportExcel("D://crawlerdata/百家号-马继华.xlsx", "马继华", headList, lists);
// }
//
//// @Test
// public void baijiaAccount2Test() {
// String app_id = "b_1548519002063358";
// String startTime = "2018-01-01 00:00:00";
// //2017-11-30 17:48:17
// List<String> idList = new ArrayList<>();
// idList.add("b_1548519002063358");
// idList.add("b_1536766292852334");
// idList.add("b_1536766781763274");
// idList.add("b_1536766200338498");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String id : idList) {
// ZhiWeiTools.sleep(5000);
// List<Map<String,Object>> lists = Baijia.getBaijiaAccount2Data(id,startTime,null);
// bodyList.addAll(lists);
// }
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("source");
// headList.add("url");
// headList.add("content");
// poi.exportExcel("D://crawlerdata//自媒体/百家号-all.xlsx", "科学的fan", headList, bodyList);
// }
//
// @Test
public void baijiaAccountTest() {
String app_id = "1536766276004443";
String startTime = "2015-01-01 00:00:00";
//2017-11-30 17:48:17
List<Map<String,Object>> lists = Baijia.getBaijiaAccountData(app_id,startTime,null);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("read_amount");
headList.add("app_id");
headList.add("source");
headList.add("url");
headList.add("content");
poi.exportExcel("D://crawlerdata/百家号-马继华.xlsx", "马继华", headList, lists);
}
// @Test
public void baijiaAccount2Test() {
String app_id = "b_1548519002063358";
String startTime = "2018-01-01 00:00:00";
//2017-11-30 17:48:17
List<String> idList = new ArrayList<>();
idList.add("b_1548519002063358");
idList.add("b_1536766292852334");
idList.add("b_1536766781763274");
idList.add("b_1536766200338498");
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String id : idList) {
ZhiWeiTools.sleep(5000);
List<Map<String,Object>> lists = Baijia.getBaijiaAccount2Data(id,startTime,null);
bodyList.addAll(lists);
}
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("source");
headList.add("url");
headList.add("content");
poi.exportExcel("D://crawlerdata//自媒体/百家号-all.xlsx", "科学的fan", headList, bodyList);
}
@Test
public void test3() {
String path = "D://crawlerdata//自媒体/百家号采集.xlsx";
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String startTime = "2018-05-01 00:00:00";
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<>();
for(Map<String,Object> m : list) {
try {
String app_id = m.get("id").toString();
app_id = "1594158489045754";
String name = m.get("name").toString();
String cookie = "__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5";
List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id,name, startTime,cookie, null);
if(lists != null) {
bodyList.addAll(lists);
}
} catch (Exception e) {
}
}
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("source");
headList.add("url");
headList.add("content");
headList.add("read_amount");
poi.exportExcel("D://crawlerdata//自媒体/百家号-lxj-2.xlsx", "娱乐资本论", headList, bodyList);
}
}
// public void test3() {
// String path = "D://crawlerdata//自媒体/百家号采集.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String startTime = "2018-05-01 00:00:00";
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Map<String,Object> m : list) {
// try {
// String app_id = m.get("id").toString();
// app_id = "1594158489045754";
// String name = m.get("name").toString();
// String cookie = "__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5";
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id,name, startTime,cookie, null);
// if(lists != null) {
// bodyList.addAll(lists);
// }
// } catch (Exception e) {
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("source");
// headList.add("url");
// headList.add("content");
// headList.add("read_amount");
// poi.exportExcel("D://crawlerdata//自媒体/百家号-lxj-2.xlsx", "娱乐资本论", headList, bodyList);
// }
//
//}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Dayu;
public class DayuAccountExample {
@Test
public void dayuAccountTest() {
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
// String mid = "d7300311c1504d24a229c3da345785c6";
// String name = "大鱼海棠雨";
String startTime = "2017-01-01 00:00:00";
String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx";
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("source");
headList.add("url");
// headList.add("content_id");
// headList.add("origin_id");
// headList.add("xss_item_id");
for(Map<String,Object> data : lists) {
String mid = data.get("mid")+"";
String name = data.get("name")+"";
if(mid.length() < 1 && name.length() < 1) {
continue;
}
List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null,null);
poi.exportExcel(path, name, headList, dataList);
}
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Dayu;
//
//public class DayuAccountExample {
//
//
// @Test
// public void dayuAccountTest() {
// //https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
//
//
//// String mid = "d7300311c1504d24a229c3da345785c6";
//// String name = "大鱼海棠雨";
// String startTime = "2017-01-01 00:00:00";
// String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
//// headList.add("content_id");
//// headList.add("origin_id");
//// headList.add("xss_item_id");
// for(Map<String,Object> data : lists) {
// String mid = data.get("mid")+"";
// String name = data.get("name")+"";
// if(mid.length() < 1 && name.length() < 1) {
// continue;
// }
// List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null,null);
// poi.exportExcel(path, name, headList, dataList);
// }
//
//
// }
//
//
//}
package com.zhiwei.crawler;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.parse.Dayu;
public class DayuByWordExample {
@Test
public void dayuByWordTest() {
String word = "11";
List<Map<String,Object>> dataList = Dayu.getDayuByWordData(word,null);
System.out.println(dataList.size());
}
}
//package com.zhiwei.crawler;
//
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.parse.Dayu;
//
//public class DayuByWordExample {
//
//
// @Test
// public void dayuByWordTest() {
// String word = "11";
//
// List<Map<String,Object>> dataList = Dayu.getDayuByWordData(word,null);
//
// System.out.println(dataList.size());
//
//
// }
//
//
//}
......@@ -10,15 +10,18 @@ import com.zhiwei.parse.Maimai;
public class MaimaiBywordExample {
public static void main(String[] args) {
String word = "美团 晋升";
String cookie = "sessionid=y87knknqrc3fi6xto2zv0s4kugmleepk; guid=GxsfBBgZGwQYGx4EGBkeVgcYGx4fHhwcGhgbVhwZBB0ZHwVDWEtMS3kKGhobBB0THhkEGgQTHAVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; seid=s1539933372113; token=\"ZTjnEij9jsL4ZCdnKF2CaUAwcJHgcem/zHvAbXp3MXdY+uSPva8scjbe2zHl2gE98CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiSFVMLVhKb2g5TkJGNHRJanljUW5Qa1V5IiwiX2V4cGlyZSI6MTU0MDAxOTc5MTUwNSwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=dJmy52LHX-stqroAbm66u2zJaZA";
String time = "2018-10-15 00:00:00";
String word = "美团|某团|MT|大众点评|新美大|美团点评";
String cookie = "guid=GxsfBBgZGwQYGx4EGBkeVhsfGB4aHBpWHBkEHRkfBUNYS0xLeQoSEwQSHR8ZBBoEGx0FT0dFWEJpCgNFQUlPbQpPQUNGCgZmZ35iYQIKHBkEHRkfBV5DYUhPfU9GWlprCgMeHH1lfQoRGQQcCn5kClldRU5EQ30CChoEHwVLRkZDUEVn; token=\"7IGuqjEwgJ2gXX5PZ0UYSxvn81Aws6v5OFrwpSErsbctlSd1e/7+AzYEMMMeeFJJ8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; _buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiOGtDSnF6VG5QcFk0R3ZmVFB4MThIMW1ZIiwiX2V4cGlyZSI6MTU0ODMwODU0MTMyNCwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=cnQ0i1LwYxhjO3_BvQ4Coh0f9PQ";
String time = "2019-01-17 00:00:00";
String[] words = word.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) {
List<Map<String,Object>> c = Maimai.getData(w, cookie, time, null);
// List<Map<String,Object>> c = Maimai.getDataByNoName(w, cookie, time, null);
bodyList.addAll(c);
//实名动态
// List<Map<String,Object>> c = Maimai.getData(w, cookie, time, null);
//职言交流
List<Map<String,Object>> c2 = Maimai.getDataByNoName(w, cookie, time, null);
// bodyList.addAll(c);
bodyList.addAll(c2);
}
List<String> headList = new ArrayList<String>();
headList.add("time");
......@@ -29,7 +32,7 @@ public class MaimaiBywordExample {
headList.add("comment_count");
headList.add("spreads");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团 晋升-1015.xlsx", "脉脉关键词", headList, bodyList);
poi.exportExcel("D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0123.xlsx", "脉脉关键词", headList, bodyList);
}
}
......@@ -13,7 +13,7 @@ public class PearVideoByWordExample {
@Test
public void pearVideoByWordTest() {
String word = "美食";
String word = "大宝 甲醛";
List<Map<String,Object>> bodyList = PearVideo.getPearVideoData(word,null);
List<String> headList = new ArrayList<String>();
......
......@@ -6,6 +6,8 @@ import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.QQKB;
import com.zhiwei.tools.tools.ZhiWeiTools;
......@@ -18,7 +20,8 @@ public class QQKBCommentExample {
String url = "https://kuaibao.qq.com/s/20181122A11WQB00";
//https://kuaibao.qq.com/s/20180423A1PI7400?refer=kb_news
// https://kuaibao.qq.com/s/20180423A0L60800?refer=kb_news
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType.PROVIDER);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/快报评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
......@@ -40,7 +43,7 @@ public class QQKBCommentExample {
headList.add("time"); //时间
headList.add("content"); //内容
System.out.println(bodyList.size());
poi.exportExcel("D:\\crawlerdata\\自媒体\\快报评论采集-zhj.xlsx", "sada", headList, bodyList);
poi.exportExcel("D:\\crawlerdata\\自媒体\\快报评论采集.xlsx", "sada", headList, bodyList);
}
......
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Souhu;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class SouhuCommentCountExample {
@Test
public void souhuCommentCountTest() {
String url = "http://www.sohu.com/a/281414426_133392";
int i = Souhu.getSouhuCommentCount(url,null);
System.out.println(i);
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType.PROVIDER);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<String> headList = (List<String>) map.get("head");
for(Map<String,Object> map1 : list) {
String url = "";
try {
url = map1.get("url")+"";
System.out.println(url);
int i = Souhu.getSouhuCommentCount(url,ProxyFactory.getNatProxy());
map1.put("count", i);
System.out.println(map1.toString());
} catch (Exception e) {
System.out.println(url);
e.printStackTrace();
continue;
}
}
headList.add("count");
poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "sheet2", headList, list);
}
......
......@@ -6,6 +6,8 @@ import java.util.Map;
import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang;
import com.zhiwei.parse.Souhu;
......@@ -16,7 +18,8 @@ public class SouhuCommentExample {
@Test
public void souhuCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType.PROVIDER);
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
......@@ -30,7 +33,7 @@ public class SouhuCommentExample {
if(dataList.size() <= 0) {
urlList.add(url);
}
ZhiWeiTools.sleep(2000);
ZhiWeiTools.sleep(100);
if(dataList != null) {
bodyList.addAll(dataList);
}
......
package com.zhiwei.hsitory;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.bean.HistortyBean;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.QQKandian;
public class QQkandianHistoryExample {
@Test
public void f() {
String uid = "2661642386";
QQKandian qqKandian = new QQKandian();
List<HistortyBean> dataList = qqKandian.getHistoryData(uid, null);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(HistortyBean h : dataList) {
Map<String, Object> map = new HashMap<String,Object>();
map.put("标题", h.getTitle());
map.put("时间", h.getTime());
map.put("来源", h.getSource());
map.put("正文", h.getContent());
map.put("链接", h.getUrl());
bodyList.add(map);
}
List<String> headList = new ArrayList<String>();
headList.add("标题");
headList.add("来源");
headList.add("链接");
headList.add("正文");
headList.add("时间");
poi.exportExcel("D:\\crawlerdata\\自媒体\\qq看点-数据-2661642386.xlsx", "数据", headList, bodyList);
}
}
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.bean.HistortyBean;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.QQKandian;
//
//public class QQkandianHistoryExample {
// @Test
// public void f() {
// String uid = "2661642386";
//
// QQKandian qqKandian = new QQKandian();
// List<HistortyBean> dataList = qqKandian.getHistoryData(uid, null);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(HistortyBean h : dataList) {
// Map<String, Object> map = new HashMap<String,Object>();
// map.put("标题", h.getTitle());
// map.put("时间", h.getTime());
// map.put("来源", h.getSource());
// map.put("正文", h.getContent());
// map.put("链接", h.getUrl());
// bodyList.add(map);
// }
// List<String> headList = new ArrayList<String>();
// headList.add("标题");
// headList.add("来源");
// headList.add("链接");
// headList.add("正文");
// headList.add("时间");
// poi.exportExcel("D:\\crawlerdata\\自媒体\\qq看点-数据-2661642386.xlsx", "数据", headList, bodyList);
//
//
// }
//}
//package com.zhiwei.keyword;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Gftai;
//
//public class GftaiTest {
// @Test
// public void f() {
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb";
// String[] ws = words.split("\\|");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String word : ws) {
// List<Map<String,Object>> list = Gftai.getData(word, null);
// bodyList.addAll(list);
// System.out.println(word + " --------- " + bodyList.size());
// }
// List<String> headList = new ArrayList<>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
//
// poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\国富泰信用.xlsx", "数据", headList, bodyList);
// }
//}
package com.zhiwei.keyword;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Gftai;
public class GftaiTest {
@Test
public void f() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String words = "美团 催收|美团 借款|美团 还钱|三快 借钱|三快 生活费|三快 借款|美团 征信";
String[] ws = words.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String word : ws) {
List<Map<String,Object>> list = Gftai.getData(word, null);
bodyList.addAll(list);
System.out.println(word + " --------- " + bodyList.size());
}
List<String> headList = new ArrayList<>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("source");
headList.add("url");
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\国富泰信用-美团-2.xlsx", "数据", headList, bodyList);
}
}
//package com.zhiwei.keyword;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Gftai;
//import com.zhiwei.parse.KuaiTousu;
//
//public class KuaiTousuTest {
// @Test
// public void f() {
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb";
// String[] ws = words.split("\\|");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String word : ws) {
// List<Map<String,Object>> list = KuaiTousu.getData(word, null);
// bodyList.addAll(list);
// System.out.println(word + " --------- " + bodyList.size());
// }
// List<String> headList = new ArrayList<>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
//
// poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\新浪广东快投诉.xlsx", "数据", headList, bodyList);
//
//
//
//
// }
//}
package com.zhiwei.keyword;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Gftai;
import com.zhiwei.parse.KuaiTousu;
public class KuaiTousuTest {
@Test
public void f() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String words = "美团 催收|美团 借款|美团 还钱|三快 借钱|三快 生活费|三快 借款|美团 征信";
String[] ws = words.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String word : ws) {
List<Map<String,Object>> list = KuaiTousu.getData(word, null);
bodyList.addAll(list);
System.out.println(word + " --------- " + bodyList.size());
}
List<String> headList = new ArrayList<>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("source");
headList.add("url");
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\新浪广东快投诉-美团-2.xlsx", "数据", headList, bodyList);
}
}
//package com.zhiwei.keyword;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.KuaiTousu;
//import com.zhiwei.parse.SinaTousu;
//
//public class SinaTousuTest {
//
// @Test
// public void getSinaTousuData() {
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb";
// String[] ws = words.split("\\|");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String word : ws) {
// List<Map<String,Object>> list = SinaTousu.getSinaTousuData(word, null, "2018-01-01 00:00:00");
// bodyList.addAll(list);
// System.out.println(word + " --------- " + bodyList.size());
// }
// List<String> headList = new ArrayList<>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
//
// poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉.xlsx", "数据", headList, bodyList);
//
//
//
// }
//}
package com.zhiwei.keyword;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.KuaiTousu;
import com.zhiwei.parse.SinaTousu;
public class SinaTousuTest {
@Test
public void getSinaTousuData() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String words = "美团 催收|美团 借款|美团 还钱|三快 借钱|三快 生活费|三快 借款|美团 征信";
String[] ws = words.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String word : ws) {
List<Map<String,Object>> list = SinaTousu.getSinaTousuData(word, null, "2018-07-01 00:00:00");
bodyList.addAll(list);
System.out.println(word + " --------- " + bodyList.size());
}
List<String> headList = new ArrayList<>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("source");
headList.add("url");
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉-美团-2.xlsx", "数据", headList, bodyList);
}
}
//package com.zhiwei.keyword;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Youku;
//
//public class YoukuKeyWordTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// String word = "帮宝适 二噁英," +
// "帮宝适 二恶英," +
// "帮宝适 有毒," +
// "帮宝适 剧毒," +
// "帮宝适 致癌," +
// "宝洁 二噁英," +
// "宝洁 二恶英," +
// "宝洁 有毒," +
// "宝洁 剧毒," +
// "宝洁 致癌," +
// "纸尿裤 二噁英," +
// "纸尿裤 二恶英," +
// "纸尿裤 有毒," +
// "纸尿裤 剧毒," +
// "纸尿裤 致癌";
// List<Map<String,Object>> bodyList = new ArrayList<>();
// String[] words = word.split(",");
// for(String w : words) {
// System.out.println(w);
// bodyList.addAll(Youku.getDataList(w));
// }
// List<String> headList = new ArrayList<>();
// headList.add("title");
// headList.add("time");
// headList.add("url");
// headList.add("uper");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\优酷数据-txh-0121.xlsx", "数据", headList, bodyList);
//
// }
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment