Commit 89439323 by yangchen

1

parent 132e6350
package com.zhiwei.httpclient;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
......@@ -9,6 +11,7 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.sun.net.httpserver.Headers;
public class HeadGet {
......@@ -645,37 +648,152 @@ public class HeadGet {
return headerMap;
}
public static Map<String,String> getQQkuaiCommentHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
// public static Map<String,String> getQQkuaiCommentHeaderMap(String cookie) {
// Map<String,String> headerMap = new HashMap<String, String>();
// if(cookie != null) {
// headerMap.put("Cookie", cookie);
// }
// return headerMap;
// }
public static Map<String,String> getweiboHeaderMap(String cookie) {
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
headerMap.put("Accept","*/*");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Content-Type", "application/x-www-form-urlencoded");
headerMap.put("Host", "d.weibo.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static Map<String,String> getweiboHeaderMap(String cookie) {
public static Map<String,String> getTxNewspage1HeaderMap(String cookie) {
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
"QQNews/5.5.60 (iPhone; iOS 11.2.1; Scale/2.00)");
headerMap.put("Accept","*/*");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept-Language", "zh-Hans-CN;q=1");
headerMap.put("Connection", "keep-alive");
headerMap.put("Content-Type", "application/x-www-form-urlencoded");
headerMap.put("Host", "d.weibo.com");
headerMap.put("devid", "6d33f35f-880d-42a6-a23f-881bec6960ec");
headerMap.put("Host", "r.inews.qq.com");
headerMap.put("Referer", "http://inews.qq.com/inews/iphone/");
headerMap.put("idft", "60EE914A-6E8E-41FA-BC69-C44D47DDC4A0");
headerMap.put("qn-sig", "7697A692D78C878B70DD2CFE90610113");
headerMap.put("idfa", "FE659B7E-5104-44C2-8A31-F88DEE7A2747");
headerMap.put("appver", "11.2.1_qqnews_5.5.60");
headerMap.put("deviceToken", "<6428c8bd 9d302f00 cf30071d e72da40e 79d4f96b 58838dec f8bdbdae bcaa89a6>");
headerMap.put("qn-rid", "206cdadb83e8");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static Map<String,String> getTxNewspage2HeaderMap(String cookie) {
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"QQNews/5.5.60 (iPhone; iOS 11.2.1; Scale/2.00)");
headerMap.put("Accept","*/*");
headerMap.put("Accept-Language", "zh-Hans-CN;q=1");
headerMap.put("Connection", "keep-alive");
headerMap.put("Content-Type", "application/x-www-form-urlencoded");
headerMap.put("devid", "6d33f35f-880d-42a6-a23f-881bec6960ec");
headerMap.put("Host", "r.inews.qq.com");
headerMap.put("Referer", "http://inews.qq.com/inews/iphone/");
headerMap.put("idft", "60EE914A-6E8E-41FA-BC69-C44D47DDC4A0");
headerMap.put("qn-sig", "D4BE31B3F37B6F2670094D2C6EF825B7");
headerMap.put("idfa", "FE659B7E-5104-44C2-8A31-F88DEE7A2747");
headerMap.put("appver", "11.2.1_qqnews_5.5.60");
headerMap.put("deviceToken", "<6428c8bd 9d302f00 cf30071d e72da40e 79d4f96b 58838dec f8bdbdae bcaa89a6>");
headerMap.put("qn-rid", "206ce00acce9");
headerMap.put("Content-Length", "95");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static Map<String,Object> getTxNewspage1ParamMap(String word) {
Map<String,Object> param = new HashMap<String,Object>();
param.put("c", "searchEnterPage");
param.put("query", word);
return param;
}
public static Map<String,Object> getTxNewspagemoreParamMap(String word, int page) {
Map<String,Object> param = new HashMap<String,Object>();
param.put("id", "");
// param.put("queryid", "2606511522312027");
param.put("count", 20);
param.put("query", word);
param.put("timeline", 0);
param.put("type", 0);
param.put("secId", 2);
param.put("page", page);
return param;
}
public static Map<String,String> getTXNewsAccountHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"QQNews/5.5.60 (iPhone; iOS 11.2.1; Scale/2.00)");
headerMap.put("Accept","*/*");
headerMap.put("Accept-Language", "zh-Hans-CN;q=1");
headerMap.put("Connection", "keep-alive");
headerMap.put("Content-Type", "application/x-www-form-urlencoded");
headerMap.put("devid", "6d33f35f-880d-42a6-a23f-881bec6960ec");
headerMap.put("Host", "r.inews.qq.com");
headerMap.put("Referer", "http://inews.qq.com/inews/iphone/");
headerMap.put("store", "1");
headerMap.put("idft", "60EE914A-6E8E-41FA-BC69-C44D47DDC4A0");
headerMap.put("qn-sig", "BA8985D7C7CF361FB42F9692F8E86605");
headerMap.put("idfa", "FE659B7E-5104-44C2-8A31-F88DEE7A2747");
headerMap.put("appver", "11.2.1_qqnews_5.5.60");
headerMap.put("deviceToken", "<6428c8bd 9d302f00 cf30071d e72da40e 79d4f96b 58838dec f8bdbdae bcaa89a6>");
headerMap.put("qn-rid", "2073271e7f49");
headerMap.put("Content-Length", "83");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static Map<String,Object> getTxNewsAccountpageParamMap(String child) {
Map<String,Object> param = new HashMap<String,Object>();
param.put("child", child);
param.put("uid", "8506EAF5-3678-4D3E-A9D6-E2A8DCF14F41");
param.put("media_openid", "");
param.put("commentBucketId", 0);
return param;
}
public static Map<String,String> getBaijiaAccount2HeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");
headerMap.put("Accept",
"application/json, text/javascript, */*; q=0.01");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "news.baidu.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static void main(String[] args) {
String url = "https://d.weibo.com/1087030002_2975_1003_0?pids=Pl_Core_F4RightUserList__4&page=2&ajaxpagelet=1&__ref=/1087030002_2975_1003_0&_t=FM_151825274677918";
String cookie = "SINAGLOBAL=7701198867685.262.1517207017616; _s_tentry=login.sina.com.cn; Apache=6842405326379.926.1517796423994; ULV=1517796424127:3:1:3:6842405326379.926.1517796423994:1517209523882; ULOGIN_IMG=15177972786361; UOR=,,login.sina.com.cn; YF-Page-G0=23b9d9eac864b0d725a27007679967df; SCF=Ag8PQSV7wMV9Lc8UOZupWW2l6wfI5N2imvtjcwFE3ovIEsRCuG5QaKQhPx4ByaNkpC5LpYocPBPnOJT2NSZMkiU.; SUHB=0C1CJFGk8jNm31; SUB=_2AkMtIj0odcPxrABWn_0WzGPhbYhH-jye91TeAn7uJhMyAxgv7lMFqSVutBF-XFWUFIfrHOaUSPWy_1IBv_YbyS5_; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WWr5b4iYaaqYk4kfrcubkrT5JpVF02ReoMpSo.XeK.f; login_sid_t=10c8fe00b1833b7414093404448d2330; cross_origin_proto=SSL";
Map<String,String> headerMap = HeadGet.getweiboHeaderMap(null);
String url = "https://news.baidu.com/sn/api/homesubcribe?forum_id=b_1560023960896882&page=1";
String cookie = "BAIDUID=4DB3FA13736131DBC2094C010E6EBCB0:FG=1; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; PSTM=1522304033; BDUSS=zJEdDI0WFBCUE05M3BVTlhSbnozYkpUflZveW9aaGZ3ODBVTC1WRzVwaUxkZlphQVFBQUFBJCQAAAAAAAAAAAEAAADTCNY9Y3k5MDkyMDk5NTEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIvozlqL6M5ac; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BD_CK_SAM=1; BDSVRTM=98; BDSFRCVID=9g8sJeC62rdtQM7AdMI6hrB7leHy_qbTH6aoIgcaD_KjQB22bioFEG0PDU8g0KubMyQBogKKKgOTHIjP; H_BDCLCKID_SF=tJPOoD-bJI83fP36qRj8hPCsqxby26nQB2ceaJ5nJDoAoqOVWR5N-T-_-f7H3jbQ5RRb3CnvQpP-HJ7TyfCWM5_PhMbhhUcHKaufKl0MLpbYbb0xynoD-lFzLfnMBMni52OnapT_LIFaMII6D5DaejPShMr2aK6KaI58LRu8Kb7VbIOgDbbkbfJBD4QqhR5na26b3R3v2PoIMnRvhbQDD4t7yajK2-bmaN6A3lQ8aI3oD45HDTopQT8rKqAOK5OibCrpaC_Eab3vOpvTXpO1ytIreGLjt5LHJnFOVbD8bRrEDnukhtu_-P4DePjK-nJZ5m7mXp0b04TPjljgqj7jKU_mBpJbW60qXKb7BPF5BDOkbC86D6K5jjjM-f8X-PcKaD70LPI8Kb7VbprDXbbkbfJBDxc4-U_jB26b3tbe2PoIMnRNjl5tQU47yajK2-tfK64qXl5CyPOJftjT3-opQT8rQb_OK5Oib4jZ-fo9ab3vOpvTXpO1ytIreGKJtTF8fnuOV-35b5rtHJrwMtJo5DCHbq8sq4-O-2Q-5KL--JbMVqC6LtOYyjKJK4Kf2PQ7MGOD3fbdJJjoOJ3n-fOryPIuLGKH5tcy3eTxoUJgQCnJhhvG-xcB0fDebPRiB-b9QgbABftLK-oj-DLmD60h3e; PSINO=5; locale=zh; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; FP_UID=f9e064a71741aa2e821e58ca2b30c3da; H_PS_PSSID=1433_21104_20882_20927; userId=1524191310247; Hm_lvt_348091a80fe10e213d94a7de762bbd44=1524191312; Hm_lpvt_348091a80fe10e213d94a7de762bbd44=1524191395";
Map<String,String> headerMap = HeadGet.getBaijiaAccount2HeaderMap(null);
// Map<String,Object> paramMap = HeadGet.getTxNewsAccountpageParamMap("1979");
String result = HttpClient.executeHttpRequestGet(url, headerMap);
System.out.println(result);
// System.out.println(result);
System.out.println(result.length());
}
......
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.BaijiaAccountAnalysis;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class Baijia {
......@@ -19,6 +23,45 @@ public class Baijia {
/**
*
* @Description 百家号历史文章采集
* @param app_id 百度新闻转发获取后面的数据
* @param startTime
* @return
*/
public static List<Map<String,Object>> getBaijiaAccount2Data(String app_id,String startTime) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
Map<String,String> headerMap = HeadGet.getBaijiaAccount2HeaderMap(null);
String url = "https://news.baidu.com/sn/api/homesubcribe?forum_id="+app_id;
boolean f = true;
while(f) {
String result = HttpClient.executeHttpRequestGet(url, headerMap);
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("news");
for(int i = 0;i < jsonArry.size();i++) {
JSONObject data = jsonArry.getJSONObject(i);
Map<String,Object> m = baijiaAccountAnalysis.getBaijiaAccount2Data(data);
if(startTime.compareTo((String) m.get("time")) > 0) {
f = false;
break;
}
dataList.add(m);
if(startTime != null && startTime.length() > 5) {
logger.info("采集到的时间为:{}",(String) m.get("time"));
}
}
logger.info("采集到的数据总量:{}",dataList.size());
if(json.getJSONObject("data").getBooleanValue("hasMore")) {
url = "https://news.baidu.com/sn/api/homesubcribe?forum_id="+app_id+"&page=" + (json.getJSONObject("data").getIntValue("page")+1);
ZhiWeiTools.sleep(2000);
continue;
}
break;
}
return dataList;
}
/**
*
* @Description 百家号历史文章采集
* @param app_id
* @param startTime
* @return
......@@ -37,12 +80,12 @@ public class Baijia {
break;
}
i += 20;
ZhiWeiTools.sleep(5000);
ZhiWeiTools.sleep(4000);
dataList.addAll(list);
logger.info(url+i+"=============="+dataList.size());
} catch (Exception e) {
e.printStackTrace();
ZhiWeiTools.sleep(4000);
ZhiWeiTools.sleep(3000);
logger.error("此页解析出错",e.getMessage());
continue;
}
......
......@@ -109,31 +109,31 @@ public class Fenghuang {
int i = 1;
try {
while (true) {
String url = "http://search.ifeng.com/sofeng/search.action?q="+URLEncoder.encode(word, "UTF-8")+"&c=1&p=";
Map<String,String> headerMap = HeadGet.getFenghuangWordHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url+i, headerMap);
List<Map<String,Object>> lists = fenghuangByWordAnalysis.getFenghuangByWord(result);
if(lists == null || lists.size() < 1) {
break;
}
if(lists != null && lists.size() > 0) {
dataList.addAll(lists);
}
System.out.println(word+"===================以获取的数据==:" + dataList.size());
i++;
if(i == 76) {
break;
try {
String url = "http://search.ifeng.com/sofeng/search.action?q="+URLEncoder.encode(word, "UTF-8")+"&c=1&p=";
Map<String,String> headerMap = HeadGet.getFenghuangWordHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url+i, headerMap);
List<Map<String,Object>> lists = fenghuangByWordAnalysis.getFenghuangByWord(result);
if(lists == null || lists.size() < 1) {
break;
}
if(lists != null && lists.size() > 0) {
dataList.addAll(lists);
}
System.out.println(word+"===================以获取的数据==:" + dataList.size());
i++;
if(i == 76) {
break;
}
ZhiWeiTools.sleep(4000);
} catch (Exception e) {
continue;
}
ZhiWeiTools.sleep(4000);
}
return dataList;
} catch (UnsupportedEncodingException e) {
logger.error("依据关键词获取凤凰文章出错",e.getMessage());
e.printStackTrace();
return dataList;
} catch (Exception e) {
logger.error("依据关键词获取凤凰文章出错",e.getMessage());
e.printStackTrace();
logger.error("链接获取凤凰信息出错",e.getMessage());
return dataList;
}
}
......
......@@ -59,12 +59,12 @@ public class QQKB {
paramMap = HeadGet.getQQAccountOtherParamMap(ids);
result = HttpClient.executeHttpRequestPost(url, headerMap, paramMap);
List<Map<String,Object>> list = qqAccountAnalysis.analysisQQAccountData(result);
ids = "";
i = 0;
if(list != null) {
dataList.addAll(list);
break;
}
ids = "";
i = 0;
}
} catch (Exception e) {
ids = "";
......
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.TXNewsByWordAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class TXNews {
private static Logger logger = LoggerFactory.getLogger(TXNews.class);
private static TXNewsByWordAnalysis txNewsByWordAnalysis = new TXNewsByWordAnalysis();
public static boolean hasMore = true;
public static List<Map<String,Object>> getData(String word) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
Map<String,String> headerMap = HeadGet.getTxNewspage1HeaderMap(null);
Map<String,Object> paramMap = HeadGet.getTxNewspage1ParamMap(word);
String result = HttpClient.executeHttpRequestPost("http://r.inews.qq.com/search?appver=11.2.1_qqnews_5.5.60&devid=6D33F35F-880D-42A6-A23F-881BEC6960EC", headerMap, paramMap);
List<Map<String,Object>> dList = txNewsByWordAnalysis.getData(result);
dataList.addAll(dList);
int page = 2;
int count = 0;
Map<String,String> header2Map = HeadGet.getTxNewspage2HeaderMap(null);
while(hasMore) {
try {
ZhiWeiTools.sleep(5000);
Map<String,Object> param2Map = HeadGet.getTxNewspagemoreParamMap(word, page);
String result2 = HttpClient.executeHttpRequestPost("http://r.inews.qq.com/searchMore?appver=11.2.1_qqnews_5.5.60&devid=6D33F35F-880D-42A6-A23F-881BEC6960EC", header2Map, param2Map);
page++;
List<Map<String,Object>> dList2 = txNewsByWordAnalysis.getData(result2);
dataList.addAll(dList2);
logger.info("采集到数据======={}" ,dataList.size());
count = 0;
} catch (Exception e) {
if(count > 2) {
count++;
break;
}
continue;
}
}
return dataList;
}
}
......@@ -14,11 +14,29 @@ import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class BaijiaAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(BaijiaAccountAnalysis.class);
public Map<String,Object> getBaijiaAccount2Data(JSONObject data) {
Map<String,Object> map = new HashMap<String,Object>();
map.put("title", data.getString("title"));
map.put("url", data.getString("url"));
map.put("source", data.getString("site"));
map.put("time", TimeParse.dateFormartString(TimeParse.stringFormartDate(data.getString("pulltime")),"yyyy-MM-dd HH:mm:ss"));
String content = "";
JSONArray jsonArry = data.getJSONArray("content");
for(int i = 0;i < jsonArry.size();i++) {
JSONObject d = jsonArry.getJSONObject(i);
if(d.getString("type").equals("text")) {
content = content + d.getString("data");
}
}
map.put("content", content.replaceAll("<.*?>", ""));
return map;
}
/**
*
......
......@@ -52,7 +52,6 @@ public class DayuAccountAnalysis {
Map<String,Object> map = new HashMap<String, Object>();
try {
String time = data.getString("published_at").replace("T", " ").split("\\.")[0];
System.out.println(time);
if(startTime != null && startTime.length() > 1) {
if(time.compareTo(startTime) < 0) {
return null;
......
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.parse.TXNews;
public class TXNewsByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(TXNewsByWordAnalysis.class);
public List<Map<String,Object>> getData(String result) {
JSONObject json = JSONObject.parseObject(result);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
JSONArray jsonArry = json.getJSONArray("secList");
if(json.getInteger("hasMore") == 1) {
TXNews.hasMore = true;
}else {
TXNews.hasMore = false;
}
for(int i = 0; i < jsonArry.size();i++) {
JSONObject js = jsonArry.getJSONObject(i);
if(js.getInteger("secType") == 0) {
JSONArray jsonArry2 = js.getJSONArray("newsList");
for(int j = 0; j < jsonArry2.size();j++) {
JSONObject js2 = jsonArry2.getJSONObject(j);
try {
Map<String,Object> map = new HashMap<String,Object>();
map.put("title", js2.getString("title"));
map.put("content", js2.getString("abstract"));
map.put("time", js2.getString("time"));
map.put("source", js2.getString("source"));
map.put("id", js2.getString("id"));
map.put("url", js2.getString("url"));
dataList.add(map);
// System.out.println(map.toString());
} catch (Exception e) {
logger.error("采集出错:{}",e.getMessage());
System.out.println(js2.toString());
}
}
}
}
return dataList;
}
}
......@@ -13,7 +13,7 @@ public class BaijiaAccountExample {
@Test
public void baijiaAccountTest() {
String app_id = "1536766731827943";
String app_id = "1536766390576806";
String startTime = "2016-01-01 00:00:00";
//2017-11-30 17:48:17
List<Map<String,Object>> lists = Baijia.getBaijiaAccountData(app_id,startTime);
......@@ -26,7 +26,23 @@ public class BaijiaAccountExample {
headList.add("source");
headList.add("url");
headList.add("content");
poi.exportExcel("D://crawlerdata/百家号-蓝鲸TMT网.xlsx", "蓝鲸TMT网", headList, lists);
poi.exportExcel("D://crawlerdata/百家号-太保.xlsx", "太保", headList, lists);
}
// @Test
public void baijiaAccount2Test() {
String app_id = "b_1536766390576806";
String startTime = "2016-01-01 00:00:00";
//2017-11-30 17:48:17
List<Map<String,Object>> lists = Baijia.getBaijiaAccount2Data(app_id,startTime);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("source");
headList.add("url");
headList.add("content");
poi.exportExcel("D://crawlerdata/百家号-俊世太保.xlsx", "俊世太保", headList, lists);
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Dayu;
public class DayuAccountExample {
......@@ -15,12 +17,32 @@ public class DayuAccountExample {
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
String mid = "d7300311c1504d24a229c3da345785c6";
String name = "大鱼海棠雨";
String startTime = "2017-12-05 22:08:01";
// String mid = "d7300311c1504d24a229c3da345785c6";
// String name = "大鱼海棠雨";
String startTime = "2018-03-16 00:00:00";
String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx";
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("source");
headList.add("url");
// headList.add("content_id");
// headList.add("origin_id");
// headList.add("xss_item_id");
for(Map<String,Object> data : lists) {
String mid = data.get("mid")+"";
String name = data.get("name")+"";
if(mid.length() < 1 && name.length() < 1) {
continue;
}
List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null);
poi.exportExcel(path, name, headList, dataList);
}
List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null);
System.out.println(dataList.size());
}
......
......@@ -12,7 +12,7 @@ public class DayuByWordExample {
@Test
public void dayuByWordTest() {
String word = "京东";
String word = "沃尔玛";
List<Map<String,Object>> dataList = Dayu.getDayuByWordData(word);
......
......@@ -15,7 +15,7 @@ public class FenghuangAccountExample {
public void fenghuangAccountTest() {
//所用时间长 1s1篇文章吧
//https://api.3g.ifeng.com/client_search_subscribe?k=(凤凰号名称拿id)
String id = "276718";
String id = "724";
String[] ids = id.split(",");
for(int i = 0;i < ids.length;i++) {
try {
......@@ -29,7 +29,7 @@ public class FenghuangAccountExample {
headList.add("source");
headList.add("url");
headList.add("id");
poi.exportExcel("D://crawlerdata/凤凰-另眼看世界.xlsx", ids[i], headList, dataList);
poi.exportExcel("D://crawlerdata/凤凰-电商报.xlsx", ids[i], headList, dataList);
} catch (Exception e) {
continue;
}
......
......@@ -16,14 +16,15 @@ public class QQAccountExample {
public void qqAccountTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> dataMap = poi.importExcel("D://crawlerdata/天天快报历史文章采集.xlsx", 0);
Map<String,Object> dataMap = poi.importExcel("D://crawlerdata//自媒体/天天快报历史文章采集.xlsx", 0);
List<Map<String,Object>> dataList = (List<Map<String, Object>>) dataMap.get("body");
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(Map<String,Object> map : dataList) {
String child = map.get("帐号链接")+"";
System.out.println(child.split("chlid=")[1]);
List<Map<String,Object>> lists = QQKB.getQQAccountData(child.split("chlid=")[1], cookie);
// System.out.println(child.split("chlid=")[1]);
System.out.println((String)map.get("child"));
List<Map<String,Object>> lists = QQKB.getQQAccountData((String)map.get("child"), cookie);
if(lists != null) {
for(Map<String,Object> map1 : lists) {
map1.put("name", map.get("呢称"));
......@@ -43,7 +44,7 @@ public class QQAccountExample {
headList.add("content");
headList.add("url");
headList.add("commentid");
poi.exportExcel("D://crawlerdata/天天快报采集.xlsx", "asd", headList, bodyList);
poi.exportExcel("D://crawlerdata//自媒体/天天快报采集-科技编年史.xlsx", "asd", headList, bodyList);
}
......
......@@ -16,7 +16,7 @@ public class SouhuAccountExample {
@Test
public void souhuAccountTest() {
List<Map<String,Object>> lists = Souhu.getSouHuAccountData("cHBhZzUyMTNjZjAzZTczYUBzb2h1LmNvbQ==","2017-01-01 00:00:00",false);
List<Map<String,Object>> lists = Souhu.getSouHuAccountData("MjI5MzAyOTMyMEBzaW5hLnNvaHUuY29t","2016-01-01 00:00:00",false);
System.out.println(lists.size());
List<String> headList = new ArrayList<String>();
headList.add("title");
......@@ -28,7 +28,7 @@ public class SouhuAccountExample {
headList.add("newsid");
headList.add("newsPv");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-蓝媒汇.xlsx", "蓝媒汇", headList, lists);
poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-太保乱谈.xlsx", "太保乱谈", headList, lists);
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.TXNews;
public class TXNewsByWordExample {
public static void main(String[] args) {
String word = "唐嫣";
List<Map<String,Object>> dataList = TXNews.getData(word);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("url");
headList.add("id");
headList.add("source");
poi.exportExcel("D://crawlerdata/腾讯新闻-唐嫣.xlsx", "腾讯新闻数据", headList, dataList);
}
}
......@@ -14,12 +14,20 @@ public class WangyiCommentExample {
//若出错 可能数据有重复 以id为准
@Test
public void wangyiCommentTest() {
String url = "http://news.163.com/18/0210/09/DA9B8PVJ000189FH.html";
String id = url.split("/")[6].split(".ht")[0];
List<Map<String,Object>> lists = Wangyi.getWangyiCommentData(id);
System.out.println(lists.size());
List<String> urlList = new ArrayList<String>();
urlList.add("https://c.m.163.com/news/a/DCQ42REV05118O92.html?spss=newsapp");
urlList.add("https://c.m.163.com/news/a/DCPLJ5GB05198R91.html?spss=newsapp");
urlList.add("https://c.m.163.com/news/a/DCRNI7020511CPVM.html?spss=newsapp");
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String url : urlList) {
String id = url.split("a/")[1].split(".ht")[0];
List<Map<String,Object>> lists = Wangyi.getWangyiCommentData(id);
System.out.println(lists.size());
if(lists != null) {
bodyList.addAll(lists);
}
}
List<String> headList = new ArrayList<String>();
headList.add("content");
headList.add("id");
......@@ -29,7 +37,7 @@ public class WangyiCommentExample {
headList.add("unlike");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\网易评论采集测试.xlsx", "asd", headList, lists);
poi.exportExcel("D:\\crawlerdata\\网易评论采集-3.xlsx", "asd", headList, bodyList);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment