Commit 89439323 by yangchen

1

parent 132e6350
package com.zhiwei.parse; package com.zhiwei.parse;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.BaijiaAccountAnalysis; import com.zhiwei.parse.analysis.BaijiaAccountAnalysis;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class Baijia { public class Baijia {
...@@ -19,6 +23,45 @@ public class Baijia { ...@@ -19,6 +23,45 @@ public class Baijia {
/** /**
* *
* @Description 百家号历史文章采集 * @Description 百家号历史文章采集
* @param app_id 百度新闻转发获取后面的数据
* @param startTime
* @return
*/
public static List<Map<String,Object>> getBaijiaAccount2Data(String app_id,String startTime) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
Map<String,String> headerMap = HeadGet.getBaijiaAccount2HeaderMap(null);
String url = "https://news.baidu.com/sn/api/homesubcribe?forum_id="+app_id;
boolean f = true;
while(f) {
String result = HttpClient.executeHttpRequestGet(url, headerMap);
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("news");
for(int i = 0;i < jsonArry.size();i++) {
JSONObject data = jsonArry.getJSONObject(i);
Map<String,Object> m = baijiaAccountAnalysis.getBaijiaAccount2Data(data);
if(startTime.compareTo((String) m.get("time")) > 0) {
f = false;
break;
}
dataList.add(m);
if(startTime != null && startTime.length() > 5) {
logger.info("采集到的时间为:{}",(String) m.get("time"));
}
}
logger.info("采集到的数据总量:{}",dataList.size());
if(json.getJSONObject("data").getBooleanValue("hasMore")) {
url = "https://news.baidu.com/sn/api/homesubcribe?forum_id="+app_id+"&page=" + (json.getJSONObject("data").getIntValue("page")+1);
ZhiWeiTools.sleep(2000);
continue;
}
break;
}
return dataList;
}
/**
*
* @Description 百家号历史文章采集
* @param app_id * @param app_id
* @param startTime * @param startTime
* @return * @return
...@@ -37,12 +80,12 @@ public class Baijia { ...@@ -37,12 +80,12 @@ public class Baijia {
break; break;
} }
i += 20; i += 20;
ZhiWeiTools.sleep(5000); ZhiWeiTools.sleep(4000);
dataList.addAll(list); dataList.addAll(list);
logger.info(url+i+"=============="+dataList.size()); logger.info(url+i+"=============="+dataList.size());
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
ZhiWeiTools.sleep(4000); ZhiWeiTools.sleep(3000);
logger.error("此页解析出错",e.getMessage()); logger.error("此页解析出错",e.getMessage());
continue; continue;
} }
......
...@@ -109,6 +109,7 @@ public class Fenghuang { ...@@ -109,6 +109,7 @@ public class Fenghuang {
int i = 1; int i = 1;
try { try {
while (true) { while (true) {
try {
String url = "http://search.ifeng.com/sofeng/search.action?q="+URLEncoder.encode(word, "UTF-8")+"&c=1&p="; String url = "http://search.ifeng.com/sofeng/search.action?q="+URLEncoder.encode(word, "UTF-8")+"&c=1&p=";
Map<String,String> headerMap = HeadGet.getFenghuangWordHeaderMap(null); Map<String,String> headerMap = HeadGet.getFenghuangWordHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url+i, headerMap); String result = HttpClient.executeHttpRequestGet(url+i, headerMap);
...@@ -125,15 +126,14 @@ public class Fenghuang { ...@@ -125,15 +126,14 @@ public class Fenghuang {
break; break;
} }
ZhiWeiTools.sleep(4000); ZhiWeiTools.sleep(4000);
} catch (Exception e) {
continue;
}
} }
return dataList;
} catch (UnsupportedEncodingException e) {
logger.error("依据关键词获取凤凰文章出错",e.getMessage());
e.printStackTrace();
return dataList; return dataList;
} catch (Exception e) { } catch (Exception e) {
logger.error("依据关键词获取凤凰文章出错",e.getMessage());
e.printStackTrace(); e.printStackTrace();
logger.error("链接获取凤凰信息出错",e.getMessage());
return dataList; return dataList;
} }
} }
......
...@@ -59,12 +59,12 @@ public class QQKB { ...@@ -59,12 +59,12 @@ public class QQKB {
paramMap = HeadGet.getQQAccountOtherParamMap(ids); paramMap = HeadGet.getQQAccountOtherParamMap(ids);
result = HttpClient.executeHttpRequestPost(url, headerMap, paramMap); result = HttpClient.executeHttpRequestPost(url, headerMap, paramMap);
List<Map<String,Object>> list = qqAccountAnalysis.analysisQQAccountData(result); List<Map<String,Object>> list = qqAccountAnalysis.analysisQQAccountData(result);
ids = "";
i = 0;
if(list != null) { if(list != null) {
dataList.addAll(list); dataList.addAll(list);
break; break;
} }
ids = "";
i = 0;
} }
} catch (Exception e) { } catch (Exception e) {
ids = ""; ids = "";
......
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.TXNewsByWordAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class TXNews {
private static Logger logger = LoggerFactory.getLogger(TXNews.class);
private static TXNewsByWordAnalysis txNewsByWordAnalysis = new TXNewsByWordAnalysis();
public static boolean hasMore = true;
public static List<Map<String,Object>> getData(String word) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
Map<String,String> headerMap = HeadGet.getTxNewspage1HeaderMap(null);
Map<String,Object> paramMap = HeadGet.getTxNewspage1ParamMap(word);
String result = HttpClient.executeHttpRequestPost("http://r.inews.qq.com/search?appver=11.2.1_qqnews_5.5.60&devid=6D33F35F-880D-42A6-A23F-881BEC6960EC", headerMap, paramMap);
List<Map<String,Object>> dList = txNewsByWordAnalysis.getData(result);
dataList.addAll(dList);
int page = 2;
int count = 0;
Map<String,String> header2Map = HeadGet.getTxNewspage2HeaderMap(null);
while(hasMore) {
try {
ZhiWeiTools.sleep(5000);
Map<String,Object> param2Map = HeadGet.getTxNewspagemoreParamMap(word, page);
String result2 = HttpClient.executeHttpRequestPost("http://r.inews.qq.com/searchMore?appver=11.2.1_qqnews_5.5.60&devid=6D33F35F-880D-42A6-A23F-881BEC6960EC", header2Map, param2Map);
page++;
List<Map<String,Object>> dList2 = txNewsByWordAnalysis.getData(result2);
dataList.addAll(dList2);
logger.info("采集到数据======={}" ,dataList.size());
count = 0;
} catch (Exception e) {
if(count > 2) {
count++;
break;
}
continue;
}
}
return dataList;
}
}
...@@ -14,11 +14,29 @@ import com.alibaba.fastjson.JSONArray; ...@@ -14,11 +14,29 @@ import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class BaijiaAccountAnalysis { public class BaijiaAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(BaijiaAccountAnalysis.class); private static Logger logger = LoggerFactory.getLogger(BaijiaAccountAnalysis.class);
public Map<String,Object> getBaijiaAccount2Data(JSONObject data) {
Map<String,Object> map = new HashMap<String,Object>();
map.put("title", data.getString("title"));
map.put("url", data.getString("url"));
map.put("source", data.getString("site"));
map.put("time", TimeParse.dateFormartString(TimeParse.stringFormartDate(data.getString("pulltime")),"yyyy-MM-dd HH:mm:ss"));
String content = "";
JSONArray jsonArry = data.getJSONArray("content");
for(int i = 0;i < jsonArry.size();i++) {
JSONObject d = jsonArry.getJSONObject(i);
if(d.getString("type").equals("text")) {
content = content + d.getString("data");
}
}
map.put("content", content.replaceAll("<.*?>", ""));
return map;
}
/** /**
* *
......
...@@ -52,7 +52,6 @@ public class DayuAccountAnalysis { ...@@ -52,7 +52,6 @@ public class DayuAccountAnalysis {
Map<String,Object> map = new HashMap<String, Object>(); Map<String,Object> map = new HashMap<String, Object>();
try { try {
String time = data.getString("published_at").replace("T", " ").split("\\.")[0]; String time = data.getString("published_at").replace("T", " ").split("\\.")[0];
System.out.println(time);
if(startTime != null && startTime.length() > 1) { if(startTime != null && startTime.length() > 1) {
if(time.compareTo(startTime) < 0) { if(time.compareTo(startTime) < 0) {
return null; return null;
......
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.parse.TXNews;
public class TXNewsByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(TXNewsByWordAnalysis.class);
public List<Map<String,Object>> getData(String result) {
JSONObject json = JSONObject.parseObject(result);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
JSONArray jsonArry = json.getJSONArray("secList");
if(json.getInteger("hasMore") == 1) {
TXNews.hasMore = true;
}else {
TXNews.hasMore = false;
}
for(int i = 0; i < jsonArry.size();i++) {
JSONObject js = jsonArry.getJSONObject(i);
if(js.getInteger("secType") == 0) {
JSONArray jsonArry2 = js.getJSONArray("newsList");
for(int j = 0; j < jsonArry2.size();j++) {
JSONObject js2 = jsonArry2.getJSONObject(j);
try {
Map<String,Object> map = new HashMap<String,Object>();
map.put("title", js2.getString("title"));
map.put("content", js2.getString("abstract"));
map.put("time", js2.getString("time"));
map.put("source", js2.getString("source"));
map.put("id", js2.getString("id"));
map.put("url", js2.getString("url"));
dataList.add(map);
// System.out.println(map.toString());
} catch (Exception e) {
logger.error("采集出错:{}",e.getMessage());
System.out.println(js2.toString());
}
}
}
}
return dataList;
}
}
...@@ -13,7 +13,7 @@ public class BaijiaAccountExample { ...@@ -13,7 +13,7 @@ public class BaijiaAccountExample {
@Test @Test
public void baijiaAccountTest() { public void baijiaAccountTest() {
String app_id = "1536766731827943"; String app_id = "1536766390576806";
String startTime = "2016-01-01 00:00:00"; String startTime = "2016-01-01 00:00:00";
//2017-11-30 17:48:17 //2017-11-30 17:48:17
List<Map<String,Object>> lists = Baijia.getBaijiaAccountData(app_id,startTime); List<Map<String,Object>> lists = Baijia.getBaijiaAccountData(app_id,startTime);
...@@ -26,7 +26,23 @@ public class BaijiaAccountExample { ...@@ -26,7 +26,23 @@ public class BaijiaAccountExample {
headList.add("source"); headList.add("source");
headList.add("url"); headList.add("url");
headList.add("content"); headList.add("content");
poi.exportExcel("D://crawlerdata/百家号-蓝鲸TMT网.xlsx", "蓝鲸TMT网", headList, lists); poi.exportExcel("D://crawlerdata/百家号-太保.xlsx", "太保", headList, lists);
}
// @Test
public void baijiaAccount2Test() {
String app_id = "b_1536766390576806";
String startTime = "2016-01-01 00:00:00";
//2017-11-30 17:48:17
List<Map<String,Object>> lists = Baijia.getBaijiaAccount2Data(app_id,startTime);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("source");
headList.add("url");
headList.add("content");
poi.exportExcel("D://crawlerdata/百家号-俊世太保.xlsx", "俊世太保", headList, lists);
} }
} }
package com.zhiwei.crawler; package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.junit.Test; import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Dayu; import com.zhiwei.parse.Dayu;
public class DayuAccountExample { public class DayuAccountExample {
...@@ -15,12 +17,32 @@ public class DayuAccountExample { ...@@ -15,12 +17,32 @@ public class DayuAccountExample {
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true //https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
String mid = "d7300311c1504d24a229c3da345785c6"; // String mid = "d7300311c1504d24a229c3da345785c6";
String name = "大鱼海棠雨"; // String name = "大鱼海棠雨";
String startTime = "2017-12-05 22:08:01"; String startTime = "2018-03-16 00:00:00";
String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx";
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("source");
headList.add("url");
// headList.add("content_id");
// headList.add("origin_id");
// headList.add("xss_item_id");
for(Map<String,Object> data : lists) {
String mid = data.get("mid")+"";
String name = data.get("name")+"";
if(mid.length() < 1 && name.length() < 1) {
continue;
}
List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null); List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null);
System.out.println(dataList.size()); poi.exportExcel(path, name, headList, dataList);
}
} }
......
...@@ -12,7 +12,7 @@ public class DayuByWordExample { ...@@ -12,7 +12,7 @@ public class DayuByWordExample {
@Test @Test
public void dayuByWordTest() { public void dayuByWordTest() {
String word = "京东"; String word = "沃尔玛";
List<Map<String,Object>> dataList = Dayu.getDayuByWordData(word); List<Map<String,Object>> dataList = Dayu.getDayuByWordData(word);
......
...@@ -15,7 +15,7 @@ public class FenghuangAccountExample { ...@@ -15,7 +15,7 @@ public class FenghuangAccountExample {
public void fenghuangAccountTest() { public void fenghuangAccountTest() {
//所用时间长 1s1篇文章吧 //所用时间长 1s1篇文章吧
//https://api.3g.ifeng.com/client_search_subscribe?k=(凤凰号名称拿id) //https://api.3g.ifeng.com/client_search_subscribe?k=(凤凰号名称拿id)
String id = "276718"; String id = "724";
String[] ids = id.split(","); String[] ids = id.split(",");
for(int i = 0;i < ids.length;i++) { for(int i = 0;i < ids.length;i++) {
try { try {
...@@ -29,7 +29,7 @@ public class FenghuangAccountExample { ...@@ -29,7 +29,7 @@ public class FenghuangAccountExample {
headList.add("source"); headList.add("source");
headList.add("url"); headList.add("url");
headList.add("id"); headList.add("id");
poi.exportExcel("D://crawlerdata/凤凰-另眼看世界.xlsx", ids[i], headList, dataList); poi.exportExcel("D://crawlerdata/凤凰-电商报.xlsx", ids[i], headList, dataList);
} catch (Exception e) { } catch (Exception e) {
continue; continue;
} }
......
...@@ -16,14 +16,15 @@ public class QQAccountExample { ...@@ -16,14 +16,15 @@ public class QQAccountExample {
public void qqAccountTest() { public void qqAccountTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> dataMap = poi.importExcel("D://crawlerdata/天天快报历史文章采集.xlsx", 0); Map<String,Object> dataMap = poi.importExcel("D://crawlerdata//自媒体/天天快报历史文章采集.xlsx", 0);
List<Map<String,Object>> dataList = (List<Map<String, Object>>) dataMap.get("body"); List<Map<String,Object>> dataList = (List<Map<String, Object>>) dataMap.get("body");
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"; String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(Map<String,Object> map : dataList) { for(Map<String,Object> map : dataList) {
String child = map.get("帐号链接")+""; String child = map.get("帐号链接")+"";
System.out.println(child.split("chlid=")[1]); // System.out.println(child.split("chlid=")[1]);
List<Map<String,Object>> lists = QQKB.getQQAccountData(child.split("chlid=")[1], cookie); System.out.println((String)map.get("child"));
List<Map<String,Object>> lists = QQKB.getQQAccountData((String)map.get("child"), cookie);
if(lists != null) { if(lists != null) {
for(Map<String,Object> map1 : lists) { for(Map<String,Object> map1 : lists) {
map1.put("name", map.get("呢称")); map1.put("name", map.get("呢称"));
...@@ -43,7 +44,7 @@ public class QQAccountExample { ...@@ -43,7 +44,7 @@ public class QQAccountExample {
headList.add("content"); headList.add("content");
headList.add("url"); headList.add("url");
headList.add("commentid"); headList.add("commentid");
poi.exportExcel("D://crawlerdata/天天快报采集.xlsx", "asd", headList, bodyList); poi.exportExcel("D://crawlerdata//自媒体/天天快报采集-科技编年史.xlsx", "asd", headList, bodyList);
} }
......
...@@ -16,7 +16,7 @@ public class SouhuAccountExample { ...@@ -16,7 +16,7 @@ public class SouhuAccountExample {
@Test @Test
public void souhuAccountTest() { public void souhuAccountTest() {
List<Map<String,Object>> lists = Souhu.getSouHuAccountData("cHBhZzUyMTNjZjAzZTczYUBzb2h1LmNvbQ==","2017-01-01 00:00:00",false); List<Map<String,Object>> lists = Souhu.getSouHuAccountData("MjI5MzAyOTMyMEBzaW5hLnNvaHUuY29t","2016-01-01 00:00:00",false);
System.out.println(lists.size()); System.out.println(lists.size());
List<String> headList = new ArrayList<String>(); List<String> headList = new ArrayList<String>();
headList.add("title"); headList.add("title");
...@@ -28,7 +28,7 @@ public class SouhuAccountExample { ...@@ -28,7 +28,7 @@ public class SouhuAccountExample {
headList.add("newsid"); headList.add("newsid");
headList.add("newsPv"); headList.add("newsPv");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-蓝媒汇.xlsx", "蓝媒汇", headList, lists); poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-太保乱谈.xlsx", "太保乱谈", headList, lists);
} }
} }
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.TXNews;
public class TXNewsByWordExample {
public static void main(String[] args) {
String word = "唐嫣";
List<Map<String,Object>> dataList = TXNews.getData(word);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("url");
headList.add("id");
headList.add("source");
poi.exportExcel("D://crawlerdata/腾讯新闻-唐嫣.xlsx", "腾讯新闻数据", headList, dataList);
}
}
...@@ -14,12 +14,20 @@ public class WangyiCommentExample { ...@@ -14,12 +14,20 @@ public class WangyiCommentExample {
//若出错 可能数据有重复 以id为准 //若出错 可能数据有重复 以id为准
@Test @Test
public void wangyiCommentTest() { public void wangyiCommentTest() {
String url = "http://news.163.com/18/0210/09/DA9B8PVJ000189FH.html"; List<String> urlList = new ArrayList<String>();
urlList.add("https://c.m.163.com/news/a/DCQ42REV05118O92.html?spss=newsapp");
String id = url.split("/")[6].split(".ht")[0]; urlList.add("https://c.m.163.com/news/a/DCPLJ5GB05198R91.html?spss=newsapp");
urlList.add("https://c.m.163.com/news/a/DCRNI7020511CPVM.html?spss=newsapp");
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String url : urlList) {
String id = url.split("a/")[1].split(".ht")[0];
List<Map<String,Object>> lists = Wangyi.getWangyiCommentData(id); List<Map<String,Object>> lists = Wangyi.getWangyiCommentData(id);
System.out.println(lists.size()); System.out.println(lists.size());
if(lists != null) {
bodyList.addAll(lists);
}
}
List<String> headList = new ArrayList<String>(); List<String> headList = new ArrayList<String>();
headList.add("content"); headList.add("content");
headList.add("id"); headList.add("id");
...@@ -29,7 +37,7 @@ public class WangyiCommentExample { ...@@ -29,7 +37,7 @@ public class WangyiCommentExample {
headList.add("unlike"); headList.add("unlike");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\网易评论采集测试.xlsx", "asd", headList, lists); poi.exportExcel("D:\\crawlerdata\\网易评论采集-3.xlsx", "asd", headList, bodyList);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment