Commit a8ebdd2c by yangchen

自媒体采集部分优化

parent a205f946
...@@ -614,10 +614,40 @@ public class HeadGet { ...@@ -614,10 +614,40 @@ public class HeadGet {
return headerMap; return headerMap;
} }
public static Map<String,String> getSouhuByWordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36");
headerMap.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "api.k.sohu.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static Map<String,String> getxiaomiShequByWordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36");
headerMap.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "so.bbs.xiaomi.cn");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static void main(String[] args) { public static void main(String[] args) {
String url = "http://comment.dy.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/D75MDLL10524H5KD/comments/newList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc&_=1514966469573"; String url = "http://so.bbs.xiaomi.cn/?q=%E5%B0%8F%E7%B1%B3%20%E7%94%B5%E9%A5%AD%E7%85%B2%20%E5%BC%80%E8%A3%82&p=1&fid=0&time=63072000&order=1";
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000fafc45b92e51a92d1a2d1c0536594402729a928137fe205f823d71e18c3e786e6f368baff37f7edc;%20uin=o0497332654;%20skey=MSF4MCe62n;%20sigA2=75E9AE34BD844F7CD19AC30353DE6116A767F02C50C78ABA2FB11B5B1D74324CCEDA1C9D13B6B3719AAA7875B14DBE4C560CB5FB99A5D63390B8041F6C83A48401EA8D5DA7B04E7A;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwvJbQ-Gsn52dfcob8V66AgcW1SAGy8xloQk1nVWfjVvR0b637c-qcRWE7M2QtFLKLsZP8o6dBVABpDhbzRQ92tw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"; String cookie = "mstuid=1518141097798_2540; Hm_lvt_71558e7b4aa822e282e758f8dc0b88b0=1518141098; lastsource=so.bbs.xiaomi.cn; mstz=||795199218.38||http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25e5%25b0%258f%25e7%25b1%25b3%2520%25e7%2594%25b5%25e9%25a5%25ad%25e7%2585%25b2%2520%25e5%25bc%2580%25e8%25a3%2582%7Cp%3D1%7Cfid%3D0%7Ctime%3D31536000%7Corder%3D1|http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25e5%25b0%258f%25e7%25b1%25b3%2520%25e7%2594%25b5%25e9%25a5%25ad%25e7%2585%25b2%2520%25e5%25bc%2580%25e8%25a3%2582%7Cp%3D1%7Cfid%3D0%7Ctime%3D63072000%7Corder%3D1; xm_vistor=1518141097798_2540_1518141097798-1518142530797; msttime=http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25E5%25B0%258F%25E7%25B1%25B3%2520%25E7%2594%25B5%25E9%25A5%25AD%25E7%2585%25B2%2520%25E5%25BC%2580%25E8%25A3%2582%26p%3D1%26fid%3D0%26time%3D63072000%26order%3D1; msttime1=http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25E5%25B0%258F%25E7%25B1%25B3%2520%25E7%2594%25B5%25E9%25A5%25AD%25E7%2585%25B2%2520%25E5%25BC%2580%25E8%25A3%2582%26p%3D1%26fid%3D0%26time%3D63072000%26order%3D1; Hm_lpvt_71558e7b4aa822e282e758f8dc0b88b0=1518142531";
Map<String,String> headerMap = HeadGet.getWangyiCommentHeaderMap(null); Map<String,String> headerMap = HeadGet.getxiaomiShequByWordHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url, headerMap); String result = HttpClient.executeHttpRequestGet(url, headerMap);
System.out.println(result); System.out.println(result);
System.out.println(result.length()); System.out.println(result.length());
......
...@@ -26,23 +26,23 @@ public class Baijia { ...@@ -26,23 +26,23 @@ public class Baijia {
public static List<Map<String,Object>> getBaijiaAccountData(String app_id,String startTime) { public static List<Map<String,Object>> getBaijiaAccountData(String app_id,String startTime) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
int i = 0; int i = 0;
Map<String,String> headerMap = HeadGet.getBaijiaAccountHeaderMap(null);
try { try {
while(true) { while(true) {
try { try {
String url = "https://baijia.baidu.com/writerlistarticle?ajax=json&app_id="+app_id+"&_limit=20&_skip="; String url = "https://baijia.baidu.com/writerlistarticle?ajax=json&app_id="+app_id+"&_limit=20&_skip=";
logger.info(url+i);
Map<String,String> headerMap = HeadGet.getBaijiaAccountHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url + i, headerMap); String result = HttpClient.executeHttpRequestGet(url + i, headerMap);
List<Map<String,Object>> list = baijiaAccountAnalysis.getBaijiaAccountData(result, startTime); List<Map<String,Object>> list = baijiaAccountAnalysis.getBaijiaAccountData(result, startTime);
if(list == null || list.size() < 1){ if(list == null || list.size() < 1){
break; break;
} }
i += 20; i += 20;
ZhiWeiTools.sleep(6000); ZhiWeiTools.sleep(5000);
dataList.addAll(list); dataList.addAll(list);
logger.info(url+i+"=============="+dataList.size());
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
ZhiWeiTools.sleep(5000); ZhiWeiTools.sleep(4000);
logger.error("此页解析出错",e.getMessage()); logger.error("此页解析出错",e.getMessage());
continue; continue;
} }
......
...@@ -32,17 +32,23 @@ public class Fenghuang { ...@@ -32,17 +32,23 @@ public class Fenghuang {
public static List<Map<String,Object>> getFenghuangAccountData(String id,String startTime) { public static List<Map<String,Object>> getFenghuangAccountData(String id,String startTime) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
int i = 1; int i = 1;
while(true){ boolean f = true;
while(f){
try { try {
String url = "http://api.3g.ifeng.com/api_wemedia_index?followid=weMedia_"+id+"&page="+i+"&pagesize=20&tag=article"; for(int j = 0;j< 4;j++){
System.out.println("====================采集第"+i+"页"); f = true;
List<Map<String,Object>> list = fenghuangAccountAnalysis.getArticleData(url, startTime); String url = "http://api.3g.ifeng.com/api_wemedia_index?followid=weMedia_"+id+"&page="+i+"&pagesize=20&tag=article";
if(list == null || list.size() < 1) { List<Map<String,Object>> list = fenghuangAccountAnalysis.getArticleData(url, startTime);
break; if(list != null && list.size() > 0) {
dataList.addAll(list);
System.out.println("====================采集第"+i+"页===共获取数据=="+dataList.size());
i++;
ZhiWeiTools.sleep(2000);
break;
}
f = false;
ZhiWeiTools.sleep(2000);
} }
dataList.addAll(list);
ZhiWeiTools.sleep(2000);
i++;
} catch (Exception e) { } catch (Exception e) {
logger.error("程序出错",e.getMessage()); logger.error("程序出错",e.getMessage());
return dataList; return dataList;
......
...@@ -22,6 +22,7 @@ public class Souhu { ...@@ -22,6 +22,7 @@ public class Souhu {
private static SouhuAccountAnalysis souhuAccountAnalysis = new SouhuAccountAnalysis(); private static SouhuAccountAnalysis souhuAccountAnalysis = new SouhuAccountAnalysis();
private static SouhuCommentAnalysis souhuCommentAnalysis = new SouhuCommentAnalysis(); private static SouhuCommentAnalysis souhuCommentAnalysis = new SouhuCommentAnalysis();
/** /**
* *
* @Description 获取链接评论数 * @Description 获取链接评论数
......
package com.zhiwei.parse;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.XiaomiShequByWordAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class Xiaomi {
private static Logger logger = LoggerFactory.getLogger(Xiaomi.class);
private static XiaomiShequByWordAnalysis xiaomiShequByWordAnalysis = new XiaomiShequByWordAnalysis();
public static List<Map<String,Object>> getXiaomiByWordData(String word) {
Map<String,String> headerMap = HeadGet.getxiaomiShequByWordHeaderMap(null);
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
int i = 1;
try {
while(true) {
String url = "http://so.bbs.xiaomi.cn/?q="+URLEncoder.encode(word, "UTF-8")+"&p="+i+"&fid=457&time=63072000&order=1";
String result = HttpClient.executeHttpRequestGet(url, headerMap);
List<Map<String,Object>> dataList = xiaomiShequByWordAnalysis.getdata(result);
if(dataList == null || dataList.size() < 1) {
break;
}
bodyList.addAll(dataList);
ZhiWeiTools.sleep(5000);
System.out.println("第"+i+"页==========="+bodyList.size());
i++;
}
return bodyList;
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return bodyList;
}
}
}
package com.zhiwei.parse.analysis; package com.zhiwei.parse.analysis;
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
...@@ -49,13 +48,14 @@ public class BaijiaAccountAnalysis { ...@@ -49,13 +48,14 @@ public class BaijiaAccountAnalysis {
if(url == null) { if(url == null) {
url = "https://baijia.baidu.com/s?old_id=" + id; url = "https://baijia.baidu.com/s?old_id=" + id;
} }
map.put("content", getBaijiaContent(url)); // map.put("content", getBaijiaContent(url));
map.put("content", data.getString("abstract"));
map.put("read_amount", data.getString("read_amount")==null?0:data.getString("read_amount")); map.put("read_amount", data.getString("read_amount")==null?0:data.getString("read_amount"));
map.put("app_id", data.getString("app_id")); map.put("app_id", data.getString("app_id"));
map.put("time", time); map.put("time", time);
map.put("url", url); map.put("url", url);
map.put("source", data.getString("writer_name")); map.put("source", data.getString("writer_name"));
System.out.println(map.toString()); // System.out.println(map.toString());
dataList.add(map); dataList.add(map);
} }
return dataList; return dataList;
......
...@@ -26,11 +26,25 @@ public class FenghuangAccountAnalysis { ...@@ -26,11 +26,25 @@ public class FenghuangAccountAnalysis {
*/ */
public List<Map<String,Object>> getArticleData(String url,String startTime) { public List<Map<String,Object>> getArticleData(String url,String startTime) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try { try {
Map<String,String> headerMap = HeadGet.getFenghuangAccountHeaderMap(null); Map<String,String> headerMap = HeadGet.getFenghuangAccountHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url, headerMap); JSONArray jsonArry = null;
JSONObject json = JSONObject.parseObject(result); for(int i = 0;i < 3;i++) {
JSONArray jsonArry = json.getJSONObject("data").getJSONObject("feeds").getJSONArray("list"); try {
String result = HttpClient.executeHttpRequestGet(url, headerMap);
JSONObject json = JSONObject.parseObject(result);
jsonArry = json.getJSONObject("data").getJSONObject("feeds").getJSONArray("list");
if(jsonArry == null || jsonArry.size() < 1) {
continue;
}
} catch (Exception e) {
continue;
}
}
if(jsonArry == null || jsonArry.size() < 1) {
return dataList;
}
for(int i = 0;i < jsonArry.size();i++) { for(int i = 0;i < jsonArry.size();i++) {
try { try {
JSONObject data = jsonArry.getJSONObject(i); JSONObject data = jsonArry.getJSONObject(i);
...@@ -50,11 +64,11 @@ public class FenghuangAccountAnalysis { ...@@ -50,11 +64,11 @@ public class FenghuangAccountAnalysis {
continue; continue;
} }
} }
return dataList;
} catch (Exception e1) { } catch (Exception e1) {
e1.printStackTrace(); e1.printStackTrace();
return null; return dataList;
} }
return dataList;
} }
private static Map<String,Object> getArticle(String articleResult) { private static Map<String,Object> getArticle(String articleResult) {
......
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class XiaomiShequByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(XiaomiShequByWordAnalysis.class);
public List<Map<String,Object>> getdata(String result) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
Document doc = Jsoup.parse(result);
Elements elements = doc.select("div.search_list").select("dl");
System.out.println(elements.size());
for(Element element : elements) {
Map<String,Object> map = new HashMap<String,Object>();
String title = element.select("dt").select("a").text();
String url = element.select("dt").select("a").attr("href");
map.put("title", title);
map.put("source", "小米社区");
map.put("url", url);
try {
String time = element.select("dd").select("div.info").text();
time = time.split(" ")[4]+" "+time.split(" ")[5];
map.put("time", time);
} catch (Exception e) {
continue;
}
String content = element.select("dd").select("p").select("a").text();
map.put("content", content);
dataList.add(map);
}
return dataList;
} catch (Exception e) {
logger.error("解析出错");
return null;
}
}
}
package com.zhiwei.util; package com.zhiwei.util;
import org.slf4j.Logger; import java.util.Calendar;
import org.slf4j.LoggerFactory;
public class TimeUtil { public class TimeUtil {
private static Logger logger = LoggerFactory.getLogger(TimeUtil.class);
public static String timeUtil(String time) { public static String timeUtil(String time) {
Calendar calendar = Calendar.getInstance();
if(time.split("-").length == 2) { if(time.split("-").length == 2) {
time = "2017-"+time+":00"; time = calendar.getWeekYear() + "-" + time + ":00";
}else { }else {
return null; return "20"+time+":00";
} }
return time; return time;
} }
public static void main(String[] args) { public static void main(String[] args) {
String time = "17-12-12 15:01"; String time = "12-12 15:01";
System.out.println(timeUtil(time)); System.out.println(timeUtil(time));
} }
......
package com.zhiwei.crawler; package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.junit.Test; import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Baijia; import com.zhiwei.parse.Baijia;
public class BaijiaAccountExample { public class BaijiaAccountExample {
@Test @Test
public void baijiaAccountTest() { public void baijiaAccountTest() {
String app_id = "1536767984069926"; String app_id = "1536766731827943";
String startTime = ""; String startTime = "2016-01-01 00:00:00";
//2017-11-30 17:48:17 //2017-11-30 17:48:17
List<Map<String,Object>> lists = Baijia.getBaijiaAccountData(app_id,startTime); List<Map<String,Object>> lists = Baijia.getBaijiaAccountData(app_id,startTime);
System.out.println(lists.size()); PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("read_amount");
headList.add("app_id");
headList.add("source");
headList.add("url");
headList.add("content");
poi.exportExcel("D://crawlerdata/百家号-蓝鲸TMT网.xlsx", "蓝鲸TMT网", headList, lists);
} }
} }
package com.zhiwei.crawler; package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.junit.Test; import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang; import com.zhiwei.parse.Fenghuang;
public class FenghuangAccountExample { public class FenghuangAccountExample {
...@@ -12,13 +14,26 @@ public class FenghuangAccountExample { ...@@ -12,13 +14,26 @@ public class FenghuangAccountExample {
@Test @Test
public void fenghuangAccountTest() { public void fenghuangAccountTest() {
//所用时间长 1s1篇文章吧 //所用时间长 1s1篇文章吧
String id = "733691"; //https://api.3g.ifeng.com/client_search_subscribe?k=(凤凰号名称拿id)
String startTime = "2017-11-15 00:00:00"; //可为空 String id = "276718";
List<Map<String,Object>> dataList = Fenghuang.getFenghuangAccountData(id, startTime); String[] ids = id.split(",");
for(Map<String,Object> map : dataList) { for(int i = 0;i < ids.length;i++) {
System.out.println(map.toString()); try {
String startTime = "2017-01-01 00:00:00"; //可为空
List<Map<String,Object>> dataList = Fenghuang.getFenghuangAccountData(ids[i], startTime);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("text");
headList.add("source");
headList.add("url");
headList.add("id");
poi.exportExcel("D://crawlerdata/凤凰-另眼看世界.xlsx", ids[i], headList, dataList);
} catch (Exception e) {
continue;
}
} }
System.out.println(dataList.size());
} }
} }
...@@ -13,8 +13,8 @@ public class QQAccountExample { ...@@ -13,8 +13,8 @@ public class QQAccountExample {
@Test @Test
public void qqAccountTest() { public void qqAccountTest() {
String child = "5002744"; String child = "5975325";
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000db3c2ec2393ea968f523f50144db7ab5aec60e79d2509c271bdacdf784e88ac1f58b7493c23ceb15;%20uin=o0497332654;%20skey=M67MOgvFQJ;%20sigA2=D3046D543D9BA50CFE749D63B1F05AF28A281C29B4F1353374AB7A19D9527497A67E507C6829AE44F67C1EA032C2A3728301D2ABC864DA32BCA7D4C7A61609F9F3BC9AE0A7243003;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmUT_jxJCnY5yVwhmL3e2K5FOTRth6jz8SKVHGseA3v9s8UIZxw00LpF1uC9l7W5WL2trdb69LlCvE1s7twReOw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"; String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
List<Map<String,Object>> dataList = QQKB.getQQAccountData(child, cookie); List<Map<String,Object>> dataList = QQKB.getQQAccountData(child, cookie);
System.out.println(dataList.size()); System.out.println(dataList.size());
...@@ -25,7 +25,7 @@ public class QQAccountExample { ...@@ -25,7 +25,7 @@ public class QQAccountExample {
headList.add("content"); headList.add("content");
headList.add("url"); headList.add("url");
headList.add("commentid"); headList.add("commentid");
poi.exportExcel("D://crawlerdata/qq-5002744.xlsx", "asd", headList, dataList); poi.exportExcel("D://crawlerdata/qq-5975325.xlsx", "asd", headList, dataList);
} }
......
...@@ -12,9 +12,11 @@ import com.zhiwei.parse.Souhu; ...@@ -12,9 +12,11 @@ import com.zhiwei.parse.Souhu;
public class SouhuAccountExample { public class SouhuAccountExample {
//http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8
@Test @Test
public void souhuAccountTest() { public void souhuAccountTest() {
List<Map<String,Object>> lists = Souhu.getSouHuAccountData("c29odXptdGh5YXRieUBzb2h1LmNvbQ==","2017-01-01 00:00:00",false); List<Map<String,Object>> lists = Souhu.getSouHuAccountData("cHBhZzUyMTNjZjAzZTczYUBzb2h1LmNvbQ==","2017-01-01 00:00:00",false);
System.out.println(lists.size()); System.out.println(lists.size());
List<String> headList = new ArrayList<String>(); List<String> headList = new ArrayList<String>();
headList.add("title"); headList.add("title");
...@@ -26,7 +28,7 @@ public class SouhuAccountExample { ...@@ -26,7 +28,7 @@ public class SouhuAccountExample {
headList.add("newsid"); headList.add("newsid");
headList.add("newsPv"); headList.add("newsPv");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章.xlsx", "sasd", headList, lists); poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-蓝媒汇.xlsx", "蓝媒汇", headList, lists);
} }
} }
...@@ -14,7 +14,7 @@ public class SouhuCommentExample { ...@@ -14,7 +14,7 @@ public class SouhuCommentExample {
@Test @Test
public void souhuCommentTest() { public void souhuCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata/搜狐评论采集.xlsx", 0); Map<String,Object> map = poi.importExcel("D://crawlerdata/搜狐评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body"); List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
......
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Xiaomi;
public class XiaomiShequByWordExample {
public static void main(String[] args) {
String word = "小米 电饭煲 锅体开裂,小米 电饭煲 开裂,小米 电饭煲 裂缝,小米 电饭煲 烫伤,小米 电饭煲 变形";
//
String[] words = word.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) {
List<Map<String,Object>> dataList = Xiaomi.getXiaomiByWordData(w);
if(dataList != null && dataList.size() > 0) {
bodyList.addAll(dataList);
}
}
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("source");
headList.add("url");
headList.add("content");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\小米-关键词电饭煲相关.xlsx", "小米社区采集", headList, bodyList);
}
}
package com.zhiwei.crawler; package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.junit.Test; import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Yidianzixun; import com.zhiwei.parse.Yidianzixun;
...@@ -12,12 +14,19 @@ public class YidianzixunAccountExample { ...@@ -12,12 +14,19 @@ public class YidianzixunAccountExample {
@Test @Test
public void yidianzixunAccountTest() { public void yidianzixunAccountTest() {
String channelid = "m133695"; String channelid = "m143901";
String startTime = "2017-09-10 09:42:05"; String startTime = "2017-01-01 00:00:00";
List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunAccountData(channelid, startTime); List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunAccountData(channelid, startTime);
for(Map<String,Object> map : dataList) { PoiExcelUtil poi = PoiExcelUtil.getInstance();
System.out.println(map.toString()); List<String> headList = new ArrayList<String>();
} headList.add("title");
headList.add("time");
headList.add("comment_count");
headList.add("ctype");
headList.add("source");
headList.add("url");
headList.add("summary");
poi.exportExcel("D://crawlerdata/一点资讯-虎嗅.xlsx", "虎嗅", headList, dataList);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment