Commit 3e3ea4d9 by yangchen

网易历史文章采集添加

parent e77ce092
...@@ -778,10 +778,25 @@ public class HeadGet { ...@@ -778,10 +778,25 @@ public class HeadGet {
return headerMap; return headerMap;
} }
public static Map<String,String> getWangyiHistoryHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
headerMap.put("Accept",
"*/*");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "dy.163.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static void main(String[] args) { public static void main(String[] args) {
String url = "https://a.jiemian.com/index.php?m=user&a=centerArticle&id=100032140&page=1"; String url = "http://dy.163.com/v2/article/detail/CK4OE81O0512974K.html";
String cookie = "pgv_pvi=1395917824; pgv_si=s4065829888"; // String cookie = "pgv_pvi=1395917824; pgv_si=s4065829888";
Map<String,String> headerMap = HeadGet.getAiqiyiBywordHeaderMap(cookie); Map<String,String> headerMap = HeadGet.getWangyiHistoryHeaderMap(null);
// Map<String,Object> paramMap = HeadGet.getTxNewsAccountpageParamMap("1979"); // Map<String,Object> paramMap = HeadGet.getTxNewsAccountpageParamMap("1979");
String result = HttpClient.executeHttpRequestGet(url,null, headerMap); String result = HttpClient.executeHttpRequestGet(url,null, headerMap);
System.out.println(result); System.out.println(result);
......
...@@ -83,6 +83,26 @@ public class Fenghuang { ...@@ -83,6 +83,26 @@ public class Fenghuang {
return dataList; return dataList;
} }
public static List<Map<String,Object>> getFenghuangCommentData2(String url,Proxy proxy) {
url = fenghuangCommentAnalysis.getdocUrl(url,proxy);
if(url == null) {
return null;
}
int i = 1;
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
while(true) {
System.out.println(url+i);
ZhiWeiTools.sleep(2000);
List<Map<String,Object>> list = fenghuangCommentAnalysis.getData2(url+i,proxy);
if(list == null || list.size() < 1) {
break;
}
i++;
dataList.addAll(list);
}
return dataList;
}
/** /**
* *
* @Description 获取凤凰评论数 * @Description 获取凤凰评论数
......
...@@ -105,7 +105,7 @@ public class QQKB { ...@@ -105,7 +105,7 @@ public class QQKB {
public static List<Map<String,Object>> getQQKBCommentData(String url,Proxy proxy) { public static List<Map<String,Object>> getQQKBCommentData(String url,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
String comment_id = getCid(url,proxy); String comment_id = getCid(url,proxy);
String article_id = url.split("/")[4]; String article_id = url.split("/")[4].split("\\?")[0];
Map<String,String> headerMap = HeadGet.getQQKBCommentHeaderMap(null); Map<String,String> headerMap = HeadGet.getQQKBCommentHeaderMap(null);
try { try {
Map<String,Object> paramMap = HeadGet.getQQKBCommentParamMap(comment_id, article_id); Map<String,Object> paramMap = HeadGet.getQQKBCommentParamMap(comment_id, article_id);
......
...@@ -5,19 +5,24 @@ import java.util.ArrayList; ...@@ -5,19 +5,24 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.jsoup.Jsoup;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.WangyiCommentAnalysis; import com.zhiwei.parse.analysis.WangyiCommentAnalysis;
import com.zhiwei.parse.analysis.WangyiHistoryAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class Wangyi { public class Wangyi {
private static Logger logger = LoggerFactory.getLogger(Wangyi.class); private static Logger logger = LoggerFactory.getLogger(Wangyi.class);
private static WangyiCommentAnalysis wangyiCommentAnalysis = new WangyiCommentAnalysis(); private static WangyiCommentAnalysis wangyiCommentAnalysis = new WangyiCommentAnalysis();
private static WangyiHistoryAnalysis wangyiHistoryAnalysis = new WangyiHistoryAnalysis();
/** /**
* *
* @Description 网易评论获取 * @Description 网易评论获取
...@@ -66,4 +71,36 @@ public class Wangyi { ...@@ -66,4 +71,36 @@ public class Wangyi {
return json.getInteger("tcount"); return json.getInteger("tcount");
} }
public static List<Map<String,Object>> getHistoryData(String url,Proxy proxy,String endTime) {
Map<String,String> headerMap = HeadGet.getWangyiHistoryHeaderMap(null);
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
String wemediaid = result.split("data-wemediaid=\"")[1].split("\"")[0];
String source = Jsoup.parse(result).select("body > div.colum_wrap.fl > div > div.colum_des > div.normal > div.colum_info > h4").text();
boolean f = true;
url = "http://dy.163.com/v2/article/list.do?wemediaId="+wemediaid+"&size=20&pageNo=";
int i = 1;
ZhiWeiTools.sleep(1000);
while(f) {
try {
result = "";
result = HttpClient.executeHttpRequestGet(url+i,proxy, headerMap);
JSONObject json = JSONObject.parseObject(result);
List<Map<String,Object>> dataList = wangyiHistoryAnalysis.getData(result,proxy, endTime,source);
if(dataList == null || dataList.size() < 1) {
break;
}
bodyList.addAll(dataList);
logger.info("数据采集第{}页;目前采集到数据{}条",i,bodyList.size());
f = json.getJSONObject("data").getBoolean("hasNext");
ZhiWeiTools.sleep(1000);
i++;
} catch (Exception e) {
ZhiWeiTools.sleep(1000);
continue;
}
}
return bodyList;
}
} }
...@@ -44,23 +44,25 @@ public class FenghuangCommentAnalysis { ...@@ -44,23 +44,25 @@ public class FenghuangCommentAnalysis {
public String getdocUrl(String url,Proxy proxy) { public String getdocUrl(String url,Proxy proxy) {
try { try {
String result = HttpClient.executeHttpRequestGet(url,proxy, null); String result = HttpClient.executeHttpRequestGet(url,proxy, null);
result = result.split("commentsUrl = '")[1].split("',")[0]; if(result.contains("commentUrl\":\"")) {
result = result.split("commentUrl\":\"")[1].split("\",")[0];
System.out.println(result);
if(result.contains("/a")) {
result = result.replace(":", "%3A");
url = "https://user.iclient.ifeng.com/Social_Api_Comment/getCommentList?comments_url="+result+"&hasChild=1&limit=30&page=";
}else { }else {
String docUrl = ""; return "http://comment.ifeng.com/get.php?docUrl="+url.replaceAll(":", "%3A").replaceAll("/", "%2F")+"&format=js&job=1&pageSize=20&p=";
docUrl = url.substring(url.length()-8,url.length());
url = "https://user.iclient.ifeng.com/Social_Api_Comment/getCommentList?comments_url=sub_"+docUrl+"&hasChild=1&limit=30&page=";
} }
System.out.println(result);
url = "http://comment.ifeng.com/get.php?docUrl="+result+"&format=js&job=1&pageSize=20&p=";
// if(url.contains("/a")) {
// url = url.replace(":", "%3A");
// url = "https://user.iclient.ifeng.com/Social_Api_Comment/getCommentList?comments_url="+url+"&hasChild=1&limit=30&page=";
// }else {
// String docUrl = "";
// docUrl = url.substring(url.length()-8,url.length());
// url = "https://user.iclient.ifeng.com/Social_Api_Comment/getCommentList?comments_url=sub_"+docUrl+"&hasChild=1&limit=30&page=";
// }
return url; return url;
} catch (Exception e) { } catch (Exception e) {
logger.error("解析连接出错",e.getMessage()); logger.error("解析连接出错",e.getMessage());
e.printStackTrace();
return null; return null;
} }
...@@ -97,6 +99,48 @@ public class FenghuangCommentAnalysis { ...@@ -97,6 +99,48 @@ public class FenghuangCommentAnalysis {
} }
public List<Map<String,Object>> getData2(String url,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getFenghuangCommentHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
String result;
try {
result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
} catch (Exception e) {
logger.error("链接获取信息失败",e.getMessage());
return null;
}
result = result.split("commentJsonVarStr___=")[1];
JSONObject json = JSONObject.parseObject(result.substring(0, result.length()-1));
JSONArray jsonArry = json.getJSONArray("comments");
try {
for(int i = 0;i < jsonArry.size(); i ++) {
Map<String,Object> map = getcommentData2(jsonArry.getJSONObject(i));
dataList.add(map);
}
} catch (Exception e) {
logger.error("获取信息出错",e.getMessage());
return null;
}
return dataList;
}
private Map<String,Object> getcommentData2(JSONObject json) {
Map<String,Object> map = new HashMap<String, Object>();
try {
map.put("nickname", json.getString("uname"));
map.put("content", json.getString("comment_contents").replaceAll("&quot;", ""));
map.put("id", json.getString("articel_id"));
map.put("like", json.getString("uptimes"));
map.put("from", json.getString("ip_from"));
map.put("time", json.getString("comment_date"));
} catch (Exception e) {
logger.error("具体解析一条数据出错",e.getMessage());
return null;
}
return map;
}
/** /**
* *
* @Description (解析评论具体信息) * @Description (解析评论具体信息)
......
package com.zhiwei.parse.analysis;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class WangyiHistoryAnalysis {
private static Logger logger = LoggerFactory.getLogger(WangyiHistoryAnalysis.class);
public List<Map<String,Object>> getData(String result,Proxy proxy,String endTime,String source) {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("list");
Map<String, String> headerMap = HeadGet.getWangyiHistoryHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
for(int i = 0;i < jsonArry.size();i++) {
try {
JSONObject data = jsonArry.getJSONObject(i);
Map<String,Object> map = new HashMap<String,Object>();
map.put("title", data.getString("title"));
if(endTime != null && endTime.length() > 1) {
if(data.getString("ptime").compareTo(endTime) <= 0) {
logger.info("超时时间采集范围 跳出采集");
return dataList;
}
}
map.put("time", data.getString("ptime"));
String url = "http://dy.163.com/v2/article/detail/"+data.getString("docid")+".html";
String reuslt = HttpClient.executeHttpRequestGet(url, proxy, headerMap);
Document doc = Jsoup.parse(reuslt);
map.put("content", doc.select("div.content").text());
map.put("url", url);
map.put("source", source);
System.out.println(map.toString());
dataList.add(map);
ZhiWeiTools.sleep(1000);
} catch (Exception e) {
ZhiWeiTools.sleep(1000);
continue;
}
}
return dataList;
}
}
...@@ -8,6 +8,7 @@ import org.junit.Test; ...@@ -8,6 +8,7 @@ import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang; import com.zhiwei.parse.Fenghuang;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class FenghuangCommentExample { public class FenghuangCommentExample {
...@@ -15,7 +16,7 @@ public class FenghuangCommentExample { ...@@ -15,7 +16,7 @@ public class FenghuangCommentExample {
public void fenghuangCommentTest() { public void fenghuangCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata/凤凰评论采集.xlsx", 0); Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body"); List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>(); List<String> urlList = new ArrayList<String>();
...@@ -23,35 +24,36 @@ public class FenghuangCommentExample { ...@@ -23,35 +24,36 @@ public class FenghuangCommentExample {
String url = ""; String url = "";
try { try {
url = map1.get("url")+""; url = map1.get("url")+"";
System.out.println(url);
List<Map<String,Object>> dataList = Fenghuang.getFenghuangCommentData(url,null); List<Map<String,Object>> dataList = Fenghuang.getFenghuangCommentData2(url,null);
if(dataList.size() <= 0) { if(dataList == null || dataList.size() <= 0) {
urlList.add(url); urlList.add(url);
} }
if(dataList != null) { if(dataList != null) {
bodyList.addAll(dataList); for(Map<String,Object> m : dataList) {
m.put("from_url", url);
bodyList.add(m);
}
} }
} catch (Exception e) { } catch (Exception e) {
System.out.println(url); System.out.println(url);
e.printStackTrace(); e.printStackTrace();
continue; continue;
} }
ZhiWeiTools.sleep(1000);
} }
List<String> headList = new ArrayList<String>(); List<String> headList = new ArrayList<String>();
headList.add("nickname"); headList.add("nickname");
headList.add("content"); headList.add("content");
headList.add("id"); headList.add("id");
// headList.add("rootid");
headList.add("like"); headList.add("like");
headList.add("unlike");
headList.add("total_num");
headList.add("from"); headList.add("from");
headList.add("source");
headList.add("time"); headList.add("time");
headList.add("from_url");
for(String s : urlList) { for(String s : urlList) {
System.out.println(s); System.out.println(s);
} }
poi.exportExcel("D://crawlerdata/凤凰评论采集.xlsx", "asd", headList, bodyList); poi.exportExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", "评论采集", headList, bodyList);
} }
......
...@@ -23,8 +23,8 @@ public class QQAccountExample { ...@@ -23,8 +23,8 @@ public class QQAccountExample {
for(Map<String,Object> map : dataList) { for(Map<String,Object> map : dataList) {
String child = map.get("帐号链接")+""; String child = map.get("帐号链接")+"";
// System.out.println(child.split("chlid=")[1]); // System.out.println(child.split("chlid=")[1]);
System.out.println((String)map.get("child")); System.out.println(child.split("=")[1]);
List<Map<String,Object>> lists = QQKB.getQQAccountData((String)map.get("child"), cookie,null); List<Map<String,Object>> lists = QQKB.getQQAccountData(child.split("=")[1], cookie,null);
if(lists != null) { if(lists != null) {
for(Map<String,Object> map1 : lists) { for(Map<String,Object> map1 : lists) {
map1.put("name", map.get("呢称")); map1.put("name", map.get("呢称"));
......
...@@ -10,7 +10,7 @@ public class QQKBCommentCountExample { ...@@ -10,7 +10,7 @@ public class QQKBCommentCountExample {
@Test @Test
public void qqkbCommentCountTest() { public void qqkbCommentCountTest() {
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=0003000049dd058f533cbebb240223ede63b864224f7eebe0f4aeca6a623572bb290a5800741d191a5768bb0;%20uin=o0497332654;%20skey=MIZmc2Oel3;%20sigA2=4282ABA809551D3534C72F999EE8F2A75219ED9452DEF04E4CBCE6B680C2C893C3E1BA617F5E0F387E558888B2ABEDFE87A4A25B16F9066C1154B2BC7A1133CA7B356AB9D3BA26ED;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwgGT4n96Oq-jHALnMUe8UzpoJghQDouvfSSWdh-JOdgAm3jRJUPbux6fcIPghoNxo24xdED8ennAANksJuHiwdw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"; String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=0003000049dd058f533cbebb240223ede63b864224f7eebe0f4aeca6a623572bb290a5800741d191a5768bb0;%20uin=o0497332654;%20skey=MIZmc2Oel3;%20sigA2=4282ABA809551D3534C72F999EE8F2A75219ED9452DEF04E4CBCE6B680C2C893C3E1BA617F5E0F387E558888B2ABEDFE87A4A25B16F9066C1154B2BC7A1133CA7B356AB9D3BA26ED;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwgGT4n96Oq-jHALnMUe8UzpoJghQDouvfSSWdh-JOdgAm3jRJUPbux6fcIPghoNxo24xdED8ennAANksJuHiwdw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
String url = ""; String url = "https://tech.sina.cn/i/gn/2018-04-26/detail-ifztkpin4282154.d.html?pos=18";
int i = QQKB.getCommentCount(cookie, url,null); int i = QQKB.getCommentCount(cookie, url,null);
System.out.println(i); System.out.println(i);
......
...@@ -14,7 +14,9 @@ public class QQKBCommentExample { ...@@ -14,7 +14,9 @@ public class QQKBCommentExample {
//天天快报与腾讯新闻都可用 不用cookie //天天快报与腾讯新闻都可用 不用cookie
@Test @Test
public void qqkbCommentTest() { public void qqkbCommentTest() {
String url = "https://kuaibao.qq.com/s/20180116C0EA8G00"; String url = "http://op.inews.qq.com/m/20180424A0309700?refer=100000355&chl_code=auto&h=0";
//https://kuaibao.qq.com/s/20180423A1PI7400?refer=kb_news
// https://kuaibao.qq.com/s/20180423A0L60800?refer=kb_news
List<Map<String,Object>> dataList = QQKB.getQQKBCommentData(url,null); List<Map<String,Object>> dataList = QQKB.getQQKBCommentData(url,null);
List<String> headList = new ArrayList<String>(); List<String> headList = new ArrayList<String>();
...@@ -26,7 +28,7 @@ public class QQKBCommentExample { ...@@ -26,7 +28,7 @@ public class QQKBCommentExample {
headList.add("content"); //内容 headList.add("content"); //内容
System.out.println(dataList.size()); System.out.println(dataList.size());
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\快报评论采集.xlsx", "sada", headList, dataList); poi.exportExcel("D:\\crawlerdata\\快报评论采集-2.xlsx", "sada", headList, dataList);
} }
......
package com.zhiwei.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
/**
* @ClassName: QQNewsCommentListTest
* @Description: TODO(腾讯新闻评论抓取)
* @author hero
* @date 2017年8月10日 下午6:08:41
*/
public class QQNewsCommentListTest {
public static void main(String[] args) {
List<String> urlList = new ArrayList<String>();
for(String url : urlList){
qqNewsCommentListTest(url);
}
}
public static void qqNewsCommentListTest(String url) {
Map<String,String> headerMap = HeaderTool.getCommonHead();
String newsId = getCommentId(url);
String splitId = "_article"+newsId+"commentv2";
System.out.println(splitId);
int pages = 0;
try {
String comment_url = "http://coral.qq.com/article/"+newsId+"/comment/v2?callback=_article"+newsId+"commentv2&orinum=30&oriorder=t&pageflag=0&source=1&_="+System.currentTimeMillis();
System.out.println("commenturl========"+comment_url);
String html = HttpClientTemplateOK.get(comment_url, null, headerMap);
if(html!=null){
html = html.split(splitId)[1];
html = html.substring(1, html.length()-1);
System.out.println(html);
JSONObject data = JSONObject.parseObject(html).getJSONObject("data");
JSONArray jsonArray = data.getJSONArray("oriCommList");
JSONObject userData = data.getJSONObject("userList");
pages = (int)Math.ceil((double)data.getIntValue("oritotal")/30.0);
for(int a = 0;a<jsonArray.size();a++){
Map<String,Object> doc = new HashMap<String, Object>();
JSONObject json = jsonArray.getJSONObject(a);
JSONObject user = userData.getJSONObject(json.getString("userid"));
if(user!=null){
doc.put("nick", user.getString("nick"));
doc.put("gender", user.getString("gender"));
doc.put("localtion", user.getString("region"));
}
doc.put("_id", json.getString("id"));
doc.put("content", json.getString("content"));
doc.put("time", TimeParse.dateFormartString(new Date(json.getLong("time")*1000), "yyyy-MM-dd HH:mm:ss"));
doc.put("up", json.getInteger("up"));
doc.put("pokenum", json.getInteger("pokenum"));
doc.put("repnum", json.getInteger("repnum"));
doc.put("fromUrl", url);
System.out.println("doc==========="+doc);
}
}else{
System.out.println("--------------");
}
for(int i=1;i<=pages;i++){
comment_url = "http://coral.qq.com/article/"+newsId+"/comment/v2?callback=_article"+newsId+"commentv2&orinum=30&oriorder=t&pageflag="+i+"&source=1&_="+System.currentTimeMillis();
html = HttpClientTemplateOK.get(comment_url, null, headerMap);
if(html!=null){
html = html.split(splitId)[1];
html = html.substring(1, html.length()-1);
System.out.println(html);
System.out.println(html);
JSONObject data = JSONObject.parseObject(html).getJSONObject("data");
JSONArray jsonArray = data.getJSONArray("oriCommList");
JSONObject userData = data.getJSONObject("userList");
pages = (int)Math.ceil((double)data.getIntValue("oritotal")/30.0);
for(int a = 0;a<jsonArray.size();a++){
Map<String,Object> doc = new HashMap<String, Object>();
JSONObject json = jsonArray.getJSONObject(a);
JSONObject user = userData.getJSONObject(json.getString("userid"));
if(user!=null){
doc.put("nick", user.getString("nick"));
doc.put("gender", user.getString("gender"));
doc.put("localtion", user.getString("region"));
}
doc.put("_id", json.getString("mid"));
doc.put("content", json.getString("content"));
doc.put("time", TimeParse.dateFormartString(new Date(json.getLong("time")*1000), "yyyy-MM-dd HH:mm:ss"));
doc.put("up", json.getInteger("up"));
doc.put("pokenum", json.getInteger("pokenum"));
doc.put("repnum", json.getInteger("repnum"));
doc.put("fromUrl", url);
System.out.println("doc==========="+doc);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static String getCommentId(String url){
String cmt_id = null;
Map<String,String> headerMap = HeaderTool.getCommonHead();
System.out.println(url);
try {
String html = HttpClientTemplateOK.get(url, null, headerMap);
if(html!=null && html.contains("cmt_id = ")){
cmt_id = html.split("cmt_id = ")[1].split(";")[0];
System.out.println("cmt_id============"+cmt_id);
return cmt_id;
}
} catch (IOException e) {
return null;
}
return cmt_id;
}
}
package com.zhiwei.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
/**
* @ClassName: SinaCommentListTest
* @Description: TODO(新浪新闻评论抓取)
* @author hero
* @date 2017年8月10日 下午6:08:41
*/
public class SinaCommentListTest {
public static void main(String[] args) {
List<String> urlList = new ArrayList<String>();
for(String url : urlList){
sinaCommentListTest(url);
}
}
public static void sinaCommentListTest(String url) {
Map<String,String> headerMap = HeaderTool.getCommonHead();
String newsId = getCommentId(url).split("=====")[1];
String channel = getCommentId(url).split("=====")[0];
int page = 1;
try {
String comment_url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel="+channel+"&newsid="+newsId+"&group=0&compress=0&ie=gbk&oe=gbk&page="+page+"&page_size=20&jsvar=loader_1525576000752_30189682";
System.out.println("commenturl========"+comment_url);
String html = HttpClientTemplateOK.get(comment_url, null, headerMap);
if(html!=null){
html = html.substring(html.indexOf("=",0)+1,html.length());
System.out.println(html);
JSONObject data = JSONObject.parseObject(html).getJSONObject("result");
JSONArray jsonArray = data.getJSONArray("cmntlist");
for(int a = 0;a<jsonArray.size();a++){
Map<String,Object> doc = new HashMap<String, Object>();
JSONObject json = jsonArray.getJSONObject(a);
doc.put("_id", json.getString("mid"));
doc.put("content", json.getString("content"));
doc.put("area", json.getString("area"));
doc.put("nick", json.getString("nick"));
doc.put("time", json.getString("time"));
doc.put("agree", json.getInteger("agree"));
doc.put("against", json.getInteger("against"));
doc.put("vote", json.getInteger("vote"));
doc.put("fromUrl", url);
System.out.println("doc==========="+doc);
}
}else{
System.out.println("--------------");
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static String getCommentId(String url){
String newsid = null;
String channel = null;
Map<String,String> headerMap = HeaderTool.getCommonHead();
System.out.println(url);
try {
String html = HttpClientTemplateOK.get(url, null, headerMap);
if(html!=null && html.contains("newsid")){
newsid = html.split("newsid: '")[1].split("',")[0];
channel = html.split("channel: '")[1].split("',")[0];
System.out.println(channel+"============"+newsid);
return channel+"====="+newsid;
}
} catch (IOException e) {
return null;
}
return newsid;
}
}
...@@ -16,7 +16,7 @@ public class SouhuAccountExample { ...@@ -16,7 +16,7 @@ public class SouhuAccountExample {
@Test @Test
public void souhuAccountTest() { public void souhuAccountTest() {
List<Map<String,Object>> lists = Souhu.getSouHuAccountData("MjI5MzAyOTMyMEBzaW5hLnNvaHUuY29t","2016-01-01 00:00:00",false,null); List<Map<String,Object>> lists = Souhu.getSouHuAccountData("MjI5MzAyOTMyMEBzaW5hLnNvaHUuY29t","2018-01-01 00:00:00",false,null);
System.out.println(lists.size()); System.out.println(lists.size());
List<String> headList = new ArrayList<String>(); List<String> headList = new ArrayList<String>();
headList.add("title"); headList.add("title");
......
...@@ -8,26 +8,38 @@ import org.junit.Test; ...@@ -8,26 +8,38 @@ import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Wangyi; import com.zhiwei.parse.Wangyi;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class WangyiCommentExample { public class WangyiCommentExample {
//若出错 可能数据有重复 以id为准 //若出错 可能数据有重复 以id为准
@Test @Test
public void wangyiCommentTest() { public void wangyiCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D:\\crawlerdata\\自媒体\\网易评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
List<String> urlList = new ArrayList<String>(); List<String> urlList = new ArrayList<String>();
urlList.add("https://c.m.163.com/news/a/DCQ42REV05118O92.html?spss=newsapp"); for(Map<String,Object> u : list) {
urlList.add("https://c.m.163.com/news/a/DCPLJ5GB05198R91.html?spss=newsapp"); String url = u.get("链接")+"";
urlList.add("https://c.m.163.com/news/a/DCRNI7020511CPVM.html?spss=newsapp"); urlList.add(url);
}
List<Map<String,Object>> bodyList = new ArrayList<>(); List<Map<String,Object>> bodyList = new ArrayList<>();
for(String url : urlList) { for(String url : urlList) {
String id = url.split("a/")[1].split(".ht")[0]; String id = url.split("/")[url.split("/").length-1].split(".ht")[0];
System.out.println(id);
List<Map<String,Object>> lists = Wangyi.getWangyiCommentData(id,null); List<Map<String,Object>> lists = Wangyi.getWangyiCommentData(id,null);
System.out.println(lists.size()); System.out.println(url+"====="+lists.size());
if(lists != null) { if(lists != null) {
bodyList.addAll(lists); for(Map<String,Object> m : lists) {
m.put("from_url", url);
bodyList.add(m);
} }
} }
ZhiWeiTools.sleep(3000);
}
List<String> headList = new ArrayList<String>(); List<String> headList = new ArrayList<String>();
headList.add("content"); headList.add("content");
headList.add("id"); headList.add("id");
...@@ -35,9 +47,9 @@ public class WangyiCommentExample { ...@@ -35,9 +47,9 @@ public class WangyiCommentExample {
headList.add("name"); headList.add("name");
headList.add("like"); headList.add("like");
headList.add("unlike"); headList.add("unlike");
headList.add("from_url");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); poi.exportExcel("D:\\crawlerdata\\自媒体\\网易评论采集.xlsx", "评论数据", headList, bodyList);
poi.exportExcel("D:\\crawlerdata\\网易评论采集-3.xlsx", "asd", headList, bodyList);
} }
......
package com.zhiwei.crawler;
import com.zhiwei.parse.Wangyi;
public class WangyiHistoryExample {
public static void main(String[] args) {
String url = "http://dy.163.com/v2/article/detail/CK4OE81O0512974K.html";
Wangyi.getHistoryData(url, null, "2016-07-06 00:11:54");
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment