Commit 132e6350 by yangchen

天天快报历史文章采集修正

parent e5ce0110
package com.zhiwei.httpclient;
import java.io.IOException;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
......@@ -652,11 +653,28 @@ public class HeadGet {
return headerMap;
}
public static Map<String,String> getweiboHeaderMap(String cookie) {
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
headerMap.put("Accept","*/*");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Content-Type", "application/x-www-form-urlencoded");
headerMap.put("Host", "d.weibo.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static void main(String[] args) {
String url = "https://view.inews.qq.com/a/NEW2018021000440002";
String cookie = "mstuid=1518141097798_2540; Hm_lvt_71558e7b4aa822e282e758f8dc0b88b0=1518141098; lastsource=so.bbs.xiaomi.cn; mstz=||795199218.38||http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25e5%25b0%258f%25e7%25b1%25b3%2520%25e7%2594%25b5%25e9%25a5%25ad%25e7%2585%25b2%2520%25e5%25bc%2580%25e8%25a3%2582%7Cp%3D1%7Cfid%3D0%7Ctime%3D31536000%7Corder%3D1|http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25e5%25b0%258f%25e7%25b1%25b3%2520%25e7%2594%25b5%25e9%25a5%25ad%25e7%2585%25b2%2520%25e5%25bc%2580%25e8%25a3%2582%7Cp%3D1%7Cfid%3D0%7Ctime%3D63072000%7Corder%3D1; xm_vistor=1518141097798_2540_1518141097798-1518142530797; msttime=http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25E5%25B0%258F%25E7%25B1%25B3%2520%25E7%2594%25B5%25E9%25A5%25AD%25E7%2585%25B2%2520%25E5%25BC%2580%25E8%25A3%2582%26p%3D1%26fid%3D0%26time%3D63072000%26order%3D1; msttime1=http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25E5%25B0%258F%25E7%25B1%25B3%2520%25E7%2594%25B5%25E9%25A5%25AD%25E7%2585%25B2%2520%25E5%25BC%2580%25E8%25A3%2582%26p%3D1%26fid%3D0%26time%3D63072000%26order%3D1; Hm_lpvt_71558e7b4aa822e282e758f8dc0b88b0=1518142531";
Map<String,String> headerMap = HeadGet.getQQkuaiCommentHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url, null);
String url = "https://d.weibo.com/1087030002_2975_1003_0?pids=Pl_Core_F4RightUserList__4&page=2&ajaxpagelet=1&__ref=/1087030002_2975_1003_0&_t=FM_151825274677918";
String cookie = "SINAGLOBAL=7701198867685.262.1517207017616; _s_tentry=login.sina.com.cn; Apache=6842405326379.926.1517796423994; ULV=1517796424127:3:1:3:6842405326379.926.1517796423994:1517209523882; ULOGIN_IMG=15177972786361; UOR=,,login.sina.com.cn; YF-Page-G0=23b9d9eac864b0d725a27007679967df; SCF=Ag8PQSV7wMV9Lc8UOZupWW2l6wfI5N2imvtjcwFE3ovIEsRCuG5QaKQhPx4ByaNkpC5LpYocPBPnOJT2NSZMkiU.; SUHB=0C1CJFGk8jNm31; SUB=_2AkMtIj0odcPxrABWn_0WzGPhbYhH-jye91TeAn7uJhMyAxgv7lMFqSVutBF-XFWUFIfrHOaUSPWy_1IBv_YbyS5_; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WWr5b4iYaaqYk4kfrcubkrT5JpVF02ReoMpSo.XeK.f; login_sid_t=10c8fe00b1833b7414093404448d2330; cross_origin_proto=SSL";
Map<String,String> headerMap = HeadGet.getweiboHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url, headerMap);
System.out.println(result);
System.out.println(result.length());
}
......
......@@ -33,9 +33,16 @@ public class QQKB {
Map<String,Object> paramMap = HeadGet.getQQAccountOneParamMap(child);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
String result = HttpClient.executeHttpRequestPost(url, headerMap, paramMap);
List<String> idsList = qqAccountAnalysis.getQQAllIds(result);
System.out.println(idsList.size());
String result = "";
List<String> idsList = new ArrayList<String>();
for(int i = 0;i < 3;i++) {
result = HttpClient.executeHttpRequestPost(url, headerMap, paramMap);
idsList = qqAccountAnalysis.getQQAllIds(result);
if(idsList.size() > 1) {
break;
}
}
System.out.println("此帐号可采集的历史文章数==============="+idsList.size());
url = "http://r.cnews.qq.com/getSubNewsListItems";
String ids = "";
int i = 0;
......@@ -44,6 +51,7 @@ public class QQKB {
i++;
if(i >= 20) {
try {
for(int j = 1; j < 3;j++) {
ids = ids.substring(0,ids.length()-1);
System.out.println(ids);
ZhiWeiTools.sleep(7000);
......@@ -51,9 +59,13 @@ public class QQKB {
paramMap = HeadGet.getQQAccountOtherParamMap(ids);
result = HttpClient.executeHttpRequestPost(url, headerMap, paramMap);
List<Map<String,Object>> list = qqAccountAnalysis.analysisQQAccountData(result);
if(list != null) {
dataList.addAll(list);
break;
}
ids = "";
i = 0;
}
} catch (Exception e) {
ids = "";
paramMap.clear();
......@@ -62,23 +74,26 @@ public class QQKB {
}
}
if(ids.length() > 1) {
for(int j = 1; j < 3;j++) {
ids = ids.substring(0,ids.length()-1);
ZhiWeiTools.sleep(8000);
paramMap.clear();
paramMap = HeadGet.getQQAccountOtherParamMap(ids);
result = HttpClient.executeHttpRequestPost(url, headerMap, paramMap);
List<Map<String,Object>> list = qqAccountAnalysis.analysisQQAccountData(result);
if(list != null) {
dataList.addAll(list);
break;
}
}
}
return dataList;
} catch (Exception e) {
logger.error("获取企鹅号历史文章未完全成功",e.getMessage());
e.printStackTrace();
return dataList;
}
}
/**
*
* @Description 获取天天快报评论
......
......@@ -8,24 +8,42 @@ import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.QQKB;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class QQAccountExample {
@Test
public void qqAccountTest() {
String child = "5975325";
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
List<Map<String,Object>> dataList = QQKB.getQQAccountData(child, cookie);
System.out.println(dataList.size());
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> dataMap = poi.importExcel("D://crawlerdata/天天快报历史文章采集.xlsx", 0);
List<Map<String,Object>> dataList = (List<Map<String, Object>>) dataMap.get("body");
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(Map<String,Object> map : dataList) {
String child = map.get("帐号链接")+"";
System.out.println(child.split("chlid=")[1]);
List<Map<String,Object>> lists = QQKB.getQQAccountData(child.split("chlid=")[1], cookie);
if(lists != null) {
for(Map<String,Object> map1 : lists) {
map1.put("name", map.get("呢称"));
map1.put("主页地址", map.get("帐号链接"));
bodyList.add(map1);
}
}
System.out.println("采集到的历史文章数总和============="+bodyList.size());
ZhiWeiTools.sleep(5000);
}
System.out.println(dataList.size());
List<String> headList = new ArrayList<String>();
headList.add("name");
headList.add("主页地址");
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("url");
headList.add("commentid");
poi.exportExcel("D://crawlerdata/qq-5975325.xlsx", "asd", headList, dataList);
poi.exportExcel("D://crawlerdata/天天快报采集.xlsx", "asd", headList, bodyList);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment