Commit 2c702467 by zhiwei

升级采集核心包

parent a9af9087
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId>
<version>1.1.4-SNAPSHOT</version>
<version>1.1.5-SNAPSHOT</version>
<description>
知微微信采集程序,包含
1.微信历史文章采集
......@@ -85,13 +85,13 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.2-SNAPSHOT</version>
<version>0.1.3-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.3.0-RELEASE</version>
<version>0.3.6-RELEASE</version>
<scope>provided</scope>
</dependency>
</dependencies>
......
......@@ -18,7 +18,8 @@ public class WechatAccountFans {
// private static Logger logger = LoggerFactory.getLogger(WechatAccountFans.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private Map<String,String> headerMap;
public WechatAccountFans()
......
......@@ -20,7 +20,8 @@ import com.zhiwei.wechat.entity.WechatAccount;
public class WechatAccountInfo {
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Logger logger = LoggerFactory.getLogger(WechatAccountInfo.class);
/***
......
/**
* 获取微信文章评论
* @Title: WechatComment.java
* @Package com.zhiwei.wechat.comment
* @Description:获取微信文章评论
* @author hero
* @date 2016年6月25日 上午8:17:37
* @version V1.0
*/ /**
*
*/
package com.zhiwei.wechat.comment;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.wechat.entity.WechatComment;
import com.zhiwei.wechat.readAndLike.AriticleContent;
import com.zhiwei.wechat.util.Tools;
/**
* @Description:获取微信文章评论
* @author hero
* @date 2016年6月25日 上午8:17:37
*/
public class WechatCommentList {
private static WechatComment wc = new WechatComment();
private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class);
/**
* 根据文章url获取文章评论列表
* @Description:
* @param @param url
* @param @return
* @return List<WechatComment> 返回类型
*/
public static List<WechatComment> getWechatCommentList(String url,String key)
{
List<WechatComment> wcList = null;
/*处理url*/
String urlcookie = url;
if(!url.contains("key")){
urlcookie = Tools.getWechatCookieUrl(url, key);
}
// 请求头信息
Map<String,String> headerMap = Tools.getWechatHeader();
Map<String, String> cookieMap;
try {
cookieMap = HttpClientTemplateOK.getCookie(urlcookie, null, headerMap);
headerMap.put("Referer", url);
if(cookieMap.get("cookie").length()>50){
headerMap.put("Cookie", cookieMap.get("cookie")+"");
}
String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
String biz = url.split("__biz=")[1].split("&")[0];
String appmsgid = url.split("mid=")[1].split("&")[0];
String comment_id = AriticleContent.getCommentId(url,key);
if(comment_id!=null && appmsg_token!=null)
{
String comment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
+ "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key
+ "&appmsg_token=" + appmsg_token;
/**解析相关数据*/
System.out.println(comment_url);
if("0".equals(comment_id))
{
logger.info("此条微信文章没有评论");
}else
{
try {
String htmlBody = HttpClientTemplateOK.get(comment_url, null, headerMap);
if(htmlBody!=null)
{
JSONObject json = JSON.parseObject(htmlBody);
wcList = wc.constructWechatComment(json.getJSONArray("elected_comment"),url);
return wcList;
}
} catch (Exception e) {
logger.info("解析微信文章评论列表时出现问题:", e.fillInStackTrace());
return null;
}
}
}
} catch (IOException e1) {
return null;
} catch (Exception e1) {
e1.printStackTrace();
}
return null;
}
/**
* @Title: getWechatCommentCount
* @Description: TODO(根据微信文章地址更新微信评论数)
* @param @param url
* @param @param key
* @param @return 设定文件
* @return int 返回类型
*/
public static int getWechatCommentCount(String url,String key)
{
System.out.println(url);
/*处理url*/
String url_new = url;
if(url.contains("#rd"))
{
url_new = url.split("#rd")[0] + key;
}else if(url.contains("#wechat_redirect"))
{
url_new = url.split("#wechat_redirect")[0] + key;
}
String biz = url.split("__biz=")[1].split("&")[0];
String appmsgid = url.split("mid=")[1].split("&")[0];
/**获取网页头信息**/
Map<String,String> headerMap = Tools.getWechatHeader();
/*获取评论id*/
String comment_id = AriticleContent.getCommentId(url,key);
if(comment_id!=null)
{
String comment_url = "http://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
+ "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key;
/**解析相关数据*/
if("0".equals(comment_id))
{
logger.info("此条微信文章没有评论");
return 0;
}else
{
try {
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url_new, null,headerMap);
headerMap.put("Cookie", cookieMap.get("cookie"));
String htmlBody = HttpClientTemplateOK.get(comment_url, null,headerMap);
System.out.println(htmlBody);
if(htmlBody!=null)
{
JSONObject json = JSON.parseObject(htmlBody);
return json.getIntValue("elected_comment_total_cnt");
}
} catch (Exception e) {
logger.debug("更新微信文章评论数时出现问题,问题信息:",e.getMessage());
return -1;
}
}
}else
{
logger.info("获取评论id失败");
return -1;
}
return -1;
}
}
///**
// * 获取微信文章评论
// * @Title: WechatComment.java
// * @Package com.zhiwei.wechat.comment
// * @Description:获取微信文章评论
// * @author hero
// * @date 2016年6月25日 上午8:17:37
// * @version V1.0
// */ /**
// *
// */
//package com.zhiwei.wechat.comment;
//
//import java.io.IOException;
//import java.util.List;
//import java.util.Map;
//
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.alibaba.fastjson.JSON;
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.crawler.core.HttpBoot;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//import com.zhiwei.wechat.entity.WechatComment;
//import com.zhiwei.wechat.readAndLike.AriticleContent;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @Description:获取微信文章评论
// * @author hero
// * @date 2016年6月25日 上午8:17:37
// */
//public class WechatCommentList {
//
// private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
//
// private static WechatComment wc = new WechatComment();
//
// private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class);
// /**
// * 根据文章url获取文章评论列表
// * @Description:
// * @param @param url
// * @param @return
// * @return List<WechatComment> 返回类型
// */
// public static List<WechatComment> getWechatCommentList(String url,String key)
// {
// List<WechatComment> wcList = null;
// /*处理url*/
// String urlcookie = url;
// if(!url.contains("key")){
// urlcookie = Tools.getWechatCookieUrl(url, key);
// }
// // 请求头信息
// Map<String,String> headerMap = Tools.getWechatHeader();
// Map<String, String> cookieMap;
// try {
// cookieMap = HttpClientTemplateOK.getCookie(urlcookie, null, headerMap);
// headerMap.put("Referer", url);
// if(cookieMap.get("cookie").length()>50){
// headerMap.put("Cookie", cookieMap.get("cookie")+"");
// }
// String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
//
// String biz = url.split("__biz=")[1].split("&")[0];
// String appmsgid = url.split("mid=")[1].split("&")[0];
// String comment_id = AriticleContent.getCommentId(url,key);
// if(comment_id!=null && appmsg_token!=null)
// {
// String comment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
// + "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key
// + "&appmsg_token=" + appmsg_token;
// /**解析相关数据*/
// System.out.println(comment_url);
// if("0".equals(comment_id))
// {
// logger.info("此条微信文章没有评论");
// }else
// {
// try {
// String htmlBody = HttpClientTemplateOK.get(comment_url, null, headerMap);
// if(htmlBody!=null)
// {
// JSONObject json = JSON.parseObject(htmlBody);
// wcList = wc.constructWechatComment(json.getJSONArray("elected_comment"),url);
// return wcList;
// }
// } catch (Exception e) {
// logger.info("解析微信文章评论列表时出现问题:", e.fillInStackTrace());
// return null;
// }
// }
// }
// } catch (IOException e1) {
// return null;
// } catch (Exception e1) {
// e1.printStackTrace();
// }
//
// return null;
// }
//
//
// /**
// * @Title: getWechatCommentCount
// * @Description: TODO(根据微信文章地址更新微信评论数)
// * @param @param url
// * @param @param key
// * @param @return 设定文件
// * @return int 返回类型
// */
// public static int getWechatCommentCount(String url,String key)
// {
// System.out.println(url);
// /*处理url*/
// String url_new = url;
// if(url.contains("#rd"))
// {
// url_new = url.split("#rd")[0] + key;
// }else if(url.contains("#wechat_redirect"))
// {
// url_new = url.split("#wechat_redirect")[0] + key;
// }
// String biz = url.split("__biz=")[1].split("&")[0];
// String appmsgid = url.split("mid=")[1].split("&")[0];
//
// /**获取网页头信息**/
// Map<String,String> headerMap = Tools.getWechatHeader();
// /*获取评论id*/
// String comment_id = AriticleContent.getCommentId(url,key);
// if(comment_id!=null)
// {
// String comment_url = "http://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
// + "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key;
// /**解析相关数据*/
//
// if("0".equals(comment_id))
// {
// logger.info("此条微信文章没有评论");
// return 0;
// }else
// {
// try {
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url_new, null,headerMap);
// headerMap.put("Cookie", cookieMap.get("cookie"));
// String htmlBody = HttpClientTemplateOK.get(comment_url, null,headerMap);
// System.out.println(htmlBody);
// if(htmlBody!=null)
// {
// JSONObject json = JSON.parseObject(htmlBody);
// return json.getIntValue("elected_comment_total_cnt");
// }
// } catch (Exception e) {
// logger.debug("更新微信文章评论数时出现问题,问题信息:",e.getMessage());
// return -1;
// }
// }
// }else
// {
// logger.info("获取评论id失败");
// return -1;
// }
// return -1;
// }
//
//
//
//}
/**
* 抓取微信公号历史文章数据
* @Title: WechatDataFromHistory.java
* @Package com.zhiwei.wechat.history
* @Description:抓取微信公号历史文章数据
* @author hero
* @date 2016年5月20日 上午10:27:19
* @version V1.0
*/ /**
*
*/
package com.zhiwei.wechat.history;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.timeparse.TimeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.entity.WechatReadLike;
import com.zhiwei.wechat.readAndLike.AriticleContent;
import com.zhiwei.wechat.readAndLike.WeChatReadAndLike;
import com.zhiwei.wechat.util.Tools;
/**
* @Description:抓取微信公号历史文章数据
* @author Bewilder Z
* @date 2016年5月20日 上午10:27:19
*/
public class WechatDataFromHistory {
private static final Logger log = LogManager.getLogger(WechatDataFromHistory.class);
private boolean updateLike = false; //是否更新点赞阅读数
private Date endDate = null; //采集的结束时间
private List<WechatAricle> result; //数据总集合
private Map<String,String> headerMap; //请求头信息
private boolean follow = false; //是否关注
private String nextId; //采集下一页id
private String key; //更新点赞阅读的key
private boolean next = true; //判断是否有下一页
/**
*
* @Description:
* @param @param updateLike 是否更新点赞数和阅读数
* @param @param endDate 采集结束时间
* @return
*/
public WechatDataFromHistory(boolean updateLike,String endDate,
boolean follow)
{
this.updateLike = updateLike;
result = new ArrayList<WechatAricle>();
headerMap = Tools.getWechatHeader();
this.follow = follow;
if(endDate == null)
{
endDate = "2011-12-30";
}
this.endDate = TimeUtil.parseTime(endDate, "yyyy-MM-dd");
}
public WechatDataFromHistory(){}
/**
* @Title: validateKey
* @author hero
* @Description: 验证链接是否有效
* @param @param key
* @param @return 设定文件
* @return boolean 返回类型
*/
public static boolean validateKey(String key,Proxy proxy){
String url = "http://mp.weixin.qq.com/s?__biz=MzIwNDk0NzEyOQ==&mid=2247484544&idx=2&sn=f64abc4b15badd77b70ca942bc5176d3&scene=0#wechat_redirect";
try {
WechatReadLike wrl = WeChatReadAndLike.getReadAndLike(url, key,proxy);
if(wrl.getRead()>0){
return true;
}else{
return false;
}
} catch (Exception e) {
log.debug("验证微信key有效性时出现问题,问题为:{}",e.getMessage());
return false;
}
}
/**
* @Title: getWechatDataFromHistory
* @author hero
* @Description: 获取微信公众号历史文章
* @param @param url
* @param @return 设定文件
* @return List<WechatAricle> 返回类型
*/
public List<WechatAricle> getWechatDataFromHistory(String url,Proxy proxy)
{
log.info("url:::::::::{}",url);
if(updateLike)
{
key = "&uin"+url.split("uin")[1].split("devicetype")[0];
}
String firstText = null;
try {
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy, headerMap);
//获取cookie
if(cookieMap.get("cookie")!=null){
headerMap.put("Referer", url);
headerMap.put("Cookie", cookieMap.get("cookie"));
firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
//采集下一页数据参数,并获取第一页数据
if(firstText != null){
String appToken = getFirst(firstText,proxy);
if(follow == true)
{
next = true;
}
//循环读取微信公号历史数据
int i = 1;
while(next)
{
String nextUrl = url.replace("home", "getmsg") + "&f=json&&offset=" + i*10 + "&count=10&scene=123&is_ok=1&appmsg_token="+appToken;
log.info("下一页地址:{}", nextUrl);
try {
//采集下一页数据参数,并获取此页数据
headerMap.put("Referer", nextUrl);
String nextJson = HttpClientTemplateOK.get(nextUrl, proxy,headerMap);
nextId = getNext(nextJson,proxy);
// System.out.println("nextId============"+nextId);
// if(nextId.equals("1")){
// next = true;
// }else{
// next = false;
///**
// * 抓取微信公号历史文章数据
// * @Title: WechatDataFromHistory.java
// * @Package com.zhiwei.wechat.history
// * @Description:抓取微信公号历史文章数据
// * @author hero
// * @date 2016年5月20日 上午10:27:19
// * @version V1.0
// */ /**
// *
// */
//package com.zhiwei.wechat.history;
//import java.net.Proxy;
//import java.util.ArrayList;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//
//import org.apache.logging.log4j.LogManager;
//import org.apache.logging.log4j.Logger;
//
//import com.alibaba.fastjson.JSONArray;
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//import com.zhiwei.tools.timeparse.TimeUtil;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//import com.zhiwei.wechat.entity.WechatAricle;
//import com.zhiwei.wechat.entity.WechatReadLike;
//import com.zhiwei.wechat.readAndLike.AriticleContent;
//import com.zhiwei.wechat.readAndLike.WeChatReadAndLike;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @Description:抓取微信公号历史文章数据
// * @author Bewilder Z
// * @date 2016年5月20日 上午10:27:19
// */
//public class WechatDataFromHistory {
//
// private static final Logger log = LogManager.getLogger(WechatDataFromHistory.class);
//
// private boolean updateLike = false; //是否更新点赞阅读数
//
// private Date endDate = null; //采集的结束时间
//
// private List<WechatAricle> result; //数据总集合
//
// private Map<String,String> headerMap; //请求头信息
//
// private boolean follow = false; //是否关注
//
// private String nextId; //采集下一页id
//
// private String key; //更新点赞阅读的key
//
// private boolean next = true; //判断是否有下一页
//
//
// /**
// *
// * @Description:
// * @param @param updateLike 是否更新点赞数和阅读数
// * @param @param endDate 采集结束时间
// * @return
// */
// public WechatDataFromHistory(boolean updateLike,String endDate,
// boolean follow)
// {
// this.updateLike = updateLike;
// result = new ArrayList<WechatAricle>();
// headerMap = Tools.getWechatHeader();
// this.follow = follow;
// if(endDate == null)
// {
// endDate = "2011-12-30";
// }
// this.endDate = TimeUtil.parseTime(endDate, "yyyy-MM-dd");
// }
//
// public WechatDataFromHistory(){}
//
//
// /**
// * @Title: validateKey
// * @author hero
// * @Description: 验证链接是否有效
// * @param @param key
// * @param @return 设定文件
// * @return boolean 返回类型
// */
// public static boolean validateKey(String key,Proxy proxy){
// String url = "http://mp.weixin.qq.com/s?__biz=MzIwNDk0NzEyOQ==&mid=2247484544&idx=2&sn=f64abc4b15badd77b70ca942bc5176d3&scene=0#wechat_redirect";
// try {
// WechatReadLike wrl = WeChatReadAndLike.getReadAndLike(url, key,proxy);
// if(wrl.getRead()>0){
// return true;
// }else{
// return false;
// }
// } catch (Exception e) {
// log.debug("验证微信key有效性时出现问题,问题为:{}",e.getMessage());
// return false;
// }
// }
//
//
// /**
// * @Title: getWechatDataFromHistory
// * @author hero
// * @Description: 获取微信公众号历史文章
// * @param @param url
// * @param @return 设定文件
// * @return List<WechatAricle> 返回类型
// */
// public List<WechatAricle> getWechatDataFromHistory(String url,Proxy proxy)
// {
// log.info("url:::::::::{}",url);
// if(updateLike)
// {
// key = "&uin"+url.split("uin")[1].split("devicetype")[0];
// }
//
// String firstText = null;
// try {
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy, headerMap);
// //获取cookie
// if(cookieMap.get("cookie")!=null){
//// headerMap.put("Referer", url);
// headerMap.put("Cookie", cookieMap.get("cookie"));
// firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
// }
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// //采集下一页数据参数,并获取第一页数据
// if(firstText != null){
// String appToken = getFirst(firstText,proxy);
// if(follow == true)
// {
// next = true;
// }
//
// //循环读取微信公号历史数据
// int i = 1;
// while(next)
// {
// String nextUrl = url.replace("home", "getmsg") + "&f=json&&offset=" + i*10 + "&count=10&scene=123&is_ok=1&appmsg_token="+appToken;
// log.info("下一页地址:{}", nextUrl);
// try {
// //采集下一页数据参数,并获取此页数据
// headerMap.put("Referer", nextUrl);
// String nextJson = HttpClientTemplateOK.get(nextUrl, proxy,headerMap);
// nextId = getNext(nextJson,proxy);
//// System.out.println("nextId============"+nextId);
//// if(nextId.equals("1")){
//// next = true;
//// }else{
//// next = false;
//// }
// ZhiWeiTools.sleep(3000);
// } catch (Exception e) {
// e.printStackTrace();
// next = false;
// }
// i++;
// }
//
// }else{
// next = false;
// }
//
// return result;
// }
//
// /***
// * 获取公号历史文章
// * @Description:
// * @param @param url
// * @param @param source
// * @param @return
// * @return List<Wechat> 返回类型
// */
// @Deprecated
// public List<WechatAricle> getWechatDataFromHistoryOld(String url,Proxy proxy)
// {
// log.info("url:::::::::{}",url);
// if(updateLike)
// {
// key = "&uin"+url.split("uin")[1].split("devicetype")[0];
// }
//
// String firstText = null;
// try {
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy,headerMap);
// //获取cookie
// headerMap.put("Referer", url);
// headerMap.put("Cookie", cookieMap.get("cookie"));
// firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// //采集下一页数据参数,并获取第一页数据
// nextId = getFirstOld(firstText,proxy);
// boolean next = false; //判断是否有下一页
// if(follow == true)
// {
// next = true;
// }
// //循环读取微信公号历史数据
// while(next)
// {
// //没有下一页数据,结束
// if(nextId==null)
// {
// next = false;
// }else //采集下一页数据
// {
// String nextUrl = url.replace("home", "getmsg") + "&f=json&frommsgid=" + nextId + "&count=10&scene=123&is_ok=1";
// log.info("下一页地址:{}", nextUrl);
// try {
// //采集下一页数据参数,并获取此页数据
// headerMap.put("Referer", nextUrl);
// String nextJson = HttpClientTemplateOK.get(nextUrl, null,headerMap);
// nextId = getNext(nextJson,proxy);
// System.out.println("nextId-============="+nextId);
// ZhiWeiTools.sleep(3000);
// } catch (Exception e) {
// e.printStackTrace();
// next = false;
// }
//
// }
// }
//
// return result;
// }
//
//
//
//
// /**
// * @Title: getFirst
// * @Description: TODO(解析第一页数据)
// * @param @param fristText
// * @param @param source
// * @param @return 设定文件
// * @return String 返回类型
// */
// @Deprecated
// public String getFirstOld(String fristText,Proxy proxy)
// {
// fristText = fristText
// .replace("\\", "")
// .replace("'", "")
// .replace("&nbsp;", " ")
// .replace("&quot;", "\"")
// .replace("&amp;", "&")
// .replace("amp;", "")
// .replace("&#39", "'")
// .replace("&gt;", ">")
// .replace("&lt;", "<")
// .replace("&yen;", "¥")
// ;
// log.info("开始解析第一页文章");
// // 截取HTML得到有用的JSON;替换掉转义字符
// if(fristText.contains("msgList ="))
// {
// fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
// return getNextIdAndAnalysis(fristText,proxy);
// }
// return null;
// }
//
// /**
// * @Title: getFirst
// * @author hero
// * @Description: 截取appmsg_token 值
// * @param @param fristText
// * @param @return 设定文件
// * @return String 返回类型
// */
// private String getFirst(String fristText,Proxy proxy)
// {
// String next = null;
//
// fristText = fristText
// .replace("\\", "")
// .replace("'", "")
// .replace("&nbsp;", " ")
// .replace("&quot;", "\"")
// .replace("&amp;", "&")
// .replace("amp;", "")
// .replace("&#39", "'")
// .replace("&gt;", ">")
// .replace("&lt;", "<")
// .replace("&yen;", "¥")
// ;
// log.info("开始解析第一页文章");
//
// if(fristText.contains("window.appmsg_token = ") && fristText.contains("msgList =")){
// try {
// next = fristText.split("window.appmsg_token = \"")[1].split("\";")[0];
// fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
// getNextIdAndAnalysis(fristText,proxy);
// return next;
// } catch (Exception e) {
// log.info("截取下一页数据参数出现问题:{}",fristText);
// return null;
// }
// }else{
// log.info("获取下一页数据参数出现问题....{}",fristText);
// }
// return null;
// }
//
//
// /***
// * 解析微信历史文章下一页数据
// * @Description:
// * @param @param nextJosn
// * @param @param key
// * @param @param source
// * @param @return
// * @return String 返回类型
// */
// private String getNext(String nextHtml,Proxy proxy)
// {
// try {
// JSONObject nextJosn = JSONObject.parseObject(nextHtml);
// String nextText = null;
// if(null != nextJosn.getString("general_msg_list"))
// {
// nextText = nextJosn.getString("general_msg_list");
// getNextIdAndAnalysis(nextText,proxy);
// }else
// {
// log.info("下一页数据解析出现问题:{}", nextHtml);
// next = false;
// return null;
// }
// return nextJosn.getInteger("can_msg_continue")+"";
//
// } catch (Exception e) {
// log.info("解析数据有问题:{}", nextHtml);
// next = false;
// return null;
// }
//
//
// }
//
// /**
// * @Title: getNextIdAndAnalysis
// * @Description: TODO(解析下一页所需字段,及数据解析)
// * @param @param text
// * @param @param source
// * @param @return 设定文件
// * @return String 返回类型
// */
// public String getNextIdAndAnalysis(String text,Proxy proxy)
// {
// JSONObject wechatData = JSONObject.parseObject(text);
// JSONArray dataList = wechatData.getJSONArray("list");
// if(dataList.size()==0)
// {
// nextId = null;
// next = false;
// }else
// {
// for(int i = 0;i<dataList.size();i++)
// {
// JSONObject data = dataList.getJSONObject(i);
// //解析时间
// JSONObject dateJson = data.getJSONObject("comm_msg_info");
// long dateTime = dateJson.getLong("datetime");
// Date time = new Date(dateTime*1000);
// nextId = dateJson.getString("id");
// if(time.before(endDate))
// {
// next = false;
// nextId = null;
// }
// //解析文本数据
// if(null != data.getJSONObject("app_msg_ext_info"))
// {
// //解析头条数据
// JSONObject first = data.getJSONObject("app_msg_ext_info");
// String content_url = first.getString("content_url");
// String content = first.getString("digest");
// String title = first.getString("title");
// String img_url = first.getString("cover");
//
// WechatAricle wechatFirst = setWechat(content_url,title
// , time, img_url, content,"true",proxy);
// result.add(wechatFirst);
// //解析其余数据
// JSONArray otherJSON = first.getJSONArray("multi_app_msg_item_list");
// if(otherJSON != null)
// {
// for(int j = 0;j<otherJSON.size();j++)
// {
// JSONObject other = otherJSON.getJSONObject(j);
// String other_content_url = other.getString("content_url");
// String other_content = other.getString("digest");
// String other_title = other.getString("title");
// String other_img_url = other.getString("cover");
//
// WechatAricle wechatOther = setWechat(other_content_url,other_title
// , time, other_img_url, other_content,"false",proxy);
// result.add(wechatOther);
// }
// }else
// {
// log.info("只有一条数据");
// }
ZhiWeiTools.sleep(3000);
} catch (Exception e) {
e.printStackTrace();
next = false;
}
i++;
}
}else{
next = false;
}
return result;
}
/***
* 获取公号历史文章
* @Description:
* @param @param url
* @param @param source
* @param @return
* @return List<Wechat> 返回类型
*/
@Deprecated
public List<WechatAricle> getWechatDataFromHistoryOld(String url,Proxy proxy)
{
log.info("url:::::::::{}",url);
if(updateLike)
{
key = "&uin"+url.split("uin")[1].split("devicetype")[0];
}
String firstText = null;
try {
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy,headerMap);
//获取cookie
headerMap.put("Referer", url);
headerMap.put("Cookie", cookieMap.get("cookie"));
firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
} catch (Exception e) {
e.printStackTrace();
return null;
}
//采集下一页数据参数,并获取第一页数据
nextId = getFirstOld(firstText,proxy);
boolean next = false; //判断是否有下一页
if(follow == true)
{
next = true;
}
//循环读取微信公号历史数据
while(next)
{
//没有下一页数据,结束
if(nextId==null)
{
next = false;
}else //采集下一页数据
{
String nextUrl = url.replace("home", "getmsg") + "&f=json&frommsgid=" + nextId + "&count=10&scene=123&is_ok=1";
log.info("下一页地址:{}", nextUrl);
try {
//采集下一页数据参数,并获取此页数据
headerMap.put("Referer", nextUrl);
String nextJson = HttpClientTemplateOK.get(nextUrl, null,headerMap);
nextId = getNext(nextJson,proxy);
System.out.println("nextId-============="+nextId);
ZhiWeiTools.sleep(3000);
} catch (Exception e) {
e.printStackTrace();
next = false;
}
}
}
return result;
}
/**
* @Title: getFirst
* @Description: TODO(解析第一页数据)
* @param @param fristText
* @param @param source
* @param @return 设定文件
* @return String 返回类型
*/
@Deprecated
public String getFirstOld(String fristText,Proxy proxy)
{
fristText = fristText
.replace("\\", "")
.replace("'", "")
.replace("&nbsp;", " ")
.replace("&quot;", "\"")
.replace("&amp;", "&")
.replace("amp;", "")
.replace("&#39", "'")
.replace("&gt;", ">")
.replace("&lt;", "<")
.replace("&yen;", "¥")
;
log.info("开始解析第一页文章");
// 截取HTML得到有用的JSON;替换掉转义字符
if(fristText.contains("msgList ="))
{
fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
return getNextIdAndAnalysis(fristText,proxy);
}
return null;
}
/**
* @Title: getFirst
* @author hero
* @Description: 截取appmsg_token 值
* @param @param fristText
* @param @return 设定文件
* @return String 返回类型
*/
private String getFirst(String fristText,Proxy proxy)
{
String next = null;
fristText = fristText
.replace("\\", "")
.replace("'", "")
.replace("&nbsp;", " ")
.replace("&quot;", "\"")
.replace("&amp;", "&")
.replace("amp;", "")
.replace("&#39", "'")
.replace("&gt;", ">")
.replace("&lt;", "<")
.replace("&yen;", "¥")
;
log.info("开始解析第一页文章");
if(fristText.contains("window.appmsg_token = ") && fristText.contains("msgList =")){
try {
next = fristText.split("window.appmsg_token = \"")[1].split("\";")[0];
fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
getNextIdAndAnalysis(fristText,proxy);
return next;
} catch (Exception e) {
log.info("截取下一页数据参数出现问题:{}",fristText);
return null;
}
}else{
log.info("获取下一页数据参数出现问题....{}",fristText);
}
return null;
}
/***
* 解析微信历史文章下一页数据
* @Description:
* @param @param nextJosn
* @param @param key
* @param @param source
* @param @return
* @return String 返回类型
*/
private String getNext(String nextHtml,Proxy proxy)
{
try {
JSONObject nextJosn = JSONObject.parseObject(nextHtml);
String nextText = null;
if(null != nextJosn.getString("general_msg_list"))
{
nextText = nextJosn.getString("general_msg_list");
getNextIdAndAnalysis(nextText,proxy);
}else
{
log.info("下一页数据解析出现问题:{}", nextHtml);
next = false;
return null;
}
return nextJosn.getInteger("can_msg_continue")+"";
} catch (Exception e) {
log.info("解析数据有问题:{}", nextHtml);
next = false;
return null;
}
}
/**
* @Title: getNextIdAndAnalysis
* @Description: TODO(解析下一页所需字段,及数据解析)
* @param @param text
* @param @param source
* @param @return 设定文件
* @return String 返回类型
*/
public String getNextIdAndAnalysis(String text,Proxy proxy)
{
JSONObject wechatData = JSONObject.parseObject(text);
JSONArray dataList = wechatData.getJSONArray("list");
if(dataList.size()==0)
{
nextId = null;
next = false;
}else
{
for(int i = 0;i<dataList.size();i++)
{
JSONObject data = dataList.getJSONObject(i);
//解析时间
JSONObject dateJson = data.getJSONObject("comm_msg_info");
long dateTime = dateJson.getLong("datetime");
Date time = new Date(dateTime*1000);
nextId = dateJson.getString("id");
if(time.before(endDate))
{
next = false;
nextId = null;
}
//解析文本数据
if(null != data.getJSONObject("app_msg_ext_info"))
{
//解析头条数据
JSONObject first = data.getJSONObject("app_msg_ext_info");
String content_url = first.getString("content_url");
String content = first.getString("digest");
String title = first.getString("title");
String img_url = first.getString("cover");
WechatAricle wechatFirst = setWechat(content_url,title
, time, img_url, content,"true",proxy);
result.add(wechatFirst);
//解析其余数据
JSONArray otherJSON = first.getJSONArray("multi_app_msg_item_list");
if(otherJSON != null)
{
for(int j = 0;j<otherJSON.size();j++)
{
JSONObject other = otherJSON.getJSONObject(j);
String other_content_url = other.getString("content_url");
String other_content = other.getString("digest");
String other_title = other.getString("title");
String other_img_url = other.getString("cover");
WechatAricle wechatOther = setWechat(other_content_url,other_title
, time, other_img_url, other_content,"false",proxy);
result.add(wechatOther);
}
}else
{
log.info("只有一条数据");
}
}else
{
log.info("不存在相关文章......");
}
}
}
return nextId;
}
/**
* 给实体类对象赋值
* @Description:
* @param @param url
* @param @param title
* @param @param source
* @param @param datetime
* @param @param key
* @param @return
* @return Wechat 返回类型
*/
private WechatAricle setWechat(String url,String title,
Date datetime,String imgUrl,String content,String isFirst,Proxy proxy)
{
WechatAricle wechat = new WechatAricle();
wechat.setId(url);
wechat.setTitle(title);
wechat.setTime(datetime);
wechat.setImgUrl(imgUrl);
wechat.setIsFirst(isFirst);
//采集文章
String source = null;
Map<String,String> sacMap = AriticleContent.getAriticleContent(url);
if(sacMap!=null)
{
source = sacMap.get("source");
content = sacMap.get("content");
}
//更新点赞阅读数
if(updateLike)
{
url = url.replaceAll("amp;", "").replaceAll("amp;", "");
try {
Thread.sleep(2000);
WechatReadLike wcrl = WeChatReadAndLike.getReadAndLike(url,key,proxy);
wechat.setLikeNum(wcrl.getLike());
wechat.setReadNum(wcrl.getRead());
} catch (InterruptedException e) {
wechat.setLikeNum(-1);
wechat.setReadNum(-1);
log.error("获取点赞阅读数出现为题,问题:{}", e.getMessage());
}
}
wechat.setContent(content);
wechat.setSource(source);
return wechat;
}
public static void main(String[] args) {
String url = "http:\\/\\/mp.weixin.qq.com\\/s?__biz=MjM5NTU0MzI0MA==&mid=2661648551&idx=1&sn=74397ab60184beb0abd4dd3f8c62f7d3&chksm=bda7c9008ad04016a5eac88c8dd18b6bc5797ae780c56e307e11781af257a68a52b7f87dfd8e&scene=27#wechat_redirect";
System.out.println(url.replaceAll("\\", ""));
}
}
// }else
// {
// log.info("不存在相关文章......");
// }
// }
// }
// return nextId;
// }
//
//
//
// /**
// * 给实体类对象赋值
// * @Description:
// * @param @param url
// * @param @param title
// * @param @param source
// * @param @param datetime
// * @param @param key
// * @param @return
// * @return Wechat 返回类型
// */
// private WechatAricle setWechat(String url,String title,
// Date datetime,String imgUrl,String content,String isFirst,Proxy proxy)
// {
// WechatAricle wechat = new WechatAricle();
// wechat.setId(url);
// wechat.setTitle(title);
// wechat.setTime(datetime);
// wechat.setImgUrl(imgUrl);
// wechat.setIsFirst(isFirst);
// //采集文章
// String source = null;
// Map<String,String> sacMap = AriticleContent.getAriticleContent(url);
// if(sacMap!=null)
// {
// source = sacMap.get("source");
// content = sacMap.get("content");
// }
// //更新点赞阅读数
// if(updateLike)
// {
// url = url.replaceAll("amp;", "").replaceAll("amp;", "");
// try {
// Thread.sleep(2000);
// WechatReadLike wcrl = WeChatReadAndLike.getReadAndLike(url,key,proxy);
// wechat.setLikeNum(wcrl.getLike());
// wechat.setReadNum(wcrl.getRead());
// } catch (InterruptedException e) {
// wechat.setLikeNum(-1);
// wechat.setReadNum(-1);
// log.error("获取点赞阅读数出现为题,问题:{}", e.getMessage());
// }
// }
//
// wechat.setContent(content);
// wechat.setSource(source);
// return wechat;
// }
//
//
// public static void main(String[] args) {
// String url = "http:\\/\\/mp.weixin.qq.com\\/s?__biz=MjM5NTU0MzI0MA==&mid=2661648551&idx=1&sn=74397ab60184beb0abd4dd3f8c62f7d3&chksm=bda7c9008ad04016a5eac88c8dd18b6bc5797ae780c56e307e11781af257a68a52b7f87dfd8e&scene=27#wechat_redirect";
// System.out.println(url.replaceAll("\\", ""));
//
// }
//
//
//}
......@@ -17,8 +17,8 @@ import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.wechat.comment.WechatCommentList;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.wechat.util.Tools;
/**
......@@ -28,7 +28,8 @@ import com.zhiwei.wechat.util.Tools;
*/
public class AriticleContent{
private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class);
private static Logger logger = LoggerFactory.getLogger(AriticleContent.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
......@@ -47,7 +48,7 @@ public class AriticleContent{
String content = null;
String source = null;
try {
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
Document document = Jsoup.parse(htmlBody);
content = document.select("div.rich_media_content").text();
if(htmlBody.contains("var nickname = ")){
......@@ -79,7 +80,7 @@ public class AriticleContent{
headerMap.put("Referer", url);
String comment_id = null;
try {
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
if(htmlBody!=null)
{
Document document = Jsoup.parse(htmlBody);
......
/**
* @Title: WindowsClient.java
* @Package com.wcral.client
* @Description: TODO(用一句话描述该文件做什么)
* @author Bewilder Z
* @date 2015年8月6日 上午9:13:37
* @version V1.0
*/
package com.zhiwei.wechat.readAndLike;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.wechat.entity.WechatReadLike;
import com.zhiwei.wechat.search.WechatAritcleSearch;
import com.zhiwei.wechat.util.Tools;
/**
* @ClassName: WindowsClient
* @Description: TODO(利用windows客戶端進行点赞阅读抓取)
* @author Abner Liu
* @date 2015年8月6日 上午9:13:37
*/
public class WeChatReadAndLike {
private static Logger logger = LoggerFactory.getLogger(WeChatReadAndLike.class);
/**
*
* @Title: getReadAndLike
* @Description: 利用windows客戶端進行点赞阅读抓取
* @param url
* 微信文章链接
* @return WeChatReadLike 微信文章实体类
*
*/
public static WechatReadLike getReadAndLike(String url,String key,Proxy proxy){
WechatReadLike wLike = new WechatReadLike();
try {
String urlcookie = Tools.getWechatCookieUrl(url, key);
// 请求头信息
Map<String,String> headerMap = Tools.getWechatHeader();
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(urlcookie, proxy, headerMap);
headerMap.put("Referer", urlcookie);
headerMap.put("Cookie", cookieMap.get("cookie")+"");
String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
System.out.println("appmsg_token==========="+appmsg_token);
String urlLike = Tools.getWechatLikeUrl(urlcookie,appmsg_token);
//设置post请求参数
HashMap<String,Object> postMap = new HashMap<String,Object>();
postMap.put("is_only_read", "1");
//获取数据
String htsString = HttpClientTemplateOK.post(urlLike, proxy, headerMap ,postMap);
System.out.println(htsString);
JSONObject jsonObject = JSONObject.parseObject(htsString);
String like_num = jsonObject.getJSONObject("appmsgstat")
.get("like_num").toString();
String real_read_num = "";
try {
real_read_num = jsonObject.getJSONObject("appmsgstat")
.get("real_read_num").toString();
if(real_read_num.equals("0"))
{
real_read_num = jsonObject.getJSONObject("appmsgstat")
.get("read_num").toString();
}
} catch (Exception e) {
real_read_num = jsonObject.getJSONObject("appmsgstat")
.get("read_num").toString();
}
wLike.setUrl(url);
wLike.setRead(Integer.valueOf(real_read_num));
wLike.setLike(Integer.valueOf(like_num));
} catch (Exception e) {
wLike.setUrl(url);
wLike.setRead(-1);
wLike.setLike(-1);
}
return wLike;
}
/**
* @Title: getReadAndLike
* @Description: TODO(通过搜狗微信获取阅读数)
* @param @param word
* @param @param time
* @param @param link
* @param @param wxId
* @param @return 设定文件
* @return WeChatReadLike 返回类型
*/
public static WechatReadLike getReadAndLike(String word,
String time,String link,String wxId){
WechatReadLike wLike = new WechatReadLike();
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Upgrade-Insecure-Requests", "1");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
headerMap.put("Host","weixin.sogou.com");
if(time.contains(" "))
{
time = time.split(" ")[0];
}
String openid = WechatAritcleSearch.getOpenId(wxId,null);
logger.info("openid is {}", openid);
try {
String url = "http://weixin.sogou.com/weixin?query=" + URLEncoder.encode(word,"utf-8")
+ "&type=2&ie=utf8&page=1&interation=&tsn=5&ft="+time + "&et="+ time
+ "&wxid="+openid+"&usip="+wxId+"&from=tool";
logger.info("url is {}",url);
String htmlBody = HttpClientTemplateOK.get(url, null, headerMap);
if(htmlBody!=null)
{
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box")
.select("ul.news-list").select("li");
for (Element element : elements)
{
try {
String url_link = element.select("div.txt-box").select("h3 >a").attr("href");
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box")
.select("div.s-p").select("span.s1").text().trim());
logger.info("readNum is {}", readNum);
} catch (Exception e) {
readNum = 0;
}
if(url_link.contains("&chksm="))
{
url_link = url_link.split("&chksm=")[0] + "&3rd" + url_link.split("&3rd")[1];
}
if(link.equals(url_link))
{
wLike.setUrl(link);
wLike.setRead(readNum);
break;
}
} catch (Exception e) {
continue;
}
}
} catch (Exception e) {
wLike.setUrl(link);
wLike.setRead(0);
return null;
}
}
} catch (Exception e) {
e.printStackTrace();
wLike.setUrl(link);
wLike.setRead(0);
return null;
}
return wLike;
}
}
///**
// * @Title: WindowsClient.java
// * @Package com.wcral.client
// * @Description: TODO(用一句话描述该文件做什么)
// * @author Bewilder Z
// * @date 2015年8月6日 上午9:13:37
// * @version V1.0
// */
//
//package com.zhiwei.wechat.readAndLike;
//
//import java.net.Proxy;
//import java.net.URLEncoder;
//import java.util.HashMap;
//import java.util.Map;
//
//import org.jsoup.Jsoup;
//import org.jsoup.nodes.Document;
//import org.jsoup.nodes.Element;
//import org.jsoup.select.Elements;
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//import com.zhiwei.wechat.entity.WechatReadLike;
//import com.zhiwei.wechat.search.WechatAritcleSearch;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @ClassName: WindowsClient
// * @Description: TODO(利用windows客戶端進行点赞阅读抓取)
// * @author Abner Liu
// * @date 2015年8月6日 上午9:13:37
// */
//public class WeChatReadAndLike {
//
//
// private static Logger logger = LoggerFactory.getLogger(WeChatReadAndLike.class);
//
// /**
// *
// * @Title: getReadAndLike
// * @Description: 利用windows客戶端進行点赞阅读抓取
// * @param url
// * 微信文章链接
// * @return WeChatReadLike 微信文章实体类
// *
// */
// public static WechatReadLike getReadAndLike(String url,String key,Proxy proxy){
// WechatReadLike wLike = new WechatReadLike();
// try {
// String urlcookie = Tools.getWechatCookieUrl(url, key);
// // 请求头信息
// Map<String,String> headerMap = Tools.getWechatHeader();
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(urlcookie, proxy, headerMap);
//
// headerMap.put("Referer", urlcookie);
// headerMap.put("Cookie", cookieMap.get("cookie")+"");
// String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
// System.out.println("appmsg_token==========="+appmsg_token);
// String urlLike = Tools.getWechatLikeUrl(urlcookie,appmsg_token);
// //设置post请求参数
// HashMap<String,Object> postMap = new HashMap<String,Object>();
// postMap.put("is_only_read", "1");
//
// //获取数据
// String htsString = HttpClientTemplateOK.post(urlLike, proxy, headerMap ,postMap);
// System.out.println(htsString);
// JSONObject jsonObject = JSONObject.parseObject(htsString);
// String like_num = jsonObject.getJSONObject("appmsgstat")
// .get("like_num").toString();
//
// String real_read_num = "";
// try {
// real_read_num = jsonObject.getJSONObject("appmsgstat")
// .get("real_read_num").toString();
// if(real_read_num.equals("0"))
// {
// real_read_num = jsonObject.getJSONObject("appmsgstat")
// .get("read_num").toString();
// }
// } catch (Exception e) {
// real_read_num = jsonObject.getJSONObject("appmsgstat")
// .get("read_num").toString();
// }
// wLike.setUrl(url);
// wLike.setRead(Integer.valueOf(real_read_num));
// wLike.setLike(Integer.valueOf(like_num));
// } catch (Exception e) {
// wLike.setUrl(url);
// wLike.setRead(-1);
// wLike.setLike(-1);
// }
// return wLike;
// }
//
//
//
// /**
// * @Title: getReadAndLike
// * @Description: TODO(通过搜狗微信获取阅读数)
// * @param @param word
// * @param @param time
// * @param @param link
// * @param @param wxId
// * @param @return 设定文件
// * @return WeChatReadLike 返回类型
// */
// public static WechatReadLike getReadAndLike(String word,
// String time,String link,String wxId){
//
// WechatReadLike wLike = new WechatReadLike();
//
// Map<String,String> headerMap = new HashMap<String,String>();
// headerMap.put("Upgrade-Insecure-Requests", "1");
// headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
// headerMap.put("Host","weixin.sogou.com");
//
// if(time.contains(" "))
// {
// time = time.split(" ")[0];
// }
//
// String openid = WechatAritcleSearch.getOpenId(wxId,null);
// logger.info("openid is {}", openid);
//
// try {
// String url = "http://weixin.sogou.com/weixin?query=" + URLEncoder.encode(word,"utf-8")
// + "&type=2&ie=utf8&page=1&interation=&tsn=5&ft="+time + "&et="+ time
// + "&wxid="+openid+"&usip="+wxId+"&from=tool";
//
// logger.info("url is {}",url);
//
// String htmlBody = HttpClientTemplateOK.get(url, null, headerMap);
// if(htmlBody!=null)
// {
// try {
// // 解析数据
// Document document = Jsoup.parse(htmlBody);
// Elements elements = document.select("div.news-box")
// .select("ul.news-list").select("li");
// for (Element element : elements)
// {
// try {
// String url_link = element.select("div.txt-box").select("h3 >a").attr("href");
// int readNum = 0;
// try {
// readNum = Integer.valueOf(element.select("div.txt-box")
// .select("div.s-p").select("span.s1").text().trim());
// logger.info("readNum is {}", readNum);
// } catch (Exception e) {
// readNum = 0;
// }
// if(url_link.contains("&chksm="))
// {
// url_link = url_link.split("&chksm=")[0] + "&3rd" + url_link.split("&3rd")[1];
// }
//
// if(link.equals(url_link))
// {
// wLike.setUrl(link);
// wLike.setRead(readNum);
// break;
// }
// } catch (Exception e) {
// continue;
// }
// }
// } catch (Exception e) {
// wLike.setUrl(link);
// wLike.setRead(0);
// return null;
// }
// }
// } catch (Exception e) {
// e.printStackTrace();
// wLike.setUrl(link);
// wLike.setRead(0);
// return null;
// }
// return wLike;
// }
//
//}
......@@ -35,8 +35,8 @@ import com.zhiwei.wechat.entity.WechatAricle;
public class WechatAritcleSearch {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
*
* @Title: wechatKeywordSearch
......
......@@ -13,7 +13,7 @@ import com.zhiwei.crawler.utils.RequestUtils;
public class WechatCount {
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static int getWechatCountByWord(String word, String cookie,
String startTime, String endTime, Proxy proxy) {
......
......@@ -5,7 +5,8 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
......@@ -17,7 +18,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
*/
public class WechatIndex {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static void main(String[] args) throws Exception {
......@@ -53,7 +54,7 @@ public class WechatIndex {
headerMap.put("Accept","application/json, text/javascript, */*; q=0.01");
headerMap.put("Cookie","mmsearch_user_key=AStrb5tD4ruSixIDu1cVpTA=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700");
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
System.out.println(htmlBody);
Thread.sleep(3000);
......
/**
* @Title: WechatDataFromHistoryExample.java
* @Package com.zhiwei.wechat.example
* @Description:微信采集历史文章测试
* @author hero
* @date 2016年5月20日 下午5:47:56
* @version V1.0
*/
/**
*
*/
package com.zhiwei.wechat.example;
import java.util.ArrayList;
import java.util.List;
import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.history.WechatDataFromHistory;
/**
* @Description:微信采集历史文章测试
* @author hero
* @date 2016年5月20日 下午5:47:56
*/
public class WechatDataFromHistoryExample {
public static void main(String[] args) {
boolean updateLike = false;
boolean follow = true;
String endDate = "2017-01-27";
try {
List<String> urllist = new ArrayList<String>();
urllist.add("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5NTU0MzI0MA==&scene=124&uin=MTE4OTQyMDc0MQ%3D%3D&key=df62f0a2a8b7732dca2d1f886b5bd15c398e1fe92940e352837738ea99e5ddc531fc24d5d57a5a43eab11df1e4db7db80aeeddfc06c8f410e159d80df4f822c07c555b4b536b52593f132f39c6868698&devicetype=Windows+8&version=6203005d&lang=zh_CN&a8scene=7&pass_ticket=nMJ5n97UE%2BxdJKqeKp3ovi8slnCMNSYF6Tu%2FgsQ4Phk%2Bc%2B%2BDM5AQy7LT6H%2BBQTc5&winzoom=1");
System.out.println(urllist.size());
int i = 0;
for (String s : urllist) {
System.out.println("i===========" + i);
String url = s.split(",")[0];
// String source = s.split(",")[1];
WechatDataFromHistory wdfh = new WechatDataFromHistory(updateLike,endDate,follow);
System.out.println(url);
List<WechatAricle> list = wdfh.getWechatDataFromHistory(url,null);
System.out.println("list size is :" + list.size());
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
///**
// * @Title: WechatDataFromHistoryExample.java
// * @Package com.zhiwei.wechat.example
// * @Description:微信采集历史文章测试
// * @author hero
// * @date 2016年5月20日 下午5:47:56
// * @version V1.0
// */
///**
// *
// */
//package com.zhiwei.wechat.example;
//
//import java.util.ArrayList;
//import java.util.List;
//
//import com.zhiwei.wechat.entity.WechatAricle;
//import com.zhiwei.wechat.history.WechatDataFromHistory;
//
///**
// * @Description:微信采集历史文章测试
// * @author hero
// * @date 2016年5月20日 下午5:47:56
// */
//public class WechatDataFromHistoryExample {
//
// public static void main(String[] args) {
// boolean updateLike = false;
// boolean follow = true;
// String endDate = "2017-01-27";
// try {
// List<String> urllist = new ArrayList<String>();
// urllist.add("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5NTU0MzI0MA==&scene=124&uin=MTE4OTQyMDc0MQ%3D%3D&key=df62f0a2a8b7732dca2d1f886b5bd15c398e1fe92940e352837738ea99e5ddc531fc24d5d57a5a43eab11df1e4db7db80aeeddfc06c8f410e159d80df4f822c07c555b4b536b52593f132f39c6868698&devicetype=Windows+8&version=6203005d&lang=zh_CN&a8scene=7&pass_ticket=nMJ5n97UE%2BxdJKqeKp3ovi8slnCMNSYF6Tu%2FgsQ4Phk%2Bc%2B%2BDM5AQy7LT6H%2BBQTc5&winzoom=1");
// System.out.println(urllist.size());
// int i = 0;
// for (String s : urllist) {
// System.out.println("i===========" + i);
// String url = s.split(",")[0];
//// String source = s.split(",")[1];
//
// WechatDataFromHistory wdfh = new WechatDataFromHistory(updateLike,endDate,follow);
// System.out.println(url);
// List<WechatAricle> list = wdfh.getWechatDataFromHistory(url,null);
// System.out.println("list size is :" + list.size());
//
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
//
//
//}
......@@ -40,13 +40,11 @@ public class WechatSearchExample{
public static void wechatSearchExample() throws UnknownHostException
{
List<String> wordList = new ArrayList<String>();
wordList.add("工业互联网");
String idOrName = "吴晓波频道";
wordList.add("京东");
for(String word : wordList)
{
try {
List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearchByAccount(word, idOrName, "2017-12-01", "2018-12-01", ProxyHolder.SOUGOU_INNER_PROXY);
List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-04-08", "2019-04-08", ProxyHolder.SOUGOU_INNER_PROXY.getProxy());
System.out.println("======"+list.size());
for(WechatAricle wechat : list){
System.out.println(wechat.getTitle());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment