Commit 2c702467 by zhiwei

升级采集核心包

parent a9af9087
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId> <artifactId>wechat</artifactId>
<version>1.1.4-SNAPSHOT</version> <version>1.1.5-SNAPSHOT</version>
<description> <description>
知微微信采集程序,包含 知微微信采集程序,包含
1.微信历史文章采集 1.微信历史文章采集
...@@ -85,13 +85,13 @@ ...@@ -85,13 +85,13 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.2-SNAPSHOT</version> <version>0.1.3-SNAPSHOT</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.3.0-RELEASE</version> <version>0.3.6-RELEASE</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
</dependencies> </dependencies>
......
...@@ -18,7 +18,8 @@ public class WechatAccountFans { ...@@ -18,7 +18,8 @@ public class WechatAccountFans {
// private static Logger logger = LoggerFactory.getLogger(WechatAccountFans.class); // private static Logger logger = LoggerFactory.getLogger(WechatAccountFans.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private Map<String,String> headerMap; private Map<String,String> headerMap;
public WechatAccountFans() public WechatAccountFans()
......
...@@ -20,7 +20,8 @@ import com.zhiwei.wechat.entity.WechatAccount; ...@@ -20,7 +20,8 @@ import com.zhiwei.wechat.entity.WechatAccount;
public class WechatAccountInfo { public class WechatAccountInfo {
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Logger logger = LoggerFactory.getLogger(WechatAccountInfo.class); private static Logger logger = LoggerFactory.getLogger(WechatAccountInfo.class);
/*** /***
......
/** ///**
* 获取微信文章评论 // * 获取微信文章评论
* @Title: WechatComment.java // * @Title: WechatComment.java
* @Package com.zhiwei.wechat.comment // * @Package com.zhiwei.wechat.comment
* @Description:获取微信文章评论 // * @Description:获取微信文章评论
* @author hero // * @author hero
* @date 2016年6月25日 上午8:17:37 // * @date 2016年6月25日 上午8:17:37
* @version V1.0 // * @version V1.0
*/ /** // */ /**
* // *
*/ // */
package com.zhiwei.wechat.comment; //package com.zhiwei.wechat.comment;
//
import java.io.IOException; //import java.io.IOException;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.slf4j.Logger; //import org.slf4j.Logger;
import org.slf4j.LoggerFactory; //import org.slf4j.LoggerFactory;
//
import com.alibaba.fastjson.JSON; //import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject; //import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK; //import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.wechat.entity.WechatComment; //import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.wechat.readAndLike.AriticleContent; //import com.zhiwei.wechat.entity.WechatComment;
import com.zhiwei.wechat.util.Tools; //import com.zhiwei.wechat.readAndLike.AriticleContent;
//import com.zhiwei.wechat.util.Tools;
/** //
* @Description:获取微信文章评论 ///**
* @author hero // * @Description:获取微信文章评论
* @date 2016年6月25日 上午8:17:37 // * @author hero
*/ // * @date 2016年6月25日 上午8:17:37
public class WechatCommentList { // */
//public class WechatCommentList {
private static WechatComment wc = new WechatComment(); //
// private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class); //
/** // private static WechatComment wc = new WechatComment();
* 根据文章url获取文章评论列表 //
* @Description: // private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class);
* @param @param url // /**
* @param @return // * 根据文章url获取文章评论列表
* @return List<WechatComment> 返回类型 // * @Description:
*/ // * @param @param url
public static List<WechatComment> getWechatCommentList(String url,String key) // * @param @return
{ // * @return List<WechatComment> 返回类型
List<WechatComment> wcList = null; // */
/*处理url*/ // public static List<WechatComment> getWechatCommentList(String url,String key)
String urlcookie = url; // {
if(!url.contains("key")){ // List<WechatComment> wcList = null;
urlcookie = Tools.getWechatCookieUrl(url, key); // /*处理url*/
} // String urlcookie = url;
// 请求头信息 // if(!url.contains("key")){
Map<String,String> headerMap = Tools.getWechatHeader(); // urlcookie = Tools.getWechatCookieUrl(url, key);
Map<String, String> cookieMap; // }
try { // // 请求头信息
cookieMap = HttpClientTemplateOK.getCookie(urlcookie, null, headerMap); // Map<String,String> headerMap = Tools.getWechatHeader();
headerMap.put("Referer", url); // Map<String, String> cookieMap;
if(cookieMap.get("cookie").length()>50){ // try {
headerMap.put("Cookie", cookieMap.get("cookie")+""); // cookieMap = HttpClientTemplateOK.getCookie(urlcookie, null, headerMap);
} // headerMap.put("Referer", url);
String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody")); // if(cookieMap.get("cookie").length()>50){
// headerMap.put("Cookie", cookieMap.get("cookie")+"");
String biz = url.split("__biz=")[1].split("&")[0]; // }
String appmsgid = url.split("mid=")[1].split("&")[0]; // String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
String comment_id = AriticleContent.getCommentId(url,key); //
if(comment_id!=null && appmsg_token!=null) // String biz = url.split("__biz=")[1].split("&")[0];
{ // String appmsgid = url.split("mid=")[1].split("&")[0];
String comment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz // String comment_id = AriticleContent.getCommentId(url,key);
+ "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key // if(comment_id!=null && appmsg_token!=null)
+ "&appmsg_token=" + appmsg_token; // {
/**解析相关数据*/ // String comment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
System.out.println(comment_url); // + "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key
if("0".equals(comment_id)) // + "&appmsg_token=" + appmsg_token;
{ // /**解析相关数据*/
logger.info("此条微信文章没有评论"); // System.out.println(comment_url);
}else // if("0".equals(comment_id))
{ // {
try { // logger.info("此条微信文章没有评论");
String htmlBody = HttpClientTemplateOK.get(comment_url, null, headerMap); // }else
if(htmlBody!=null) // {
{ // try {
JSONObject json = JSON.parseObject(htmlBody); // String htmlBody = HttpClientTemplateOK.get(comment_url, null, headerMap);
wcList = wc.constructWechatComment(json.getJSONArray("elected_comment"),url); // if(htmlBody!=null)
return wcList; // {
} // JSONObject json = JSON.parseObject(htmlBody);
} catch (Exception e) { // wcList = wc.constructWechatComment(json.getJSONArray("elected_comment"),url);
logger.info("解析微信文章评论列表时出现问题:", e.fillInStackTrace()); // return wcList;
return null; // }
} // } catch (Exception e) {
} // logger.info("解析微信文章评论列表时出现问题:", e.fillInStackTrace());
} // return null;
} catch (IOException e1) { // }
return null; // }
} catch (Exception e1) { // }
e1.printStackTrace(); // } catch (IOException e1) {
} // return null;
// } catch (Exception e1) {
return null; // e1.printStackTrace();
} // }
//
// return null;
/** // }
* @Title: getWechatCommentCount //
* @Description: TODO(根据微信文章地址更新微信评论数) //
* @param @param url // /**
* @param @param key // * @Title: getWechatCommentCount
* @param @return 设定文件 // * @Description: TODO(根据微信文章地址更新微信评论数)
* @return int 返回类型 // * @param @param url
*/ // * @param @param key
public static int getWechatCommentCount(String url,String key) // * @param @return 设定文件
{ // * @return int 返回类型
System.out.println(url); // */
/*处理url*/ // public static int getWechatCommentCount(String url,String key)
String url_new = url; // {
if(url.contains("#rd")) // System.out.println(url);
{ // /*处理url*/
url_new = url.split("#rd")[0] + key; // String url_new = url;
}else if(url.contains("#wechat_redirect")) // if(url.contains("#rd"))
{ // {
url_new = url.split("#wechat_redirect")[0] + key; // url_new = url.split("#rd")[0] + key;
} // }else if(url.contains("#wechat_redirect"))
String biz = url.split("__biz=")[1].split("&")[0]; // {
String appmsgid = url.split("mid=")[1].split("&")[0]; // url_new = url.split("#wechat_redirect")[0] + key;
// }
/**获取网页头信息**/ // String biz = url.split("__biz=")[1].split("&")[0];
Map<String,String> headerMap = Tools.getWechatHeader(); // String appmsgid = url.split("mid=")[1].split("&")[0];
/*获取评论id*/ //
String comment_id = AriticleContent.getCommentId(url,key); // /**获取网页头信息**/
if(comment_id!=null) // Map<String,String> headerMap = Tools.getWechatHeader();
{ // /*获取评论id*/
String comment_url = "http://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz // String comment_id = AriticleContent.getCommentId(url,key);
+ "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key; // if(comment_id!=null)
/**解析相关数据*/ // {
// String comment_url = "http://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
if("0".equals(comment_id)) // + "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key;
{ // /**解析相关数据*/
logger.info("此条微信文章没有评论"); //
return 0; // if("0".equals(comment_id))
}else // {
{ // logger.info("此条微信文章没有评论");
try { // return 0;
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url_new, null,headerMap); // }else
headerMap.put("Cookie", cookieMap.get("cookie")); // {
String htmlBody = HttpClientTemplateOK.get(comment_url, null,headerMap); // try {
System.out.println(htmlBody); // Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url_new, null,headerMap);
if(htmlBody!=null) // headerMap.put("Cookie", cookieMap.get("cookie"));
{ // String htmlBody = HttpClientTemplateOK.get(comment_url, null,headerMap);
JSONObject json = JSON.parseObject(htmlBody); // System.out.println(htmlBody);
return json.getIntValue("elected_comment_total_cnt"); // if(htmlBody!=null)
} // {
} catch (Exception e) { // JSONObject json = JSON.parseObject(htmlBody);
logger.debug("更新微信文章评论数时出现问题,问题信息:",e.getMessage()); // return json.getIntValue("elected_comment_total_cnt");
return -1; // }
} // } catch (Exception e) {
} // logger.debug("更新微信文章评论数时出现问题,问题信息:",e.getMessage());
}else // return -1;
{ // }
logger.info("获取评论id失败"); // }
return -1; // }else
} // {
return -1; // logger.info("获取评论id失败");
} // return -1;
// }
// return -1;
// }
} //
//
//
//}
/** ///**
* 抓取微信公号历史文章数据 // * 抓取微信公号历史文章数据
* @Title: WechatDataFromHistory.java // * @Title: WechatDataFromHistory.java
* @Package com.zhiwei.wechat.history // * @Package com.zhiwei.wechat.history
* @Description:抓取微信公号历史文章数据 // * @Description:抓取微信公号历史文章数据
* @author hero // * @author hero
* @date 2016年5月20日 上午10:27:19 // * @date 2016年5月20日 上午10:27:19
* @version V1.0 // * @version V1.0
*/ /** // */ /**
* // *
*/ // */
package com.zhiwei.wechat.history; //package com.zhiwei.wechat.history;
import java.net.Proxy; //import java.net.Proxy;
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.Date; //import java.util.Date;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.apache.logging.log4j.LogManager; //import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; //import org.apache.logging.log4j.Logger;
//
import com.alibaba.fastjson.JSONArray; //import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; //import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK; //import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.timeparse.TimeUtil; //import com.zhiwei.tools.timeparse.TimeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.wechat.entity.WechatAricle; //import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.entity.WechatReadLike; //import com.zhiwei.wechat.entity.WechatReadLike;
import com.zhiwei.wechat.readAndLike.AriticleContent; //import com.zhiwei.wechat.readAndLike.AriticleContent;
import com.zhiwei.wechat.readAndLike.WeChatReadAndLike; //import com.zhiwei.wechat.readAndLike.WeChatReadAndLike;
import com.zhiwei.wechat.util.Tools; //import com.zhiwei.wechat.util.Tools;
//
/** ///**
* @Description:抓取微信公号历史文章数据 // * @Description:抓取微信公号历史文章数据
* @author Bewilder Z // * @author Bewilder Z
* @date 2016年5月20日 上午10:27:19 // * @date 2016年5月20日 上午10:27:19
*/ // */
public class WechatDataFromHistory { //public class WechatDataFromHistory {
//
private static final Logger log = LogManager.getLogger(WechatDataFromHistory.class); // private static final Logger log = LogManager.getLogger(WechatDataFromHistory.class);
//
private boolean updateLike = false; //是否更新点赞阅读数 // private boolean updateLike = false; //是否更新点赞阅读数
//
private Date endDate = null; //采集的结束时间 // private Date endDate = null; //采集的结束时间
//
private List<WechatAricle> result; //数据总集合 // private List<WechatAricle> result; //数据总集合
//
private Map<String,String> headerMap; //请求头信息 // private Map<String,String> headerMap; //请求头信息
//
private boolean follow = false; //是否关注 // private boolean follow = false; //是否关注
//
private String nextId; //采集下一页id // private String nextId; //采集下一页id
//
private String key; //更新点赞阅读的key // private String key; //更新点赞阅读的key
//
private boolean next = true; //判断是否有下一页 // private boolean next = true; //判断是否有下一页
//
//
/** // /**
* // *
* @Description: // * @Description:
* @param @param updateLike 是否更新点赞数和阅读数 // * @param @param updateLike 是否更新点赞数和阅读数
* @param @param endDate 采集结束时间 // * @param @param endDate 采集结束时间
* @return // * @return
*/ // */
public WechatDataFromHistory(boolean updateLike,String endDate, // public WechatDataFromHistory(boolean updateLike,String endDate,
boolean follow) // boolean follow)
{ // {
this.updateLike = updateLike; // this.updateLike = updateLike;
result = new ArrayList<WechatAricle>(); // result = new ArrayList<WechatAricle>();
headerMap = Tools.getWechatHeader(); // headerMap = Tools.getWechatHeader();
this.follow = follow; // this.follow = follow;
if(endDate == null) // if(endDate == null)
{ // {
endDate = "2011-12-30"; // endDate = "2011-12-30";
} // }
this.endDate = TimeUtil.parseTime(endDate, "yyyy-MM-dd"); // this.endDate = TimeUtil.parseTime(endDate, "yyyy-MM-dd");
} // }
//
public WechatDataFromHistory(){} // public WechatDataFromHistory(){}
//
//
/** // /**
* @Title: validateKey // * @Title: validateKey
* @author hero // * @author hero
* @Description: 验证链接是否有效 // * @Description: 验证链接是否有效
* @param @param key // * @param @param key
* @param @return 设定文件 // * @param @return 设定文件
* @return boolean 返回类型 // * @return boolean 返回类型
*/ // */
public static boolean validateKey(String key,Proxy proxy){ // public static boolean validateKey(String key,Proxy proxy){
String url = "http://mp.weixin.qq.com/s?__biz=MzIwNDk0NzEyOQ==&mid=2247484544&idx=2&sn=f64abc4b15badd77b70ca942bc5176d3&scene=0#wechat_redirect"; // String url = "http://mp.weixin.qq.com/s?__biz=MzIwNDk0NzEyOQ==&mid=2247484544&idx=2&sn=f64abc4b15badd77b70ca942bc5176d3&scene=0#wechat_redirect";
try { // try {
WechatReadLike wrl = WeChatReadAndLike.getReadAndLike(url, key,proxy); // WechatReadLike wrl = WeChatReadAndLike.getReadAndLike(url, key,proxy);
if(wrl.getRead()>0){ // if(wrl.getRead()>0){
return true; // return true;
}else{ // }else{
return false; // return false;
} // }
} catch (Exception e) { // } catch (Exception e) {
log.debug("验证微信key有效性时出现问题,问题为:{}",e.getMessage()); // log.debug("验证微信key有效性时出现问题,问题为:{}",e.getMessage());
return false; // return false;
} // }
} // }
//
//
/** // /**
* @Title: getWechatDataFromHistory // * @Title: getWechatDataFromHistory
* @author hero // * @author hero
* @Description: 获取微信公众号历史文章 // * @Description: 获取微信公众号历史文章
* @param @param url // * @param @param url
* @param @return 设定文件 // * @param @return 设定文件
* @return List<WechatAricle> 返回类型 // * @return List<WechatAricle> 返回类型
*/ // */
public List<WechatAricle> getWechatDataFromHistory(String url,Proxy proxy) // public List<WechatAricle> getWechatDataFromHistory(String url,Proxy proxy)
{ // {
log.info("url:::::::::{}",url); // log.info("url:::::::::{}",url);
if(updateLike) // if(updateLike)
{ // {
key = "&uin"+url.split("uin")[1].split("devicetype")[0]; // key = "&uin"+url.split("uin")[1].split("devicetype")[0];
} // }
//
String firstText = null; // String firstText = null;
try { // try {
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy, headerMap); // Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy, headerMap);
//获取cookie // //获取cookie
if(cookieMap.get("cookie")!=null){ // if(cookieMap.get("cookie")!=null){
headerMap.put("Referer", url); //// headerMap.put("Referer", url);
headerMap.put("Cookie", cookieMap.get("cookie")); // headerMap.put("Cookie", cookieMap.get("cookie"));
firstText = HttpClientTemplateOK.get(url, proxy,headerMap); // firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
} // }
} catch (Exception e) { // } catch (Exception e) {
e.printStackTrace(); // e.printStackTrace();
return null; // return null;
} // }
//采集下一页数据参数,并获取第一页数据 // //采集下一页数据参数,并获取第一页数据
if(firstText != null){ // if(firstText != null){
String appToken = getFirst(firstText,proxy); // String appToken = getFirst(firstText,proxy);
if(follow == true) // if(follow == true)
{ // {
next = true;
}
//循环读取微信公号历史数据
int i = 1;
while(next)
{
String nextUrl = url.replace("home", "getmsg") + "&f=json&&offset=" + i*10 + "&count=10&scene=123&is_ok=1&appmsg_token="+appToken;
log.info("下一页地址:{}", nextUrl);
try {
//采集下一页数据参数,并获取此页数据
headerMap.put("Referer", nextUrl);
String nextJson = HttpClientTemplateOK.get(nextUrl, proxy,headerMap);
nextId = getNext(nextJson,proxy);
// System.out.println("nextId============"+nextId);
// if(nextId.equals("1")){
// next = true; // next = true;
// }
//
// //循环读取微信公号历史数据
// int i = 1;
// while(next)
// {
// String nextUrl = url.replace("home", "getmsg") + "&f=json&&offset=" + i*10 + "&count=10&scene=123&is_ok=1&appmsg_token="+appToken;
// log.info("下一页地址:{}", nextUrl);
// try {
// //采集下一页数据参数,并获取此页数据
// headerMap.put("Referer", nextUrl);
// String nextJson = HttpClientTemplateOK.get(nextUrl, proxy,headerMap);
// nextId = getNext(nextJson,proxy);
//// System.out.println("nextId============"+nextId);
//// if(nextId.equals("1")){
//// next = true;
//// }else{
//// next = false;
//// }
// ZhiWeiTools.sleep(3000);
// } catch (Exception e) {
// e.printStackTrace();
// next = false;
// }
// i++;
// }
//
// }else{ // }else{
// next = false; // next = false;
// } // }
ZhiWeiTools.sleep(3000); //
} catch (Exception e) { // return result;
e.printStackTrace(); // }
next = false; //
} // /***
i++; // * 获取公号历史文章
} // * @Description:
// * @param @param url
}else{ // * @param @param source
next = false; // * @param @return
} // * @return List<Wechat> 返回类型
// */
return result; // @Deprecated
} // public List<WechatAricle> getWechatDataFromHistoryOld(String url,Proxy proxy)
// {
/*** // log.info("url:::::::::{}",url);
* 获取公号历史文章 // if(updateLike)
* @Description: // {
* @param @param url // key = "&uin"+url.split("uin")[1].split("devicetype")[0];
* @param @param source // }
* @param @return //
* @return List<Wechat> 返回类型 // String firstText = null;
*/ // try {
@Deprecated // Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy,headerMap);
public List<WechatAricle> getWechatDataFromHistoryOld(String url,Proxy proxy) // //获取cookie
{ // headerMap.put("Referer", url);
log.info("url:::::::::{}",url); // headerMap.put("Cookie", cookieMap.get("cookie"));
if(updateLike) // firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
{ // } catch (Exception e) {
key = "&uin"+url.split("uin")[1].split("devicetype")[0]; // e.printStackTrace();
} // return null;
// }
String firstText = null; // //采集下一页数据参数,并获取第一页数据
try { // nextId = getFirstOld(firstText,proxy);
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy,headerMap); // boolean next = false; //判断是否有下一页
//获取cookie // if(follow == true)
headerMap.put("Referer", url); // {
headerMap.put("Cookie", cookieMap.get("cookie")); // next = true;
firstText = HttpClientTemplateOK.get(url, proxy,headerMap); // }
} catch (Exception e) { // //循环读取微信公号历史数据
e.printStackTrace(); // while(next)
return null; // {
} // //没有下一页数据,结束
//采集下一页数据参数,并获取第一页数据 // if(nextId==null)
nextId = getFirstOld(firstText,proxy); // {
boolean next = false; //判断是否有下一页 // next = false;
if(follow == true) // }else //采集下一页数据
{ // {
next = true; // String nextUrl = url.replace("home", "getmsg") + "&f=json&frommsgid=" + nextId + "&count=10&scene=123&is_ok=1";
} // log.info("下一页地址:{}", nextUrl);
//循环读取微信公号历史数据 // try {
while(next) // //采集下一页数据参数,并获取此页数据
{ // headerMap.put("Referer", nextUrl);
//没有下一页数据,结束 // String nextJson = HttpClientTemplateOK.get(nextUrl, null,headerMap);
if(nextId==null) // nextId = getNext(nextJson,proxy);
{ // System.out.println("nextId-============="+nextId);
next = false; // ZhiWeiTools.sleep(3000);
}else //采集下一页数据 // } catch (Exception e) {
{ // e.printStackTrace();
String nextUrl = url.replace("home", "getmsg") + "&f=json&frommsgid=" + nextId + "&count=10&scene=123&is_ok=1"; // next = false;
log.info("下一页地址:{}", nextUrl); // }
try { //
//采集下一页数据参数,并获取此页数据 // }
headerMap.put("Referer", nextUrl); // }
String nextJson = HttpClientTemplateOK.get(nextUrl, null,headerMap); //
nextId = getNext(nextJson,proxy); // return result;
System.out.println("nextId-============="+nextId); // }
ZhiWeiTools.sleep(3000); //
} catch (Exception e) { //
e.printStackTrace(); //
next = false; //
} // /**
// * @Title: getFirst
} // * @Description: TODO(解析第一页数据)
} // * @param @param fristText
// * @param @param source
return result; // * @param @return 设定文件
} // * @return String 返回类型
// */
// @Deprecated
// public String getFirstOld(String fristText,Proxy proxy)
// {
/** // fristText = fristText
* @Title: getFirst // .replace("\\", "")
* @Description: TODO(解析第一页数据) // .replace("'", "")
* @param @param fristText // .replace("&nbsp;", " ")
* @param @param source // .replace("&quot;", "\"")
* @param @return 设定文件 // .replace("&amp;", "&")
* @return String 返回类型 // .replace("amp;", "")
*/ // .replace("&#39", "'")
@Deprecated // .replace("&gt;", ">")
public String getFirstOld(String fristText,Proxy proxy) // .replace("&lt;", "<")
{ // .replace("&yen;", "¥")
fristText = fristText // ;
.replace("\\", "") // log.info("开始解析第一页文章");
.replace("'", "") // // 截取HTML得到有用的JSON;替换掉转义字符
.replace("&nbsp;", " ") // if(fristText.contains("msgList ="))
.replace("&quot;", "\"") // {
.replace("&amp;", "&") // fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
.replace("amp;", "") // return getNextIdAndAnalysis(fristText,proxy);
.replace("&#39", "'") // }
.replace("&gt;", ">") // return null;
.replace("&lt;", "<") // }
.replace("&yen;", "¥") //
; // /**
log.info("开始解析第一页文章"); // * @Title: getFirst
// 截取HTML得到有用的JSON;替换掉转义字符 // * @author hero
if(fristText.contains("msgList =")) // * @Description: 截取appmsg_token 值
{ // * @param @param fristText
fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}"; // * @param @return 设定文件
return getNextIdAndAnalysis(fristText,proxy); // * @return String 返回类型
} // */
return null; // private String getFirst(String fristText,Proxy proxy)
} // {
// String next = null;
/** //
* @Title: getFirst // fristText = fristText
* @author hero // .replace("\\", "")
* @Description: 截取appmsg_token 值 // .replace("'", "")
* @param @param fristText // .replace("&nbsp;", " ")
* @param @return 设定文件 // .replace("&quot;", "\"")
* @return String 返回类型 // .replace("&amp;", "&")
*/ // .replace("amp;", "")
private String getFirst(String fristText,Proxy proxy) // .replace("&#39", "'")
{ // .replace("&gt;", ">")
String next = null; // .replace("&lt;", "<")
// .replace("&yen;", "¥")
fristText = fristText // ;
.replace("\\", "") // log.info("开始解析第一页文章");
.replace("'", "") //
.replace("&nbsp;", " ") // if(fristText.contains("window.appmsg_token = ") && fristText.contains("msgList =")){
.replace("&quot;", "\"") // try {
.replace("&amp;", "&") // next = fristText.split("window.appmsg_token = \"")[1].split("\";")[0];
.replace("amp;", "") // fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
.replace("&#39", "'") // getNextIdAndAnalysis(fristText,proxy);
.replace("&gt;", ">") // return next;
.replace("&lt;", "<") // } catch (Exception e) {
.replace("&yen;", "¥") // log.info("截取下一页数据参数出现问题:{}",fristText);
; // return null;
log.info("开始解析第一页文章"); // }
// }else{
if(fristText.contains("window.appmsg_token = ") && fristText.contains("msgList =")){ // log.info("获取下一页数据参数出现问题....{}",fristText);
try { // }
next = fristText.split("window.appmsg_token = \"")[1].split("\";")[0]; // return null;
fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}"; // }
getNextIdAndAnalysis(fristText,proxy); //
return next; //
} catch (Exception e) { // /***
log.info("截取下一页数据参数出现问题:{}",fristText); // * 解析微信历史文章下一页数据
return null; // * @Description:
} // * @param @param nextJosn
}else{ // * @param @param key
log.info("获取下一页数据参数出现问题....{}",fristText); // * @param @param source
} // * @param @return
return null; // * @return String 返回类型
} // */
// private String getNext(String nextHtml,Proxy proxy)
// {
/*** // try {
* 解析微信历史文章下一页数据 // JSONObject nextJosn = JSONObject.parseObject(nextHtml);
* @Description: // String nextText = null;
* @param @param nextJosn // if(null != nextJosn.getString("general_msg_list"))
* @param @param key // {
* @param @param source // nextText = nextJosn.getString("general_msg_list");
* @param @return // getNextIdAndAnalysis(nextText,proxy);
* @return String 返回类型 // }else
*/ // {
private String getNext(String nextHtml,Proxy proxy) // log.info("下一页数据解析出现问题:{}", nextHtml);
{ // next = false;
try { // return null;
JSONObject nextJosn = JSONObject.parseObject(nextHtml); // }
String nextText = null; // return nextJosn.getInteger("can_msg_continue")+"";
if(null != nextJosn.getString("general_msg_list")) //
{ // } catch (Exception e) {
nextText = nextJosn.getString("general_msg_list"); // log.info("解析数据有问题:{}", nextHtml);
getNextIdAndAnalysis(nextText,proxy); // next = false;
}else // return null;
{ // }
log.info("下一页数据解析出现问题:{}", nextHtml); //
next = false; //
return null; // }
} //
return nextJosn.getInteger("can_msg_continue")+""; // /**
// * @Title: getNextIdAndAnalysis
} catch (Exception e) { // * @Description: TODO(解析下一页所需字段,及数据解析)
log.info("解析数据有问题:{}", nextHtml); // * @param @param text
next = false; // * @param @param source
return null; // * @param @return 设定文件
} // * @return String 返回类型
// */
// public String getNextIdAndAnalysis(String text,Proxy proxy)
} // {
// JSONObject wechatData = JSONObject.parseObject(text);
/** // JSONArray dataList = wechatData.getJSONArray("list");
* @Title: getNextIdAndAnalysis // if(dataList.size()==0)
* @Description: TODO(解析下一页所需字段,及数据解析) // {
* @param @param text // nextId = null;
* @param @param source // next = false;
* @param @return 设定文件 // }else
* @return String 返回类型 // {
*/ // for(int i = 0;i<dataList.size();i++)
public String getNextIdAndAnalysis(String text,Proxy proxy) // {
{ // JSONObject data = dataList.getJSONObject(i);
JSONObject wechatData = JSONObject.parseObject(text); // //解析时间
JSONArray dataList = wechatData.getJSONArray("list"); // JSONObject dateJson = data.getJSONObject("comm_msg_info");
if(dataList.size()==0) // long dateTime = dateJson.getLong("datetime");
{ // Date time = new Date(dateTime*1000);
nextId = null; // nextId = dateJson.getString("id");
next = false; // if(time.before(endDate))
}else // {
{ // next = false;
for(int i = 0;i<dataList.size();i++) // nextId = null;
{ // }
JSONObject data = dataList.getJSONObject(i); // //解析文本数据
//解析时间 // if(null != data.getJSONObject("app_msg_ext_info"))
JSONObject dateJson = data.getJSONObject("comm_msg_info"); // {
long dateTime = dateJson.getLong("datetime"); // //解析头条数据
Date time = new Date(dateTime*1000); // JSONObject first = data.getJSONObject("app_msg_ext_info");
nextId = dateJson.getString("id"); // String content_url = first.getString("content_url");
if(time.before(endDate)) // String content = first.getString("digest");
{ // String title = first.getString("title");
next = false; // String img_url = first.getString("cover");
nextId = null; //
} // WechatAricle wechatFirst = setWechat(content_url,title
//解析文本数据 // , time, img_url, content,"true",proxy);
if(null != data.getJSONObject("app_msg_ext_info")) // result.add(wechatFirst);
{ // //解析其余数据
//解析头条数据 // JSONArray otherJSON = first.getJSONArray("multi_app_msg_item_list");
JSONObject first = data.getJSONObject("app_msg_ext_info"); // if(otherJSON != null)
String content_url = first.getString("content_url"); // {
String content = first.getString("digest"); // for(int j = 0;j<otherJSON.size();j++)
String title = first.getString("title"); // {
String img_url = first.getString("cover"); // JSONObject other = otherJSON.getJSONObject(j);
// String other_content_url = other.getString("content_url");
WechatAricle wechatFirst = setWechat(content_url,title // String other_content = other.getString("digest");
, time, img_url, content,"true",proxy); // String other_title = other.getString("title");
result.add(wechatFirst); // String other_img_url = other.getString("cover");
//解析其余数据 //
JSONArray otherJSON = first.getJSONArray("multi_app_msg_item_list"); // WechatAricle wechatOther = setWechat(other_content_url,other_title
if(otherJSON != null) // , time, other_img_url, other_content,"false",proxy);
{ // result.add(wechatOther);
for(int j = 0;j<otherJSON.size();j++) // }
{ // }else
JSONObject other = otherJSON.getJSONObject(j); // {
String other_content_url = other.getString("content_url"); // log.info("只有一条数据");
String other_content = other.getString("digest"); // }
String other_title = other.getString("title"); // }else
String other_img_url = other.getString("cover"); // {
// log.info("不存在相关文章......");
WechatAricle wechatOther = setWechat(other_content_url,other_title // }
, time, other_img_url, other_content,"false",proxy); // }
result.add(wechatOther); // }
} // return nextId;
}else // }
{ //
log.info("只有一条数据"); //
} //
}else // /**
{ // * 给实体类对象赋值
log.info("不存在相关文章......"); // * @Description:
} // * @param @param url
} // * @param @param title
} // * @param @param source
return nextId; // * @param @param datetime
} // * @param @param key
// * @param @return
// * @return Wechat 返回类型
// */
/** // private WechatAricle setWechat(String url,String title,
* 给实体类对象赋值 // Date datetime,String imgUrl,String content,String isFirst,Proxy proxy)
* @Description: // {
* @param @param url // WechatAricle wechat = new WechatAricle();
* @param @param title // wechat.setId(url);
* @param @param source // wechat.setTitle(title);
* @param @param datetime // wechat.setTime(datetime);
* @param @param key // wechat.setImgUrl(imgUrl);
* @param @return // wechat.setIsFirst(isFirst);
* @return Wechat 返回类型 // //采集文章
*/ // String source = null;
private WechatAricle setWechat(String url,String title, // Map<String,String> sacMap = AriticleContent.getAriticleContent(url);
Date datetime,String imgUrl,String content,String isFirst,Proxy proxy) // if(sacMap!=null)
{ // {
WechatAricle wechat = new WechatAricle(); // source = sacMap.get("source");
wechat.setId(url); // content = sacMap.get("content");
wechat.setTitle(title); // }
wechat.setTime(datetime); // //更新点赞阅读数
wechat.setImgUrl(imgUrl); // if(updateLike)
wechat.setIsFirst(isFirst); // {
//采集文章 // url = url.replaceAll("amp;", "").replaceAll("amp;", "");
String source = null; // try {
Map<String,String> sacMap = AriticleContent.getAriticleContent(url); // Thread.sleep(2000);
if(sacMap!=null) // WechatReadLike wcrl = WeChatReadAndLike.getReadAndLike(url,key,proxy);
{ // wechat.setLikeNum(wcrl.getLike());
source = sacMap.get("source"); // wechat.setReadNum(wcrl.getRead());
content = sacMap.get("content"); // } catch (InterruptedException e) {
} // wechat.setLikeNum(-1);
//更新点赞阅读数 // wechat.setReadNum(-1);
if(updateLike) // log.error("获取点赞阅读数出现为题,问题:{}", e.getMessage());
{ // }
url = url.replaceAll("amp;", "").replaceAll("amp;", ""); // }
try { //
Thread.sleep(2000); // wechat.setContent(content);
WechatReadLike wcrl = WeChatReadAndLike.getReadAndLike(url,key,proxy); // wechat.setSource(source);
wechat.setLikeNum(wcrl.getLike()); // return wechat;
wechat.setReadNum(wcrl.getRead()); // }
} catch (InterruptedException e) { //
wechat.setLikeNum(-1); //
wechat.setReadNum(-1); // public static void main(String[] args) {
log.error("获取点赞阅读数出现为题,问题:{}", e.getMessage()); // String url = "http:\\/\\/mp.weixin.qq.com\\/s?__biz=MjM5NTU0MzI0MA==&mid=2661648551&idx=1&sn=74397ab60184beb0abd4dd3f8c62f7d3&chksm=bda7c9008ad04016a5eac88c8dd18b6bc5797ae780c56e307e11781af257a68a52b7f87dfd8e&scene=27#wechat_redirect";
} // System.out.println(url.replaceAll("\\", ""));
} //
// }
wechat.setContent(content); //
wechat.setSource(source); //
return wechat; //}
}
public static void main(String[] args) {
String url = "http:\\/\\/mp.weixin.qq.com\\/s?__biz=MjM5NTU0MzI0MA==&mid=2661648551&idx=1&sn=74397ab60184beb0abd4dd3f8c62f7d3&chksm=bda7c9008ad04016a5eac88c8dd18b6bc5797ae780c56e307e11781af257a68a52b7f87dfd8e&scene=27#wechat_redirect";
System.out.println(url.replaceAll("\\", ""));
}
}
...@@ -17,8 +17,8 @@ import org.jsoup.nodes.Document; ...@@ -17,8 +17,8 @@ import org.jsoup.nodes.Document;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.wechat.comment.WechatCommentList; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.wechat.util.Tools; import com.zhiwei.wechat.util.Tools;
/** /**
...@@ -28,7 +28,8 @@ import com.zhiwei.wechat.util.Tools; ...@@ -28,7 +28,8 @@ import com.zhiwei.wechat.util.Tools;
*/ */
public class AriticleContent{ public class AriticleContent{
private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class); private static Logger logger = LoggerFactory.getLogger(AriticleContent.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/** /**
...@@ -47,7 +48,7 @@ public class AriticleContent{ ...@@ -47,7 +48,7 @@ public class AriticleContent{
String content = null; String content = null;
String source = null; String source = null;
try { try {
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
content = document.select("div.rich_media_content").text(); content = document.select("div.rich_media_content").text();
if(htmlBody.contains("var nickname = ")){ if(htmlBody.contains("var nickname = ")){
...@@ -79,7 +80,7 @@ public class AriticleContent{ ...@@ -79,7 +80,7 @@ public class AriticleContent{
headerMap.put("Referer", url); headerMap.put("Referer", url);
String comment_id = null; String comment_id = null;
try { try {
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
if(htmlBody!=null) if(htmlBody!=null)
{ {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
......
/** ///**
* @Title: WindowsClient.java // * @Title: WindowsClient.java
* @Package com.wcral.client // * @Package com.wcral.client
* @Description: TODO(用一句话描述该文件做什么) // * @Description: TODO(用一句话描述该文件做什么)
* @author Bewilder Z // * @author Bewilder Z
* @date 2015年8月6日 上午9:13:37 // * @date 2015年8月6日 上午9:13:37
* @version V1.0 // * @version V1.0
*/ // */
//
package com.zhiwei.wechat.readAndLike; //package com.zhiwei.wechat.readAndLike;
//
import java.net.Proxy; //import java.net.Proxy;
import java.net.URLEncoder; //import java.net.URLEncoder;
import java.util.HashMap; //import java.util.HashMap;
import java.util.Map; //import java.util.Map;
//
import org.jsoup.Jsoup; //import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; //import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; //import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; //import org.jsoup.select.Elements;
import org.slf4j.Logger; //import org.slf4j.Logger;
import org.slf4j.LoggerFactory; //import org.slf4j.LoggerFactory;
//
import com.alibaba.fastjson.JSONObject; //import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK; //import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.wechat.entity.WechatReadLike; //import com.zhiwei.wechat.entity.WechatReadLike;
import com.zhiwei.wechat.search.WechatAritcleSearch; //import com.zhiwei.wechat.search.WechatAritcleSearch;
import com.zhiwei.wechat.util.Tools; //import com.zhiwei.wechat.util.Tools;
//
/** ///**
* @ClassName: WindowsClient // * @ClassName: WindowsClient
* @Description: TODO(利用windows客戶端進行点赞阅读抓取) // * @Description: TODO(利用windows客戶端進行点赞阅读抓取)
* @author Abner Liu // * @author Abner Liu
* @date 2015年8月6日 上午9:13:37 // * @date 2015年8月6日 上午9:13:37
*/ // */
public class WeChatReadAndLike { //public class WeChatReadAndLike {
//
//
private static Logger logger = LoggerFactory.getLogger(WeChatReadAndLike.class); // private static Logger logger = LoggerFactory.getLogger(WeChatReadAndLike.class);
//
/** // /**
* // *
* @Title: getReadAndLike // * @Title: getReadAndLike
* @Description: 利用windows客戶端進行点赞阅读抓取 // * @Description: 利用windows客戶端進行点赞阅读抓取
* @param url // * @param url
* 微信文章链接 // * 微信文章链接
* @return WeChatReadLike 微信文章实体类 // * @return WeChatReadLike 微信文章实体类
* // *
*/ // */
public static WechatReadLike getReadAndLike(String url,String key,Proxy proxy){ // public static WechatReadLike getReadAndLike(String url,String key,Proxy proxy){
WechatReadLike wLike = new WechatReadLike(); // WechatReadLike wLike = new WechatReadLike();
try { // try {
String urlcookie = Tools.getWechatCookieUrl(url, key); // String urlcookie = Tools.getWechatCookieUrl(url, key);
// 请求头信息 // // 请求头信息
Map<String,String> headerMap = Tools.getWechatHeader(); // Map<String,String> headerMap = Tools.getWechatHeader();
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(urlcookie, proxy, headerMap); // Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(urlcookie, proxy, headerMap);
//
headerMap.put("Referer", urlcookie); // headerMap.put("Referer", urlcookie);
headerMap.put("Cookie", cookieMap.get("cookie")+""); // headerMap.put("Cookie", cookieMap.get("cookie")+"");
String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody")); // String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
System.out.println("appmsg_token==========="+appmsg_token); // System.out.println("appmsg_token==========="+appmsg_token);
String urlLike = Tools.getWechatLikeUrl(urlcookie,appmsg_token); // String urlLike = Tools.getWechatLikeUrl(urlcookie,appmsg_token);
//设置post请求参数 // //设置post请求参数
HashMap<String,Object> postMap = new HashMap<String,Object>(); // HashMap<String,Object> postMap = new HashMap<String,Object>();
postMap.put("is_only_read", "1"); // postMap.put("is_only_read", "1");
//
//获取数据 // //获取数据
String htsString = HttpClientTemplateOK.post(urlLike, proxy, headerMap ,postMap); // String htsString = HttpClientTemplateOK.post(urlLike, proxy, headerMap ,postMap);
System.out.println(htsString); // System.out.println(htsString);
JSONObject jsonObject = JSONObject.parseObject(htsString); // JSONObject jsonObject = JSONObject.parseObject(htsString);
String like_num = jsonObject.getJSONObject("appmsgstat") // String like_num = jsonObject.getJSONObject("appmsgstat")
.get("like_num").toString(); // .get("like_num").toString();
//
String real_read_num = ""; // String real_read_num = "";
try { // try {
real_read_num = jsonObject.getJSONObject("appmsgstat") // real_read_num = jsonObject.getJSONObject("appmsgstat")
.get("real_read_num").toString(); // .get("real_read_num").toString();
if(real_read_num.equals("0")) // if(real_read_num.equals("0"))
{ // {
real_read_num = jsonObject.getJSONObject("appmsgstat") // real_read_num = jsonObject.getJSONObject("appmsgstat")
.get("read_num").toString(); // .get("read_num").toString();
} // }
} catch (Exception e) { // } catch (Exception e) {
real_read_num = jsonObject.getJSONObject("appmsgstat") // real_read_num = jsonObject.getJSONObject("appmsgstat")
.get("read_num").toString(); // .get("read_num").toString();
} // }
wLike.setUrl(url); // wLike.setUrl(url);
wLike.setRead(Integer.valueOf(real_read_num)); // wLike.setRead(Integer.valueOf(real_read_num));
wLike.setLike(Integer.valueOf(like_num)); // wLike.setLike(Integer.valueOf(like_num));
} catch (Exception e) { // } catch (Exception e) {
wLike.setUrl(url); // wLike.setUrl(url);
wLike.setRead(-1); // wLike.setRead(-1);
wLike.setLike(-1); // wLike.setLike(-1);
} // }
return wLike; // return wLike;
} // }
//
//
//
/** // /**
* @Title: getReadAndLike // * @Title: getReadAndLike
* @Description: TODO(通过搜狗微信获取阅读数) // * @Description: TODO(通过搜狗微信获取阅读数)
* @param @param word // * @param @param word
* @param @param time // * @param @param time
* @param @param link // * @param @param link
* @param @param wxId // * @param @param wxId
* @param @return 设定文件 // * @param @return 设定文件
* @return WeChatReadLike 返回类型 // * @return WeChatReadLike 返回类型
*/ // */
public static WechatReadLike getReadAndLike(String word, // public static WechatReadLike getReadAndLike(String word,
String time,String link,String wxId){ // String time,String link,String wxId){
//
WechatReadLike wLike = new WechatReadLike(); // WechatReadLike wLike = new WechatReadLike();
//
Map<String,String> headerMap = new HashMap<String,String>(); // Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Upgrade-Insecure-Requests", "1"); // headerMap.put("Upgrade-Insecure-Requests", "1");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"); // headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
headerMap.put("Host","weixin.sogou.com"); // headerMap.put("Host","weixin.sogou.com");
//
if(time.contains(" ")) // if(time.contains(" "))
{ // {
time = time.split(" ")[0]; // time = time.split(" ")[0];
} // }
//
String openid = WechatAritcleSearch.getOpenId(wxId,null); // String openid = WechatAritcleSearch.getOpenId(wxId,null);
logger.info("openid is {}", openid); // logger.info("openid is {}", openid);
//
try { // try {
String url = "http://weixin.sogou.com/weixin?query=" + URLEncoder.encode(word,"utf-8") // String url = "http://weixin.sogou.com/weixin?query=" + URLEncoder.encode(word,"utf-8")
+ "&type=2&ie=utf8&page=1&interation=&tsn=5&ft="+time + "&et="+ time // + "&type=2&ie=utf8&page=1&interation=&tsn=5&ft="+time + "&et="+ time
+ "&wxid="+openid+"&usip="+wxId+"&from=tool"; // + "&wxid="+openid+"&usip="+wxId+"&from=tool";
//
logger.info("url is {}",url); // logger.info("url is {}",url);
//
String htmlBody = HttpClientTemplateOK.get(url, null, headerMap); // String htmlBody = HttpClientTemplateOK.get(url, null, headerMap);
if(htmlBody!=null) // if(htmlBody!=null)
{ // {
try { // try {
// 解析数据 // // 解析数据
Document document = Jsoup.parse(htmlBody); // Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box") // Elements elements = document.select("div.news-box")
.select("ul.news-list").select("li"); // .select("ul.news-list").select("li");
for (Element element : elements) // for (Element element : elements)
{ // {
try { // try {
String url_link = element.select("div.txt-box").select("h3 >a").attr("href"); // String url_link = element.select("div.txt-box").select("h3 >a").attr("href");
int readNum = 0; // int readNum = 0;
try { // try {
readNum = Integer.valueOf(element.select("div.txt-box") // readNum = Integer.valueOf(element.select("div.txt-box")
.select("div.s-p").select("span.s1").text().trim()); // .select("div.s-p").select("span.s1").text().trim());
logger.info("readNum is {}", readNum); // logger.info("readNum is {}", readNum);
} catch (Exception e) { // } catch (Exception e) {
readNum = 0; // readNum = 0;
} // }
if(url_link.contains("&chksm=")) // if(url_link.contains("&chksm="))
{ // {
url_link = url_link.split("&chksm=")[0] + "&3rd" + url_link.split("&3rd")[1]; // url_link = url_link.split("&chksm=")[0] + "&3rd" + url_link.split("&3rd")[1];
} // }
//
if(link.equals(url_link)) // if(link.equals(url_link))
{ // {
wLike.setUrl(link); // wLike.setUrl(link);
wLike.setRead(readNum); // wLike.setRead(readNum);
break; // break;
} // }
} catch (Exception e) { // } catch (Exception e) {
continue; // continue;
} // }
} // }
} catch (Exception e) { // } catch (Exception e) {
wLike.setUrl(link); // wLike.setUrl(link);
wLike.setRead(0); // wLike.setRead(0);
return null; // return null;
} // }
} // }
} catch (Exception e) { // } catch (Exception e) {
e.printStackTrace(); // e.printStackTrace();
wLike.setUrl(link); // wLike.setUrl(link);
wLike.setRead(0); // wLike.setRead(0);
return null; // return null;
} // }
return wLike; // return wLike;
} // }
//
} //}
...@@ -35,7 +35,7 @@ import com.zhiwei.wechat.entity.WechatAricle; ...@@ -35,7 +35,7 @@ import com.zhiwei.wechat.entity.WechatAricle;
public class WechatAritcleSearch { public class WechatAritcleSearch {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class); private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/** /**
* *
......
...@@ -13,7 +13,7 @@ import com.zhiwei.crawler.utils.RequestUtils; ...@@ -13,7 +13,7 @@ import com.zhiwei.crawler.utils.RequestUtils;
public class WechatCount { public class WechatCount {
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static int getWechatCountByWord(String word, String cookie, public static int getWechatCountByWord(String word, String cookie,
String startTime, String endTime, Proxy proxy) { String startTime, String endTime, Proxy proxy) {
......
...@@ -5,7 +5,8 @@ import java.util.HashMap; ...@@ -5,7 +5,8 @@ import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
...@@ -17,7 +18,7 @@ import com.zhiwei.tools.tools.URLCodeUtil; ...@@ -17,7 +18,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
*/ */
public class WechatIndex { public class WechatIndex {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
...@@ -53,7 +54,7 @@ public class WechatIndex { ...@@ -53,7 +54,7 @@ public class WechatIndex {
headerMap.put("Accept","application/json, text/javascript, */*; q=0.01"); headerMap.put("Accept","application/json, text/javascript, */*; q=0.01");
headerMap.put("Cookie","mmsearch_user_key=AStrb5tD4ruSixIDu1cVpTA=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700"); headerMap.put("Cookie","mmsearch_user_key=AStrb5tD4ruSixIDu1cVpTA=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700");
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
System.out.println(htmlBody); System.out.println(htmlBody);
Thread.sleep(3000); Thread.sleep(3000);
......
/** ///**
* @Title: WechatDataFromHistoryExample.java // * @Title: WechatDataFromHistoryExample.java
* @Package com.zhiwei.wechat.example // * @Package com.zhiwei.wechat.example
* @Description:微信采集历史文章测试 // * @Description:微信采集历史文章测试
* @author hero // * @author hero
* @date 2016年5月20日 下午5:47:56 // * @date 2016年5月20日 下午5:47:56
* @version V1.0 // * @version V1.0
*/ // */
/** ///**
* // *
*/ // */
package com.zhiwei.wechat.example; //package com.zhiwei.wechat.example;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
//
import com.zhiwei.wechat.entity.WechatAricle; //import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.history.WechatDataFromHistory; //import com.zhiwei.wechat.history.WechatDataFromHistory;
//
/** ///**
* @Description:微信采集历史文章测试 // * @Description:微信采集历史文章测试
* @author hero // * @author hero
* @date 2016年5月20日 下午5:47:56 // * @date 2016年5月20日 下午5:47:56
*/ // */
public class WechatDataFromHistoryExample { //public class WechatDataFromHistoryExample {
//
public static void main(String[] args) { // public static void main(String[] args) {
boolean updateLike = false; // boolean updateLike = false;
boolean follow = true; // boolean follow = true;
String endDate = "2017-01-27"; // String endDate = "2017-01-27";
try { // try {
List<String> urllist = new ArrayList<String>(); // List<String> urllist = new ArrayList<String>();
urllist.add("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5NTU0MzI0MA==&scene=124&uin=MTE4OTQyMDc0MQ%3D%3D&key=df62f0a2a8b7732dca2d1f886b5bd15c398e1fe92940e352837738ea99e5ddc531fc24d5d57a5a43eab11df1e4db7db80aeeddfc06c8f410e159d80df4f822c07c555b4b536b52593f132f39c6868698&devicetype=Windows+8&version=6203005d&lang=zh_CN&a8scene=7&pass_ticket=nMJ5n97UE%2BxdJKqeKp3ovi8slnCMNSYF6Tu%2FgsQ4Phk%2Bc%2B%2BDM5AQy7LT6H%2BBQTc5&winzoom=1"); // urllist.add("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5NTU0MzI0MA==&scene=124&uin=MTE4OTQyMDc0MQ%3D%3D&key=df62f0a2a8b7732dca2d1f886b5bd15c398e1fe92940e352837738ea99e5ddc531fc24d5d57a5a43eab11df1e4db7db80aeeddfc06c8f410e159d80df4f822c07c555b4b536b52593f132f39c6868698&devicetype=Windows+8&version=6203005d&lang=zh_CN&a8scene=7&pass_ticket=nMJ5n97UE%2BxdJKqeKp3ovi8slnCMNSYF6Tu%2FgsQ4Phk%2Bc%2B%2BDM5AQy7LT6H%2BBQTc5&winzoom=1");
System.out.println(urllist.size()); // System.out.println(urllist.size());
int i = 0; // int i = 0;
for (String s : urllist) { // for (String s : urllist) {
System.out.println("i===========" + i); // System.out.println("i===========" + i);
String url = s.split(",")[0]; // String url = s.split(",")[0];
// String source = s.split(",")[1]; //// String source = s.split(",")[1];
//
WechatDataFromHistory wdfh = new WechatDataFromHistory(updateLike,endDate,follow); // WechatDataFromHistory wdfh = new WechatDataFromHistory(updateLike,endDate,follow);
System.out.println(url); // System.out.println(url);
List<WechatAricle> list = wdfh.getWechatDataFromHistory(url,null); // List<WechatAricle> list = wdfh.getWechatDataFromHistory(url,null);
System.out.println("list size is :" + list.size()); // System.out.println("list size is :" + list.size());
//
} // }
} catch (Exception e) { // } catch (Exception e) {
e.printStackTrace(); // e.printStackTrace();
} // }
} // }
//
//
} //}
...@@ -40,13 +40,11 @@ public class WechatSearchExample{ ...@@ -40,13 +40,11 @@ public class WechatSearchExample{
public static void wechatSearchExample() throws UnknownHostException public static void wechatSearchExample() throws UnknownHostException
{ {
List<String> wordList = new ArrayList<String>(); List<String> wordList = new ArrayList<String>();
wordList.add("工业互联网"); wordList.add("京东");
String idOrName = "吴晓波频道";
for(String word : wordList) for(String word : wordList)
{ {
try { try {
List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearchByAccount(word, idOrName, "2017-12-01", "2018-12-01", ProxyHolder.SOUGOU_INNER_PROXY); List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-04-08", "2019-04-08", ProxyHolder.SOUGOU_INNER_PROXY.getProxy());
System.out.println("======"+list.size()); System.out.println("======"+list.size());
for(WechatAricle wechat : list){ for(WechatAricle wechat : list){
System.out.println(wechat.getTitle()); System.out.println(wechat.getTitle());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment