Commit 424bda47 by yangchen

微信初步版本提交

parents
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId>
<version>1.1.0-SNAPSHOT</version>
<description>
知微微信采集程序,包含
1.微信历史文章采集
2.搜狗微信接口关键词采集
3.点赞阅读更新接口
4.根据关键词或微信id查询帐号信息
5.根据文章链接采集评论列表及评论数
</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<developers>
<developer>
<id>Bewilder</id>
<name>zhiwei zhang</name>
<email>zhangzhiwei@zhiweidata.com</email>
</developer>
</developers>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.5-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package com.zhiwei.wechat.account;
import java.util.HashMap;
import java.util.Map;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
/**
* @ClassName: WechatAccountFans
* @Description: TODO(微信公众号粉丝增量采集程序)
* @author hero
* @date 2017年2月8日 上午11:36:11
*/
public class WechatAccountFans {
// private static Logger logger = LoggerFactory.getLogger(WechatAccountFans.class);
private Map<String,String> headerMap;
public WechatAccountFans()
{
headerMap = new HashMap<String,String>();
headerMap.put("Host", "mp.weixin.qq.com");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
headerMap.put("X-Requested-With", "XMLHttpRequest ");
}
/**
* @Title: getWechatAccountFans
* @Description: TODO(采集微信公众号的粉丝增量)
* @param @param url
* @param @param cookie 设定文件
* @return void 返回类型
*/
public JSONArray getWechatAccountFans(String token,String start
,String end,String cookie) throws Exception
{
String url = "https://mp.weixin.qq.com/misc/useranalysis?&begin_date="+start+"&end_date="+end+"&source=99999999,99999999&token="+token+"&lang=zh_CN&f=json&ajax=1";
String referer = "https://mp.weixin.qq.com/misc/useranalysis?&token="+token+"&lang=zh_CN";
headerMap.put("Referer", referer);
headerMap.put("Cookie", cookie);
String htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
if(htmlBody != null)
{
JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject category_list = json.getJSONArray("category_list").getJSONObject(0);
return category_list.getJSONArray("list");
}else
{
return null;
}
}
}
/**
* 获取微信文章评论
* @Title: WechatComment.java
* @Package com.zhiwei.wechat.comment
* @Description:获取微信文章评论
* @author hero
* @date 2016年6月25日 上午8:17:37
* @version V1.0
*/ /**
*
*/
package com.zhiwei.wechat.comment;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.wechat.entity.WechatComment;
import com.zhiwei.wechat.readAndLike.AriticleContent;
import com.zhiwei.wechat.util.Tools;
/**
* @Description:获取微信文章评论
* @author hero
* @date 2016年6月25日 上午8:17:37
*/
public class WechatCommentList {
private static WechatComment wc = new WechatComment();
private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class);
/**
* 根据文章url获取文章评论列表
* @Description:
* @param @param url
* @param @return
* @return List<WechatComment> 返回类型
*/
public static List<WechatComment> getWechatCommentList(String url,String key)
{
List<WechatComment> wcList = null;
/*处理url*/
String urlcookie = url;
if(!url.contains("key")){
urlcookie = Tools.getWechatCookieUrl(url, key);
}
// 请求头信息
Map<String,String> headerMap = Tools.getWechatHeader();
Map<String, String> cookieMap;
try {
cookieMap = HttpClientTemplateOK.getCookie(urlcookie, null, headerMap);
headerMap.put("Referer", url);
if(cookieMap.get("cookie").length()>50){
headerMap.put("Cookie", cookieMap.get("cookie")+"");
}
String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
String biz = url.split("__biz=")[1].split("&")[0];
String appmsgid = url.split("mid=")[1].split("&")[0];
String comment_id = AriticleContent.getCommentId(url,key);
if(comment_id!=null && appmsg_token!=null)
{
String comment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
+ "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key
+ "&appmsg_token=" + appmsg_token;
/**解析相关数据*/
System.out.println(comment_url);
if("0".equals(comment_id))
{
logger.info("此条微信文章没有评论");
}else
{
try {
String htmlBody = HttpClientTemplateOK.get(comment_url, null, headerMap);
if(htmlBody!=null)
{
JSONObject json = JSON.parseObject(htmlBody);
wcList = wc.constructWechatComment(json.getJSONArray("elected_comment"),url);
return wcList;
}
} catch (Exception e) {
logger.info("解析微信文章评论列表时出现问题:", e.fillInStackTrace());
return null;
}
}
}
} catch (IOException e1) {
return null;
} catch (Exception e1) {
e1.printStackTrace();
}
return null;
}
/**
* @Title: getWechatCommentCount
* @Description: TODO(根据微信文章地址更新微信评论数)
* @param @param url
* @param @param key
* @param @return 设定文件
* @return int 返回类型
*/
public static int getWechatCommentCount(String url,String key)
{
System.out.println(url);
/*处理url*/
String url_new = url;
if(url.contains("#rd"))
{
url_new = url.split("#rd")[0] + key;
}else if(url.contains("#wechat_redirect"))
{
url_new = url.split("#wechat_redirect")[0] + key;
}
String biz = url.split("__biz=")[1].split("&")[0];
String appmsgid = url.split("mid=")[1].split("&")[0];
/**获取网页头信息**/
Map<String,String> headerMap = Tools.getWechatHeader();
/*获取评论id*/
String comment_id = AriticleContent.getCommentId(url,key);
if(comment_id!=null)
{
String comment_url = "http://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
+ "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key;
/**解析相关数据*/
if("0".equals(comment_id))
{
logger.info("此条微信文章没有评论");
return 0;
}else
{
try {
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url_new, null,headerMap);
headerMap.put("Cookie", cookieMap.get("cookie"));
String htmlBody = HttpClientTemplateOK.get(comment_url, null,headerMap);
System.out.println(htmlBody);
if(htmlBody!=null)
{
JSONObject json = JSON.parseObject(htmlBody);
return json.getIntValue("elected_comment_total_cnt");
}
} catch (Exception e) {
logger.debug("更新微信文章评论数时出现问题,问题信息:",e.getMessage());
return -1;
}
}
}else
{
logger.info("获取评论id失败");
return -1;
}
return -1;
}
}
package com.zhiwei.wechat.entity;
import java.io.Serializable;
/**
* @ClassName: WechatAccount
* @Description: TODO(微信帐号)
* @author Bewilder Z
* @date 2017年1月16日 上午11:44:21
*/
public class WechatAccount implements Serializable{
private static final long serialVersionUID = -5179735277202327683L;
private String id; //公号id
private String name; //公号昵称
private String biz; //公号biz
private String imgurl; //头像地址
private String descript; //描述
private String verified_reason; //认证原因
private String openid; //认证原因
private int article_count_month; //月发文量
private int avg_read_month; //月平均阅读数
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getBiz() {
return biz;
}
public void setBiz(String biz) {
this.biz = biz;
}
public String getImgurl() {
return imgurl;
}
public void setImgurl(String imgurl) {
this.imgurl = imgurl;
}
public String getDescript() {
return descript;
}
public void setDescript(String descript) {
this.descript = descript;
}
public String getVerified_reason() {
return verified_reason;
}
public void setVerified_reason(String verified_reason) {
this.verified_reason = verified_reason;
}
public String getOpenid() {
return openid;
}
public void setOpenid(String openid) {
this.openid = openid;
}
public int getArticle_count_month() {
return article_count_month;
}
public void setArticle_count_month(int article_count_month) {
this.article_count_month = article_count_month;
}
public int getAvg_read_month() {
return avg_read_month;
}
public void setAvg_read_month(int avg_read_month) {
this.avg_read_month = avg_read_month;
}
@Override
public String toString()
{
return "new WechatAccount["
+ "id = " + id
+ ", name = " + name
+ ", biz = " + biz
+ ", imgurl = " + imgurl
+ ", descript = " + descript
+ ", verified_reason = " + verified_reason
+ ", openid = " + openid
+ ", article_count_month = " + article_count_month
+ ", avg_read_month = " + avg_read_month
+ "]";
}
public WechatAccount(){}
public WechatAccount(String id,String name, String biz,
String imgurl,String descript,String verified_reason,
String openid, int article_count_month,int avg_read_month)
{
this.id = id;
this.name = name;
this.biz = biz;
this.imgurl = imgurl;
this.descript = descript;
this.verified_reason = verified_reason;
this.openid = openid;
this.article_count_month = article_count_month;
this.avg_read_month = avg_read_month;
}
}
/**
* @Title: Wechat.java
* @Package com.zhiwei.wechat.bean
* @Description: TODO(用一句话描述该文件做什么)
* @author zhiweizhang
* @date 2015年11月5日 下午4:37:10
* @version V1.0
*/ /**
*
*/
package com.zhiwei.wechat.entity;
import java.util.Date;
/**
* @ClassName: Wechat
* @Description: TODO(微信文章)
* @author Bewilder Z
* @date 2015年11月5日 下午4:37:10
*/
public class WechatAricle {
private String id; //主键
private String title; //标题
private String source; //来源
private String content; //内容
private String imgUrl; //图片地址
private Date time; //发布时间
private int readNum; //阅读数
private int likeNum; //点赞数
private String openId;//openid
private String isFirst; //是否为头条文章
public String getIsFirst() {
return isFirst;
}
public void setIsFirst(String isFirst) {
this.isFirst = isFirst;
}
public String getOpenId() {
return openId;
}
public void setOpenId(String openId) {
this.openId = openId;
}
public String getImgUrl() {
return imgUrl;
}
public void setImgUrl(String imgUrl) {
this.imgUrl = imgUrl;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public int getReadNum() {
return readNum;
}
public void setReadNum(int readNum) {
this.readNum = readNum;
}
public int getLikeNum() {
return likeNum;
}
public void setLikeNum(int likeNum) {
this.likeNum = likeNum;
}
public WechatAricle(){}
public WechatAricle(String id,String title,String source,String content
,Date time,int readNum,int likeNum,String openId,String isFirst)
{
this.id = id.replaceAll("amp;", "");
this.title = title;
this.source = source;
this.content = content;
this.time = time;
this.readNum = readNum;
this.likeNum = likeNum;
this.openId = openId;
this.isFirst = isFirst;
}
@Override
public String toString()
{
return "new Wechat["
+ "id = " + id + ","
+ "title = " + title + ","
+ "source = " + source + ","
+ "content = " + content + ","
+ "time = " + time + ","
+ "readNum = " + readNum + ","
+ "likeNum = " + likeNum + ","
+ "openId = " + openId + ","
+ "isFirst = " + isFirst + ","
+ "]";
}
}
/**
* 微信文章评论数据
* @Title: WechatComment.java
* @Package com.zhiwei.wechat.entity
* @Description:微信文章评论数据
* @author Bewilder Z
* @date 2016年6月24日 下午5:12:11
* @version V1.0
*/
package com.zhiwei.wechat.entity;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.timeparse.TimeParse;
/**
* @Description:微信文章评论数据
* @author hero
* @date 2016年6月24日 下午5:12:11
*/
public class WechatComment implements Serializable{
private static final long serialVersionUID = -1232983839480666507L;
private int id;
private int my_id;
private String nick_name;
private String content;
private Date time;
private int like_id;
private int like_num;
private int like_status;
private int is_from_friend;
private int is_from_me;
private String from_url;
public String getFrom_url() {
return from_url;
}
public void setFrom_url(String from_url) {
this.from_url = from_url;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public int getMy_id() {
return my_id;
}
public void setMy_id(int my_id) {
this.my_id = my_id;
}
public String getNick_name() {
return nick_name;
}
public void setNick_name(String nick_name) {
this.nick_name = nick_name;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public int getLike_id() {
return like_id;
}
public void setLike_id(int like_id) {
this.like_id = like_id;
}
public int getLike_num() {
return like_num;
}
public void setLike_num(int like_num) {
this.like_num = like_num;
}
public int getLike_status() {
return like_status;
}
public void setLike_status(int like_status) {
this.like_status = like_status;
}
public int getIs_from_friend() {
return is_from_friend;
}
public void setIs_from_friend(int is_from_friend) {
this.is_from_friend = is_from_friend;
}
public int getIs_from_me() {
return is_from_me;
}
public void setIs_from_me(int is_from_me) {
this.is_from_me = is_from_me;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
@Override
public String toString()
{
return "new WechatComment["
+ "id = " + id
+ ",my_id = " + my_id
+ ",nick_name = " + nick_name
+ ",content = " + content
+ ",time = " + time
+ ",like_id = " + like_id
+ ",like_num = " + like_num
+ ",like_status = " + like_status
+ ",is_from_friend = " + is_from_friend
+ ",is_from_me = " + is_from_me
+ ",from_url = " + from_url
+ "]";
}
public WechatComment(JSONObject json,String url)throws Exception
{
super();
init(json,url);
}
public WechatComment(){}
private void init(JSONObject json,String url) throws Exception {
System.out.println(json);
if(json!=null){
try {
id = json.getInteger("id");
my_id = json.getInteger("my_id");
nick_name = json.getString("nick_name");
content = json.getString("content");
time = TimeParse.stringFormartDate(json.getLong("create_time")*1000L+"");
like_id = json.getInteger("like_id");
like_num = json.getInteger("like_num");
like_status = json.getInteger("like_status");
is_from_friend = json.getInteger("is_from_friend");
is_from_me = json.getInteger("is_from_me");
from_url = url;
} catch (JSONException jsone) {
throw new Exception(jsone.getMessage() + ":" + json.toString(), jsone);
}
}
}
public List<WechatComment> constructWechatComment(JSONArray list,String url) throws Exception
{
try {
int size = list.size();
List<WechatComment> wcList = new ArrayList<WechatComment>(size);
for (int i = 0; i < size; i++) {
wcList.add(new WechatComment(list.getJSONObject(i),url));
}
System.out.println(wcList);
return wcList;
} catch (JSONException e)
{
throw new Exception("解析微信文章评论bug:"+e.getMessage(),e);
}
}
}
/**
* @Title: WeChatReadLike.java
* @Package com.wcral.bean
* @Description: TODO(用一句话描述该文件做什么)
* @author Abner Liu
* @date 2015年8月6日 上午9:52:36
* @version V1.0
*/
/**
*
*/
package com.zhiwei.wechat.entity;
/**
* @ClassName: WeChatReadLike
* @Description: TODO(微信点赞阅读实体)
* @author Bewilder Z
* @date 2015年8月6日 上午9:52:36
*/
public class WechatReadLike {
private String url;
private int read;
private int like;
private int real_read;
public int getReal_read() {
return real_read;
}
public void setReal_read(int real_read) {
this.real_read = real_read;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public int getRead() {
return read;
}
public void setRead(int read) {
this.read = read;
}
public int getLike() {
return like;
}
public void setLike(int like) {
this.like = like;
}
}
/**
* 文章具体内容
* @Title: AriticleContent.java
* @Package com.zhiwei.wechat.readAndLike
* @Description:文章具体内容
* @author Bewilder Z
* @date 2016年5月20日 下午2:39:46
* @version V1.0
*/
package com.zhiwei.wechat.readAndLike;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.wechat.comment.WechatCommentList;
import com.zhiwei.wechat.util.Tools;
/**
* @Description:文章具体内容
* @author hero
* @date 2016年5月20日 下午2:39:46
*/
public class AriticleContent{
private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class);
/**
* @Title: getAriticleContent
* @Description: TODO(根据url获取文章内容)
* @param @param url
* @param @return 设定文件
* @return Map<String,String> 返回类型
*/
public static Map<String,String> getAriticleContent(String url)
{
Map<String,String> result = new HashMap<String,String>();
Map<String,String> headerMap = Tools.getWechatHeader();
url = url.replaceAll("amp;", "").replaceAll("amp;", "");
headerMap.put("Referer", url);
String content = null;
String source = null;
try {
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
Document document = Jsoup.parse(htmlBody);
content = document.select("div.rich_media_content").text();
if(htmlBody.contains("var nickname = ")){
source = htmlBody.split("var nickname = \"")[1].split("\";")[0];
}else{
source = document.select("div#meta_content").select("span#profileBt").text();
}
result.put("content", content);
result.put("source", source);
return result;
} catch (Exception e) {
logger.debug("获取微信文章内容或来源时出现问题,",e.getMessage());
return null;
}
}
/**
* 获取评论的commentId
* @Description:获取评论的commentId
* @param @param url
* @param @return
* @return String 返回类型
*/
public static String getCommentId(String url,String key)
{
Map<String,String> headerMap = new HashMap<String,String>();
url = url.replaceAll("amp;", "")+"&key="+key;
headerMap.put("Referer", url);
String comment_id = null;
try {
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
if(htmlBody!=null)
{
Document document = Jsoup.parse(htmlBody);
String content = document.select("script").html();
comment_id = content.split("var comment_id = \"")[1].split("\"")[0];
logger.info("comment_id : " + comment_id);
}
} catch (Exception e) {
logger.debug("获取微信文章评论id时出现问题,",e.getMessage());
return null;
}
return comment_id;
}
public static void main(String[] args) {
String key = "9ed31d4918c154c810272b09930a4bc0f0cdef34aac4d35a975d8c81fc3261892cf249fedfe7fbacd61a36ecf44d54d1e537ca555379ab6223a63c2c5abf062af1272a2c79a54ee0296ae5d8f22f2092";
String url = "https://mp.weixin.qq.com/s?__biz=MjM5NTE1NTc0MA==&mid=2652457522&idx=1&sn=66496e63dd39097ffb21545a81c45812&scene=0&key="+key+"&ascene=1&uin=MTE4OTQyMDc0MQ%3D%3D";
System.out.println(getCommentId(url,key));
}
}
/**
* @Title: WindowsClient.java
* @Package com.wcral.client
* @Description: TODO(用一句话描述该文件做什么)
* @author Bewilder Z
* @date 2015年8月6日 上午9:13:37
* @version V1.0
*/
package com.zhiwei.wechat.readAndLike;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.wechat.entity.WechatReadLike;
import com.zhiwei.wechat.search.WechatAritcleSearch;
import com.zhiwei.wechat.util.Tools;
/**
* @ClassName: WindowsClient
* @Description: TODO(利用windows客戶端進行点赞阅读抓取)
* @author Abner Liu
* @date 2015年8月6日 上午9:13:37
*/
public class WeChatReadAndLike {
private static Logger logger = LoggerFactory.getLogger(WeChatReadAndLike.class);
/**
*
* @Title: getReadAndLike
* @Description: 利用windows客戶端進行点赞阅读抓取
* @param url
* 微信文章链接
* @return WeChatReadLike 微信文章实体类
*
*/
public static WechatReadLike getReadAndLike(String url,String key,Proxy proxy){
WechatReadLike wLike = new WechatReadLike();
try {
String urlcookie = Tools.getWechatCookieUrl(url, key);
// 请求头信息
Map<String,String> headerMap = Tools.getWechatHeader();
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(urlcookie, proxy, headerMap);
headerMap.put("Referer", urlcookie);
headerMap.put("Cookie", cookieMap.get("cookie")+"");
String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
System.out.println("appmsg_token==========="+appmsg_token);
String urlLike = Tools.getWechatLikeUrl(urlcookie,appmsg_token);
//设置post请求参数
HashMap<String,Object> postMap = new HashMap<String,Object>();
postMap.put("is_only_read", "1");
//获取数据
String htsString = HttpClientTemplateOK.post(urlLike, proxy, headerMap ,postMap);
System.out.println(htsString);
JSONObject jsonObject = JSONObject.parseObject(htsString);
String like_num = jsonObject.getJSONObject("appmsgstat")
.get("like_num").toString();
String real_read_num = "";
try {
real_read_num = jsonObject.getJSONObject("appmsgstat")
.get("real_read_num").toString();
if(real_read_num.equals("0"))
{
real_read_num = jsonObject.getJSONObject("appmsgstat")
.get("read_num").toString();
}
} catch (Exception e) {
real_read_num = jsonObject.getJSONObject("appmsgstat")
.get("read_num").toString();
}
wLike.setUrl(url);
wLike.setRead(Integer.valueOf(real_read_num));
wLike.setLike(Integer.valueOf(like_num));
} catch (Exception e) {
wLike.setUrl(url);
wLike.setRead(-1);
wLike.setLike(-1);
}
return wLike;
}
/**
* @Title: getReadAndLike
* @Description: TODO(通过搜狗微信获取阅读数)
* @param @param word
* @param @param time
* @param @param link
* @param @param wxId
* @param @return 设定文件
* @return WeChatReadLike 返回类型
*/
public static WechatReadLike getReadAndLike(String word,
String time,String link,String wxId){
WechatReadLike wLike = new WechatReadLike();
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Upgrade-Insecure-Requests", "1");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
headerMap.put("Host","weixin.sogou.com");
if(time.contains(" "))
{
time = time.split(" ")[0];
}
String openid = WechatAritcleSearch.getOpenId(wxId);
logger.info("openid is {}", openid);
try {
String url = "http://weixin.sogou.com/weixin?query=" + URLEncoder.encode(word,"utf-8")
+ "&type=2&ie=utf8&page=1&interation=&tsn=5&ft="+time + "&et="+ time
+ "&wxid="+openid+"&usip="+wxId+"&from=tool";
logger.info("url is {}",url);
String htmlBody = HttpClientTemplateOK.get(url, null, headerMap);
if(htmlBody!=null)
{
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box")
.select("ul.news-list").select("li");
for (Element element : elements)
{
try {
String url_link = element.select("div.txt-box").select("h3 >a").attr("href");
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box")
.select("div.s-p").select("span.s1").text().trim());
logger.info("readNum is {}", readNum);
} catch (Exception e) {
readNum = 0;
}
if(url_link.contains("&chksm="))
{
url_link = url_link.split("&chksm=")[0] + "&3rd" + url_link.split("&3rd")[1];
}
if(link.equals(url_link))
{
wLike.setUrl(link);
wLike.setRead(readNum);
break;
}
} catch (Exception e) {
continue;
}
}
} catch (Exception e) {
wLike.setUrl(link);
wLike.setRead(0);
return null;
}
}
} catch (Exception e) {
e.printStackTrace();
wLike.setUrl(link);
wLike.setRead(0);
return null;
}
return wLike;
}
}
package com.zhiwei.wechat.search;
import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.util.Tools;
/**
* @ClassName: WechatAritcleSearch
* @Description: TODO(在搜索接口根据关键词采集微信文章)
* @author Bewilder Z
* @date 2016年10月14日 上午9:40:18
*/
public class WechatAritcleSearch {
private static Logger logger = LoggerFactory.getLogger(WechatAritcleSearch.class);
/**
*
* @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章)
* @param @param word 关键词
* @param @param tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param startTime 开始时间 格式为yyyy-MM-dd
* @param @param endTime 结束时间 格式为yyyy-MM-dd
* @param @param cookie 用户登录后的cookie(不登录最多10页)
* @param @return
* @param @throws ZhiWeiException
* @param @throws UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
*/
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn,
String startTime,String endTime,String cookie,Proxy proxy)
throws Exception, UnsupportedEncodingException
{
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Upgrade-Insecure-Requests", "1");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
headerMap.put("Host","weixin.sogou.com");
if(cookie!=null){
headerMap.put("Cookie",cookie);
}
boolean f = true;
int page = 1;
while(f)
{
String url = "http://weixin.sogou.com/weixin?type=2&query="
+ URLEncoder.encode(word,"UTF-8")+"&ie=utf8&_sug_=n&_sug_type_="
+ "&ri=1&sourceid=sugg&sst0="+System.currentTimeMillis()
+"&tsn="+tsn + "&page="+page;
if(tsn==5)
{
url = url + "&ft="+startTime + "&et="+ endTime
+ "&wxid=&usip=&interation=&from=tool";
}
headerMap.put("Referer",url);
//获取数据
String htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string();
//解析数据
if(htmlBody != null)
{
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box")
.select("ul.news-list").select("li");
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements)
{
try {
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("href");
content = "";
if(element.select("p.txt-info").isEmpty())
{
content = element.select("p.txt-info").text();
}else
{
content = element.select("div.txt-box").select("p.txt-info").text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p")
.select("a").text();
openid = element.select("div.txt-box")
.select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box")
.select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box")
.select("div.s-p").select("span.s1").text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0,openid,"unknow");
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}",e.getMessage());
continue;
}
}
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
if(pageNext.contains("下一页")){
page++;
}else{
f = false;
}
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}",e.getMessage());
return null;
}
}else
{
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
}
return result;
}
/**
* @Title: getOpenId
* @Description: TODO(获取微信wxID)
* @param @param wxId
* @param @return 设定文件
* @return String 返回类型
*/
public static String getOpenId(String wxId)
{
String openId = null;
String url = "http://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query="+wxId;
Map<String,String> headerMap = Tools.getWechatHeader();
String htmlBody;
try {
htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
if(htmlBody!=null)
{
JSONObject json = JSONObject.parseObject(htmlBody);
openId = json.getString("openid");
}
} catch (Exception e) {
openId = null;
e.printStackTrace();
}
return openId;
}
}
package com.zhiwei.wechat.search;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
/**
* @ClassName: WechatIndex
* @Description: TODO(微信指数搜索)
* @author Bewilder Z
* @date 2017年3月24日 下午2:52:01
*/
public class WechatIndex {
public static void main(String[] args) throws Exception {
String word = "百度";
String startTime = "2016-01-01 00:00:00";
String endTime = "2017-03-24 00:00:00";
Map<String,String> timeLine = TimeParse.getTimeMap(startTime, endTime, "dd", 7);
for(Entry<String,String> entry: timeLine.entrySet())
{
Date st = TimeParse.stringFormartDate(entry.getKey());
Date et = TimeParse.stringFormartDate(entry.getValue());
getWechatIndex(word,st.getTime()/1000L,et.getTime()/1000L);
}
}
public static void getWechatIndex(String word,long startTime,long endTime) throws Exception
{
String url = "https://search.weixin.qq.com/cgi-bin/searchweb/getwxindex?query="
+URLCodeUtil.getURLEncode(word, "utf-8")+"&start_time="+startTime+"&end_time="+endTime+"&_="+new Date().getTime();
// String urlCookie = "https://search.weixin.qq.com/cgi-bin/searchweb/getjsapiticket?sign_url=https%253A%252F%252Fsearch.weixin.qq.com%252Fcgi-bin%252Fsearchweb%252Fclientjump%253Ftag%253Dwxindex%2526exportkey%253DAStrb5tD4ruSixIDu1cVpTA%25253D%2526pass_ticket%253DbbP7ZT5xEUrYe%25252BoOa6ACUw%25252BmgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva%25252BGxj&_=1490341301892";
System.out.println(url);
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Host", "search.weixin.qq.com");
headerMap.put("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Mobile/14D27 MicroMessenger/6.5.5 NetType/4G Language/zh_CN");
headerMap.put("Referer", url);
headerMap.put("X-Requested-With","XMLHttpRequest");
headerMap.put("Accept","application/json, text/javascript, */*; q=0.01");
headerMap.put("Cookie","mmsearch_user_key=AStrb5tD4ruSixIDu1cVpTA=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700");
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
System.out.println(htmlBody);
Thread.sleep(3000);
// Document htmlBody = Jsoup.connect(url)
// .header("Host", "search.weixin.qq.com")
// .header("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Mobile/14D27 MicroMessenger/6.5.5 NetType/4G Language/zh_CN")
// .header("Referer", url)
// .header("X-Requested-With","XMLHttpRequest")
// .header("Accept","application/json, text/javascript, */*; q=0.01")
// .header("Cookie", "mmsearch_user_key=AfNSrJx116RWkWvTuVC949k=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700")
// .ignoreHttpErrors(false)
// .ignoreContentType(true)
// .timeout(3000)
// .get();
}
}
package com.zhiwei.wechat.util;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class Tools {
/**
* @Title: getWechatHeader
* @Description: TODO(设置微信采集头信息)
* @param @return 设定文件
* @return Map<String,String> 返回类型
*/
public static Map<String,String> getWechatHeader()
{
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.8");
headerMap.put("Connection", "keep-alive");
headerMap.put("Upgrade-Insecure-Requests", "1");
headerMap.put("Host", "mp.weixin.qq.com");
headerMap.put("Origin", "http://mp.weixin.qq.com");
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.691.400 QQBrowser/9.0.2524.400");
return headerMap;
}
/**
* @Title: getWechatLikeUrl
* @Description: TODO(根据url和key拼接获取cookie链接)
* @param @param url
* @param @param key
* @param @return 设定文件
* @return String 返回类型
*/
public static String getWechatCookieUrl(String url,String key)
{
url = url.replace("http://mp.weixin.qq.com/s?", "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&");
String url_new = url;
if(key != null)
{
if(url.contains("#rd"))
{
url_new = url.split("#rd")[0] + key;
}else if(url.contains("#wechat_redirect"))
{
url_new = url.split("#wechat_redirect")[0] + key;
}else{
url_new = url + key;
}
}
return url_new;
}
/**
* @Title: getWechatLikeUrl
* @Description: TODO(拼接获取点赞阅读链接)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public static String getWechatLikeUrl(String url,String appmsg_token)
{
return "http://mp.weixin.qq.com/mp/getappmsgext?__biz" + url.split("s?__biz")[1]+"&appmsg_token="+appmsg_token;
}
/**
* 读取关键词信息
* @param String fileName
* 外部关键词文件名
* @return List
* **/
public static List<String> getFileName(String fileName) {
List<String> list = new ArrayList<String>();
try {
BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(fileName),"GBK"));
String line = "";
while((line = br.readLine())!=null)
{
list.add(line);
}
br.close();
} catch (Exception e) {
e.printStackTrace();
return null;
}
return list;
}
/**
* @Title: getAppMsgToken
* @author hero
* @Description: TODO(获取appmsg_token,用于更新点赞阅读)
* @param @param htmlBody
* @param @return 设定文件
* @return String 返回类型
*/
public static String getAppMsgToken(String htmlBody){
String appmsg_token = null;
if(htmlBody !=null && !"".equals(htmlBody)){
if(htmlBody.contains("appmsg_token")){
try {
appmsg_token = htmlBody.split("window.appmsg_token = \"")[1].split("\";")[0];
} catch (Exception e) {
return null;
}
}
}
return appmsg_token;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment