Commit 424bda47 by yangchen

微信初步版本提交

parents
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId>
<version>1.1.0-SNAPSHOT</version>
<description>
知微微信采集程序,包含
1.微信历史文章采集
2.搜狗微信接口关键词采集
3.点赞阅读更新接口
4.根据关键词或微信id查询帐号信息
5.根据文章链接采集评论列表及评论数
</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<developers>
<developer>
<id>Bewilder</id>
<name>zhiwei zhang</name>
<email>zhangzhiwei@zhiweidata.com</email>
</developer>
</developers>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.5-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package com.zhiwei.wechat.account;
import java.util.HashMap;
import java.util.Map;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
/**
* @ClassName: WechatAccountFans
* @Description: TODO(微信公众号粉丝增量采集程序)
* @author hero
* @date 2017年2月8日 上午11:36:11
*/
public class WechatAccountFans {
// private static Logger logger = LoggerFactory.getLogger(WechatAccountFans.class);
private Map<String,String> headerMap;
public WechatAccountFans()
{
headerMap = new HashMap<String,String>();
headerMap.put("Host", "mp.weixin.qq.com");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
headerMap.put("X-Requested-With", "XMLHttpRequest ");
}
/**
* @Title: getWechatAccountFans
* @Description: TODO(采集微信公众号的粉丝增量)
* @param @param url
* @param @param cookie 设定文件
* @return void 返回类型
*/
public JSONArray getWechatAccountFans(String token,String start
,String end,String cookie) throws Exception
{
String url = "https://mp.weixin.qq.com/misc/useranalysis?&begin_date="+start+"&end_date="+end+"&source=99999999,99999999&token="+token+"&lang=zh_CN&f=json&ajax=1";
String referer = "https://mp.weixin.qq.com/misc/useranalysis?&token="+token+"&lang=zh_CN";
headerMap.put("Referer", referer);
headerMap.put("Cookie", cookie);
String htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
if(htmlBody != null)
{
JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject category_list = json.getJSONArray("category_list").getJSONObject(0);
return category_list.getJSONArray("list");
}else
{
return null;
}
}
}
package com.zhiwei.wechat.account;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.wechat.entity.WechatAccount;
public class WechatAccountInfo {
private static Logger logger = LoggerFactory.getLogger(WechatAccountInfo.class);
/***
* @Title: getWechatAccount
* @Description: TODO(根据帐号id查询帐号信息)
* @param @param id
* @param @param name
* @param @param biz
* @param @return 设定文件
* @return WechatAccount 返回类型
*/
public static WechatAccount getUserInfoById(String id,Proxy proxy)
{
String url = "http://weixin.sogou.com/weixin?type=1&query=" + id +"&ie=utf8&_sug_=n&_sug_type_=";
System.out.println(url);
try {
String htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url),proxy,false).body().string();
if(htmlBody != null)
{
return anaSislyAccount(url,htmlBody, id, "id");
}
} catch (Exception e) {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", e.getMessage());
return null;
}
return null;
}
/**
* @deprecated:根据用户名和id精准匹配微信公号信息
* @param String name
* @param String id
* **/
public static WechatAccount getUserInfoByName(String name,Proxy proxy)
{
String query = URLCodeUtil.getURLEncode(name, "utf-8");
for(int i = 1;i<=3;i++)
{
String url = "http://weixin.sogou.com/weixin?type=1&query=" + query +"&ie=utf8&_sug_=n&_sug_type_=&page="+i;
logger.info("url:{}",url);
try {
String htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url),proxy,false).body().string();
if(htmlBody != null)
{
WechatAccount wa = anaSislyAccount(url,htmlBody, name, "name");
if(wa!=null)
{
return wa;
}
}else
{
logger.info("数据不存在...........");
}
} catch (Exception e) {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", e.getMessage());
return null;
}
}
return null;
}
public static List<WechatAccount> searchWechatAccount(String word)
{
List<WechatAccount> list = new ArrayList<>();
String query = URLCodeUtil.getURLEncode(word, "utf-8");
boolean more = true;
int i = 1;
while(more)
{
String url = "http://weixin.sogou.com/weixin?type=1&query=" + query +"&ie=utf8&_sug_=n&_sug_type_=&page="+i;
logger.info("url:{}",url);
try {
String htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
if(htmlBody != null)
{
/** 解析页面 */
list.addAll(anaSislyAccountList(url, htmlBody));
/** 判断是否有下一页 **/
Document document = Jsoup.parse(htmlBody);
if (document.select("a#sogou_next") == null) {
more = false;
} else {
if (!document.select("div#pagebar_container").text().contains("下一页")) {
more = false;
}
}
}else
{
more = false;
logger.info("数据不存在...........");
}
} catch (Exception e) {
e.printStackTrace();
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", e.fillInStackTrace());
more = false;
}
i++;
}
return list;
}
/**
* @Title: anaSislyAccount
* @Description: TODO(解析并获取帐号)
* @param @param htmlBody
* @param @param name
* @param @param biz
* @param @return 设定文件
* @return WechatAccount 返回类型
*/
private static WechatAccount anaSislyAccount(String url,String htmlBody
,String matchKey,String type)
{
Document document = Jsoup.parse(htmlBody);
if(htmlBody.contains("noresult_part1_container") ||
htmlBody.contains("501 Not Implemented"))
{
logger.info("暂无与“{}”相关的官方认证订阅号。",matchKey);
}else
{
String readurl = htmlBody.split("account_anti_url = \"")[1].split("\";")[0];
Elements element = document.select("div.news-box")
.select("ul.news-list2").select("li");
// System.out.println("数据大小:"+element.size());
logger.info("数据大小:"+element.size());
for (int i = 0; i < element.size(); i++)
{
String openid = element.get(i).attr("d");
String userName = element.get(i).select("div.gzh-box2")
.select("div.txt-box").select("p.tit").text();
String id = element.get(i).select("div.gzh-box2")
.select("div.txt-box").select("p.info")
.select("label").text();
int article_count_month = 0;
int avg_read_month = 0;
String imgurl = element.get(i).select("div.gzh-box2")
.select("div.img-box").select("img").attr("src");
String descript = "";
String info = "";
String bizR = "";
/**只有一个简介或者认证原因或者最近文章**/
try {
if(element.get(i).select("dl").size()==1)
{
String text = element.get(i).select("dl").get(0).text();
if(text.contains("功能介绍"))
{
descript = text;
}
if(text.contains("认证"))
{
info = text;
}
if(text.contains("最近文章"))
{
// bizR = element.get(i).select("dl").get(0).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
}
/**只有简介、认证原因 或者 简介、最近文章 或者 认证原因、最近文章**/
if(element.get(i).select("dl").size()==2)
{
String text = element.get(i).select("dl").get(0).text();
String text2 = element.get(i).select("dl").get(1).text();
/**有简介;认证或者最近文章*/
if(text.contains("功能介绍") )
{
descript = text;
if(text2.contains("认证"))
{
info = text2;
}else if(text2.contains("最近文章")){
// bizR = element.get(i).select("dl").get(1).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
}else if(text.contains("认证"))
{
info = text;
// bizR = element.get(i).select("dl").get(1).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
}
if(element.get(i).select("dl").size()==3)
{
descript = element.get(i).select("dl").get(0).text();
info = element.get(i).select("dl").get(1).text();
// bizR = element.get(i).select("dl").get(2).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
} catch (Exception e) {
e.printStackTrace();
}
switch (type) {
case "name":
// System.out.println(userName+"========="+matchKey);
if (userName.equals(matchKey)) {
String avg = getAvgRead(url, readurl, openid);
if (avg != null) {
article_count_month = Integer.valueOf(avg.split(",")[0]);
avg_read_month = Integer.valueOf(avg.split(",")[1]);
}
return new WechatAccount(id, userName, bizR, imgurl, descript, info, openid,
article_count_month, avg_read_month);
}
break;
case "id":
// System.out.println(id+"=========="+matchKey);
if (id.equals(matchKey)) {
String avg = getAvgRead(url, readurl, openid);
if (avg != null) {
article_count_month = Integer.valueOf(avg.split(",")[0]);
avg_read_month = Integer.valueOf(avg.split(",")[1]);
}
return new WechatAccount(id, userName, bizR, imgurl, descript, info, openid,
article_count_month, avg_read_month);
}
break;
}
}
}
return null;
}
/**
* @Title: anaSislyAccount
* @Description: TODO(解析并获取帐号列表)
* @param @param htmlBody
* @param @param name
* @param @param biz
* @param @return 设定文件
* @return WechatAccount 返回类型
*/
private static List<WechatAccount> anaSislyAccountList(String url,String htmlBody)
{
List<WechatAccount> list = new ArrayList<WechatAccount>();
Document document = Jsoup.parse(htmlBody);
if(htmlBody.contains("noresult_part1_container") ||
htmlBody.contains("501 Not Implemented"))
{
logger.info("暂无与“{}”相关的官方认证订阅号。");
}else
{
String readurl = htmlBody.split("account_anti_url = \"")[1].split("\";")[0];
JSONObject avgJson = getAvgRead(url, readurl);
Elements element = document.select("div.news-box")
.select("ul.news-list2").select("li");
logger.info("数据大小:"+element.size());
for (int i = 0; i < element.size(); i++)
{
String openid = element.get(i).attr("d");
String userName = element.get(i).select("div.gzh-box2")
.select("div.txt-box").select("p.tit").text();
String id = element.get(i).select("div.gzh-box2")
.select("div.txt-box").select("p.info")
.select("label").text();
int article_count_month = 0;
int avg_read_month = 0;
String imgurl = element.get(i).select("div.gzh-box2")
.select("div.img-box").select("img").attr("src");
String descript = "";
String info = "";
String bizR = "";
/**只有一个简介或者认证原因或者最近文章**/
if(element.get(i).select("dl").size()==1)
{
String text = element.get(i).select("dl").get(0).text();
if(text.contains("功能介绍"))
{
descript = text;
}
if(text.contains("认证"))
{
info = text;
}
if(text.contains("最近文章"))
{
// bizR = element.get(i).select("dl").get(0).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
}
/**只有简介、认证原因 或者 简介、最近文章 或者 认证原因、最近文章**/
if(element.get(i).select("dl").size()==2)
{
String text = element.get(i).select("dl").get(0).text();
String text2 = element.get(i).select("dl").get(1).text();
/**有简介;认证或者最近文章*/
if(text.contains("功能介绍"))
{
descript = text;
if(text2.contains("认证"))
{
info = text2;
}
}
/**有认证和最近文章**/
if(text.contains("认证"))
{
info = text;
}
}
if(element.get(i).select("dl").size()==3)
{
descript = element.get(i).select("dl").get(0).text();
info = element.get(i).select("dl").get(1).text();
}
if(avgJson!=null && avgJson.containsKey("msg")){
JSONObject data = avgJson.getJSONObject("msg");
if(openid!=null){
String avg = data.getString(openid);
if (avg != null) {
article_count_month = Integer.valueOf(avg.split(",")[0]);
avg_read_month = Integer.valueOf(avg.split(",")[1]);
}
}
}
WechatAccount wechatAccount = new WechatAccount(id,userName,bizR,imgurl,descript,info,openid,article_count_month,avg_read_month);
list.add(wechatAccount);
}
}
return list;
}
/**
* @Title: getAvgRead
* @Description: TODO(更新平均阅读数)
* @param @param url
* @param @param readUrl
* @param @param openid
* @param @return 设定文件
* @return String 返回类型
*/
private static String getAvgRead(String url,String readUrl,String openid)
{
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Referer", url);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
readUrl = "http://weixin.sogou.com" + readUrl;
try {
String htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(readUrl,headerMap)).body().string();
if(htmlBody != null)
{
JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject data = json.getJSONObject("msg");
if(data.containsKey(openid))
{
return data.getString(openid);
}
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
return null;
}
private static JSONObject getAvgRead(String url,String readUrl) {
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Referer", url);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
readUrl = "http://weixin.sogou.com" + readUrl;
try {
String htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(readUrl,headerMap)).body().string();
if(htmlBody != null)
{
JSONObject json = JSONObject.parseObject(htmlBody);
return json.getJSONObject("msg");
}
return null;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}
/**
* 获取微信文章评论
* @Title: WechatComment.java
* @Package com.zhiwei.wechat.comment
* @Description:获取微信文章评论
* @author hero
* @date 2016年6月25日 上午8:17:37
* @version V1.0
*/ /**
*
*/
package com.zhiwei.wechat.comment;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.wechat.entity.WechatComment;
import com.zhiwei.wechat.readAndLike.AriticleContent;
import com.zhiwei.wechat.util.Tools;
/**
* @Description:获取微信文章评论
* @author hero
* @date 2016年6月25日 上午8:17:37
*/
public class WechatCommentList {
private static WechatComment wc = new WechatComment();
private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class);
/**
* 根据文章url获取文章评论列表
* @Description:
* @param @param url
* @param @return
* @return List<WechatComment> 返回类型
*/
public static List<WechatComment> getWechatCommentList(String url,String key)
{
List<WechatComment> wcList = null;
/*处理url*/
String urlcookie = url;
if(!url.contains("key")){
urlcookie = Tools.getWechatCookieUrl(url, key);
}
// 请求头信息
Map<String,String> headerMap = Tools.getWechatHeader();
Map<String, String> cookieMap;
try {
cookieMap = HttpClientTemplateOK.getCookie(urlcookie, null, headerMap);
headerMap.put("Referer", url);
if(cookieMap.get("cookie").length()>50){
headerMap.put("Cookie", cookieMap.get("cookie")+"");
}
String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
String biz = url.split("__biz=")[1].split("&")[0];
String appmsgid = url.split("mid=")[1].split("&")[0];
String comment_id = AriticleContent.getCommentId(url,key);
if(comment_id!=null && appmsg_token!=null)
{
String comment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
+ "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key
+ "&appmsg_token=" + appmsg_token;
/**解析相关数据*/
System.out.println(comment_url);
if("0".equals(comment_id))
{
logger.info("此条微信文章没有评论");
}else
{
try {
String htmlBody = HttpClientTemplateOK.get(comment_url, null, headerMap);
if(htmlBody!=null)
{
JSONObject json = JSON.parseObject(htmlBody);
wcList = wc.constructWechatComment(json.getJSONArray("elected_comment"),url);
return wcList;
}
} catch (Exception e) {
logger.info("解析微信文章评论列表时出现问题:", e.fillInStackTrace());
return null;
}
}
}
} catch (IOException e1) {
return null;
} catch (Exception e1) {
e1.printStackTrace();
}
return null;
}
/**
* @Title: getWechatCommentCount
* @Description: TODO(根据微信文章地址更新微信评论数)
* @param @param url
* @param @param key
* @param @return 设定文件
* @return int 返回类型
*/
public static int getWechatCommentCount(String url,String key)
{
System.out.println(url);
/*处理url*/
String url_new = url;
if(url.contains("#rd"))
{
url_new = url.split("#rd")[0] + key;
}else if(url.contains("#wechat_redirect"))
{
url_new = url.split("#wechat_redirect")[0] + key;
}
String biz = url.split("__biz=")[1].split("&")[0];
String appmsgid = url.split("mid=")[1].split("&")[0];
/**获取网页头信息**/
Map<String,String> headerMap = Tools.getWechatHeader();
/*获取评论id*/
String comment_id = AriticleContent.getCommentId(url,key);
if(comment_id!=null)
{
String comment_url = "http://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
+ "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key;
/**解析相关数据*/
if("0".equals(comment_id))
{
logger.info("此条微信文章没有评论");
return 0;
}else
{
try {
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url_new, null,headerMap);
headerMap.put("Cookie", cookieMap.get("cookie"));
String htmlBody = HttpClientTemplateOK.get(comment_url, null,headerMap);
System.out.println(htmlBody);
if(htmlBody!=null)
{
JSONObject json = JSON.parseObject(htmlBody);
return json.getIntValue("elected_comment_total_cnt");
}
} catch (Exception e) {
logger.debug("更新微信文章评论数时出现问题,问题信息:",e.getMessage());
return -1;
}
}
}else
{
logger.info("获取评论id失败");
return -1;
}
return -1;
}
}
package com.zhiwei.wechat.entity;
import java.io.Serializable;
/**
* @ClassName: WechatAccount
* @Description: TODO(微信帐号)
* @author Bewilder Z
* @date 2017年1月16日 上午11:44:21
*/
public class WechatAccount implements Serializable{
private static final long serialVersionUID = -5179735277202327683L;
private String id; //公号id
private String name; //公号昵称
private String biz; //公号biz
private String imgurl; //头像地址
private String descript; //描述
private String verified_reason; //认证原因
private String openid; //认证原因
private int article_count_month; //月发文量
private int avg_read_month; //月平均阅读数
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getBiz() {
return biz;
}
public void setBiz(String biz) {
this.biz = biz;
}
public String getImgurl() {
return imgurl;
}
public void setImgurl(String imgurl) {
this.imgurl = imgurl;
}
public String getDescript() {
return descript;
}
public void setDescript(String descript) {
this.descript = descript;
}
public String getVerified_reason() {
return verified_reason;
}
public void setVerified_reason(String verified_reason) {
this.verified_reason = verified_reason;
}
public String getOpenid() {
return openid;
}
public void setOpenid(String openid) {
this.openid = openid;
}
public int getArticle_count_month() {
return article_count_month;
}
public void setArticle_count_month(int article_count_month) {
this.article_count_month = article_count_month;
}
public int getAvg_read_month() {
return avg_read_month;
}
public void setAvg_read_month(int avg_read_month) {
this.avg_read_month = avg_read_month;
}
@Override
public String toString()
{
return "new WechatAccount["
+ "id = " + id
+ ", name = " + name
+ ", biz = " + biz
+ ", imgurl = " + imgurl
+ ", descript = " + descript
+ ", verified_reason = " + verified_reason
+ ", openid = " + openid
+ ", article_count_month = " + article_count_month
+ ", avg_read_month = " + avg_read_month
+ "]";
}
public WechatAccount(){}
public WechatAccount(String id,String name, String biz,
String imgurl,String descript,String verified_reason,
String openid, int article_count_month,int avg_read_month)
{
this.id = id;
this.name = name;
this.biz = biz;
this.imgurl = imgurl;
this.descript = descript;
this.verified_reason = verified_reason;
this.openid = openid;
this.article_count_month = article_count_month;
this.avg_read_month = avg_read_month;
}
}
/**
* @Title: Wechat.java
* @Package com.zhiwei.wechat.bean
* @Description: TODO(用一句话描述该文件做什么)
* @author zhiweizhang
* @date 2015年11月5日 下午4:37:10
* @version V1.0
*/ /**
*
*/
package com.zhiwei.wechat.entity;
import java.util.Date;
/**
* @ClassName: Wechat
* @Description: TODO(微信文章)
* @author Bewilder Z
* @date 2015年11月5日 下午4:37:10
*/
public class WechatAricle {
private String id; //主键
private String title; //标题
private String source; //来源
private String content; //内容
private String imgUrl; //图片地址
private Date time; //发布时间
private int readNum; //阅读数
private int likeNum; //点赞数
private String openId;//openid
private String isFirst; //是否为头条文章
public String getIsFirst() {
return isFirst;
}
public void setIsFirst(String isFirst) {
this.isFirst = isFirst;
}
public String getOpenId() {
return openId;
}
public void setOpenId(String openId) {
this.openId = openId;
}
public String getImgUrl() {
return imgUrl;
}
public void setImgUrl(String imgUrl) {
this.imgUrl = imgUrl;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public int getReadNum() {
return readNum;
}
public void setReadNum(int readNum) {
this.readNum = readNum;
}
public int getLikeNum() {
return likeNum;
}
public void setLikeNum(int likeNum) {
this.likeNum = likeNum;
}
public WechatAricle(){}
public WechatAricle(String id,String title,String source,String content
,Date time,int readNum,int likeNum,String openId,String isFirst)
{
this.id = id.replaceAll("amp;", "");
this.title = title;
this.source = source;
this.content = content;
this.time = time;
this.readNum = readNum;
this.likeNum = likeNum;
this.openId = openId;
this.isFirst = isFirst;
}
@Override
public String toString()
{
return "new Wechat["
+ "id = " + id + ","
+ "title = " + title + ","
+ "source = " + source + ","
+ "content = " + content + ","
+ "time = " + time + ","
+ "readNum = " + readNum + ","
+ "likeNum = " + likeNum + ","
+ "openId = " + openId + ","
+ "isFirst = " + isFirst + ","
+ "]";
}
}
/**
* 微信文章评论数据
* @Title: WechatComment.java
* @Package com.zhiwei.wechat.entity
* @Description:微信文章评论数据
* @author Bewilder Z
* @date 2016年6月24日 下午5:12:11
* @version V1.0
*/
package com.zhiwei.wechat.entity;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.timeparse.TimeParse;
/**
* @Description:微信文章评论数据
* @author hero
* @date 2016年6月24日 下午5:12:11
*/
public class WechatComment implements Serializable{
private static final long serialVersionUID = -1232983839480666507L;
private int id;
private int my_id;
private String nick_name;
private String content;
private Date time;
private int like_id;
private int like_num;
private int like_status;
private int is_from_friend;
private int is_from_me;
private String from_url;
public String getFrom_url() {
return from_url;
}
public void setFrom_url(String from_url) {
this.from_url = from_url;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public int getMy_id() {
return my_id;
}
public void setMy_id(int my_id) {
this.my_id = my_id;
}
public String getNick_name() {
return nick_name;
}
public void setNick_name(String nick_name) {
this.nick_name = nick_name;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public int getLike_id() {
return like_id;
}
public void setLike_id(int like_id) {
this.like_id = like_id;
}
public int getLike_num() {
return like_num;
}
public void setLike_num(int like_num) {
this.like_num = like_num;
}
public int getLike_status() {
return like_status;
}
public void setLike_status(int like_status) {
this.like_status = like_status;
}
public int getIs_from_friend() {
return is_from_friend;
}
public void setIs_from_friend(int is_from_friend) {
this.is_from_friend = is_from_friend;
}
public int getIs_from_me() {
return is_from_me;
}
public void setIs_from_me(int is_from_me) {
this.is_from_me = is_from_me;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
@Override
public String toString()
{
return "new WechatComment["
+ "id = " + id
+ ",my_id = " + my_id
+ ",nick_name = " + nick_name
+ ",content = " + content
+ ",time = " + time
+ ",like_id = " + like_id
+ ",like_num = " + like_num
+ ",like_status = " + like_status
+ ",is_from_friend = " + is_from_friend
+ ",is_from_me = " + is_from_me
+ ",from_url = " + from_url
+ "]";
}
public WechatComment(JSONObject json,String url)throws Exception
{
super();
init(json,url);
}
public WechatComment(){}
private void init(JSONObject json,String url) throws Exception {
System.out.println(json);
if(json!=null){
try {
id = json.getInteger("id");
my_id = json.getInteger("my_id");
nick_name = json.getString("nick_name");
content = json.getString("content");
time = TimeParse.stringFormartDate(json.getLong("create_time")*1000L+"");
like_id = json.getInteger("like_id");
like_num = json.getInteger("like_num");
like_status = json.getInteger("like_status");
is_from_friend = json.getInteger("is_from_friend");
is_from_me = json.getInteger("is_from_me");
from_url = url;
} catch (JSONException jsone) {
throw new Exception(jsone.getMessage() + ":" + json.toString(), jsone);
}
}
}
public List<WechatComment> constructWechatComment(JSONArray list,String url) throws Exception
{
try {
int size = list.size();
List<WechatComment> wcList = new ArrayList<WechatComment>(size);
for (int i = 0; i < size; i++) {
wcList.add(new WechatComment(list.getJSONObject(i),url));
}
System.out.println(wcList);
return wcList;
} catch (JSONException e)
{
throw new Exception("解析微信文章评论bug:"+e.getMessage(),e);
}
}
}
/**
* @Title: WeChatReadLike.java
* @Package com.wcral.bean
* @Description: TODO(用一句话描述该文件做什么)
* @author Abner Liu
* @date 2015年8月6日 上午9:52:36
* @version V1.0
*/
/**
*
*/
package com.zhiwei.wechat.entity;
/**
* @ClassName: WeChatReadLike
* @Description: TODO(微信点赞阅读实体)
* @author Bewilder Z
* @date 2015年8月6日 上午9:52:36
*/
public class WechatReadLike {
private String url;
private int read;
private int like;
private int real_read;
public int getReal_read() {
return real_read;
}
public void setReal_read(int real_read) {
this.real_read = real_read;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public int getRead() {
return read;
}
public void setRead(int read) {
this.read = read;
}
public int getLike() {
return like;
}
public void setLike(int like) {
this.like = like;
}
}
/**
* 抓取微信公号历史文章数据
* @Title: WechatDataFromHistory.java
* @Package com.zhiwei.wechat.history
* @Description:抓取微信公号历史文章数据
* @author hero
* @date 2016年5月20日 上午10:27:19
* @version V1.0
*/ /**
*
*/
package com.zhiwei.wechat.history;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.timeparse.TimeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.entity.WechatReadLike;
import com.zhiwei.wechat.readAndLike.AriticleContent;
import com.zhiwei.wechat.readAndLike.WeChatReadAndLike;
import com.zhiwei.wechat.util.Tools;
/**
* @Description:抓取微信公号历史文章数据
* @author Bewilder Z
* @date 2016年5月20日 上午10:27:19
*/
public class WechatDataFromHistory {
private static final Logger log = LoggerFactory.getLogger(WechatDataFromHistory.class);
private boolean updateLike = false; //是否更新点赞阅读数
private Date endDate = null; //采集的结束时间
private List<WechatAricle> result; //数据总集合
private Map<String,String> headerMap; //请求头信息
private boolean follow = false; //是否关注
private String nextId; //采集下一页id
private String key; //更新点赞阅读的key
private boolean next = true; //判断是否有下一页
/**
*
* @Description:
* @param @param updateLike 是否更新点赞数和阅读数
* @param @param endDate 采集结束时间
* @return
*/
public WechatDataFromHistory(boolean updateLike,String endDate,
boolean follow)
{
this.updateLike = updateLike;
result = new ArrayList<WechatAricle>();
headerMap = Tools.getWechatHeader();
this.follow = follow;
if(endDate == null)
{
endDate = "2011-12-30";
}
this.endDate = TimeUtil.parseTime(endDate, "yyyy-MM-dd");
}
public WechatDataFromHistory(){}
/**
* @Title: validateKey
* @author hero
* @Description: 验证链接是否有效
* @param @param key
* @param @return 设定文件
* @return boolean 返回类型
*/
public static boolean validateKey(String key,Proxy proxy){
String url = "http://mp.weixin.qq.com/s?__biz=MzIwNDk0NzEyOQ==&mid=2247484544&idx=2&sn=f64abc4b15badd77b70ca942bc5176d3&scene=0#wechat_redirect";
try {
WechatReadLike wrl = WeChatReadAndLike.getReadAndLike(url, key,proxy);
if(wrl.getRead()>0){
return true;
}else{
return false;
}
} catch (Exception e) {
log.debug("验证微信key有效性时出现问题,问题为:{}",e.getMessage());
return false;
}
}
/**
* @Title: getWechatDataFromHistory
* @author hero
* @Description: 获取微信公众号历史文章
* @param @param url
* @param @return 设定文件
* @return List<WechatAricle> 返回类型
*/
public List<WechatAricle> getWechatDataFromHistory(String url,Proxy proxy)
{
log.info("url:::::::::{}",url);
if(updateLike)
{
key = "&uin"+url.split("uin")[1].split("devicetype")[0];
}
String firstText = null;
try {
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy, headerMap);
//获取cookie
if(cookieMap.get("cookie")!=null){
headerMap.put("Referer", url);
headerMap.put("Cookie", cookieMap.get("cookie"));
firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
//采集下一页数据参数,并获取第一页数据
if(firstText != null){
String appToken = getFirst(firstText,proxy);
if(follow == true)
{
next = true;
}
//循环读取微信公号历史数据
int i = 1;
while(next)
{
String nextUrl = url.replace("home", "getmsg") + "&f=json&&offset=" + i*10 + "&count=10&scene=123&is_ok=1&appmsg_token="+appToken;
log.info("下一页地址:{}", nextUrl);
try {
//采集下一页数据参数,并获取此页数据
headerMap.put("Referer", nextUrl);
String nextJson = HttpClientTemplateOK.get(nextUrl, proxy,headerMap);
nextId = getNext(nextJson,proxy);
// System.out.println("nextId============"+nextId);
// if(nextId.equals("1")){
// next = true;
// }else{
// next = false;
// }
ZhiWeiTools.sleep(3000);
} catch (Exception e) {
e.printStackTrace();
next = false;
}
i++;
}
}else{
next = false;
}
return result;
}
/***
* 获取公号历史文章
* @Description:
* @param @param url
* @param @param source
* @param @return
* @return List<Wechat> 返回类型
*/
@Deprecated
public List<WechatAricle> getWechatDataFromHistoryOld(String url,Proxy proxy)
{
log.info("url:::::::::{}",url);
if(updateLike)
{
key = "&uin"+url.split("uin")[1].split("devicetype")[0];
}
String firstText = null;
try {
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy,headerMap);
//获取cookie
headerMap.put("Referer", url);
headerMap.put("Cookie", cookieMap.get("cookie"));
firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
} catch (Exception e) {
e.printStackTrace();
return null;
}
//采集下一页数据参数,并获取第一页数据
nextId = getFirstOld(firstText,proxy);
boolean next = false; //判断是否有下一页
if(follow == true)
{
next = true;
}
//循环读取微信公号历史数据
while(next)
{
//没有下一页数据,结束
if(nextId==null)
{
next = false;
}else //采集下一页数据
{
String nextUrl = url.replace("home", "getmsg") + "&f=json&frommsgid=" + nextId + "&count=10&scene=123&is_ok=1";
log.info("下一页地址:{}", nextUrl);
try {
//采集下一页数据参数,并获取此页数据
headerMap.put("Referer", nextUrl);
String nextJson = HttpClientTemplateOK.get(nextUrl, null,headerMap);
nextId = getNext(nextJson,proxy);
System.out.println("nextId-============="+nextId);
ZhiWeiTools.sleep(3000);
} catch (Exception e) {
e.printStackTrace();
next = false;
}
}
}
return result;
}
/**
* @Title: getFirst
* @Description: TODO(解析第一页数据)
* @param @param fristText
* @param @param source
* @param @return 设定文件
* @return String 返回类型
*/
@Deprecated
public String getFirstOld(String fristText,Proxy proxy)
{
fristText = fristText
.replace("\\", "")
.replace("'", "")
.replace("&nbsp;", " ")
.replace("&quot;", "\"")
.replace("&amp;", "&")
.replace("amp;", "")
.replace("&#39", "'")
.replace("&gt;", ">")
.replace("&lt;", "<")
.replace("&yen;", "¥")
;
log.info("开始解析第一页文章");
// 截取HTML得到有用的JSON;替换掉转义字符
if(fristText.contains("msgList ="))
{
fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
return getNextIdAndAnalysis(fristText,proxy);
}
return null;
}
/**
* @Title: getFirst
* @author hero
* @Description: 截取appmsg_token 值
* @param @param fristText
* @param @return 设定文件
* @return String 返回类型
*/
private String getFirst(String fristText,Proxy proxy)
{
String next = null;
fristText = fristText
.replace("\\", "")
.replace("'", "")
.replace("&nbsp;", " ")
.replace("&quot;", "\"")
.replace("&amp;", "&")
.replace("amp;", "")
.replace("&#39", "'")
.replace("&gt;", ">")
.replace("&lt;", "<")
.replace("&yen;", "¥")
;
log.info("开始解析第一页文章");
if(fristText.contains("window.appmsg_token = ") && fristText.contains("msgList =")){
try {
next = fristText.split("window.appmsg_token = \"")[1].split("\";")[0];
fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
getNextIdAndAnalysis(fristText,proxy);
return next;
} catch (Exception e) {
log.info("截取下一页数据参数出现问题:{}",fristText);
return null;
}
}else{
log.info("获取下一页数据参数出现问题....{}",fristText);
}
return null;
}
/***
* 解析微信历史文章下一页数据
* @Description:
* @param @param nextJosn
* @param @param key
* @param @param source
* @param @return
* @return String 返回类型
*/
private String getNext(String nextHtml,Proxy proxy)
{
try {
JSONObject nextJosn = JSONObject.parseObject(nextHtml);
String nextText = null;
if(null != nextJosn.getString("general_msg_list"))
{
nextText = nextJosn.getString("general_msg_list");
getNextIdAndAnalysis(nextText,proxy);
}else
{
log.info("下一页数据解析出现问题:{}", nextHtml);
next = false;
return null;
}
return nextJosn.getInteger("can_msg_continue")+"";
} catch (Exception e) {
log.info("解析数据有问题:{}", nextHtml);
next = false;
return null;
}
}
/**
* @Title: getNextIdAndAnalysis
* @Description: TODO(解析下一页所需字段,及数据解析)
* @param @param text
* @param @param source
* @param @return 设定文件
* @return String 返回类型
*/
public String getNextIdAndAnalysis(String text,Proxy proxy)
{
JSONObject wechatData = JSONObject.parseObject(text);
JSONArray dataList = wechatData.getJSONArray("list");
if(dataList.size()==0)
{
nextId = null;
next = false;
}else
{
for(int i = 0;i<dataList.size();i++)
{
JSONObject data = dataList.getJSONObject(i);
//解析时间
JSONObject dateJson = data.getJSONObject("comm_msg_info");
long dateTime = dateJson.getLong("datetime");
Date time = new Date(dateTime*1000);
nextId = dateJson.getString("id");
if(time.before(endDate))
{
next = false;
nextId = null;
}
//解析文本数据
if(null != data.getJSONObject("app_msg_ext_info"))
{
//解析头条数据
JSONObject first = data.getJSONObject("app_msg_ext_info");
String content_url = first.getString("content_url");
String content = first.getString("digest");
String title = first.getString("title");
String img_url = first.getString("cover");
WechatAricle wechatFirst = setWechat(content_url,title
, time, img_url, content,"true",proxy);
result.add(wechatFirst);
//解析其余数据
JSONArray otherJSON = first.getJSONArray("multi_app_msg_item_list");
if(otherJSON != null)
{
for(int j = 0;j<otherJSON.size();j++)
{
JSONObject other = otherJSON.getJSONObject(j);
String other_content_url = other.getString("content_url");
String other_content = other.getString("digest");
String other_title = other.getString("title");
String other_img_url = other.getString("cover");
WechatAricle wechatOther = setWechat(other_content_url,other_title
, time, other_img_url, other_content,"false",proxy);
result.add(wechatOther);
}
}else
{
log.info("只有一条数据");
}
}else
{
log.info("不存在相关文章......");
}
}
}
return nextId;
}
/**
* 给实体类对象赋值
* @Description:
* @param @param url
* @param @param title
* @param @param source
* @param @param datetime
* @param @param key
* @param @return
* @return Wechat 返回类型
*/
private WechatAricle setWechat(String url,String title,
Date datetime,String imgUrl,String content,String isFirst,Proxy proxy)
{
WechatAricle wechat = new WechatAricle();
wechat.setId(url);
wechat.setTitle(title);
wechat.setTime(datetime);
wechat.setImgUrl(imgUrl);
wechat.setIsFirst(isFirst);
//采集文章
String source = null;
Map<String,String> sacMap = AriticleContent.getAriticleContent(url);
if(sacMap!=null)
{
source = sacMap.get("source");
content = sacMap.get("content");
}
//更新点赞阅读数
if(updateLike)
{
url = url.replaceAll("amp;", "").replaceAll("amp;", "");
try {
Thread.sleep(2000);
WechatReadLike wcrl = WeChatReadAndLike.getReadAndLike(url,key,proxy);
wechat.setLikeNum(wcrl.getLike());
wechat.setReadNum(wcrl.getRead());
} catch (InterruptedException e) {
wechat.setLikeNum(-1);
wechat.setReadNum(-1);
log.error("获取点赞阅读数出现为题,问题:{}", e.getMessage());
}
}
wechat.setContent(content);
wechat.setSource(source);
return wechat;
}
public static void main(String[] args) {
String url = "http:\\/\\/mp.weixin.qq.com\\/s?__biz=MjM5NTU0MzI0MA==&mid=2661648551&idx=1&sn=74397ab60184beb0abd4dd3f8c62f7d3&chksm=bda7c9008ad04016a5eac88c8dd18b6bc5797ae780c56e307e11781af257a68a52b7f87dfd8e&scene=27#wechat_redirect";
System.out.println(url.replaceAll("\\", ""));
}
}
/**
* 文章具体内容
* @Title: AriticleContent.java
* @Package com.zhiwei.wechat.readAndLike
* @Description:文章具体内容
* @author Bewilder Z
* @date 2016年5月20日 下午2:39:46
* @version V1.0
*/
package com.zhiwei.wechat.readAndLike;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.wechat.comment.WechatCommentList;
import com.zhiwei.wechat.util.Tools;
/**
* @Description:文章具体内容
* @author hero
* @date 2016年5月20日 下午2:39:46
*/
public class AriticleContent{
private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class);
/**
* @Title: getAriticleContent
* @Description: TODO(根据url获取文章内容)
* @param @param url
* @param @return 设定文件
* @return Map<String,String> 返回类型
*/
public static Map<String,String> getAriticleContent(String url)
{
Map<String,String> result = new HashMap<String,String>();
Map<String,String> headerMap = Tools.getWechatHeader();
url = url.replaceAll("amp;", "").replaceAll("amp;", "");
headerMap.put("Referer", url);
String content = null;
String source = null;
try {
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
Document document = Jsoup.parse(htmlBody);
content = document.select("div.rich_media_content").text();
if(htmlBody.contains("var nickname = ")){
source = htmlBody.split("var nickname = \"")[1].split("\";")[0];
}else{
source = document.select("div#meta_content").select("span#profileBt").text();
}
result.put("content", content);
result.put("source", source);
return result;
} catch (Exception e) {
logger.debug("获取微信文章内容或来源时出现问题,",e.getMessage());
return null;
}
}
/**
* 获取评论的commentId
* @Description:获取评论的commentId
* @param @param url
* @param @return
* @return String 返回类型
*/
public static String getCommentId(String url,String key)
{
Map<String,String> headerMap = new HashMap<String,String>();
url = url.replaceAll("amp;", "")+"&key="+key;
headerMap.put("Referer", url);
String comment_id = null;
try {
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
if(htmlBody!=null)
{
Document document = Jsoup.parse(htmlBody);
String content = document.select("script").html();
comment_id = content.split("var comment_id = \"")[1].split("\"")[0];
logger.info("comment_id : " + comment_id);
}
} catch (Exception e) {
logger.debug("获取微信文章评论id时出现问题,",e.getMessage());
return null;
}
return comment_id;
}
public static void main(String[] args) {
String key = "9ed31d4918c154c810272b09930a4bc0f0cdef34aac4d35a975d8c81fc3261892cf249fedfe7fbacd61a36ecf44d54d1e537ca555379ab6223a63c2c5abf062af1272a2c79a54ee0296ae5d8f22f2092";
String url = "https://mp.weixin.qq.com/s?__biz=MjM5NTE1NTc0MA==&mid=2652457522&idx=1&sn=66496e63dd39097ffb21545a81c45812&scene=0&key="+key+"&ascene=1&uin=MTE4OTQyMDc0MQ%3D%3D";
System.out.println(getCommentId(url,key));
}
}
/**
* @Title: WindowsClient.java
* @Package com.wcral.client
* @Description: TODO(用一句话描述该文件做什么)
* @author Bewilder Z
* @date 2015年8月6日 上午9:13:37
* @version V1.0
*/
package com.zhiwei.wechat.readAndLike;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.wechat.entity.WechatReadLike;
import com.zhiwei.wechat.search.WechatAritcleSearch;
import com.zhiwei.wechat.util.Tools;
/**
* @ClassName: WindowsClient
* @Description: TODO(利用windows客戶端進行点赞阅读抓取)
* @author Abner Liu
* @date 2015年8月6日 上午9:13:37
*/
public class WeChatReadAndLike {
private static Logger logger = LoggerFactory.getLogger(WeChatReadAndLike.class);
/**
*
* @Title: getReadAndLike
* @Description: 利用windows客戶端進行点赞阅读抓取
* @param url
* 微信文章链接
* @return WeChatReadLike 微信文章实体类
*
*/
public static WechatReadLike getReadAndLike(String url,String key,Proxy proxy){
WechatReadLike wLike = new WechatReadLike();
try {
String urlcookie = Tools.getWechatCookieUrl(url, key);
// 请求头信息
Map<String,String> headerMap = Tools.getWechatHeader();
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(urlcookie, proxy, headerMap);
headerMap.put("Referer", urlcookie);
headerMap.put("Cookie", cookieMap.get("cookie")+"");
String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
System.out.println("appmsg_token==========="+appmsg_token);
String urlLike = Tools.getWechatLikeUrl(urlcookie,appmsg_token);
//设置post请求参数
HashMap<String,Object> postMap = new HashMap<String,Object>();
postMap.put("is_only_read", "1");
//获取数据
String htsString = HttpClientTemplateOK.post(urlLike, proxy, headerMap ,postMap);
System.out.println(htsString);
JSONObject jsonObject = JSONObject.parseObject(htsString);
String like_num = jsonObject.getJSONObject("appmsgstat")
.get("like_num").toString();
String real_read_num = "";
try {
real_read_num = jsonObject.getJSONObject("appmsgstat")
.get("real_read_num").toString();
if(real_read_num.equals("0"))
{
real_read_num = jsonObject.getJSONObject("appmsgstat")
.get("read_num").toString();
}
} catch (Exception e) {
real_read_num = jsonObject.getJSONObject("appmsgstat")
.get("read_num").toString();
}
wLike.setUrl(url);
wLike.setRead(Integer.valueOf(real_read_num));
wLike.setLike(Integer.valueOf(like_num));
} catch (Exception e) {
wLike.setUrl(url);
wLike.setRead(-1);
wLike.setLike(-1);
}
return wLike;
}
/**
* @Title: getReadAndLike
* @Description: TODO(通过搜狗微信获取阅读数)
* @param @param word
* @param @param time
* @param @param link
* @param @param wxId
* @param @return 设定文件
* @return WeChatReadLike 返回类型
*/
public static WechatReadLike getReadAndLike(String word,
String time,String link,String wxId){
WechatReadLike wLike = new WechatReadLike();
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Upgrade-Insecure-Requests", "1");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
headerMap.put("Host","weixin.sogou.com");
if(time.contains(" "))
{
time = time.split(" ")[0];
}
String openid = WechatAritcleSearch.getOpenId(wxId);
logger.info("openid is {}", openid);
try {
String url = "http://weixin.sogou.com/weixin?query=" + URLEncoder.encode(word,"utf-8")
+ "&type=2&ie=utf8&page=1&interation=&tsn=5&ft="+time + "&et="+ time
+ "&wxid="+openid+"&usip="+wxId+"&from=tool";
logger.info("url is {}",url);
String htmlBody = HttpClientTemplateOK.get(url, null, headerMap);
if(htmlBody!=null)
{
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box")
.select("ul.news-list").select("li");
for (Element element : elements)
{
try {
String url_link = element.select("div.txt-box").select("h3 >a").attr("href");
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box")
.select("div.s-p").select("span.s1").text().trim());
logger.info("readNum is {}", readNum);
} catch (Exception e) {
readNum = 0;
}
if(url_link.contains("&chksm="))
{
url_link = url_link.split("&chksm=")[0] + "&3rd" + url_link.split("&3rd")[1];
}
if(link.equals(url_link))
{
wLike.setUrl(link);
wLike.setRead(readNum);
break;
}
} catch (Exception e) {
continue;
}
}
} catch (Exception e) {
wLike.setUrl(link);
wLike.setRead(0);
return null;
}
}
} catch (Exception e) {
e.printStackTrace();
wLike.setUrl(link);
wLike.setRead(0);
return null;
}
return wLike;
}
}
package com.zhiwei.wechat.search;
import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.util.Tools;
/**
* @ClassName: WechatAritcleSearch
* @Description: TODO(在搜索接口根据关键词采集微信文章)
* @author Bewilder Z
* @date 2016年10月14日 上午9:40:18
*/
public class WechatAritcleSearch {
private static Logger logger = LoggerFactory.getLogger(WechatAritcleSearch.class);
/**
*
* @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章)
* @param @param word 关键词
* @param @param tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param startTime 开始时间 格式为yyyy-MM-dd
* @param @param endTime 结束时间 格式为yyyy-MM-dd
* @param @param cookie 用户登录后的cookie(不登录最多10页)
* @param @return
* @param @throws ZhiWeiException
* @param @throws UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
*/
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn,
String startTime,String endTime,String cookie,Proxy proxy)
throws Exception, UnsupportedEncodingException
{
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Upgrade-Insecure-Requests", "1");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
headerMap.put("Host","weixin.sogou.com");
if(cookie!=null){
headerMap.put("Cookie",cookie);
}
boolean f = true;
int page = 1;
while(f)
{
String url = "http://weixin.sogou.com/weixin?type=2&query="
+ URLEncoder.encode(word,"UTF-8")+"&ie=utf8&_sug_=n&_sug_type_="
+ "&ri=1&sourceid=sugg&sst0="+System.currentTimeMillis()
+"&tsn="+tsn + "&page="+page;
if(tsn==5)
{
url = url + "&ft="+startTime + "&et="+ endTime
+ "&wxid=&usip=&interation=&from=tool";
}
headerMap.put("Referer",url);
//获取数据
String htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string();
//解析数据
if(htmlBody != null)
{
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box")
.select("ul.news-list").select("li");
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements)
{
try {
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("href");
content = "";
if(element.select("p.txt-info").isEmpty())
{
content = element.select("p.txt-info").text();
}else
{
content = element.select("div.txt-box").select("p.txt-info").text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p")
.select("a").text();
openid = element.select("div.txt-box")
.select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box")
.select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box")
.select("div.s-p").select("span.s1").text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0,openid,"unknow");
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}",e.getMessage());
continue;
}
}
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
if(pageNext.contains("下一页")){
page++;
}else{
f = false;
}
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}",e.getMessage());
return null;
}
}else
{
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
}
return result;
}
/**
* @Title: getOpenId
* @Description: TODO(获取微信wxID)
* @param @param wxId
* @param @return 设定文件
* @return String 返回类型
*/
public static String getOpenId(String wxId)
{
String openId = null;
String url = "http://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query="+wxId;
Map<String,String> headerMap = Tools.getWechatHeader();
String htmlBody;
try {
htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
if(htmlBody!=null)
{
JSONObject json = JSONObject.parseObject(htmlBody);
openId = json.getString("openid");
}
} catch (Exception e) {
openId = null;
e.printStackTrace();
}
return openId;
}
}
package com.zhiwei.wechat.search;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
/**
* @ClassName: WechatIndex
* @Description: TODO(微信指数搜索)
* @author Bewilder Z
* @date 2017年3月24日 下午2:52:01
*/
public class WechatIndex {
public static void main(String[] args) throws Exception {
String word = "百度";
String startTime = "2016-01-01 00:00:00";
String endTime = "2017-03-24 00:00:00";
Map<String,String> timeLine = TimeParse.getTimeMap(startTime, endTime, "dd", 7);
for(Entry<String,String> entry: timeLine.entrySet())
{
Date st = TimeParse.stringFormartDate(entry.getKey());
Date et = TimeParse.stringFormartDate(entry.getValue());
getWechatIndex(word,st.getTime()/1000L,et.getTime()/1000L);
}
}
public static void getWechatIndex(String word,long startTime,long endTime) throws Exception
{
String url = "https://search.weixin.qq.com/cgi-bin/searchweb/getwxindex?query="
+URLCodeUtil.getURLEncode(word, "utf-8")+"&start_time="+startTime+"&end_time="+endTime+"&_="+new Date().getTime();
// String urlCookie = "https://search.weixin.qq.com/cgi-bin/searchweb/getjsapiticket?sign_url=https%253A%252F%252Fsearch.weixin.qq.com%252Fcgi-bin%252Fsearchweb%252Fclientjump%253Ftag%253Dwxindex%2526exportkey%253DAStrb5tD4ruSixIDu1cVpTA%25253D%2526pass_ticket%253DbbP7ZT5xEUrYe%25252BoOa6ACUw%25252BmgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva%25252BGxj&_=1490341301892";
System.out.println(url);
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Host", "search.weixin.qq.com");
headerMap.put("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Mobile/14D27 MicroMessenger/6.5.5 NetType/4G Language/zh_CN");
headerMap.put("Referer", url);
headerMap.put("X-Requested-With","XMLHttpRequest");
headerMap.put("Accept","application/json, text/javascript, */*; q=0.01");
headerMap.put("Cookie","mmsearch_user_key=AStrb5tD4ruSixIDu1cVpTA=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700");
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
System.out.println(htmlBody);
Thread.sleep(3000);
// Document htmlBody = Jsoup.connect(url)
// .header("Host", "search.weixin.qq.com")
// .header("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Mobile/14D27 MicroMessenger/6.5.5 NetType/4G Language/zh_CN")
// .header("Referer", url)
// .header("X-Requested-With","XMLHttpRequest")
// .header("Accept","application/json, text/javascript, */*; q=0.01")
// .header("Cookie", "mmsearch_user_key=AfNSrJx116RWkWvTuVC949k=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700")
// .ignoreHttpErrors(false)
// .ignoreContentType(true)
// .timeout(3000)
// .get();
}
}
package com.zhiwei.wechat.util;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class Tools {
/**
* @Title: getWechatHeader
* @Description: TODO(设置微信采集头信息)
* @param @return 设定文件
* @return Map<String,String> 返回类型
*/
public static Map<String,String> getWechatHeader()
{
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.8");
headerMap.put("Connection", "keep-alive");
headerMap.put("Upgrade-Insecure-Requests", "1");
headerMap.put("Host", "mp.weixin.qq.com");
headerMap.put("Origin", "http://mp.weixin.qq.com");
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.691.400 QQBrowser/9.0.2524.400");
return headerMap;
}
/**
* @Title: getWechatLikeUrl
* @Description: TODO(根据url和key拼接获取cookie链接)
* @param @param url
* @param @param key
* @param @return 设定文件
* @return String 返回类型
*/
public static String getWechatCookieUrl(String url,String key)
{
url = url.replace("http://mp.weixin.qq.com/s?", "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&");
String url_new = url;
if(key != null)
{
if(url.contains("#rd"))
{
url_new = url.split("#rd")[0] + key;
}else if(url.contains("#wechat_redirect"))
{
url_new = url.split("#wechat_redirect")[0] + key;
}else{
url_new = url + key;
}
}
return url_new;
}
/**
* @Title: getWechatLikeUrl
* @Description: TODO(拼接获取点赞阅读链接)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public static String getWechatLikeUrl(String url,String appmsg_token)
{
return "http://mp.weixin.qq.com/mp/getappmsgext?__biz" + url.split("s?__biz")[1]+"&appmsg_token="+appmsg_token;
}
/**
* 读取关键词信息
* @param String fileName
* 外部关键词文件名
* @return List
* **/
public static List<String> getFileName(String fileName) {
List<String> list = new ArrayList<String>();
try {
BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(fileName),"GBK"));
String line = "";
while((line = br.readLine())!=null)
{
list.add(line);
}
br.close();
} catch (Exception e) {
e.printStackTrace();
return null;
}
return list;
}
/**
* @Title: getAppMsgToken
* @author hero
* @Description: TODO(获取appmsg_token,用于更新点赞阅读)
* @param @param htmlBody
* @param @return 设定文件
* @return String 返回类型
*/
public static String getAppMsgToken(String htmlBody){
String appmsg_token = null;
if(htmlBody !=null && !"".equals(htmlBody)){
if(htmlBody.contains("appmsg_token")){
try {
appmsg_token = htmlBody.split("window.appmsg_token = \"")[1].split("\";")[0];
} catch (Exception e) {
return null;
}
}
}
return appmsg_token;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment