Commit a5057f65 by zhiwei

处理搜狗微信搜索链接中出现两次https的问题

parent 09b58307
...@@ -26,6 +26,49 @@ ...@@ -26,6 +26,49 @@
</developer> </developer>
</developers> </developers>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.2.0-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.6.8-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>automaticmark-client</artifactId>
<version>2.1.7-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.71</version>
<scope>provided</scope>
</dependency>
</dependencies>
<!-- 打包管理 --> <!-- 打包管理 -->
<build> <build>
<plugins> <plugins>
...@@ -63,36 +106,4 @@ ...@@ -63,36 +106,4 @@
</plugin> </plugin>
</plugins> </plugins>
</build> </build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.5-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.6.3-RELEASE</version>
<scope>provided</scope>
</dependency>
</dependencies>
</project> </project>
\ No newline at end of file
...@@ -5,21 +5,19 @@ import java.util.Map; ...@@ -5,21 +5,19 @@ import java.util.Map;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.wechat.util.HtmlDownUtil;
import org.apache.commons.lang3.StringUtils;
/** /**
* @ClassName: WechatAccountFans * @ClassName: WechatAccountFans
* @Description: TODO(微信公众号粉丝增量采集程序) * @Description: 微信公众号粉丝增量采集程序
* @author hero * @author hero
* @date 2017年2月8日 上午11:36:11 * @date 2017年2月8日 上午11:36:11
*/ */
public class WechatAccountFans { public class WechatAccountFans {
// private static Logger logger = LoggerFactory.getLogger(WechatAccountFans.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private Map<String,String> headerMap; private Map<String,String> headerMap;
public WechatAccountFans() public WechatAccountFans()
...@@ -45,8 +43,8 @@ public class WechatAccountFans { ...@@ -45,8 +43,8 @@ public class WechatAccountFans {
headerMap.put("Referer", referer); headerMap.put("Referer", referer);
headerMap.put("Cookie", cookie); headerMap.put("Cookie", cookie);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string(); String htmlBody = HtmlDownUtil.downloadHtml(url,headerMap, ProxyHolder.NAT_HEAVY_PROXY);
if(htmlBody != null) if(StringUtils.isNotBlank(htmlBody))
{ {
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject category_list = json.getJSONArray("category_list").getJSONObject(0); JSONObject category_list = json.getJSONArray("category_list").getJSONObject(0);
......
///**
// * 获取微信文章评论
// * @Title: WechatComment.java
// * @Package com.zhiwei.wechat.comment
// * @Description:获取微信文章评论
// * @author hero
// * @date 2016年6月25日 上午8:17:37
// * @version V1.0
// */ /**
// *
// */
//package com.zhiwei.wechat.comment;
//
//import java.io.IOException;
//import java.util.List;
//import java.util.Map;
//
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.alibaba.fastjson.JSON;
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.crawler.core.HttpBoot;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//import com.zhiwei.wechat.entity.WechatComment;
//import com.zhiwei.wechat.readAndLike.AriticleContent;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @Description:获取微信文章评论
// * @author hero
// * @date 2016年6月25日 上午8:17:37
// */
//public class WechatCommentList {
//
// private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
//
// private static WechatComment wc = new WechatComment();
//
// private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class);
// /**
// * 根据文章url获取文章评论列表
// * @Description:
// * @param @param url
// * @param @return
// * @return List<WechatComment> 返回类型
// */
// public static List<WechatComment> getWechatCommentList(String url,String key)
// {
// List<WechatComment> wcList = null;
// /*处理url*/
// String urlcookie = url;
// if(!url.contains("key")){
// urlcookie = Tools.getWechatCookieUrl(url, key);
// }
// // 请求头信息
// Map<String,String> headerMap = Tools.getWechatHeader();
// Map<String, String> cookieMap;
// try {
// cookieMap = HttpClientTemplateOK.getCookie(urlcookie, null, headerMap);
// headerMap.put("Referer", url);
// if(cookieMap.get("cookie").length()>50){
// headerMap.put("Cookie", cookieMap.get("cookie")+"");
// }
// String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
//
// String biz = url.split("__biz=")[1].split("&")[0];
// String appmsgid = url.split("mid=")[1].split("&")[0];
// String comment_id = AriticleContent.getCommentId(url,key);
// if(comment_id!=null && appmsg_token!=null)
// {
// String comment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
// + "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key
// + "&appmsg_token=" + appmsg_token;
// /**解析相关数据*/
// System.out.println(comment_url);
// if("0".equals(comment_id))
// {
// logger.info("此条微信文章没有评论");
// }else
// {
// try {
// String htmlBody = HttpClientTemplateOK.get(comment_url, null, headerMap);
// if(htmlBody!=null)
// {
// JSONObject json = JSON.parseObject(htmlBody);
// wcList = wc.constructWechatComment(json.getJSONArray("elected_comment"),url);
// return wcList;
// }
// } catch (Exception e) {
// logger.info("解析微信文章评论列表时出现问题:", e.fillInStackTrace());
// return null;
// }
// }
// }
// } catch (IOException e1) {
// return null;
// } catch (Exception e1) {
// e1.printStackTrace();
// }
//
// return null;
// }
//
//
// /**
// * @Title: getWechatCommentCount
// * @Description: TODO(根据微信文章地址更新微信评论数)
// * @param @param url
// * @param @param key
// * @param @return 设定文件
// * @return int 返回类型
// */
// public static int getWechatCommentCount(String url,String key)
// {
// System.out.println(url);
// /*处理url*/
// String url_new = url;
// if(url.contains("#rd"))
// {
// url_new = url.split("#rd")[0] + key;
// }else if(url.contains("#wechat_redirect"))
// {
// url_new = url.split("#wechat_redirect")[0] + key;
// }
// String biz = url.split("__biz=")[1].split("&")[0];
// String appmsgid = url.split("mid=")[1].split("&")[0];
//
// /**获取网页头信息**/
// Map<String,String> headerMap = Tools.getWechatHeader();
// /*获取评论id*/
// String comment_id = AriticleContent.getCommentId(url,key);
// if(comment_id!=null)
// {
// String comment_url = "http://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
// + "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key;
// /**解析相关数据*/
//
// if("0".equals(comment_id))
// {
// logger.info("此条微信文章没有评论");
// return 0;
// }else
// {
// try {
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url_new, null,headerMap);
// headerMap.put("Cookie", cookieMap.get("cookie"));
// String htmlBody = HttpClientTemplateOK.get(comment_url, null,headerMap);
// System.out.println(htmlBody);
// if(htmlBody!=null)
// {
// JSONObject json = JSON.parseObject(htmlBody);
// return json.getIntValue("elected_comment_total_cnt");
// }
// } catch (Exception e) {
// logger.debug("更新微信文章评论数时出现问题,问题信息:",e.getMessage());
// return -1;
// }
// }
// }else
// {
// logger.info("获取评论id失败");
// return -1;
// }
// return -1;
// }
//
//
//
//}
...@@ -22,13 +22,13 @@ public class WechatAccount implements Serializable{ ...@@ -22,13 +22,13 @@ public class WechatAccount implements Serializable{
private String descript; //描述 private String descript; //描述
private String verified_reason; //认证原因 private String verifiedReason; //认证原因
private String openid; //认证原因 private String openid; //认证原因
private int article_count_month; //月发文量 private Integer articleCountMonth; //月发文量
private int avg_read_month; //月平均阅读数 private Integer avgReadMonth; //月平均阅读数
public String getId() { public String getId() {
return id; return id;
...@@ -70,12 +70,12 @@ public class WechatAccount implements Serializable{ ...@@ -70,12 +70,12 @@ public class WechatAccount implements Serializable{
this.descript = descript; this.descript = descript;
} }
public String getVerified_reason() { public String getVerifiedReason() {
return verified_reason; return verifiedReason;
} }
public void setVerified_reason(String verified_reason) { public void setVerifiedReason(String verifiedReason) {
this.verified_reason = verified_reason; this.verifiedReason = verifiedReason;
} }
public String getOpenid() { public String getOpenid() {
...@@ -86,20 +86,20 @@ public class WechatAccount implements Serializable{ ...@@ -86,20 +86,20 @@ public class WechatAccount implements Serializable{
this.openid = openid; this.openid = openid;
} }
public int getArticle_count_month() { public Integer getArticleCountMonth() {
return article_count_month; return articleCountMonth;
} }
public void setArticle_count_month(int article_count_month) { public void setArticleCountMonth(Integer articleCountMonth) {
this.article_count_month = article_count_month; this.articleCountMonth = articleCountMonth;
} }
public int getAvg_read_month() { public Integer getAvgReadMonth() {
return avg_read_month; return avgReadMonth;
} }
public void setAvg_read_month(int avg_read_month) { public void setAvgReadMonth(Integer avgReadMonth) {
this.avg_read_month = avg_read_month; this.avgReadMonth = avgReadMonth;
} }
...@@ -112,10 +112,10 @@ public class WechatAccount implements Serializable{ ...@@ -112,10 +112,10 @@ public class WechatAccount implements Serializable{
+ ", biz = " + biz + ", biz = " + biz
+ ", imgurl = " + imgurl + ", imgurl = " + imgurl
+ ", descript = " + descript + ", descript = " + descript
+ ", verified_reason = " + verified_reason + ", verifiedReason = " + verifiedReason
+ ", openid = " + openid + ", openid = " + openid
+ ", article_count_month = " + article_count_month + ", articleCountMonth = " + articleCountMonth
+ ", avg_read_month = " + avg_read_month + ", avgReadMonth = " + avgReadMonth
+ "]"; + "]";
} }
...@@ -124,18 +124,18 @@ public class WechatAccount implements Serializable{ ...@@ -124,18 +124,18 @@ public class WechatAccount implements Serializable{
public WechatAccount(String id,String name, String biz, public WechatAccount(String id,String name, String biz,
String imgurl,String descript,String verified_reason, String imgurl,String descript,String verifiedReason,
String openid, int article_count_month,int avg_read_month) String openid, Integer articleCountMonth,Integer avgReadMonth)
{ {
this.id = id; this.id = id;
this.name = name; this.name = name;
this.biz = biz; this.biz = biz;
this.imgurl = imgurl; this.imgurl = imgurl;
this.descript = descript; this.descript = descript;
this.verified_reason = verified_reason; this.verifiedReason = verifiedReason;
this.openid = openid; this.openid = openid;
this.article_count_month = article_count_month; this.articleCountMonth = articleCountMonth;
this.avg_read_month = avg_read_month; this.avgReadMonth = avgReadMonth;
} }
} }
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
package com.zhiwei.wechat.entity; package com.zhiwei.wechat.entity;
import java.util.Date; import java.util.Date;
import java.util.List;
/** /**
* @ClassName: Wechat * @ClassName: Wechat
...@@ -28,8 +29,6 @@ public class WechatAricle { ...@@ -28,8 +29,6 @@ public class WechatAricle {
private String content; //内容 private String content; //内容
private String imgUrl; //图片地址
private Date time; //发布时间 private Date time; //发布时间
private int readNum; //阅读数 private int readNum; //阅读数
...@@ -46,64 +45,79 @@ public class WechatAricle { ...@@ -46,64 +45,79 @@ public class WechatAricle {
private String user_name;//微信公众号初始id private String user_name;//微信公众号初始id
private String rootSource;
private List<String> imgUrls;
public String getIsFirst() { public String getIsFirst() {
return isFirst; return isFirst;
} }
public void setIsFirst(String isFirst) { public void setIsFirst(String isFirst) {
this.isFirst = isFirst; this.isFirst = isFirst;
} }
public String getOpenId() { public String getOpenId() {
return openId; return openId;
} }
public void setOpenId(String openId) { public void setOpenId(String openId) {
this.openId = openId; this.openId = openId;
} }
public String getImgUrl() {
return imgUrl;
}
public void setImgUrl(String imgUrl) {
this.imgUrl = imgUrl;
}
public String getId() { public String getId() {
return id; return id;
} }
public void setId(String id) { public void setId(String id) {
this.id = id; this.id = id;
} }
public String getTitle() { public String getTitle() {
return title; return title;
} }
public void setTitle(String title) { public void setTitle(String title) {
this.title = title; this.title = title;
} }
public String getSource() { public String getSource() {
return source; return source;
} }
public void setSource(String source) { public void setSource(String source) {
this.source = source; this.source = source;
} }
public String getContent() { public String getContent() {
return content; return content;
} }
public void setContent(String content) { public void setContent(String content) {
this.content = content; this.content = content;
} }
public Date getTime() { public Date getTime() {
return time; return time;
} }
public void setTime(Date time) { public void setTime(Date time) {
this.time = time; this.time = time;
} }
public int getReadNum() { public int getReadNum() {
return readNum; return readNum;
} }
public void setReadNum(int readNum) { public void setReadNum(int readNum) {
this.readNum = readNum; this.readNum = readNum;
} }
public int getLikeNum() { public int getLikeNum() {
return likeNum; return likeNum;
} }
public void setLikeNum(int likeNum) { public void setLikeNum(int likeNum) {
this.likeNum = likeNum; this.likeNum = likeNum;
} }
...@@ -112,31 +126,55 @@ public class WechatAricle { ...@@ -112,31 +126,55 @@ public class WechatAricle {
public String getBiz() { public String getBiz() {
return biz; return biz;
} }
public String getWxId() { public String getWxId() {
return wxId; return wxId;
} }
public String getUser_name() { public String getUser_name() {
return user_name; return user_name;
} }
public void setBiz(String biz) { public void setBiz(String biz) {
this.biz = biz; this.biz = biz;
} }
public void setWxId(String wxId) { public void setWxId(String wxId) {
this.wxId = wxId; this.wxId = wxId;
} }
public List<String> getImgUrls() {
return imgUrls;
}
public void setImgUrls(List<String> imgUrls) {
this.imgUrls = imgUrls;
}
public void setUser_name(String user_name) { public void setUser_name(String user_name) {
this.user_name = user_name; this.user_name = user_name;
} }
public WechatAricle(){}
public WechatAricle(String id,String title,String source,String content public String getRootSource() {
,Date time,int readNum,int likeNum,String openId,String isFirst) return rootSource;
{ }
public void setRootSource(String rootSource) {
this.rootSource = rootSource;
}
public WechatAricle() {
}
public WechatAricle(String id, String title, String source, String content
, Date time, String rootSource, List<String> imgUrls,int readNum, int likeNum, String openId, String isFirst) {
this.id = id.replaceAll("amp;", ""); this.id = id.replaceAll("amp;", "");
this.title = title; this.title = title;
this.source = source; this.source = source;
this.content = content; this.content = content;
this.time = time; this.time = time;
this.rootSource = rootSource;
this.imgUrls = imgUrls;
this.readNum = readNum; this.readNum = readNum;
this.likeNum = likeNum; this.likeNum = likeNum;
this.openId = openId; this.openId = openId;
...@@ -145,14 +183,15 @@ public class WechatAricle { ...@@ -145,14 +183,15 @@ public class WechatAricle {
@Override @Override
public String toString() public String toString() {
{
return "new Wechat[" return "new Wechat["
+ "id = " + id + "," + "id = " + id + ","
+ "title = " + title + "," + "title = " + title + ","
+ "source = " + source + "," + "source = " + source + ","
+ "content = " + content + "," + "content = " + content + ","
+ "time = " + time + "," + "time = " + time + ","
+ "rootSource = " + rootSource + ","
+ "imgUrls = " + imgUrls + ","
+ "readNum = " + readNum + "," + "readNum = " + readNum + ","
+ "likeNum = " + likeNum + "," + "likeNum = " + likeNum + ","
+ "openId = " + openId + "," + "openId = " + openId + ","
......
...@@ -12,13 +12,15 @@ package com.zhiwei.wechat.readAndLike; ...@@ -12,13 +12,15 @@ package com.zhiwei.wechat.readAndLike;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.wechat.util.HtmlDownUtil;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.wechat.util.Tools; import com.zhiwei.wechat.util.Tools;
/** /**
...@@ -48,7 +50,7 @@ public class AriticleContent{ ...@@ -48,7 +50,7 @@ public class AriticleContent{
String content = null; String content = null;
String source = null; String source = null;
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string(); String htmlBody = HtmlDownUtil.downloadHtml(url, headerMap, ProxyHolder.NAT_HEAVY_PROXY);
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
content = document.select("div.rich_media_content").text(); content = document.select("div.rich_media_content").text();
if(htmlBody.contains("var nickname = ")){ if(htmlBody.contains("var nickname = ")){
...@@ -80,8 +82,8 @@ public class AriticleContent{ ...@@ -80,8 +82,8 @@ public class AriticleContent{
headerMap.put("Referer", url); headerMap.put("Referer", url);
String comment_id = null; String comment_id = null;
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string(); String htmlBody = HtmlDownUtil.downloadHtml(url, headerMap, ProxyHolder.NAT_HEAVY_PROXY);
if(htmlBody!=null) if(StringUtils.isNotBlank(htmlBody))
{ {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
String content = document.select("script").html(); String content = document.select("script").html();
......
///**
// * @Title: WindowsClient.java
// * @Package com.wcral.client
// * @Description: TODO(用一句话描述该文件做什么)
// * @author Bewilder Z
// * @date 2015年8月6日 上午9:13:37
// * @version V1.0
// */
//
//package com.zhiwei.wechat.readAndLike;
//
//import java.net.Proxy;
//import java.net.URLEncoder;
//import java.util.HashMap;
//import java.util.Map;
//
//import org.jsoup.Jsoup;
//import org.jsoup.nodes.Document;
//import org.jsoup.nodes.Element;
//import org.jsoup.select.Elements;
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//import com.zhiwei.wechat.entity.WechatReadLike;
//import com.zhiwei.wechat.search.WechatAritcleSearch;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @ClassName: WindowsClient
// * @Description: TODO(利用windows客戶端進行点赞阅读抓取)
// * @author Abner Liu
// * @date 2015年8月6日 上午9:13:37
// */
//public class WeChatReadAndLike {
//
//
// private static Logger logger = LoggerFactory.getLogger(WeChatReadAndLike.class);
//
// /**
// *
// * @Title: getReadAndLike
// * @Description: 利用windows客戶端進行点赞阅读抓取
// * @param url
// * 微信文章链接
// * @return WeChatReadLike 微信文章实体类
// *
// */
// public static WechatReadLike getReadAndLike(String url,String key,Proxy proxy){
// WechatReadLike wLike = new WechatReadLike();
// try {
// String urlcookie = Tools.getWechatCookieUrl(url, key);
// // 请求头信息
// Map<String,String> headerMap = Tools.getWechatHeader();
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(urlcookie, proxy, headerMap);
//
// headerMap.put("Referer", urlcookie);
// headerMap.put("Cookie", cookieMap.get("cookie")+"");
// String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
// System.out.println("appmsg_token==========="+appmsg_token);
// String urlLike = Tools.getWechatLikeUrl(urlcookie,appmsg_token);
// //设置post请求参数
// HashMap<String,Object> postMap = new HashMap<String,Object>();
// postMap.put("is_only_read", "1");
//
// //获取数据
// String htsString = HttpClientTemplateOK.post(urlLike, proxy, headerMap ,postMap);
// System.out.println(htsString);
// JSONObject jsonObject = JSONObject.parseObject(htsString);
// String like_num = jsonObject.getJSONObject("appmsgstat")
// .get("like_num").toString();
//
// String real_read_num = "";
// try {
// real_read_num = jsonObject.getJSONObject("appmsgstat")
// .get("real_read_num").toString();
// if(real_read_num.equals("0"))
// {
// real_read_num = jsonObject.getJSONObject("appmsgstat")
// .get("read_num").toString();
// }
// } catch (Exception e) {
// real_read_num = jsonObject.getJSONObject("appmsgstat")
// .get("read_num").toString();
// }
// wLike.setUrl(url);
// wLike.setRead(Integer.valueOf(real_read_num));
// wLike.setLike(Integer.valueOf(like_num));
// } catch (Exception e) {
// wLike.setUrl(url);
// wLike.setRead(-1);
// wLike.setLike(-1);
// }
// return wLike;
// }
//
//
//
// /**
// * @Title: getReadAndLike
// * @Description: TODO(通过搜狗微信获取阅读数)
// * @param @param word
// * @param @param time
// * @param @param link
// * @param @param wxId
// * @param @return 设定文件
// * @return WeChatReadLike 返回类型
// */
// public static WechatReadLike getReadAndLike(String word,
// String time,String link,String wxId){
//
// WechatReadLike wLike = new WechatReadLike();
//
// Map<String,String> headerMap = new HashMap<String,String>();
// headerMap.put("Upgrade-Insecure-Requests", "1");
// headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
// headerMap.put("Host","weixin.sogou.com");
//
// if(time.contains(" "))
// {
// time = time.split(" ")[0];
// }
//
// String openid = WechatAritcleSearch.getOpenId(wxId,null);
// logger.info("openid is {}", openid);
//
// try {
// String url = "http://weixin.sogou.com/weixin?query=" + URLEncoder.encode(word,"utf-8")
// + "&type=2&ie=utf8&page=1&interation=&tsn=5&ft="+time + "&et="+ time
// + "&wxid="+openid+"&usip="+wxId+"&from=tool";
//
// logger.info("url is {}",url);
//
// String htmlBody = HttpClientTemplateOK.get(url, null, headerMap);
// if(htmlBody!=null)
// {
// try {
// // 解析数据
// Document document = Jsoup.parse(htmlBody);
// Elements elements = document.select("div.news-box")
// .select("ul.news-list").select("li");
// for (Element element : elements)
// {
// try {
// String url_link = element.select("div.txt-box").select("h3 >a").attr("href");
// int readNum = 0;
// try {
// readNum = Integer.valueOf(element.select("div.txt-box")
// .select("div.s-p").select("span.s1").text().trim());
// logger.info("readNum is {}", readNum);
// } catch (Exception e) {
// readNum = 0;
// }
// if(url_link.contains("&chksm="))
// {
// url_link = url_link.split("&chksm=")[0] + "&3rd" + url_link.split("&3rd")[1];
// }
//
// if(link.equals(url_link))
// {
// wLike.setUrl(link);
// wLike.setRead(readNum);
// break;
// }
// } catch (Exception e) {
// continue;
// }
// }
// } catch (Exception e) {
// wLike.setUrl(link);
// wLike.setRead(0);
// return null;
// }
// }
// } catch (Exception e) {
// e.printStackTrace();
// wLike.setUrl(link);
// wLike.setRead(0);
// return null;
// }
// return wLike;
// }
//
//}
...@@ -5,16 +5,13 @@ import java.net.URLEncoder; ...@@ -5,16 +5,13 @@ import java.net.URLEncoder;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import com.zhiwei.wechat.util.HtmlDownUtil;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
public class WechatCount { public class WechatCount {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static int getWechatCountByWord(String word, String cookie, public static int getWechatCountByWord(String word, String cookie,
String startTime, String endTime, Proxy proxy) { String startTime, String endTime, Proxy proxy) {
Map<String, String> headerMap = getWechatCount(cookie); Map<String, String> headerMap = getWechatCount(cookie);
...@@ -24,9 +21,7 @@ public class WechatCount { ...@@ -24,9 +21,7 @@ public class WechatCount {
+ startTime.split(" ")[0] + "&et=" + endTime.split(" ")[0] + startTime.split(" ")[0] + "&et=" + endTime.split(" ")[0]
+ "&interation=&wxid=&usip="; + "&interation=&wxid=&usip=";
headerMap.put("Referer", url); headerMap.put("Referer", url);
String result = httpBoot String result = HtmlDownUtil.downloadHtml(url, headerMap, proxy);
.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)
.body().string();
String s = ""; String s = "";
int n = -1; int n = -1;
if (result.contains("找到约") && result.contains("条结果")) { if (result.contains("找到约") && result.contains("条结果")) {
......
package com.zhiwei.wechat.search;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
/**
* @ClassName: WechatIndex
* @Description: TODO(微信指数搜索)
* @author Bewilder Z
* @date 2017年3月24日 下午2:52:01
*/
public class WechatIndex {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static void main(String[] args) throws Exception {
String word = "百度";
String startTime = "2016-01-01 00:00:00";
String endTime = "2017-03-24 00:00:00";
Map<String,String> timeLine = TimeParse.getTimeMap(startTime, endTime, "dd", 7);
for(Entry<String,String> entry: timeLine.entrySet())
{
Date st = TimeParse.stringFormartDate(entry.getKey());
Date et = TimeParse.stringFormartDate(entry.getValue());
getWechatIndex(word,st.getTime()/1000L,et.getTime()/1000L);
}
}
public static void getWechatIndex(String word,long startTime,long endTime) throws Exception
{
String url = "https://search.weixin.qq.com/cgi-bin/searchweb/getwxindex?query="
+URLCodeUtil.getURLEncode(word, "utf-8")+"&start_time="+startTime+"&end_time="+endTime+"&_="+new Date().getTime();
// String urlCookie = "https://search.weixin.qq.com/cgi-bin/searchweb/getjsapiticket?sign_url=https%253A%252F%252Fsearch.weixin.qq.com%252Fcgi-bin%252Fsearchweb%252Fclientjump%253Ftag%253Dwxindex%2526exportkey%253DAStrb5tD4ruSixIDu1cVpTA%25253D%2526pass_ticket%253DbbP7ZT5xEUrYe%25252BoOa6ACUw%25252BmgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva%25252BGxj&_=1490341301892";
System.out.println(url);
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Host", "search.weixin.qq.com");
headerMap.put("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Mobile/14D27 MicroMessenger/6.5.5 NetType/4G Language/zh_CN");
headerMap.put("Referer", url);
headerMap.put("X-Requested-With","XMLHttpRequest");
headerMap.put("Accept","application/json, text/javascript, */*; q=0.01");
headerMap.put("Cookie","mmsearch_user_key=AStrb5tD4ruSixIDu1cVpTA=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700");
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
System.out.println(htmlBody);
Thread.sleep(3000);
// Document htmlBody = Jsoup.connect(url)
// .header("Host", "search.weixin.qq.com")
// .header("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Mobile/14D27 MicroMessenger/6.5.5 NetType/4G Language/zh_CN")
// .header("Referer", url)
// .header("X-Requested-With","XMLHttpRequest")
// .header("Accept","application/json, text/javascript, */*; q=0.01")
// .header("Cookie", "mmsearch_user_key=AfNSrJx116RWkWvTuVC949k=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700")
// .ignoreHttpErrors(false)
// .ignoreContentType(true)
// .timeout(3000)
// .get();
}
}
package com.zhiwei.wechat.search; package com.zhiwei.wechat.search;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.core.utils.RequestUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
......
package com.zhiwei.wechat.util;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.wechat.search.WechatAritcleSearch;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.IOException;
import java.net.Proxy;
import java.util.Map;
/**
* @ProjectName: wechat
* @ClassName: HtmlDownUtil
* @Author: admin
* @Description: 网页数据下载
* @Date: 2020/8/3 8:57
* @Version: 1.0
*/
public class HtmlDownUtil {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* 根据链接获取网页数据
* @param url
* @param headMap
* @param proxy
* @return
* @throws IOException
*/
public static String downloadHtml(String url, Map<String,String> headMap, Proxy proxy)throws IOException{
headMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), proxy, false)) {
return response.body().string();
}catch (IOException e){
throw e;
}
}
public static String downloadHtml(String url, Map<String,String> headMap, ProxyHolder proxyHolder)throws IOException{
headMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), proxyHolder, false)) {
return response.body().string();
}catch (IOException e){
throw e;
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment