Commit a5057f65 by zhiwei

处理搜狗微信搜索链接中出现两次https的问题

parent 09b58307
......@@ -26,6 +26,49 @@
</developer>
</developers>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.2.0-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.6.8-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>automaticmark-client</artifactId>
<version>2.1.7-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.71</version>
<scope>provided</scope>
</dependency>
</dependencies>
<!-- 打包管理 -->
<build>
<plugins>
......@@ -63,36 +106,4 @@
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.5-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.6.3-RELEASE</version>
<scope>provided</scope>
</dependency>
</dependencies>
</project>
\ No newline at end of file
......@@ -5,21 +5,19 @@ import java.util.Map;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.wechat.util.HtmlDownUtil;
import org.apache.commons.lang3.StringUtils;
/**
* @ClassName: WechatAccountFans
* @Description: TODO(微信公众号粉丝增量采集程序)
* @Description: 微信公众号粉丝增量采集程序
* @author hero
* @date 2017年2月8日 上午11:36:11
*/
public class WechatAccountFans {
// private static Logger logger = LoggerFactory.getLogger(WechatAccountFans.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private Map<String,String> headerMap;
public WechatAccountFans()
......@@ -45,8 +43,8 @@ public class WechatAccountFans {
headerMap.put("Referer", referer);
headerMap.put("Cookie", cookie);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
if(htmlBody != null)
String htmlBody = HtmlDownUtil.downloadHtml(url,headerMap, ProxyHolder.NAT_HEAVY_PROXY);
if(StringUtils.isNotBlank(htmlBody))
{
JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject category_list = json.getJSONArray("category_list").getJSONObject(0);
......
package com.zhiwei.wechat.account;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.wechat.util.HtmlDownUtil;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.seimicrawler.xpath.JXDocument;
import org.seimicrawler.xpath.JXNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.wechat.entity.WechatAccount;
public class WechatAccountInfo {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Logger logger = LoggerFactory.getLogger(WechatAccountInfo.class);
/***
* @Title: getWechatAccount
* @Description: TODO(根据帐号id查询帐号信息)
* @Description: 根据帐号id查询帐号信息
* @param @param id
* @param @param name
* @param @param biz
* @param @return 设定文件
* @return WechatAccount 返回类型
*/
public static WechatAccount getUserInfoById(String id,Proxy proxy)
{
String url = "http://weixin.sogou.com/weixin?type=1&query=" + id +"&ie=utf8&_sug_=n&_sug_type_=";
public static WechatAccount getUserInfoById(String id, Proxy proxy) {
String url = "http://weixin.sogou.com/weixin?type=1&query=" + id + "&ie=utf8&_sug_=n&_sug_type_=";
System.out.println(url);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy,false).body().string();
if(htmlBody != null)
{
return anaSislyAccount(url,htmlBody, id, "id");
String htmlBody = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), proxy);
if (StringUtils.isNotBlank(htmlBody)) {
return anaSislyAccount(url, htmlBody, id, "id");
}
} catch (Exception e) {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", e.getMessage());
......@@ -52,28 +49,23 @@ public class WechatAccountInfo {
/**
* @param name
* @param proxy
* @deprecated:根据用户名和id精准匹配微信公号信息
* @param String name
* @param String id
* **/
public static WechatAccount getUserInfoByName(String name,Proxy proxy)
{
**/
public static WechatAccount getUserInfoByName(String name, Proxy proxy) {
String query = URLCodeUtil.getURLEncode(name, "utf-8");
for(int i = 1;i<=3;i++)
{
String url = "http://weixin.sogou.com/weixin?type=1&query=" + query +"&ie=utf8&_sug_=n&_sug_type_=&page="+i;
logger.info("url:{}",url);
for (int i = 1; i <= 3; i++) {
String url = "http://weixin.sogou.com/weixin?type=1&query=" + query + "&ie=utf8&_sug_=n&_sug_type_=&page=" + i;
logger.info("url:{}", url);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy).body().string();
if(htmlBody != null)
{
WechatAccount wa = anaSislyAccount(url,htmlBody, name, "name");
if(wa!=null)
{
String htmlBody = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), proxy);
if (StringUtils.isNotBlank(htmlBody)) {
WechatAccount wa = anaSislyAccount(url, htmlBody, name, "name");
if (wa != null) {
return wa;
}
}else
{
} else {
logger.info("数据不存在...........");
}
} catch (Exception e) {
......@@ -85,22 +77,17 @@ public class WechatAccountInfo {
}
public static List<WechatAccount> searchWechatAccount(String word)
{
public static List<WechatAccount> searchWechatAccount(String word) {
List<WechatAccount> list = new ArrayList<>();
String query = URLCodeUtil.getURLEncode(word, "utf-8");
boolean more = true;
int i = 1;
while(more)
{
String url = "http://weixin.sogou.com/weixin?type=1&query=" + query +"&ie=utf8&_sug_=n&_sug_type_=&page="+i;
logger.info("url:{}",url);
while (more) {
String url = "http://weixin.sogou.com/weixin?type=1&query=" + query + "&ie=utf8&_sug_=n&_sug_type_=&page=" + i;
logger.info("url:{}", url);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
if(htmlBody != null)
{
String htmlBody = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), ProxyHolder.NAT_HEAVY_PROXY);
if (StringUtils.isNotBlank(htmlBody)) {
/** 解析页面 */
list.addAll(anaSislyAccountList(url, htmlBody));
/** 判断是否有下一页 **/
......@@ -112,8 +99,7 @@ public class WechatAccountInfo {
more = false;
}
}
}else
{
} else {
more = false;
logger.info("数据不存在...........");
}
......@@ -128,101 +114,36 @@ public class WechatAccountInfo {
}
/**
* @Title: anaSislyAccount
* @Description: TODO(解析并获取帐号)
* @param @param htmlBody
* @param @param name
* @param @param biz
* @param @return 设定文件
* @return WechatAccount 返回类型
* @Title: anaSislyAccount
* @Description: TODO(解析并获取帐号)
*/
private static WechatAccount anaSislyAccount(String url,String htmlBody
,String matchKey,String type)
{
Document document = Jsoup.parse(htmlBody);
if(htmlBody.contains("noresult_part1_container") ||
htmlBody.contains("501 Not Implemented"))
{
logger.info("暂无与“{}”相关的官方认证订阅号。",matchKey);
}else
{
private static WechatAccount anaSislyAccount(String url, String htmlBody
, String matchKey, String type) {
JXDocument jxDocument = JXDocument.create(htmlBody);
if (htmlBody.contains("noresult_part1_container") ||
htmlBody.contains("501 Not Implemented")) {
logger.info("暂无与“{}”相关的官方认证订阅号。", matchKey);
} else {
String readurl = htmlBody.split("account_anti_url = \"")[1].split("\";")[0];
Elements element = document.select("div.news-box")
.select("ul.news-list2").select("li");
List<JXNode> jxNodeList = jxDocument.selN("//div[@class='news-box']/ul[@class='news-list2']/li");
// System.out.println("数据大小:"+element.size());
logger.info("数据大小:"+element.size());
for (int i = 0; i < element.size(); i++)
{
String openid = element.get(i).attr("d");
String userName = element.get(i).select("div.gzh-box2")
.select("div.txt-box").select("p.tit").text();
String id = element.get(i).select("div.gzh-box2")
.select("div.txt-box").select("p.info")
.select("label").text();
int article_count_month = 0;
int avg_read_month = 0;
String imgurl = element.get(i).select("div.gzh-box2")
.select("div.img-box").select("img").attr("src");
String descript = "";
String info = "";
String bizR = "";
/**只有一个简介或者认证原因或者最近文章**/
try {
if(element.get(i).select("dl").size()==1)
{
String text = element.get(i).select("dl").get(0).text();
if(text.contains("功能介绍"))
{
descript = text;
}
if(text.contains("认证"))
{
info = text;
}
if(text.contains("最近文章"))
{
// bizR = element.get(i).select("dl").get(0).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
}
/**只有简介、认证原因 或者 简介、最近文章 或者 认证原因、最近文章**/
if(element.get(i).select("dl").size()==2)
{
String text = element.get(i).select("dl").get(0).text();
String text2 = element.get(i).select("dl").get(1).text();
/**有简介;认证或者最近文章*/
if(text.contains("功能介绍") )
{
descript = text;
if(text2.contains("认证"))
{
info = text2;
}else if(text2.contains("最近文章")){
// bizR = element.get(i).select("dl").get(1).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
}else if(text.contains("认证"))
{
info = text;
// bizR = element.get(i).select("dl").get(1).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
}
if(element.get(i).select("dl").size()==3)
{
descript = element.get(i).select("dl").get(0).text();
info = element.get(i).select("dl").get(1).text();
// bizR = element.get(i).select("dl").get(2).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
} catch (Exception e) {
e.printStackTrace();
}
logger.info("数据大小:{}", jxNodeList.size());
if (Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()) {
for (JXNode jxNode : jxNodeList) {
WechatAccount wechatAccount = anysislyWechatAccount(jxNode);
if (Objects.nonNull(wechatAccount)) {
String userName = wechatAccount.getName();
String openid = wechatAccount.getOpenid();
String id = wechatAccount.getId();
Integer articleCountMonth = null;
Integer avgReadMonth = null;
switch (type) {
case "name":
......@@ -230,11 +151,13 @@ public class WechatAccountInfo {
if (userName.equals(matchKey)) {
String avg = getAvgRead(url, readurl, openid);
if (avg != null) {
article_count_month = Integer.valueOf(avg.split(",")[0]);
avg_read_month = Integer.valueOf(avg.split(",")[1]);
articleCountMonth = Integer.valueOf(avg.split(",")[0]);
avgReadMonth = Integer.valueOf(avg.split(",")[1]);
}
return new WechatAccount(id, userName, bizR, imgurl, descript, info, openid,
article_count_month, avg_read_month);
wechatAccount.setAvgReadMonth(avgReadMonth);
wechatAccount.setArticleCountMonth(articleCountMonth);
return wechatAccount;
}
break;
......@@ -243,150 +166,156 @@ public class WechatAccountInfo {
if (id.equals(matchKey)) {
String avg = getAvgRead(url, readurl, openid);
if (avg != null) {
article_count_month = Integer.valueOf(avg.split(",")[0]);
avg_read_month = Integer.valueOf(avg.split(",")[1]);
articleCountMonth = Integer.valueOf(avg.split(",")[0]);
avgReadMonth = Integer.valueOf(avg.split(",")[1]);
}
return new WechatAccount(id, userName, bizR, imgurl, descript, info, openid,
article_count_month, avg_read_month);
wechatAccount.setAvgReadMonth(avgReadMonth);
wechatAccount.setArticleCountMonth(articleCountMonth);
return wechatAccount;
}
break;
}
}
}
return null;
}
}
return null;
}
/**
* @Title: anaSislyAccount
* @Description: TODO(解析并获取帐号列表)
* @param @param htmlBody
* @param @param name
* @param @param biz
* @param @return 设定文件
* @return WechatAccount 返回类型
* @Title: anaSislyAccount
* @Description: TODO(解析并获取帐号列表)
*/
private static List<WechatAccount> anaSislyAccountList(String url,String htmlBody)
{
private static List<WechatAccount> anaSislyAccountList(String url, String htmlBody) {
List<WechatAccount> list = new ArrayList<WechatAccount>();
Document document = Jsoup.parse(htmlBody);
if(htmlBody.contains("noresult_part1_container") ||
htmlBody.contains("501 Not Implemented"))
{
JXDocument jxDocument = JXDocument.create(htmlBody);
if (htmlBody.contains("noresult_part1_container") ||
htmlBody.contains("501 Not Implemented")) {
logger.info("暂无与“{}”相关的官方认证订阅号。");
}else
{
} else {
String readurl = htmlBody.split("account_anti_url = \"")[1].split("\";")[0];
JSONObject avgJson = getAvgRead(url, readurl);
Elements element = document.select("div.news-box")
.select("ul.news-list2").select("li");
logger.info("数据大小:"+element.size());
for (int i = 0; i < element.size(); i++)
{
String openid = element.get(i).attr("d");
String userName = element.get(i).select("div.gzh-box2")
.select("div.txt-box").select("p.tit").text();
String id = element.get(i).select("div.gzh-box2")
.select("div.txt-box").select("p.info")
.select("label").text();
List<JXNode> jxNodeList = jxDocument.selN("//div[@class='news-box']/ul[@class='news-list2']/li");
if(Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()){
for(JXNode jxNode : jxNodeList){
WechatAccount wechatAccount = anysislyWechatAccount(jxNode);
String openid = wechatAccount.getOpenid();
if(avgJson!=null && avgJson.containsKey("msg")){
JSONObject data = avgJson.getJSONObject("msg");
Integer articleCountMonth = null;
Integer avgReadMonth = null;
if(openid!=null){
String avg = data.getString(openid);
if (avg != null) {
wechatAccount.setAvgReadMonth(avgReadMonth);
wechatAccount.setArticleCountMonth(articleCountMonth);
}
}
}
list.add(wechatAccount);
}
}
}
return list;
}
/**
* 解析账号信息
* @param jxNode
* @return
*/
private static WechatAccount anysislyWechatAccount(JXNode jxNode) {
String openid = jxNode.selOne("//li").asElement().attr("d");
String userName = jxNode.selOne("//p[@class='tit']/a/em").asElement().text();
String id = jxNode.selOne("//p[@class='info']/label").asElement().text();
int article_count_month = 0;
int avg_read_month = 0;
String imgurl = element.get(i).select("div.gzh-box2")
.select("div.img-box").select("img").attr("src");
String imgurl = jxNode.selOne("//div[@class='img-box']/a").asElement().attr("src");
String descript = "";
String info = "";
String bizR = "";
/**只有一个简介或者认证原因或者最近文章**/
if(element.get(i).select("dl").size()==1)
{
String text = element.get(i).select("dl").get(0).text();
if(text.contains("功能介绍"))
{
try {
if (jxNode.sel("//dl").size() == 1) {
String text = jxNode.sel("//dl").get(0).asElement().text();
if (text.contains("功能介绍")) {
descript = text;
}
if(text.contains("认证"))
{
if (text.contains("认证")) {
info = text;
}
if(text.contains("最近文章"))
{
if (text.contains("最近文章")) {
// bizR = element.get(i).select("dl").get(0).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
}
/**只有简介、认证原因 或者 简介、最近文章 或者 认证原因、最近文章**/
if(element.get(i).select("dl").size()==2)
{
String text = element.get(i).select("dl").get(0).text();
String text2 = element.get(i).select("dl").get(1).text();
if (jxNode.sel("//dl").size() == 2) {
String text = jxNode.sel("//dl").get(0).asElement().text();
String text2 = jxNode.sel("//dl").get(1).asElement().text();
/**有简介;认证或者最近文章*/
if(text.contains("功能介绍"))
{
if (text.contains("功能介绍")) {
descript = text;
if(text2.contains("认证"))
{
if (text2.contains("认证")) {
info = text2;
} else if (text2.contains("最近文章")) {
// bizR = element.get(i).select("dl").get(1).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
}
/**有认证和最近文章**/
if(text.contains("认证"))
{
} else if (text.contains("认证")) {
info = text;
// bizR = element.get(i).select("dl").get(1).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
}
if(element.get(i).select("dl").size()==3)
{
descript = element.get(i).select("dl").get(0).text();
info = element.get(i).select("dl").get(1).text();
if (jxNode.sel("//dl").size() == 3) {
descript = jxNode.sel("//dl").get(0).asElement().text();
info = jxNode.sel("//dl").get(1).asElement().text();
// bizR = element.get(i).select("dl").get(2).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
if(avgJson!=null && avgJson.containsKey("msg")){
JSONObject data = avgJson.getJSONObject("msg");
if(openid!=null){
String avg = data.getString(openid);
if (avg != null) {
article_count_month = Integer.valueOf(avg.split(",")[0]);
avg_read_month = Integer.valueOf(avg.split(",")[1]);
}
}
}
WechatAccount wechatAccount = new WechatAccount(id,userName,bizR,imgurl,descript,info,openid,article_count_month,avg_read_month);
list.add(wechatAccount);
}
return new WechatAccount(id, userName, bizR, imgurl, descript, info, openid, article_count_month, avg_read_month);
} catch (Exception e) {
e.printStackTrace();
}
return list;
return null;
}
/**
* @Title: getAvgRead
* @Description: TODO(更新平均阅读数)
* @param @param url
* @param @param readUrl
* @param @param openid
* @param @return 设定文件
* @return String 返回类型
* @Title: getAvgRead
* @Description: TODO(更新平均阅读数)
*/
private static String getAvgRead(String url,String readUrl,String openid)
{
Map<String,String> headerMap = new HashMap<String,String>();
private static String getAvgRead(String url, String readUrl, String openid) {
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("Referer", url);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
readUrl = "http://weixin.sogou.com" + readUrl;
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(readUrl,headerMap)).body().string();
if(htmlBody != null)
{
String htmlBody = HtmlDownUtil.downloadHtml(readUrl, headerMap, ProxyHolder.NAT_HEAVY_PROXY);
if (StringUtils.isNotBlank(htmlBody)) {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject data = json.getJSONObject("msg");
if(data.containsKey(openid))
{
if (data.containsKey(openid)) {
return data.getString(openid);
}
}
......@@ -399,16 +328,14 @@ public class WechatAccountInfo {
}
private static JSONObject getAvgRead(String url,String readUrl) {
Map<String,String> headerMap = new HashMap<String,String>();
private static JSONObject getAvgRead(String url, String readUrl) {
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("Referer", url);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
readUrl = "http://weixin.sogou.com" + readUrl;
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(readUrl,headerMap)).body().string();
if(htmlBody != null)
{
String htmlBody = HtmlDownUtil.downloadHtml(readUrl, headerMap, ProxyHolder.NAT_HEAVY_PROXY);
if (StringUtils.isNotBlank(htmlBody)) {
JSONObject json = JSONObject.parseObject(htmlBody);
return json.getJSONObject("msg");
}
......
///**
// * 获取微信文章评论
// * @Title: WechatComment.java
// * @Package com.zhiwei.wechat.comment
// * @Description:获取微信文章评论
// * @author hero
// * @date 2016年6月25日 上午8:17:37
// * @version V1.0
// */ /**
// *
// */
//package com.zhiwei.wechat.comment;
//
//import java.io.IOException;
//import java.util.List;
//import java.util.Map;
//
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.alibaba.fastjson.JSON;
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.crawler.core.HttpBoot;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//import com.zhiwei.wechat.entity.WechatComment;
//import com.zhiwei.wechat.readAndLike.AriticleContent;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @Description:获取微信文章评论
// * @author hero
// * @date 2016年6月25日 上午8:17:37
// */
//public class WechatCommentList {
//
// private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
//
// private static WechatComment wc = new WechatComment();
//
// private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class);
// /**
// * 根据文章url获取文章评论列表
// * @Description:
// * @param @param url
// * @param @return
// * @return List<WechatComment> 返回类型
// */
// public static List<WechatComment> getWechatCommentList(String url,String key)
// {
// List<WechatComment> wcList = null;
// /*处理url*/
// String urlcookie = url;
// if(!url.contains("key")){
// urlcookie = Tools.getWechatCookieUrl(url, key);
// }
// // 请求头信息
// Map<String,String> headerMap = Tools.getWechatHeader();
// Map<String, String> cookieMap;
// try {
// cookieMap = HttpClientTemplateOK.getCookie(urlcookie, null, headerMap);
// headerMap.put("Referer", url);
// if(cookieMap.get("cookie").length()>50){
// headerMap.put("Cookie", cookieMap.get("cookie")+"");
// }
// String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
//
// String biz = url.split("__biz=")[1].split("&")[0];
// String appmsgid = url.split("mid=")[1].split("&")[0];
// String comment_id = AriticleContent.getCommentId(url,key);
// if(comment_id!=null && appmsg_token!=null)
// {
// String comment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
// + "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key
// + "&appmsg_token=" + appmsg_token;
// /**解析相关数据*/
// System.out.println(comment_url);
// if("0".equals(comment_id))
// {
// logger.info("此条微信文章没有评论");
// }else
// {
// try {
// String htmlBody = HttpClientTemplateOK.get(comment_url, null, headerMap);
// if(htmlBody!=null)
// {
// JSONObject json = JSON.parseObject(htmlBody);
// wcList = wc.constructWechatComment(json.getJSONArray("elected_comment"),url);
// return wcList;
// }
// } catch (Exception e) {
// logger.info("解析微信文章评论列表时出现问题:", e.fillInStackTrace());
// return null;
// }
// }
// }
// } catch (IOException e1) {
// return null;
// } catch (Exception e1) {
// e1.printStackTrace();
// }
//
// return null;
// }
//
//
// /**
// * @Title: getWechatCommentCount
// * @Description: TODO(根据微信文章地址更新微信评论数)
// * @param @param url
// * @param @param key
// * @param @return 设定文件
// * @return int 返回类型
// */
// public static int getWechatCommentCount(String url,String key)
// {
// System.out.println(url);
// /*处理url*/
// String url_new = url;
// if(url.contains("#rd"))
// {
// url_new = url.split("#rd")[0] + key;
// }else if(url.contains("#wechat_redirect"))
// {
// url_new = url.split("#wechat_redirect")[0] + key;
// }
// String biz = url.split("__biz=")[1].split("&")[0];
// String appmsgid = url.split("mid=")[1].split("&")[0];
//
// /**获取网页头信息**/
// Map<String,String> headerMap = Tools.getWechatHeader();
// /*获取评论id*/
// String comment_id = AriticleContent.getCommentId(url,key);
// if(comment_id!=null)
// {
// String comment_url = "http://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
// + "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key;
// /**解析相关数据*/
//
// if("0".equals(comment_id))
// {
// logger.info("此条微信文章没有评论");
// return 0;
// }else
// {
// try {
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url_new, null,headerMap);
// headerMap.put("Cookie", cookieMap.get("cookie"));
// String htmlBody = HttpClientTemplateOK.get(comment_url, null,headerMap);
// System.out.println(htmlBody);
// if(htmlBody!=null)
// {
// JSONObject json = JSON.parseObject(htmlBody);
// return json.getIntValue("elected_comment_total_cnt");
// }
// } catch (Exception e) {
// logger.debug("更新微信文章评论数时出现问题,问题信息:",e.getMessage());
// return -1;
// }
// }
// }else
// {
// logger.info("获取评论id失败");
// return -1;
// }
// return -1;
// }
//
//
//
//}
......@@ -22,13 +22,13 @@ public class WechatAccount implements Serializable{
private String descript; //描述
private String verified_reason; //认证原因
private String verifiedReason; //认证原因
private String openid; //认证原因
private int article_count_month; //月发文量
private Integer articleCountMonth; //月发文量
private int avg_read_month; //月平均阅读数
private Integer avgReadMonth; //月平均阅读数
public String getId() {
return id;
......@@ -70,12 +70,12 @@ public class WechatAccount implements Serializable{
this.descript = descript;
}
public String getVerified_reason() {
return verified_reason;
public String getVerifiedReason() {
return verifiedReason;
}
public void setVerified_reason(String verified_reason) {
this.verified_reason = verified_reason;
public void setVerifiedReason(String verifiedReason) {
this.verifiedReason = verifiedReason;
}
public String getOpenid() {
......@@ -86,20 +86,20 @@ public class WechatAccount implements Serializable{
this.openid = openid;
}
public int getArticle_count_month() {
return article_count_month;
public Integer getArticleCountMonth() {
return articleCountMonth;
}
public void setArticle_count_month(int article_count_month) {
this.article_count_month = article_count_month;
public void setArticleCountMonth(Integer articleCountMonth) {
this.articleCountMonth = articleCountMonth;
}
public int getAvg_read_month() {
return avg_read_month;
public Integer getAvgReadMonth() {
return avgReadMonth;
}
public void setAvg_read_month(int avg_read_month) {
this.avg_read_month = avg_read_month;
public void setAvgReadMonth(Integer avgReadMonth) {
this.avgReadMonth = avgReadMonth;
}
......@@ -112,10 +112,10 @@ public class WechatAccount implements Serializable{
+ ", biz = " + biz
+ ", imgurl = " + imgurl
+ ", descript = " + descript
+ ", verified_reason = " + verified_reason
+ ", verifiedReason = " + verifiedReason
+ ", openid = " + openid
+ ", article_count_month = " + article_count_month
+ ", avg_read_month = " + avg_read_month
+ ", articleCountMonth = " + articleCountMonth
+ ", avgReadMonth = " + avgReadMonth
+ "]";
}
......@@ -124,18 +124,18 @@ public class WechatAccount implements Serializable{
public WechatAccount(String id,String name, String biz,
String imgurl,String descript,String verified_reason,
String openid, int article_count_month,int avg_read_month)
String imgurl,String descript,String verifiedReason,
String openid, Integer articleCountMonth,Integer avgReadMonth)
{
this.id = id;
this.name = name;
this.biz = biz;
this.imgurl = imgurl;
this.descript = descript;
this.verified_reason = verified_reason;
this.verifiedReason = verifiedReason;
this.openid = openid;
this.article_count_month = article_count_month;
this.avg_read_month = avg_read_month;
this.articleCountMonth = articleCountMonth;
this.avgReadMonth = avgReadMonth;
}
}
......@@ -11,6 +11,7 @@
package com.zhiwei.wechat.entity;
import java.util.Date;
import java.util.List;
/**
* @ClassName: Wechat
......@@ -28,8 +29,6 @@ public class WechatAricle {
private String content; //内容
private String imgUrl; //图片地址
private Date time; //发布时间
private int readNum; //阅读数
......@@ -46,64 +45,79 @@ public class WechatAricle {
private String user_name;//微信公众号初始id
private String rootSource;
private List<String> imgUrls;
public String getIsFirst() {
return isFirst;
}
public void setIsFirst(String isFirst) {
this.isFirst = isFirst;
}
public String getOpenId() {
return openId;
}
public void setOpenId(String openId) {
this.openId = openId;
}
public String getImgUrl() {
return imgUrl;
}
public void setImgUrl(String imgUrl) {
this.imgUrl = imgUrl;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public int getReadNum() {
return readNum;
}
public void setReadNum(int readNum) {
this.readNum = readNum;
}
public int getLikeNum() {
return likeNum;
}
public void setLikeNum(int likeNum) {
this.likeNum = likeNum;
}
......@@ -112,31 +126,55 @@ public class WechatAricle {
public String getBiz() {
return biz;
}
public String getWxId() {
return wxId;
}
public String getUser_name() {
return user_name;
}
public void setBiz(String biz) {
this.biz = biz;
}
public void setWxId(String wxId) {
this.wxId = wxId;
}
public List<String> getImgUrls() {
return imgUrls;
}
public void setImgUrls(List<String> imgUrls) {
this.imgUrls = imgUrls;
}
public void setUser_name(String user_name) {
this.user_name = user_name;
}
public WechatAricle(){}
public WechatAricle(String id,String title,String source,String content
,Date time,int readNum,int likeNum,String openId,String isFirst)
{
public String getRootSource() {
return rootSource;
}
public void setRootSource(String rootSource) {
this.rootSource = rootSource;
}
public WechatAricle() {
}
public WechatAricle(String id, String title, String source, String content
, Date time, String rootSource, List<String> imgUrls,int readNum, int likeNum, String openId, String isFirst) {
this.id = id.replaceAll("amp;", "");
this.title = title;
this.source = source;
this.content = content;
this.time = time;
this.rootSource = rootSource;
this.imgUrls = imgUrls;
this.readNum = readNum;
this.likeNum = likeNum;
this.openId = openId;
......@@ -145,14 +183,15 @@ public class WechatAricle {
@Override
public String toString()
{
public String toString() {
return "new Wechat["
+ "id = " + id + ","
+ "title = " + title + ","
+ "source = " + source + ","
+ "content = " + content + ","
+ "time = " + time + ","
+ "rootSource = " + rootSource + ","
+ "imgUrls = " + imgUrls + ","
+ "readNum = " + readNum + ","
+ "likeNum = " + likeNum + ","
+ "openId = " + openId + ","
......
///**
// * 抓取微信公号历史文章数据
// * @Title: WechatDataFromHistory.java
// * @Package com.zhiwei.wechat.history
// * @Description:抓取微信公号历史文章数据
// * @author hero
// * @date 2016年5月20日 上午10:27:19
// * @version V1.0
// */ /**
// *
// */
//package com.zhiwei.wechat.history;
//import java.net.Proxy;
//import java.util.ArrayList;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//
//import org.apache.logging.log4j.LogManager;
//import org.apache.logging.log4j.Logger;
//
//import com.alibaba.fastjson.JSONArray;
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//import com.zhiwei.tools.timeparse.TimeUtil;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//import com.zhiwei.wechat.entity.WechatAricle;
//import com.zhiwei.wechat.entity.WechatReadLike;
//import com.zhiwei.wechat.readAndLike.AriticleContent;
//import com.zhiwei.wechat.readAndLike.WeChatReadAndLike;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @Description:抓取微信公号历史文章数据
// * @author Bewilder Z
// * @date 2016年5月20日 上午10:27:19
// */
//public class WechatDataFromHistory {
//
// private static final Logger log = LogManager.getLogger(WechatDataFromHistory.class);
//
// private boolean updateLike = false; //是否更新点赞阅读数
//
// private Date endDate = null; //采集的结束时间
//
// private List<WechatAricle> result; //数据总集合
//
// private Map<String,String> headerMap; //请求头信息
//
// private boolean follow = false; //是否关注
//
// private String nextId; //采集下一页id
//
// private String key; //更新点赞阅读的key
//
// private boolean next = true; //判断是否有下一页
//
//
// /**
// *
// * @Description:
// * @param @param updateLike 是否更新点赞数和阅读数
// * @param @param endDate 采集结束时间
// * @return
// */
// public WechatDataFromHistory(boolean updateLike,String endDate,
// boolean follow)
// {
// this.updateLike = updateLike;
// result = new ArrayList<WechatAricle>();
// headerMap = Tools.getWechatHeader();
// this.follow = follow;
// if(endDate == null)
// {
// endDate = "2011-12-30";
// }
// this.endDate = TimeUtil.parseTime(endDate, "yyyy-MM-dd");
// }
//
// public WechatDataFromHistory(){}
//
//
// /**
// * @Title: validateKey
// * @author hero
// * @Description: 验证链接是否有效
// * @param @param key
// * @param @return 设定文件
// * @return boolean 返回类型
// */
// public static boolean validateKey(String key,Proxy proxy){
// String url = "http://mp.weixin.qq.com/s?__biz=MzIwNDk0NzEyOQ==&mid=2247484544&idx=2&sn=f64abc4b15badd77b70ca942bc5176d3&scene=0#wechat_redirect";
// try {
// WechatReadLike wrl = WeChatReadAndLike.getReadAndLike(url, key,proxy);
// if(wrl.getRead()>0){
// return true;
// }else{
// return false;
// }
// } catch (Exception e) {
// log.debug("验证微信key有效性时出现问题,问题为:{}",e.getMessage());
// return false;
// }
// }
//
//
// /**
// * @Title: getWechatDataFromHistory
// * @author hero
// * @Description: 获取微信公众号历史文章
// * @param @param url
// * @param @return 设定文件
// * @return List<WechatAricle> 返回类型
// */
// public List<WechatAricle> getWechatDataFromHistory(String url,Proxy proxy)
// {
// log.info("url:::::::::{}",url);
// if(updateLike)
// {
// key = "&uin"+url.split("uin")[1].split("devicetype")[0];
// }
//
// String firstText = null;
// try {
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy, headerMap);
// //获取cookie
// if(cookieMap.get("cookie")!=null){
//// headerMap.put("Referer", url);
// headerMap.put("Cookie", cookieMap.get("cookie"));
// firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
// }
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// //采集下一页数据参数,并获取第一页数据
// if(firstText != null){
// String appToken = getFirst(firstText,proxy);
// if(follow == true)
// {
// next = true;
// }
//
// //循环读取微信公号历史数据
// int i = 1;
// while(next)
// {
// String nextUrl = url.replace("home", "getmsg") + "&f=json&&offset=" + i*10 + "&count=10&scene=123&is_ok=1&appmsg_token="+appToken;
// log.info("下一页地址:{}", nextUrl);
// try {
// //采集下一页数据参数,并获取此页数据
// headerMap.put("Referer", nextUrl);
// String nextJson = HttpClientTemplateOK.get(nextUrl, proxy,headerMap);
// nextId = getNext(nextJson,proxy);
//// System.out.println("nextId============"+nextId);
//// if(nextId.equals("1")){
//// next = true;
//// }else{
//// next = false;
//// }
// ZhiWeiTools.sleep(3000);
// } catch (Exception e) {
// e.printStackTrace();
// next = false;
// }
// i++;
// }
//
// }else{
// next = false;
// }
//
// return result;
// }
//
// /***
// * 获取公号历史文章
// * @Description:
// * @param @param url
// * @param @param source
// * @param @return
// * @return List<Wechat> 返回类型
// */
// @Deprecated
// public List<WechatAricle> getWechatDataFromHistoryOld(String url,Proxy proxy)
// {
// log.info("url:::::::::{}",url);
// if(updateLike)
// {
// key = "&uin"+url.split("uin")[1].split("devicetype")[0];
// }
//
// String firstText = null;
// try {
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy,headerMap);
// //获取cookie
// headerMap.put("Referer", url);
// headerMap.put("Cookie", cookieMap.get("cookie"));
// firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// //采集下一页数据参数,并获取第一页数据
// nextId = getFirstOld(firstText,proxy);
// boolean next = false; //判断是否有下一页
// if(follow == true)
// {
// next = true;
// }
// //循环读取微信公号历史数据
// while(next)
// {
// //没有下一页数据,结束
// if(nextId==null)
// {
// next = false;
// }else //采集下一页数据
// {
// String nextUrl = url.replace("home", "getmsg") + "&f=json&frommsgid=" + nextId + "&count=10&scene=123&is_ok=1";
// log.info("下一页地址:{}", nextUrl);
// try {
// //采集下一页数据参数,并获取此页数据
// headerMap.put("Referer", nextUrl);
// String nextJson = HttpClientTemplateOK.get(nextUrl, null,headerMap);
// nextId = getNext(nextJson,proxy);
// System.out.println("nextId-============="+nextId);
// ZhiWeiTools.sleep(3000);
// } catch (Exception e) {
// e.printStackTrace();
// next = false;
// }
//
// }
// }
//
// return result;
// }
//
//
//
//
// /**
// * @Title: getFirst
// * @Description: TODO(解析第一页数据)
// * @param @param fristText
// * @param @param source
// * @param @return 设定文件
// * @return String 返回类型
// */
// @Deprecated
// public String getFirstOld(String fristText,Proxy proxy)
// {
// fristText = fristText
// .replace("\\", "")
// .replace("'", "")
// .replace("&nbsp;", " ")
// .replace("&quot;", "\"")
// .replace("&amp;", "&")
// .replace("amp;", "")
// .replace("&#39", "'")
// .replace("&gt;", ">")
// .replace("&lt;", "<")
// .replace("&yen;", "¥")
// ;
// log.info("开始解析第一页文章");
// // 截取HTML得到有用的JSON;替换掉转义字符
// if(fristText.contains("msgList ="))
// {
// fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
// return getNextIdAndAnalysis(fristText,proxy);
// }
// return null;
// }
//
// /**
// * @Title: getFirst
// * @author hero
// * @Description: 截取appmsg_token 值
// * @param @param fristText
// * @param @return 设定文件
// * @return String 返回类型
// */
// private String getFirst(String fristText,Proxy proxy)
// {
// String next = null;
//
// fristText = fristText
// .replace("\\", "")
// .replace("'", "")
// .replace("&nbsp;", " ")
// .replace("&quot;", "\"")
// .replace("&amp;", "&")
// .replace("amp;", "")
// .replace("&#39", "'")
// .replace("&gt;", ">")
// .replace("&lt;", "<")
// .replace("&yen;", "¥")
// ;
// log.info("开始解析第一页文章");
//
// if(fristText.contains("window.appmsg_token = ") && fristText.contains("msgList =")){
// try {
// next = fristText.split("window.appmsg_token = \"")[1].split("\";")[0];
// fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
// getNextIdAndAnalysis(fristText,proxy);
// return next;
// } catch (Exception e) {
// log.info("截取下一页数据参数出现问题:{}",fristText);
// return null;
// }
// }else{
// log.info("获取下一页数据参数出现问题....{}",fristText);
// }
// return null;
// }
//
//
// /***
// * 解析微信历史文章下一页数据
// * @Description:
// * @param @param nextJosn
// * @param @param key
// * @param @param source
// * @param @return
// * @return String 返回类型
// */
// private String getNext(String nextHtml,Proxy proxy)
// {
// try {
// JSONObject nextJosn = JSONObject.parseObject(nextHtml);
// String nextText = null;
// if(null != nextJosn.getString("general_msg_list"))
// {
// nextText = nextJosn.getString("general_msg_list");
// getNextIdAndAnalysis(nextText,proxy);
// }else
// {
// log.info("下一页数据解析出现问题:{}", nextHtml);
// next = false;
// return null;
// }
// return nextJosn.getInteger("can_msg_continue")+"";
//
// } catch (Exception e) {
// log.info("解析数据有问题:{}", nextHtml);
// next = false;
// return null;
// }
//
//
// }
//
// /**
// * @Title: getNextIdAndAnalysis
// * @Description: TODO(解析下一页所需字段,及数据解析)
// * @param @param text
// * @param @param source
// * @param @return 设定文件
// * @return String 返回类型
// */
// public String getNextIdAndAnalysis(String text,Proxy proxy)
// {
// JSONObject wechatData = JSONObject.parseObject(text);
// JSONArray dataList = wechatData.getJSONArray("list");
// if(dataList.size()==0)
// {
// nextId = null;
// next = false;
// }else
// {
// for(int i = 0;i<dataList.size();i++)
// {
// JSONObject data = dataList.getJSONObject(i);
// //解析时间
// JSONObject dateJson = data.getJSONObject("comm_msg_info");
// long dateTime = dateJson.getLong("datetime");
// Date time = new Date(dateTime*1000);
// nextId = dateJson.getString("id");
// if(time.before(endDate))
// {
// next = false;
// nextId = null;
// }
// //解析文本数据
// if(null != data.getJSONObject("app_msg_ext_info"))
// {
// //解析头条数据
// JSONObject first = data.getJSONObject("app_msg_ext_info");
// String content_url = first.getString("content_url");
// String content = first.getString("digest");
// String title = first.getString("title");
// String img_url = first.getString("cover");
//
// WechatAricle wechatFirst = setWechat(content_url,title
// , time, img_url, content,"true",proxy);
// result.add(wechatFirst);
// //解析其余数据
// JSONArray otherJSON = first.getJSONArray("multi_app_msg_item_list");
// if(otherJSON != null)
// {
// for(int j = 0;j<otherJSON.size();j++)
// {
// JSONObject other = otherJSON.getJSONObject(j);
// String other_content_url = other.getString("content_url");
// String other_content = other.getString("digest");
// String other_title = other.getString("title");
// String other_img_url = other.getString("cover");
//
// WechatAricle wechatOther = setWechat(other_content_url,other_title
// , time, other_img_url, other_content,"false",proxy);
// result.add(wechatOther);
// }
// }else
// {
// log.info("只有一条数据");
// }
// }else
// {
// log.info("不存在相关文章......");
// }
// }
// }
// return nextId;
// }
//
//
//
// /**
// * 给实体类对象赋值
// * @Description:
// * @param @param url
// * @param @param title
// * @param @param source
// * @param @param datetime
// * @param @param key
// * @param @return
// * @return Wechat 返回类型
// */
// private WechatAricle setWechat(String url,String title,
// Date datetime,String imgUrl,String content,String isFirst,Proxy proxy)
// {
// WechatAricle wechat = new WechatAricle();
// wechat.setId(url);
// wechat.setTitle(title);
// wechat.setTime(datetime);
// wechat.setImgUrl(imgUrl);
// wechat.setIsFirst(isFirst);
// //采集文章
// String source = null;
// Map<String,String> sacMap = AriticleContent.getAriticleContent(url);
// if(sacMap!=null)
// {
// source = sacMap.get("source");
// content = sacMap.get("content");
// }
// //更新点赞阅读数
// if(updateLike)
// {
// url = url.replaceAll("amp;", "").replaceAll("amp;", "");
// try {
// Thread.sleep(2000);
// WechatReadLike wcrl = WeChatReadAndLike.getReadAndLike(url,key,proxy);
// wechat.setLikeNum(wcrl.getLike());
// wechat.setReadNum(wcrl.getRead());
// } catch (InterruptedException e) {
// wechat.setLikeNum(-1);
// wechat.setReadNum(-1);
// log.error("获取点赞阅读数出现为题,问题:{}", e.getMessage());
// }
// }
//
// wechat.setContent(content);
// wechat.setSource(source);
// return wechat;
// }
//
//
// public static void main(String[] args) {
// String url = "http:\\/\\/mp.weixin.qq.com\\/s?__biz=MjM5NTU0MzI0MA==&mid=2661648551&idx=1&sn=74397ab60184beb0abd4dd3f8c62f7d3&chksm=bda7c9008ad04016a5eac88c8dd18b6bc5797ae780c56e307e11781af257a68a52b7f87dfd8e&scene=27#wechat_redirect";
// System.out.println(url.replaceAll("\\", ""));
//
// }
//
//
//}
......@@ -12,13 +12,15 @@ package com.zhiwei.wechat.readAndLike;
import java.util.HashMap;
import java.util.Map;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.wechat.util.HtmlDownUtil;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.wechat.util.Tools;
/**
......@@ -48,7 +50,7 @@ public class AriticleContent{
String content = null;
String source = null;
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
String htmlBody = HtmlDownUtil.downloadHtml(url, headerMap, ProxyHolder.NAT_HEAVY_PROXY);
Document document = Jsoup.parse(htmlBody);
content = document.select("div.rich_media_content").text();
if(htmlBody.contains("var nickname = ")){
......@@ -80,8 +82,8 @@ public class AriticleContent{
headerMap.put("Referer", url);
String comment_id = null;
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
if(htmlBody!=null)
String htmlBody = HtmlDownUtil.downloadHtml(url, headerMap, ProxyHolder.NAT_HEAVY_PROXY);
if(StringUtils.isNotBlank(htmlBody))
{
Document document = Jsoup.parse(htmlBody);
String content = document.select("script").html();
......
///**
// * @Title: WindowsClient.java
// * @Package com.wcral.client
// * @Description: TODO(用一句话描述该文件做什么)
// * @author Bewilder Z
// * @date 2015年8月6日 上午9:13:37
// * @version V1.0
// */
//
//package com.zhiwei.wechat.readAndLike;
//
//import java.net.Proxy;
//import java.net.URLEncoder;
//import java.util.HashMap;
//import java.util.Map;
//
//import org.jsoup.Jsoup;
//import org.jsoup.nodes.Document;
//import org.jsoup.nodes.Element;
//import org.jsoup.select.Elements;
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//import com.zhiwei.wechat.entity.WechatReadLike;
//import com.zhiwei.wechat.search.WechatAritcleSearch;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @ClassName: WindowsClient
// * @Description: TODO(利用windows客戶端進行点赞阅读抓取)
// * @author Abner Liu
// * @date 2015年8月6日 上午9:13:37
// */
//public class WeChatReadAndLike {
//
//
// private static Logger logger = LoggerFactory.getLogger(WeChatReadAndLike.class);
//
// /**
// *
// * @Title: getReadAndLike
// * @Description: 利用windows客戶端進行点赞阅读抓取
// * @param url
// * 微信文章链接
// * @return WeChatReadLike 微信文章实体类
// *
// */
// public static WechatReadLike getReadAndLike(String url,String key,Proxy proxy){
// WechatReadLike wLike = new WechatReadLike();
// try {
// String urlcookie = Tools.getWechatCookieUrl(url, key);
// // 请求头信息
// Map<String,String> headerMap = Tools.getWechatHeader();
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(urlcookie, proxy, headerMap);
//
// headerMap.put("Referer", urlcookie);
// headerMap.put("Cookie", cookieMap.get("cookie")+"");
// String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
// System.out.println("appmsg_token==========="+appmsg_token);
// String urlLike = Tools.getWechatLikeUrl(urlcookie,appmsg_token);
// //设置post请求参数
// HashMap<String,Object> postMap = new HashMap<String,Object>();
// postMap.put("is_only_read", "1");
//
// //获取数据
// String htsString = HttpClientTemplateOK.post(urlLike, proxy, headerMap ,postMap);
// System.out.println(htsString);
// JSONObject jsonObject = JSONObject.parseObject(htsString);
// String like_num = jsonObject.getJSONObject("appmsgstat")
// .get("like_num").toString();
//
// String real_read_num = "";
// try {
// real_read_num = jsonObject.getJSONObject("appmsgstat")
// .get("real_read_num").toString();
// if(real_read_num.equals("0"))
// {
// real_read_num = jsonObject.getJSONObject("appmsgstat")
// .get("read_num").toString();
// }
// } catch (Exception e) {
// real_read_num = jsonObject.getJSONObject("appmsgstat")
// .get("read_num").toString();
// }
// wLike.setUrl(url);
// wLike.setRead(Integer.valueOf(real_read_num));
// wLike.setLike(Integer.valueOf(like_num));
// } catch (Exception e) {
// wLike.setUrl(url);
// wLike.setRead(-1);
// wLike.setLike(-1);
// }
// return wLike;
// }
//
//
//
// /**
// * @Title: getReadAndLike
// * @Description: TODO(通过搜狗微信获取阅读数)
// * @param @param word
// * @param @param time
// * @param @param link
// * @param @param wxId
// * @param @return 设定文件
// * @return WeChatReadLike 返回类型
// */
// public static WechatReadLike getReadAndLike(String word,
// String time,String link,String wxId){
//
// WechatReadLike wLike = new WechatReadLike();
//
// Map<String,String> headerMap = new HashMap<String,String>();
// headerMap.put("Upgrade-Insecure-Requests", "1");
// headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
// headerMap.put("Host","weixin.sogou.com");
//
// if(time.contains(" "))
// {
// time = time.split(" ")[0];
// }
//
// String openid = WechatAritcleSearch.getOpenId(wxId,null);
// logger.info("openid is {}", openid);
//
// try {
// String url = "http://weixin.sogou.com/weixin?query=" + URLEncoder.encode(word,"utf-8")
// + "&type=2&ie=utf8&page=1&interation=&tsn=5&ft="+time + "&et="+ time
// + "&wxid="+openid+"&usip="+wxId+"&from=tool";
//
// logger.info("url is {}",url);
//
// String htmlBody = HttpClientTemplateOK.get(url, null, headerMap);
// if(htmlBody!=null)
// {
// try {
// // 解析数据
// Document document = Jsoup.parse(htmlBody);
// Elements elements = document.select("div.news-box")
// .select("ul.news-list").select("li");
// for (Element element : elements)
// {
// try {
// String url_link = element.select("div.txt-box").select("h3 >a").attr("href");
// int readNum = 0;
// try {
// readNum = Integer.valueOf(element.select("div.txt-box")
// .select("div.s-p").select("span.s1").text().trim());
// logger.info("readNum is {}", readNum);
// } catch (Exception e) {
// readNum = 0;
// }
// if(url_link.contains("&chksm="))
// {
// url_link = url_link.split("&chksm=")[0] + "&3rd" + url_link.split("&3rd")[1];
// }
//
// if(link.equals(url_link))
// {
// wLike.setUrl(link);
// wLike.setRead(readNum);
// break;
// }
// } catch (Exception e) {
// continue;
// }
// }
// } catch (Exception e) {
// wLike.setUrl(link);
// wLike.setRead(0);
// return null;
// }
// }
// } catch (Exception e) {
// e.printStackTrace();
// wLike.setUrl(link);
// wLike.setRead(0);
// return null;
// }
// return wLike;
// }
//
//}
......@@ -5,22 +5,19 @@ import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.*;;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.wechat.util.HtmlDownUtil;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.wechat.entity.WechatAricle;
import org.seimicrawler.xpath.JXDocument;
import org.seimicrawler.xpath.JXNode;
/**
* @ClassName: WechatAritcleSearch
......@@ -31,7 +28,6 @@ import com.zhiwei.wechat.entity.WechatAricle;
public class WechatAritcleSearch {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* 根据关键词在搜狗微信搜索微信文章,不包含全文
......@@ -39,49 +35,33 @@ public class WechatAritcleSearch {
* @param
* word 关键词
* @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param
* cookie 用户登录后的cookie(不登录最多10页)
* proxy 代理
* @param
* pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @throws
* Exception
* @return List<Wechat> 返回类型
*/
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String cookie,String startTime, String endTime,
Proxy proxy, Integer pages) throws Exception{
public static List<WechatAricle> wechatKeywordSearch(String word, Proxy proxy, Integer pages) throws Exception{
List<WechatAricle> result = new ArrayList<>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
if(StringUtils.isNotBlank(cookie)) {
headerMap.put("cookie", cookie);
}
headerMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
boolean f = true;
int page = 1;
while (f) {
String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page;
// String searchUrl = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
// + "&ie=utf8&_sug_=n&_sug_type_=&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
// + "&tsn=" + tsn + "&page=" + page;
// if (tsn == 5) {
// searchUrl = searchUrl + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
// }
headerMap.put("Referer", searchUrl);
// 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string();
try{
String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
// 解析数据
if (StringUtils.isNotBlank(htmlBody)) {
Document document = Jsoup.parse(htmlBody);
result.addAll(analysis(document));
JXDocument jxDocument = JXDocument.create(htmlBody);
result.addAll(analysis(jxDocument));
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
if (pageNext.contains("下一页")) {
page++;
} else {
......@@ -90,10 +70,13 @@ public class WechatAritcleSearch {
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
if(pages!=null && pages==page) {
break;
}
}catch (IOException e){
logger.error("根据关键词获取微信文章失败,错误为: {}", e);
}
}
return result;
}
......@@ -105,7 +88,7 @@ public class WechatAritcleSearch {
/**
*
* @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章,包含全文)
* @Description: 根据关键词在搜狗微信搜索微信文章,包含全文
* @param @param
* word 关键词
* @param @param
......@@ -115,8 +98,6 @@ public class WechatAritcleSearch {
* startTime 开始时间 格式为yyyy-MM-dd
* @param @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @param
* cookie 用户登录后的cookie(不登录最多10页)
* @param @return
* @param @throws
* ZhiWeiException
......@@ -124,30 +105,25 @@ public class WechatAritcleSearch {
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
*/
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String startTime, String endTime,
Proxy proxy,ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
public static List<WechatAricle> wechatKeywordSearch(String word,
Proxy proxy, ProxyHolder proxyHolder) throws Exception {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
headerMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
boolean f = true;
int page = 1;
while (f) {
String searchUrl = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
+ "&ie=utf8&_sug_=n&_sug_type_=" + "&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
+ "&tsn=" + tsn + "&page=" + page;
if (tsn == 5) {
searchUrl = searchUrl + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
}
String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page;
headerMap.put("Referer", searchUrl);
// 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string();
String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
// 解析数据
if (StringUtils.isNotBlank(htmlBody)) {
Document document = Jsoup.parse(htmlBody);
result.addAll(analysis(document));
JXDocument jxDocument = JXDocument.create(htmlBody);
result.addAll(analysis(jxDocument));
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
if (pageNext.contains("下一页")) {
page++;
} else {
......@@ -170,7 +146,7 @@ public class WechatAritcleSearch {
*/
private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy,WechatAricle wechatAricle){
try {
String contentHtml = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy.getProxy()).body().string();
String contentHtml = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), proxy.getProxy());
String content = null;
String time = null;
String source = null;
......@@ -178,20 +154,36 @@ public class WechatAritcleSearch {
String title = null;
String user_name = null;
String wxId = null;
List<String> imgUrls = null;
String rootSource = null;
if(contentHtml!=null){
Document document = Jsoup.parse(contentHtml);
title = document.select("title").text();
wxId = document.select("p.profile_meta").get(0).select("span.profile_meta_value").text();
JXDocument jxDocument = JXDocument.create(contentHtml);
title = jxDocument.selNOne("//h2[@id='activity-name']").asElement().text();
wxId = jxDocument.selNOne("//p[@class='profile_meta'][1]/span[@class='profile_meta_value']").asElement().text();
if(contentHtml.contains("js_article")){
content = document.select("div#js_article").text();
if(contentHtml.contains("js_content")){
content = jxDocument.selNOne("//div[@id='js_content']").asElement().text();
}else if(contentHtml.contains("js_share_content")){
content = document.select("div#js_share_content").text();
content = jxDocument.selNOne("//div[@id='js_share_content']").asElement().text();
}
if(contentHtml.contains("content_tpl")){
String text = document.select("script#content_tpl").html();
String text = jxDocument.selNOne("//script[@id='content_tpl']").asElement().text();
content = Jsoup.parse(text).text();
}
//解析文章图片地址
if(Objects.nonNull(jxDocument.selN("//div[@id='js_content']//img"))){
imgUrls = new ArrayList<>();
List<JXNode> imgNodeList = jxDocument.selN("//div[@id='js_content']//img");
for(JXNode imgNode : imgNodeList){
String imgUrl = imgNode.selOne("//img").asElement().attr("href");
imgUrls.add(imgUrl);
}
}
//解析来源
if(Objects.nonNull(jxDocument.selNOne("//span[@id='copyright_logo']"))){
rootSource = jxDocument.selNOne("//span[@id='profileBt']/a[@id='js_name']").asElement().text();
}
if(contentHtml.contains("d.nick_name = ")){
time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
......@@ -211,6 +203,9 @@ public class WechatAritcleSearch {
wechatAricle.setTime(new Date(Long.valueOf(time)*1000));
wechatAricle.setSource(source);
}
wechatAricle.setImgUrls(imgUrls);
wechatAricle.setRootSource(rootSource);
wechatAricle.setBiz(biz);
wechatAricle.setContent(content);
wechatAricle.setWxId(wxId);
......@@ -241,7 +236,6 @@ public class WechatAritcleSearch {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
if(idOrName==null || idOrName.equals("")){
throw new IllegalArgumentException("要检索的昵称或id不能为空");
}
......@@ -259,12 +253,12 @@ public class WechatAritcleSearch {
headerMap.put("Referer", searchUrl);
// 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxyHolder, true).body().string();
String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxyHolder);
if (StringUtils.isNotBlank(htmlBody)) {
Document document = Jsoup.parse(htmlBody);
result.addAll(analysis(document));
JXDocument jxDocument = JXDocument.create(htmlBody);
result.addAll(analysis(jxDocument));
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
if (pageNext.contains("下一页")) {
page++;
} else {
......@@ -280,15 +274,15 @@ public class WechatAritcleSearch {
/**
* 解析数据
* @param document
* @param jxDocument
* @return
*/
private static List<WechatAricle> analysis(Document document){
private static List<WechatAricle> analysis(JXDocument jxDocument){
List<WechatAricle> result = new ArrayList<WechatAricle>();
// 解析数据
try {
// 解析数据
Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
List<JXNode> jxNodeList = jxDocument.selN("//div[@class='news-box']/ul[@class='news-list']/li");
String title = null;
String link = null;
String content = null;
......@@ -297,34 +291,31 @@ public class WechatAritcleSearch {
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements) {
if(Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()){
for (JXNode jxNode : jxNodeList) {
try {
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("href");
title = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().text();
link = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().attr("href");
if(!link.contains("weixin.sogou.com")){
link = "https://weixin.sogou.com" + link;
}
if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text();
} else {
content = element.select("div.txt-box").select("p.txt-info").text();
if (Objects.nonNull(jxNode.selOne("//div[@class='txt-box']/p"))) {
content = jxNode.selOne("//div[@class='txt-box']/p").asElement().text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p").select("a").text();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box").select("div.s-p").attr("t");
source = jxNode.selOne("//div[@class='s-p']/a").asElement().text();
openid = jxNode.selOne("//div[@class='s-p']/a").asElement().attr("i");
putDate = jxNode.selOne("//div[@class='s-p']").asElement().attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("span.s1").text().trim());
readNum = Integer.valueOf(jxNode.selOne("//div[@class='s-p']/span[@class='s1']").asElement().text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
if(StringUtils.isNotBlank(title)){
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = new WechatAricle(link, title, source, content, date, null, null,readNum, 0, openid, "unknow");
result.add(wechat);
}
} catch (Exception e) {
......@@ -332,6 +323,8 @@ public class WechatAritcleSearch {
continue;
}
}
}
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage());
......@@ -356,10 +349,10 @@ public class WechatAritcleSearch {
for(int i = 1;i < 3;i++) {
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxyHolder).body().string();
if (htmlBody != null) {
JSONObject json = JSONObject.parseObject(htmlBody);
openId = json.getString("openid");
htmlBody = HtmlDownUtil.downloadHtml(url,null, proxyHolder);
if (StringUtils.isNotBlank(htmlBody)) {
JSONObject jsonObject = JSONObject.parseObject(htmlBody);
openId = jsonObject.getString("openid");
return openId;
}
} catch (Exception e) {
......
......@@ -5,16 +5,13 @@ import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;
import com.zhiwei.wechat.util.HtmlDownUtil;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
public class WechatCount {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static int getWechatCountByWord(String word, String cookie,
String startTime, String endTime, Proxy proxy) {
Map<String, String> headerMap = getWechatCount(cookie);
......@@ -24,9 +21,7 @@ public class WechatCount {
+ startTime.split(" ")[0] + "&et=" + endTime.split(" ")[0]
+ "&interation=&wxid=&usip=";
headerMap.put("Referer", url);
String result = httpBoot
.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)
.body().string();
String result = HtmlDownUtil.downloadHtml(url, headerMap, proxy);
String s = "";
int n = -1;
if (result.contains("找到约") && result.contains("条结果")) {
......
package com.zhiwei.wechat.search;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
/**
* @ClassName: WechatIndex
* @Description: TODO(微信指数搜索)
* @author Bewilder Z
* @date 2017年3月24日 下午2:52:01
*/
public class WechatIndex {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static void main(String[] args) throws Exception {
String word = "百度";
String startTime = "2016-01-01 00:00:00";
String endTime = "2017-03-24 00:00:00";
Map<String,String> timeLine = TimeParse.getTimeMap(startTime, endTime, "dd", 7);
for(Entry<String,String> entry: timeLine.entrySet())
{
Date st = TimeParse.stringFormartDate(entry.getKey());
Date et = TimeParse.stringFormartDate(entry.getValue());
getWechatIndex(word,st.getTime()/1000L,et.getTime()/1000L);
}
}
public static void getWechatIndex(String word,long startTime,long endTime) throws Exception
{
String url = "https://search.weixin.qq.com/cgi-bin/searchweb/getwxindex?query="
+URLCodeUtil.getURLEncode(word, "utf-8")+"&start_time="+startTime+"&end_time="+endTime+"&_="+new Date().getTime();
// String urlCookie = "https://search.weixin.qq.com/cgi-bin/searchweb/getjsapiticket?sign_url=https%253A%252F%252Fsearch.weixin.qq.com%252Fcgi-bin%252Fsearchweb%252Fclientjump%253Ftag%253Dwxindex%2526exportkey%253DAStrb5tD4ruSixIDu1cVpTA%25253D%2526pass_ticket%253DbbP7ZT5xEUrYe%25252BoOa6ACUw%25252BmgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva%25252BGxj&_=1490341301892";
System.out.println(url);
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Host", "search.weixin.qq.com");
headerMap.put("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Mobile/14D27 MicroMessenger/6.5.5 NetType/4G Language/zh_CN");
headerMap.put("Referer", url);
headerMap.put("X-Requested-With","XMLHttpRequest");
headerMap.put("Accept","application/json, text/javascript, */*; q=0.01");
headerMap.put("Cookie","mmsearch_user_key=AStrb5tD4ruSixIDu1cVpTA=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700");
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
System.out.println(htmlBody);
Thread.sleep(3000);
// Document htmlBody = Jsoup.connect(url)
// .header("Host", "search.weixin.qq.com")
// .header("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Mobile/14D27 MicroMessenger/6.5.5 NetType/4G Language/zh_CN")
// .header("Referer", url)
// .header("X-Requested-With","XMLHttpRequest")
// .header("Accept","application/json, text/javascript, */*; q=0.01")
// .header("Cookie", "mmsearch_user_key=AfNSrJx116RWkWvTuVC949k=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700")
// .ignoreHttpErrors(false)
// .ignoreContentType(true)
// .timeout(3000)
// .get();
}
}
package com.zhiwei.wechat.search;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......
package com.zhiwei.wechat.util;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.wechat.search.WechatAritcleSearch;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.IOException;
import java.net.Proxy;
import java.util.Map;
/**
* @ProjectName: wechat
* @ClassName: HtmlDownUtil
* @Author: admin
* @Description: 网页数据下载
* @Date: 2020/8/3 8:57
* @Version: 1.0
*/
public class HtmlDownUtil {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* 根据链接获取网页数据
* @param url
* @param headMap
* @param proxy
* @return
* @throws IOException
*/
public static String downloadHtml(String url, Map<String,String> headMap, Proxy proxy)throws IOException{
headMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), proxy, false)) {
return response.body().string();
}catch (IOException e){
throw e;
}
}
public static String downloadHtml(String url, Map<String,String> headMap, ProxyHolder proxyHolder)throws IOException{
headMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), proxyHolder, false)) {
return response.body().string();
}catch (IOException e){
throw e;
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment