Commit a5057f65 by zhiwei

处理搜狗微信搜索链接中出现两次https的问题

parent 09b58307
...@@ -26,6 +26,49 @@ ...@@ -26,6 +26,49 @@
</developer> </developer>
</developers> </developers>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.2.0-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.6.8-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>automaticmark-client</artifactId>
<version>2.1.7-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.71</version>
<scope>provided</scope>
</dependency>
</dependencies>
<!-- 打包管理 --> <!-- 打包管理 -->
<build> <build>
<plugins> <plugins>
...@@ -63,36 +106,4 @@ ...@@ -63,36 +106,4 @@
</plugin> </plugin>
</plugins> </plugins>
</build> </build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.5-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.6.3-RELEASE</version>
<scope>provided</scope>
</dependency>
</dependencies>
</project> </project>
\ No newline at end of file
...@@ -5,21 +5,19 @@ import java.util.Map; ...@@ -5,21 +5,19 @@ import java.util.Map;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.wechat.util.HtmlDownUtil;
import org.apache.commons.lang3.StringUtils;
/** /**
* @ClassName: WechatAccountFans * @ClassName: WechatAccountFans
* @Description: TODO(微信公众号粉丝增量采集程序) * @Description: 微信公众号粉丝增量采集程序
* @author hero * @author hero
* @date 2017年2月8日 上午11:36:11 * @date 2017年2月8日 上午11:36:11
*/ */
public class WechatAccountFans { public class WechatAccountFans {
// private static Logger logger = LoggerFactory.getLogger(WechatAccountFans.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private Map<String,String> headerMap; private Map<String,String> headerMap;
public WechatAccountFans() public WechatAccountFans()
...@@ -45,8 +43,8 @@ public class WechatAccountFans { ...@@ -45,8 +43,8 @@ public class WechatAccountFans {
headerMap.put("Referer", referer); headerMap.put("Referer", referer);
headerMap.put("Cookie", cookie); headerMap.put("Cookie", cookie);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string(); String htmlBody = HtmlDownUtil.downloadHtml(url,headerMap, ProxyHolder.NAT_HEAVY_PROXY);
if(htmlBody != null) if(StringUtils.isNotBlank(htmlBody))
{ {
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject category_list = json.getJSONArray("category_list").getJSONObject(0); JSONObject category_list = json.getJSONArray("category_list").getJSONObject(0);
......
package com.zhiwei.wechat.account; package com.zhiwei.wechat.account;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.*;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.wechat.util.HtmlDownUtil;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.seimicrawler.xpath.JXDocument;
import org.seimicrawler.xpath.JXNode;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.wechat.entity.WechatAccount; import com.zhiwei.wechat.entity.WechatAccount;
public class WechatAccountInfo { public class WechatAccountInfo {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Logger logger = LoggerFactory.getLogger(WechatAccountInfo.class); private static Logger logger = LoggerFactory.getLogger(WechatAccountInfo.class);
/*** /***
* @Title: getWechatAccount * @Title: getWechatAccount
* @Description: TODO(根据帐号id查询帐号信息) * @Description: 根据帐号id查询帐号信息
* @param @param id * @param @param id
* @param @param name * @param @param name
* @param @param biz * @param @param biz
* @param @return 设定文件 * @param @return 设定文件
* @return WechatAccount 返回类型 * @return WechatAccount 返回类型
*/ */
public static WechatAccount getUserInfoById(String id,Proxy proxy) public static WechatAccount getUserInfoById(String id, Proxy proxy) {
{ String url = "http://weixin.sogou.com/weixin?type=1&query=" + id + "&ie=utf8&_sug_=n&_sug_type_=";
String url = "http://weixin.sogou.com/weixin?type=1&query=" + id +"&ie=utf8&_sug_=n&_sug_type_=";
System.out.println(url); System.out.println(url);
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy,false).body().string(); String htmlBody = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), proxy);
if(htmlBody != null) if (StringUtils.isNotBlank(htmlBody)) {
{ return anaSislyAccount(url, htmlBody, id, "id");
return anaSislyAccount(url,htmlBody, id, "id");
} }
} catch (Exception e) { } catch (Exception e) {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", e.getMessage()); logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", e.getMessage());
...@@ -52,28 +49,23 @@ public class WechatAccountInfo { ...@@ -52,28 +49,23 @@ public class WechatAccountInfo {
/** /**
* @param name
* @param proxy
* @deprecated:根据用户名和id精准匹配微信公号信息 * @deprecated:根据用户名和id精准匹配微信公号信息
* @param String name **/
* @param String id public static WechatAccount getUserInfoByName(String name, Proxy proxy) {
* **/
public static WechatAccount getUserInfoByName(String name,Proxy proxy)
{
String query = URLCodeUtil.getURLEncode(name, "utf-8"); String query = URLCodeUtil.getURLEncode(name, "utf-8");
for(int i = 1;i<=3;i++) for (int i = 1; i <= 3; i++) {
{ String url = "http://weixin.sogou.com/weixin?type=1&query=" + query + "&ie=utf8&_sug_=n&_sug_type_=&page=" + i;
String url = "http://weixin.sogou.com/weixin?type=1&query=" + query +"&ie=utf8&_sug_=n&_sug_type_=&page="+i; logger.info("url:{}", url);
logger.info("url:{}",url);
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy).body().string(); String htmlBody = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), proxy);
if(htmlBody != null) if (StringUtils.isNotBlank(htmlBody)) {
{ WechatAccount wa = anaSislyAccount(url, htmlBody, name, "name");
WechatAccount wa = anaSislyAccount(url,htmlBody, name, "name"); if (wa != null) {
if(wa!=null)
{
return wa; return wa;
} }
}else } else {
{
logger.info("数据不存在..........."); logger.info("数据不存在...........");
} }
} catch (Exception e) { } catch (Exception e) {
...@@ -85,22 +77,17 @@ public class WechatAccountInfo { ...@@ -85,22 +77,17 @@ public class WechatAccountInfo {
} }
public static List<WechatAccount> searchWechatAccount(String word) {
public static List<WechatAccount> searchWechatAccount(String word)
{
List<WechatAccount> list = new ArrayList<>(); List<WechatAccount> list = new ArrayList<>();
String query = URLCodeUtil.getURLEncode(word, "utf-8"); String query = URLCodeUtil.getURLEncode(word, "utf-8");
boolean more = true; boolean more = true;
int i = 1; int i = 1;
while(more) while (more) {
{ String url = "http://weixin.sogou.com/weixin?type=1&query=" + query + "&ie=utf8&_sug_=n&_sug_type_=&page=" + i;
String url = "http://weixin.sogou.com/weixin?type=1&query=" + query +"&ie=utf8&_sug_=n&_sug_type_=&page="+i; logger.info("url:{}", url);
logger.info("url:{}",url);
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string(); String htmlBody = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), ProxyHolder.NAT_HEAVY_PROXY);
if(htmlBody != null) if (StringUtils.isNotBlank(htmlBody)) {
{
/** 解析页面 */ /** 解析页面 */
list.addAll(anaSislyAccountList(url, htmlBody)); list.addAll(anaSislyAccountList(url, htmlBody));
/** 判断是否有下一页 **/ /** 判断是否有下一页 **/
...@@ -112,8 +99,7 @@ public class WechatAccountInfo { ...@@ -112,8 +99,7 @@ public class WechatAccountInfo {
more = false; more = false;
} }
} }
}else } else {
{
more = false; more = false;
logger.info("数据不存在..........."); logger.info("数据不存在...........");
} }
...@@ -128,101 +114,36 @@ public class WechatAccountInfo { ...@@ -128,101 +114,36 @@ public class WechatAccountInfo {
} }
/** /**
* @Title: anaSislyAccount
* @Description: TODO(解析并获取帐号)
* @param @param htmlBody * @param @param htmlBody
* @param @param name * @param @param name
* @param @param biz * @param @param biz
* @param @return 设定文件 * @param @return 设定文件
* @return WechatAccount 返回类型 * @return WechatAccount 返回类型
* @Title: anaSislyAccount
* @Description: TODO(解析并获取帐号)
*/ */
private static WechatAccount anaSislyAccount(String url,String htmlBody private static WechatAccount anaSislyAccount(String url, String htmlBody
,String matchKey,String type) , String matchKey, String type) {
{ JXDocument jxDocument = JXDocument.create(htmlBody);
Document document = Jsoup.parse(htmlBody); if (htmlBody.contains("noresult_part1_container") ||
if(htmlBody.contains("noresult_part1_container") || htmlBody.contains("501 Not Implemented")) {
htmlBody.contains("501 Not Implemented")) logger.info("暂无与“{}”相关的官方认证订阅号。", matchKey);
{ } else {
logger.info("暂无与“{}”相关的官方认证订阅号。",matchKey);
}else
{
String readurl = htmlBody.split("account_anti_url = \"")[1].split("\";")[0]; String readurl = htmlBody.split("account_anti_url = \"")[1].split("\";")[0];
Elements element = document.select("div.news-box") List<JXNode> jxNodeList = jxDocument.selN("//div[@class='news-box']/ul[@class='news-list2']/li");
.select("ul.news-list2").select("li");
// System.out.println("数据大小:"+element.size()); // System.out.println("数据大小:"+element.size());
logger.info("数据大小:"+element.size()); logger.info("数据大小:{}", jxNodeList.size());
for (int i = 0; i < element.size(); i++) if (Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()) {
{ for (JXNode jxNode : jxNodeList) {
String openid = element.get(i).attr("d");
String userName = element.get(i).select("div.gzh-box2") WechatAccount wechatAccount = anysislyWechatAccount(jxNode);
.select("div.txt-box").select("p.tit").text(); if (Objects.nonNull(wechatAccount)) {
String id = element.get(i).select("div.gzh-box2") String userName = wechatAccount.getName();
.select("div.txt-box").select("p.info") String openid = wechatAccount.getOpenid();
.select("label").text(); String id = wechatAccount.getId();
int article_count_month = 0; Integer articleCountMonth = null;
int avg_read_month = 0; Integer avgReadMonth = null;
String imgurl = element.get(i).select("div.gzh-box2")
.select("div.img-box").select("img").attr("src");
String descript = "";
String info = "";
String bizR = "";
/**只有一个简介或者认证原因或者最近文章**/
try {
if(element.get(i).select("dl").size()==1)
{
String text = element.get(i).select("dl").get(0).text();
if(text.contains("功能介绍"))
{
descript = text;
}
if(text.contains("认证"))
{
info = text;
}
if(text.contains("最近文章"))
{
// bizR = element.get(i).select("dl").get(0).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
}
/**只有简介、认证原因 或者 简介、最近文章 或者 认证原因、最近文章**/
if(element.get(i).select("dl").size()==2)
{
String text = element.get(i).select("dl").get(0).text();
String text2 = element.get(i).select("dl").get(1).text();
/**有简介;认证或者最近文章*/
if(text.contains("功能介绍") )
{
descript = text;
if(text2.contains("认证"))
{
info = text2;
}else if(text2.contains("最近文章")){
// bizR = element.get(i).select("dl").get(1).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
}else if(text.contains("认证"))
{
info = text;
// bizR = element.get(i).select("dl").get(1).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
}
if(element.get(i).select("dl").size()==3)
{
descript = element.get(i).select("dl").get(0).text();
info = element.get(i).select("dl").get(1).text();
// bizR = element.get(i).select("dl").get(2).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
}
} catch (Exception e) {
e.printStackTrace();
}
switch (type) { switch (type) {
case "name": case "name":
...@@ -230,11 +151,13 @@ public class WechatAccountInfo { ...@@ -230,11 +151,13 @@ public class WechatAccountInfo {
if (userName.equals(matchKey)) { if (userName.equals(matchKey)) {
String avg = getAvgRead(url, readurl, openid); String avg = getAvgRead(url, readurl, openid);
if (avg != null) { if (avg != null) {
article_count_month = Integer.valueOf(avg.split(",")[0]); articleCountMonth = Integer.valueOf(avg.split(",")[0]);
avg_read_month = Integer.valueOf(avg.split(",")[1]); avgReadMonth = Integer.valueOf(avg.split(",")[1]);
} }
return new WechatAccount(id, userName, bizR, imgurl, descript, info, openid, wechatAccount.setAvgReadMonth(avgReadMonth);
article_count_month, avg_read_month); wechatAccount.setArticleCountMonth(articleCountMonth);
return wechatAccount;
} }
break; break;
...@@ -243,150 +166,156 @@ public class WechatAccountInfo { ...@@ -243,150 +166,156 @@ public class WechatAccountInfo {
if (id.equals(matchKey)) { if (id.equals(matchKey)) {
String avg = getAvgRead(url, readurl, openid); String avg = getAvgRead(url, readurl, openid);
if (avg != null) { if (avg != null) {
article_count_month = Integer.valueOf(avg.split(",")[0]); articleCountMonth = Integer.valueOf(avg.split(",")[0]);
avg_read_month = Integer.valueOf(avg.split(",")[1]); avgReadMonth = Integer.valueOf(avg.split(",")[1]);
} }
return new WechatAccount(id, userName, bizR, imgurl, descript, info, openid, wechatAccount.setAvgReadMonth(avgReadMonth);
article_count_month, avg_read_month); wechatAccount.setArticleCountMonth(articleCountMonth);
return wechatAccount;
} }
break; break;
} }
} }
} }
return null;
} }
}
return null;
}
/** /**
* @Title: anaSislyAccount
* @Description: TODO(解析并获取帐号列表)
* @param @param htmlBody * @param @param htmlBody
* @param @param name * @param @param name
* @param @param biz * @param @param biz
* @param @return 设定文件 * @param @return 设定文件
* @return WechatAccount 返回类型 * @return WechatAccount 返回类型
* @Title: anaSislyAccount
* @Description: TODO(解析并获取帐号列表)
*/ */
private static List<WechatAccount> anaSislyAccountList(String url,String htmlBody) private static List<WechatAccount> anaSislyAccountList(String url, String htmlBody) {
{
List<WechatAccount> list = new ArrayList<WechatAccount>(); List<WechatAccount> list = new ArrayList<WechatAccount>();
Document document = Jsoup.parse(htmlBody); JXDocument jxDocument = JXDocument.create(htmlBody);
if(htmlBody.contains("noresult_part1_container") || if (htmlBody.contains("noresult_part1_container") ||
htmlBody.contains("501 Not Implemented")) htmlBody.contains("501 Not Implemented")) {
{
logger.info("暂无与“{}”相关的官方认证订阅号。"); logger.info("暂无与“{}”相关的官方认证订阅号。");
}else } else {
{
String readurl = htmlBody.split("account_anti_url = \"")[1].split("\";")[0]; String readurl = htmlBody.split("account_anti_url = \"")[1].split("\";")[0];
JSONObject avgJson = getAvgRead(url, readurl); JSONObject avgJson = getAvgRead(url, readurl);
Elements element = document.select("div.news-box") List<JXNode> jxNodeList = jxDocument.selN("//div[@class='news-box']/ul[@class='news-list2']/li");
.select("ul.news-list2").select("li"); if(Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()){
logger.info("数据大小:"+element.size()); for(JXNode jxNode : jxNodeList){
for (int i = 0; i < element.size(); i++) WechatAccount wechatAccount = anysislyWechatAccount(jxNode);
{ String openid = wechatAccount.getOpenid();
String openid = element.get(i).attr("d"); if(avgJson!=null && avgJson.containsKey("msg")){
String userName = element.get(i).select("div.gzh-box2") JSONObject data = avgJson.getJSONObject("msg");
.select("div.txt-box").select("p.tit").text(); Integer articleCountMonth = null;
String id = element.get(i).select("div.gzh-box2") Integer avgReadMonth = null;
.select("div.txt-box").select("p.info") if(openid!=null){
.select("label").text(); String avg = data.getString(openid);
if (avg != null) {
wechatAccount.setAvgReadMonth(avgReadMonth);
wechatAccount.setArticleCountMonth(articleCountMonth);
}
}
}
list.add(wechatAccount);
}
}
}
return list;
}
/**
* 解析账号信息
* @param jxNode
* @return
*/
private static WechatAccount anysislyWechatAccount(JXNode jxNode) {
String openid = jxNode.selOne("//li").asElement().attr("d");
String userName = jxNode.selOne("//p[@class='tit']/a/em").asElement().text();
String id = jxNode.selOne("//p[@class='info']/label").asElement().text();
int article_count_month = 0; int article_count_month = 0;
int avg_read_month = 0; int avg_read_month = 0;
String imgurl = element.get(i).select("div.gzh-box2") String imgurl = jxNode.selOne("//div[@class='img-box']/a").asElement().attr("src");
.select("div.img-box").select("img").attr("src");
String descript = ""; String descript = "";
String info = ""; String info = "";
String bizR = ""; String bizR = "";
/**只有一个简介或者认证原因或者最近文章**/ /**只有一个简介或者认证原因或者最近文章**/
if(element.get(i).select("dl").size()==1)
{ try {
String text = element.get(i).select("dl").get(0).text(); if (jxNode.sel("//dl").size() == 1) {
if(text.contains("功能介绍")) String text = jxNode.sel("//dl").get(0).asElement().text();
{ if (text.contains("功能介绍")) {
descript = text; descript = text;
} }
if(text.contains("认证")) if (text.contains("认证")) {
{
info = text; info = text;
} }
if(text.contains("最近文章")) if (text.contains("最近文章")) {
{
// bizR = element.get(i).select("dl").get(0).select("a") // bizR = element.get(i).select("dl").get(0).select("a")
// .attr("href").split("biz=")[1].split("&")[0]; // .attr("href").split("biz=")[1].split("&")[0];
} }
} }
/**只有简介、认证原因 或者 简介、最近文章 或者 认证原因、最近文章**/ /**只有简介、认证原因 或者 简介、最近文章 或者 认证原因、最近文章**/
if(element.get(i).select("dl").size()==2) if (jxNode.sel("//dl").size() == 2) {
{ String text = jxNode.sel("//dl").get(0).asElement().text();
String text = element.get(i).select("dl").get(0).text(); String text2 = jxNode.sel("//dl").get(1).asElement().text();
String text2 = element.get(i).select("dl").get(1).text();
/**有简介;认证或者最近文章*/ /**有简介;认证或者最近文章*/
if(text.contains("功能介绍")) if (text.contains("功能介绍")) {
{
descript = text; descript = text;
if(text2.contains("认证")) if (text2.contains("认证")) {
{
info = text2; info = text2;
} else if (text2.contains("最近文章")) {
// bizR = element.get(i).select("dl").get(1).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
} }
} } else if (text.contains("认证")) {
/**有认证和最近文章**/
if(text.contains("认证"))
{
info = text; info = text;
// bizR = element.get(i).select("dl").get(1).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
} }
} }
if(element.get(i).select("dl").size()==3) if (jxNode.sel("//dl").size() == 3) {
{ descript = jxNode.sel("//dl").get(0).asElement().text();
descript = element.get(i).select("dl").get(0).text(); info = jxNode.sel("//dl").get(1).asElement().text();
info = element.get(i).select("dl").get(1).text(); // bizR = element.get(i).select("dl").get(2).select("a")
// .attr("href").split("biz=")[1].split("&")[0];
} }
if(avgJson!=null && avgJson.containsKey("msg")){ return new WechatAccount(id, userName, bizR, imgurl, descript, info, openid, article_count_month, avg_read_month);
JSONObject data = avgJson.getJSONObject("msg");
if(openid!=null){ } catch (Exception e) {
String avg = data.getString(openid); e.printStackTrace();
if (avg != null) {
article_count_month = Integer.valueOf(avg.split(",")[0]);
avg_read_month = Integer.valueOf(avg.split(",")[1]);
}
}
}
WechatAccount wechatAccount = new WechatAccount(id,userName,bizR,imgurl,descript,info,openid,article_count_month,avg_read_month);
list.add(wechatAccount);
}
} }
return list; return null;
} }
/** /**
* @Title: getAvgRead
* @Description: TODO(更新平均阅读数)
* @param @param url * @param @param url
* @param @param readUrl * @param @param readUrl
* @param @param openid * @param @param openid
* @param @return 设定文件 * @param @return 设定文件
* @return String 返回类型 * @return String 返回类型
* @Title: getAvgRead
* @Description: TODO(更新平均阅读数)
*/ */
private static String getAvgRead(String url,String readUrl,String openid) private static String getAvgRead(String url, String readUrl, String openid) {
{ Map<String, String> headerMap = new HashMap<String, String>();
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Referer", url); headerMap.put("Referer", url);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"); headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
readUrl = "http://weixin.sogou.com" + readUrl; readUrl = "http://weixin.sogou.com" + readUrl;
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(readUrl,headerMap)).body().string(); String htmlBody = HtmlDownUtil.downloadHtml(readUrl, headerMap, ProxyHolder.NAT_HEAVY_PROXY);
if(htmlBody != null) if (StringUtils.isNotBlank(htmlBody)) {
{
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject data = json.getJSONObject("msg"); JSONObject data = json.getJSONObject("msg");
if(data.containsKey(openid)) if (data.containsKey(openid)) {
{
return data.getString(openid); return data.getString(openid);
} }
} }
...@@ -399,16 +328,14 @@ public class WechatAccountInfo { ...@@ -399,16 +328,14 @@ public class WechatAccountInfo {
} }
private static JSONObject getAvgRead(String url, String readUrl) {
private static JSONObject getAvgRead(String url,String readUrl) { Map<String, String> headerMap = new HashMap<String, String>();
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Referer", url); headerMap.put("Referer", url);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"); headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
readUrl = "http://weixin.sogou.com" + readUrl; readUrl = "http://weixin.sogou.com" + readUrl;
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(readUrl,headerMap)).body().string(); String htmlBody = HtmlDownUtil.downloadHtml(readUrl, headerMap, ProxyHolder.NAT_HEAVY_PROXY);
if(htmlBody != null) if (StringUtils.isNotBlank(htmlBody)) {
{
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject json = JSONObject.parseObject(htmlBody);
return json.getJSONObject("msg"); return json.getJSONObject("msg");
} }
......
///**
// * 获取微信文章评论
// * @Title: WechatComment.java
// * @Package com.zhiwei.wechat.comment
// * @Description:获取微信文章评论
// * @author hero
// * @date 2016年6月25日 上午8:17:37
// * @version V1.0
// */ /**
// *
// */
//package com.zhiwei.wechat.comment;
//
//import java.io.IOException;
//import java.util.List;
//import java.util.Map;
//
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.alibaba.fastjson.JSON;
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.crawler.core.HttpBoot;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//import com.zhiwei.wechat.entity.WechatComment;
//import com.zhiwei.wechat.readAndLike.AriticleContent;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @Description:获取微信文章评论
// * @author hero
// * @date 2016年6月25日 上午8:17:37
// */
//public class WechatCommentList {
//
// private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
//
// private static WechatComment wc = new WechatComment();
//
// private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class);
// /**
// * 根据文章url获取文章评论列表
// * @Description:
// * @param @param url
// * @param @return
// * @return List<WechatComment> 返回类型
// */
// public static List<WechatComment> getWechatCommentList(String url,String key)
// {
// List<WechatComment> wcList = null;
// /*处理url*/
// String urlcookie = url;
// if(!url.contains("key")){
// urlcookie = Tools.getWechatCookieUrl(url, key);
// }
// // 请求头信息
// Map<String,String> headerMap = Tools.getWechatHeader();
// Map<String, String> cookieMap;
// try {
// cookieMap = HttpClientTemplateOK.getCookie(urlcookie, null, headerMap);
// headerMap.put("Referer", url);
// if(cookieMap.get("cookie").length()>50){
// headerMap.put("Cookie", cookieMap.get("cookie")+"");
// }
// String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
//
// String biz = url.split("__biz=")[1].split("&")[0];
// String appmsgid = url.split("mid=")[1].split("&")[0];
// String comment_id = AriticleContent.getCommentId(url,key);
// if(comment_id!=null && appmsg_token!=null)
// {
// String comment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
// + "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key
// + "&appmsg_token=" + appmsg_token;
// /**解析相关数据*/
// System.out.println(comment_url);
// if("0".equals(comment_id))
// {
// logger.info("此条微信文章没有评论");
// }else
// {
// try {
// String htmlBody = HttpClientTemplateOK.get(comment_url, null, headerMap);
// if(htmlBody!=null)
// {
// JSONObject json = JSON.parseObject(htmlBody);
// wcList = wc.constructWechatComment(json.getJSONArray("elected_comment"),url);
// return wcList;
// }
// } catch (Exception e) {
// logger.info("解析微信文章评论列表时出现问题:", e.fillInStackTrace());
// return null;
// }
// }
// }
// } catch (IOException e1) {
// return null;
// } catch (Exception e1) {
// e1.printStackTrace();
// }
//
// return null;
// }
//
//
// /**
// * @Title: getWechatCommentCount
// * @Description: TODO(根据微信文章地址更新微信评论数)
// * @param @param url
// * @param @param key
// * @param @return 设定文件
// * @return int 返回类型
// */
// public static int getWechatCommentCount(String url,String key)
// {
// System.out.println(url);
// /*处理url*/
// String url_new = url;
// if(url.contains("#rd"))
// {
// url_new = url.split("#rd")[0] + key;
// }else if(url.contains("#wechat_redirect"))
// {
// url_new = url.split("#wechat_redirect")[0] + key;
// }
// String biz = url.split("__biz=")[1].split("&")[0];
// String appmsgid = url.split("mid=")[1].split("&")[0];
//
// /**获取网页头信息**/
// Map<String,String> headerMap = Tools.getWechatHeader();
// /*获取评论id*/
// String comment_id = AriticleContent.getCommentId(url,key);
// if(comment_id!=null)
// {
// String comment_url = "http://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
// + "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key;
// /**解析相关数据*/
//
// if("0".equals(comment_id))
// {
// logger.info("此条微信文章没有评论");
// return 0;
// }else
// {
// try {
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url_new, null,headerMap);
// headerMap.put("Cookie", cookieMap.get("cookie"));
// String htmlBody = HttpClientTemplateOK.get(comment_url, null,headerMap);
// System.out.println(htmlBody);
// if(htmlBody!=null)
// {
// JSONObject json = JSON.parseObject(htmlBody);
// return json.getIntValue("elected_comment_total_cnt");
// }
// } catch (Exception e) {
// logger.debug("更新微信文章评论数时出现问题,问题信息:",e.getMessage());
// return -1;
// }
// }
// }else
// {
// logger.info("获取评论id失败");
// return -1;
// }
// return -1;
// }
//
//
//
//}
...@@ -22,13 +22,13 @@ public class WechatAccount implements Serializable{ ...@@ -22,13 +22,13 @@ public class WechatAccount implements Serializable{
private String descript; //描述 private String descript; //描述
private String verified_reason; //认证原因 private String verifiedReason; //认证原因
private String openid; //认证原因 private String openid; //认证原因
private int article_count_month; //月发文量 private Integer articleCountMonth; //月发文量
private int avg_read_month; //月平均阅读数 private Integer avgReadMonth; //月平均阅读数
public String getId() { public String getId() {
return id; return id;
...@@ -70,12 +70,12 @@ public class WechatAccount implements Serializable{ ...@@ -70,12 +70,12 @@ public class WechatAccount implements Serializable{
this.descript = descript; this.descript = descript;
} }
public String getVerified_reason() { public String getVerifiedReason() {
return verified_reason; return verifiedReason;
} }
public void setVerified_reason(String verified_reason) { public void setVerifiedReason(String verifiedReason) {
this.verified_reason = verified_reason; this.verifiedReason = verifiedReason;
} }
public String getOpenid() { public String getOpenid() {
...@@ -86,20 +86,20 @@ public class WechatAccount implements Serializable{ ...@@ -86,20 +86,20 @@ public class WechatAccount implements Serializable{
this.openid = openid; this.openid = openid;
} }
public int getArticle_count_month() { public Integer getArticleCountMonth() {
return article_count_month; return articleCountMonth;
} }
public void setArticle_count_month(int article_count_month) { public void setArticleCountMonth(Integer articleCountMonth) {
this.article_count_month = article_count_month; this.articleCountMonth = articleCountMonth;
} }
public int getAvg_read_month() { public Integer getAvgReadMonth() {
return avg_read_month; return avgReadMonth;
} }
public void setAvg_read_month(int avg_read_month) { public void setAvgReadMonth(Integer avgReadMonth) {
this.avg_read_month = avg_read_month; this.avgReadMonth = avgReadMonth;
} }
...@@ -112,10 +112,10 @@ public class WechatAccount implements Serializable{ ...@@ -112,10 +112,10 @@ public class WechatAccount implements Serializable{
+ ", biz = " + biz + ", biz = " + biz
+ ", imgurl = " + imgurl + ", imgurl = " + imgurl
+ ", descript = " + descript + ", descript = " + descript
+ ", verified_reason = " + verified_reason + ", verifiedReason = " + verifiedReason
+ ", openid = " + openid + ", openid = " + openid
+ ", article_count_month = " + article_count_month + ", articleCountMonth = " + articleCountMonth
+ ", avg_read_month = " + avg_read_month + ", avgReadMonth = " + avgReadMonth
+ "]"; + "]";
} }
...@@ -124,18 +124,18 @@ public class WechatAccount implements Serializable{ ...@@ -124,18 +124,18 @@ public class WechatAccount implements Serializable{
public WechatAccount(String id,String name, String biz, public WechatAccount(String id,String name, String biz,
String imgurl,String descript,String verified_reason, String imgurl,String descript,String verifiedReason,
String openid, int article_count_month,int avg_read_month) String openid, Integer articleCountMonth,Integer avgReadMonth)
{ {
this.id = id; this.id = id;
this.name = name; this.name = name;
this.biz = biz; this.biz = biz;
this.imgurl = imgurl; this.imgurl = imgurl;
this.descript = descript; this.descript = descript;
this.verified_reason = verified_reason; this.verifiedReason = verifiedReason;
this.openid = openid; this.openid = openid;
this.article_count_month = article_count_month; this.articleCountMonth = articleCountMonth;
this.avg_read_month = avg_read_month; this.avgReadMonth = avgReadMonth;
} }
} }
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
package com.zhiwei.wechat.entity; package com.zhiwei.wechat.entity;
import java.util.Date; import java.util.Date;
import java.util.List;
/** /**
* @ClassName: Wechat * @ClassName: Wechat
...@@ -28,8 +29,6 @@ public class WechatAricle { ...@@ -28,8 +29,6 @@ public class WechatAricle {
private String content; //内容 private String content; //内容
private String imgUrl; //图片地址
private Date time; //发布时间 private Date time; //发布时间
private int readNum; //阅读数 private int readNum; //阅读数
...@@ -46,64 +45,79 @@ public class WechatAricle { ...@@ -46,64 +45,79 @@ public class WechatAricle {
private String user_name;//微信公众号初始id private String user_name;//微信公众号初始id
private String rootSource;
private List<String> imgUrls;
public String getIsFirst() { public String getIsFirst() {
return isFirst; return isFirst;
} }
public void setIsFirst(String isFirst) { public void setIsFirst(String isFirst) {
this.isFirst = isFirst; this.isFirst = isFirst;
} }
public String getOpenId() { public String getOpenId() {
return openId; return openId;
} }
public void setOpenId(String openId) { public void setOpenId(String openId) {
this.openId = openId; this.openId = openId;
} }
public String getImgUrl() {
return imgUrl;
}
public void setImgUrl(String imgUrl) {
this.imgUrl = imgUrl;
}
public String getId() { public String getId() {
return id; return id;
} }
public void setId(String id) { public void setId(String id) {
this.id = id; this.id = id;
} }
public String getTitle() { public String getTitle() {
return title; return title;
} }
public void setTitle(String title) { public void setTitle(String title) {
this.title = title; this.title = title;
} }
public String getSource() { public String getSource() {
return source; return source;
} }
public void setSource(String source) { public void setSource(String source) {
this.source = source; this.source = source;
} }
public String getContent() { public String getContent() {
return content; return content;
} }
public void setContent(String content) { public void setContent(String content) {
this.content = content; this.content = content;
} }
public Date getTime() { public Date getTime() {
return time; return time;
} }
public void setTime(Date time) { public void setTime(Date time) {
this.time = time; this.time = time;
} }
public int getReadNum() { public int getReadNum() {
return readNum; return readNum;
} }
public void setReadNum(int readNum) { public void setReadNum(int readNum) {
this.readNum = readNum; this.readNum = readNum;
} }
public int getLikeNum() { public int getLikeNum() {
return likeNum; return likeNum;
} }
public void setLikeNum(int likeNum) { public void setLikeNum(int likeNum) {
this.likeNum = likeNum; this.likeNum = likeNum;
} }
...@@ -112,31 +126,55 @@ public class WechatAricle { ...@@ -112,31 +126,55 @@ public class WechatAricle {
public String getBiz() { public String getBiz() {
return biz; return biz;
} }
public String getWxId() { public String getWxId() {
return wxId; return wxId;
} }
public String getUser_name() { public String getUser_name() {
return user_name; return user_name;
} }
public void setBiz(String biz) { public void setBiz(String biz) {
this.biz = biz; this.biz = biz;
} }
public void setWxId(String wxId) { public void setWxId(String wxId) {
this.wxId = wxId; this.wxId = wxId;
} }
public List<String> getImgUrls() {
return imgUrls;
}
public void setImgUrls(List<String> imgUrls) {
this.imgUrls = imgUrls;
}
public void setUser_name(String user_name) { public void setUser_name(String user_name) {
this.user_name = user_name; this.user_name = user_name;
} }
public WechatAricle(){}
public WechatAricle(String id,String title,String source,String content public String getRootSource() {
,Date time,int readNum,int likeNum,String openId,String isFirst) return rootSource;
{ }
public void setRootSource(String rootSource) {
this.rootSource = rootSource;
}
public WechatAricle() {
}
public WechatAricle(String id, String title, String source, String content
, Date time, String rootSource, List<String> imgUrls,int readNum, int likeNum, String openId, String isFirst) {
this.id = id.replaceAll("amp;", ""); this.id = id.replaceAll("amp;", "");
this.title = title; this.title = title;
this.source = source; this.source = source;
this.content = content; this.content = content;
this.time = time; this.time = time;
this.rootSource = rootSource;
this.imgUrls = imgUrls;
this.readNum = readNum; this.readNum = readNum;
this.likeNum = likeNum; this.likeNum = likeNum;
this.openId = openId; this.openId = openId;
...@@ -145,14 +183,15 @@ public class WechatAricle { ...@@ -145,14 +183,15 @@ public class WechatAricle {
@Override @Override
public String toString() public String toString() {
{
return "new Wechat[" return "new Wechat["
+ "id = " + id + "," + "id = " + id + ","
+ "title = " + title + "," + "title = " + title + ","
+ "source = " + source + "," + "source = " + source + ","
+ "content = " + content + "," + "content = " + content + ","
+ "time = " + time + "," + "time = " + time + ","
+ "rootSource = " + rootSource + ","
+ "imgUrls = " + imgUrls + ","
+ "readNum = " + readNum + "," + "readNum = " + readNum + ","
+ "likeNum = " + likeNum + "," + "likeNum = " + likeNum + ","
+ "openId = " + openId + "," + "openId = " + openId + ","
......
///**
// * 抓取微信公号历史文章数据
// * @Title: WechatDataFromHistory.java
// * @Package com.zhiwei.wechat.history
// * @Description:抓取微信公号历史文章数据
// * @author hero
// * @date 2016年5月20日 上午10:27:19
// * @version V1.0
// */ /**
// *
// */
//package com.zhiwei.wechat.history;
//import java.net.Proxy;
//import java.util.ArrayList;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//
//import org.apache.logging.log4j.LogManager;
//import org.apache.logging.log4j.Logger;
//
//import com.alibaba.fastjson.JSONArray;
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//import com.zhiwei.tools.timeparse.TimeUtil;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//import com.zhiwei.wechat.entity.WechatAricle;
//import com.zhiwei.wechat.entity.WechatReadLike;
//import com.zhiwei.wechat.readAndLike.AriticleContent;
//import com.zhiwei.wechat.readAndLike.WeChatReadAndLike;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @Description:抓取微信公号历史文章数据
// * @author Bewilder Z
// * @date 2016年5月20日 上午10:27:19
// */
//public class WechatDataFromHistory {
//
// private static final Logger log = LogManager.getLogger(WechatDataFromHistory.class);
//
// private boolean updateLike = false; //是否更新点赞阅读数
//
// private Date endDate = null; //采集的结束时间
//
// private List<WechatAricle> result; //数据总集合
//
// private Map<String,String> headerMap; //请求头信息
//
// private boolean follow = false; //是否关注
//
// private String nextId; //采集下一页id
//
// private String key; //更新点赞阅读的key
//
// private boolean next = true; //判断是否有下一页
//
//
// /**
// *
// * @Description:
// * @param @param updateLike 是否更新点赞数和阅读数
// * @param @param endDate 采集结束时间
// * @return
// */
// public WechatDataFromHistory(boolean updateLike,String endDate,
// boolean follow)
// {
// this.updateLike = updateLike;
// result = new ArrayList<WechatAricle>();
// headerMap = Tools.getWechatHeader();
// this.follow = follow;
// if(endDate == null)
// {
// endDate = "2011-12-30";
// }
// this.endDate = TimeUtil.parseTime(endDate, "yyyy-MM-dd");
// }
//
// public WechatDataFromHistory(){}
//
//
// /**
// * @Title: validateKey
// * @author hero
// * @Description: 验证链接是否有效
// * @param @param key
// * @param @return 设定文件
// * @return boolean 返回类型
// */
// public static boolean validateKey(String key,Proxy proxy){
// String url = "http://mp.weixin.qq.com/s?__biz=MzIwNDk0NzEyOQ==&mid=2247484544&idx=2&sn=f64abc4b15badd77b70ca942bc5176d3&scene=0#wechat_redirect";
// try {
// WechatReadLike wrl = WeChatReadAndLike.getReadAndLike(url, key,proxy);
// if(wrl.getRead()>0){
// return true;
// }else{
// return false;
// }
// } catch (Exception e) {
// log.debug("验证微信key有效性时出现问题,问题为:{}",e.getMessage());
// return false;
// }
// }
//
//
// /**
// * @Title: getWechatDataFromHistory
// * @author hero
// * @Description: 获取微信公众号历史文章
// * @param @param url
// * @param @return 设定文件
// * @return List<WechatAricle> 返回类型
// */
// public List<WechatAricle> getWechatDataFromHistory(String url,Proxy proxy)
// {
// log.info("url:::::::::{}",url);
// if(updateLike)
// {
// key = "&uin"+url.split("uin")[1].split("devicetype")[0];
// }
//
// String firstText = null;
// try {
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy, headerMap);
// //获取cookie
// if(cookieMap.get("cookie")!=null){
//// headerMap.put("Referer", url);
// headerMap.put("Cookie", cookieMap.get("cookie"));
// firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
// }
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// //采集下一页数据参数,并获取第一页数据
// if(firstText != null){
// String appToken = getFirst(firstText,proxy);
// if(follow == true)
// {
// next = true;
// }
//
// //循环读取微信公号历史数据
// int i = 1;
// while(next)
// {
// String nextUrl = url.replace("home", "getmsg") + "&f=json&&offset=" + i*10 + "&count=10&scene=123&is_ok=1&appmsg_token="+appToken;
// log.info("下一页地址:{}", nextUrl);
// try {
// //采集下一页数据参数,并获取此页数据
// headerMap.put("Referer", nextUrl);
// String nextJson = HttpClientTemplateOK.get(nextUrl, proxy,headerMap);
// nextId = getNext(nextJson,proxy);
//// System.out.println("nextId============"+nextId);
//// if(nextId.equals("1")){
//// next = true;
//// }else{
//// next = false;
//// }
// ZhiWeiTools.sleep(3000);
// } catch (Exception e) {
// e.printStackTrace();
// next = false;
// }
// i++;
// }
//
// }else{
// next = false;
// }
//
// return result;
// }
//
// /***
// * 获取公号历史文章
// * @Description:
// * @param @param url
// * @param @param source
// * @param @return
// * @return List<Wechat> 返回类型
// */
// @Deprecated
// public List<WechatAricle> getWechatDataFromHistoryOld(String url,Proxy proxy)
// {
// log.info("url:::::::::{}",url);
// if(updateLike)
// {
// key = "&uin"+url.split("uin")[1].split("devicetype")[0];
// }
//
// String firstText = null;
// try {
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy,headerMap);
// //获取cookie
// headerMap.put("Referer", url);
// headerMap.put("Cookie", cookieMap.get("cookie"));
// firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// //采集下一页数据参数,并获取第一页数据
// nextId = getFirstOld(firstText,proxy);
// boolean next = false; //判断是否有下一页
// if(follow == true)
// {
// next = true;
// }
// //循环读取微信公号历史数据
// while(next)
// {
// //没有下一页数据,结束
// if(nextId==null)
// {
// next = false;
// }else //采集下一页数据
// {
// String nextUrl = url.replace("home", "getmsg") + "&f=json&frommsgid=" + nextId + "&count=10&scene=123&is_ok=1";
// log.info("下一页地址:{}", nextUrl);
// try {
// //采集下一页数据参数,并获取此页数据
// headerMap.put("Referer", nextUrl);
// String nextJson = HttpClientTemplateOK.get(nextUrl, null,headerMap);
// nextId = getNext(nextJson,proxy);
// System.out.println("nextId-============="+nextId);
// ZhiWeiTools.sleep(3000);
// } catch (Exception e) {
// e.printStackTrace();
// next = false;
// }
//
// }
// }
//
// return result;
// }
//
//
//
//
// /**
// * @Title: getFirst
// * @Description: TODO(解析第一页数据)
// * @param @param fristText
// * @param @param source
// * @param @return 设定文件
// * @return String 返回类型
// */
// @Deprecated
// public String getFirstOld(String fristText,Proxy proxy)
// {
// fristText = fristText
// .replace("\\", "")
// .replace("'", "")
// .replace("&nbsp;", " ")
// .replace("&quot;", "\"")
// .replace("&amp;", "&")
// .replace("amp;", "")
// .replace("&#39", "'")
// .replace("&gt;", ">")
// .replace("&lt;", "<")
// .replace("&yen;", "¥")
// ;
// log.info("开始解析第一页文章");
// // 截取HTML得到有用的JSON;替换掉转义字符
// if(fristText.contains("msgList ="))
// {
// fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
// return getNextIdAndAnalysis(fristText,proxy);
// }
// return null;
// }
//
// /**
// * @Title: getFirst
// * @author hero
// * @Description: 截取appmsg_token 值
// * @param @param fristText
// * @param @return 设定文件
// * @return String 返回类型
// */
// private String getFirst(String fristText,Proxy proxy)
// {
// String next = null;
//
// fristText = fristText
// .replace("\\", "")
// .replace("'", "")
// .replace("&nbsp;", " ")
// .replace("&quot;", "\"")
// .replace("&amp;", "&")
// .replace("amp;", "")
// .replace("&#39", "'")
// .replace("&gt;", ">")
// .replace("&lt;", "<")
// .replace("&yen;", "¥")
// ;
// log.info("开始解析第一页文章");
//
// if(fristText.contains("window.appmsg_token = ") && fristText.contains("msgList =")){
// try {
// next = fristText.split("window.appmsg_token = \"")[1].split("\";")[0];
// fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
// getNextIdAndAnalysis(fristText,proxy);
// return next;
// } catch (Exception e) {
// log.info("截取下一页数据参数出现问题:{}",fristText);
// return null;
// }
// }else{
// log.info("获取下一页数据参数出现问题....{}",fristText);
// }
// return null;
// }
//
//
// /***
// * 解析微信历史文章下一页数据
// * @Description:
// * @param @param nextJosn
// * @param @param key
// * @param @param source
// * @param @return
// * @return String 返回类型
// */
// private String getNext(String nextHtml,Proxy proxy)
// {
// try {
// JSONObject nextJosn = JSONObject.parseObject(nextHtml);
// String nextText = null;
// if(null != nextJosn.getString("general_msg_list"))
// {
// nextText = nextJosn.getString("general_msg_list");
// getNextIdAndAnalysis(nextText,proxy);
// }else
// {
// log.info("下一页数据解析出现问题:{}", nextHtml);
// next = false;
// return null;
// }
// return nextJosn.getInteger("can_msg_continue")+"";
//
// } catch (Exception e) {
// log.info("解析数据有问题:{}", nextHtml);
// next = false;
// return null;
// }
//
//
// }
//
// /**
// * @Title: getNextIdAndAnalysis
// * @Description: TODO(解析下一页所需字段,及数据解析)
// * @param @param text
// * @param @param source
// * @param @return 设定文件
// * @return String 返回类型
// */
// public String getNextIdAndAnalysis(String text,Proxy proxy)
// {
// JSONObject wechatData = JSONObject.parseObject(text);
// JSONArray dataList = wechatData.getJSONArray("list");
// if(dataList.size()==0)
// {
// nextId = null;
// next = false;
// }else
// {
// for(int i = 0;i<dataList.size();i++)
// {
// JSONObject data = dataList.getJSONObject(i);
// //解析时间
// JSONObject dateJson = data.getJSONObject("comm_msg_info");
// long dateTime = dateJson.getLong("datetime");
// Date time = new Date(dateTime*1000);
// nextId = dateJson.getString("id");
// if(time.before(endDate))
// {
// next = false;
// nextId = null;
// }
// //解析文本数据
// if(null != data.getJSONObject("app_msg_ext_info"))
// {
// //解析头条数据
// JSONObject first = data.getJSONObject("app_msg_ext_info");
// String content_url = first.getString("content_url");
// String content = first.getString("digest");
// String title = first.getString("title");
// String img_url = first.getString("cover");
//
// WechatAricle wechatFirst = setWechat(content_url,title
// , time, img_url, content,"true",proxy);
// result.add(wechatFirst);
// //解析其余数据
// JSONArray otherJSON = first.getJSONArray("multi_app_msg_item_list");
// if(otherJSON != null)
// {
// for(int j = 0;j<otherJSON.size();j++)
// {
// JSONObject other = otherJSON.getJSONObject(j);
// String other_content_url = other.getString("content_url");
// String other_content = other.getString("digest");
// String other_title = other.getString("title");
// String other_img_url = other.getString("cover");
//
// WechatAricle wechatOther = setWechat(other_content_url,other_title
// , time, other_img_url, other_content,"false",proxy);
// result.add(wechatOther);
// }
// }else
// {
// log.info("只有一条数据");
// }
// }else
// {
// log.info("不存在相关文章......");
// }
// }
// }
// return nextId;
// }
//
//
//
// /**
// * 给实体类对象赋值
// * @Description:
// * @param @param url
// * @param @param title
// * @param @param source
// * @param @param datetime
// * @param @param key
// * @param @return
// * @return Wechat 返回类型
// */
// private WechatAricle setWechat(String url,String title,
// Date datetime,String imgUrl,String content,String isFirst,Proxy proxy)
// {
// WechatAricle wechat = new WechatAricle();
// wechat.setId(url);
// wechat.setTitle(title);
// wechat.setTime(datetime);
// wechat.setImgUrl(imgUrl);
// wechat.setIsFirst(isFirst);
// //采集文章
// String source = null;
// Map<String,String> sacMap = AriticleContent.getAriticleContent(url);
// if(sacMap!=null)
// {
// source = sacMap.get("source");
// content = sacMap.get("content");
// }
// //更新点赞阅读数
// if(updateLike)
// {
// url = url.replaceAll("amp;", "").replaceAll("amp;", "");
// try {
// Thread.sleep(2000);
// WechatReadLike wcrl = WeChatReadAndLike.getReadAndLike(url,key,proxy);
// wechat.setLikeNum(wcrl.getLike());
// wechat.setReadNum(wcrl.getRead());
// } catch (InterruptedException e) {
// wechat.setLikeNum(-1);
// wechat.setReadNum(-1);
// log.error("获取点赞阅读数出现为题,问题:{}", e.getMessage());
// }
// }
//
// wechat.setContent(content);
// wechat.setSource(source);
// return wechat;
// }
//
//
// public static void main(String[] args) {
// String url = "http:\\/\\/mp.weixin.qq.com\\/s?__biz=MjM5NTU0MzI0MA==&mid=2661648551&idx=1&sn=74397ab60184beb0abd4dd3f8c62f7d3&chksm=bda7c9008ad04016a5eac88c8dd18b6bc5797ae780c56e307e11781af257a68a52b7f87dfd8e&scene=27#wechat_redirect";
// System.out.println(url.replaceAll("\\", ""));
//
// }
//
//
//}
...@@ -12,13 +12,15 @@ package com.zhiwei.wechat.readAndLike; ...@@ -12,13 +12,15 @@ package com.zhiwei.wechat.readAndLike;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.wechat.util.HtmlDownUtil;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.wechat.util.Tools; import com.zhiwei.wechat.util.Tools;
/** /**
...@@ -48,7 +50,7 @@ public class AriticleContent{ ...@@ -48,7 +50,7 @@ public class AriticleContent{
String content = null; String content = null;
String source = null; String source = null;
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string(); String htmlBody = HtmlDownUtil.downloadHtml(url, headerMap, ProxyHolder.NAT_HEAVY_PROXY);
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
content = document.select("div.rich_media_content").text(); content = document.select("div.rich_media_content").text();
if(htmlBody.contains("var nickname = ")){ if(htmlBody.contains("var nickname = ")){
...@@ -80,8 +82,8 @@ public class AriticleContent{ ...@@ -80,8 +82,8 @@ public class AriticleContent{
headerMap.put("Referer", url); headerMap.put("Referer", url);
String comment_id = null; String comment_id = null;
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string(); String htmlBody = HtmlDownUtil.downloadHtml(url, headerMap, ProxyHolder.NAT_HEAVY_PROXY);
if(htmlBody!=null) if(StringUtils.isNotBlank(htmlBody))
{ {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
String content = document.select("script").html(); String content = document.select("script").html();
......
///**
// * @Title: WindowsClient.java
// * @Package com.wcral.client
// * @Description: TODO(用一句话描述该文件做什么)
// * @author Bewilder Z
// * @date 2015年8月6日 上午9:13:37
// * @version V1.0
// */
//
//package com.zhiwei.wechat.readAndLike;
//
//import java.net.Proxy;
//import java.net.URLEncoder;
//import java.util.HashMap;
//import java.util.Map;
//
//import org.jsoup.Jsoup;
//import org.jsoup.nodes.Document;
//import org.jsoup.nodes.Element;
//import org.jsoup.select.Elements;
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//import com.zhiwei.wechat.entity.WechatReadLike;
//import com.zhiwei.wechat.search.WechatAritcleSearch;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @ClassName: WindowsClient
// * @Description: TODO(利用windows客戶端進行点赞阅读抓取)
// * @author Abner Liu
// * @date 2015年8月6日 上午9:13:37
// */
//public class WeChatReadAndLike {
//
//
// private static Logger logger = LoggerFactory.getLogger(WeChatReadAndLike.class);
//
// /**
// *
// * @Title: getReadAndLike
// * @Description: 利用windows客戶端進行点赞阅读抓取
// * @param url
// * 微信文章链接
// * @return WeChatReadLike 微信文章实体类
// *
// */
// public static WechatReadLike getReadAndLike(String url,String key,Proxy proxy){
// WechatReadLike wLike = new WechatReadLike();
// try {
// String urlcookie = Tools.getWechatCookieUrl(url, key);
// // 请求头信息
// Map<String,String> headerMap = Tools.getWechatHeader();
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(urlcookie, proxy, headerMap);
//
// headerMap.put("Referer", urlcookie);
// headerMap.put("Cookie", cookieMap.get("cookie")+"");
// String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
// System.out.println("appmsg_token==========="+appmsg_token);
// String urlLike = Tools.getWechatLikeUrl(urlcookie,appmsg_token);
// //设置post请求参数
// HashMap<String,Object> postMap = new HashMap<String,Object>();
// postMap.put("is_only_read", "1");
//
// //获取数据
// String htsString = HttpClientTemplateOK.post(urlLike, proxy, headerMap ,postMap);
// System.out.println(htsString);
// JSONObject jsonObject = JSONObject.parseObject(htsString);
// String like_num = jsonObject.getJSONObject("appmsgstat")
// .get("like_num").toString();
//
// String real_read_num = "";
// try {
// real_read_num = jsonObject.getJSONObject("appmsgstat")
// .get("real_read_num").toString();
// if(real_read_num.equals("0"))
// {
// real_read_num = jsonObject.getJSONObject("appmsgstat")
// .get("read_num").toString();
// }
// } catch (Exception e) {
// real_read_num = jsonObject.getJSONObject("appmsgstat")
// .get("read_num").toString();
// }
// wLike.setUrl(url);
// wLike.setRead(Integer.valueOf(real_read_num));
// wLike.setLike(Integer.valueOf(like_num));
// } catch (Exception e) {
// wLike.setUrl(url);
// wLike.setRead(-1);
// wLike.setLike(-1);
// }
// return wLike;
// }
//
//
//
// /**
// * @Title: getReadAndLike
// * @Description: TODO(通过搜狗微信获取阅读数)
// * @param @param word
// * @param @param time
// * @param @param link
// * @param @param wxId
// * @param @return 设定文件
// * @return WeChatReadLike 返回类型
// */
// public static WechatReadLike getReadAndLike(String word,
// String time,String link,String wxId){
//
// WechatReadLike wLike = new WechatReadLike();
//
// Map<String,String> headerMap = new HashMap<String,String>();
// headerMap.put("Upgrade-Insecure-Requests", "1");
// headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
// headerMap.put("Host","weixin.sogou.com");
//
// if(time.contains(" "))
// {
// time = time.split(" ")[0];
// }
//
// String openid = WechatAritcleSearch.getOpenId(wxId,null);
// logger.info("openid is {}", openid);
//
// try {
// String url = "http://weixin.sogou.com/weixin?query=" + URLEncoder.encode(word,"utf-8")
// + "&type=2&ie=utf8&page=1&interation=&tsn=5&ft="+time + "&et="+ time
// + "&wxid="+openid+"&usip="+wxId+"&from=tool";
//
// logger.info("url is {}",url);
//
// String htmlBody = HttpClientTemplateOK.get(url, null, headerMap);
// if(htmlBody!=null)
// {
// try {
// // 解析数据
// Document document = Jsoup.parse(htmlBody);
// Elements elements = document.select("div.news-box")
// .select("ul.news-list").select("li");
// for (Element element : elements)
// {
// try {
// String url_link = element.select("div.txt-box").select("h3 >a").attr("href");
// int readNum = 0;
// try {
// readNum = Integer.valueOf(element.select("div.txt-box")
// .select("div.s-p").select("span.s1").text().trim());
// logger.info("readNum is {}", readNum);
// } catch (Exception e) {
// readNum = 0;
// }
// if(url_link.contains("&chksm="))
// {
// url_link = url_link.split("&chksm=")[0] + "&3rd" + url_link.split("&3rd")[1];
// }
//
// if(link.equals(url_link))
// {
// wLike.setUrl(link);
// wLike.setRead(readNum);
// break;
// }
// } catch (Exception e) {
// continue;
// }
// }
// } catch (Exception e) {
// wLike.setUrl(link);
// wLike.setRead(0);
// return null;
// }
// }
// } catch (Exception e) {
// e.printStackTrace();
// wLike.setUrl(link);
// wLike.setRead(0);
// return null;
// }
// return wLike;
// }
//
//}
...@@ -5,22 +5,19 @@ import java.io.UnsupportedEncodingException; ...@@ -5,22 +5,19 @@ import java.io.UnsupportedEncodingException;
import java.net.Proxy; import java.net.Proxy;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.*;; import java.util.*;;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.wechat.util.HtmlDownUtil;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.wechat.entity.WechatAricle; import com.zhiwei.wechat.entity.WechatAricle;
import org.seimicrawler.xpath.JXDocument;
import org.seimicrawler.xpath.JXNode;
/** /**
* @ClassName: WechatAritcleSearch * @ClassName: WechatAritcleSearch
...@@ -31,7 +28,6 @@ import com.zhiwei.wechat.entity.WechatAricle; ...@@ -31,7 +28,6 @@ import com.zhiwei.wechat.entity.WechatAricle;
public class WechatAritcleSearch { public class WechatAritcleSearch {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class); private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/** /**
* 根据关键词在搜狗微信搜索微信文章,不包含全文 * 根据关键词在搜狗微信搜索微信文章,不包含全文
...@@ -39,49 +35,33 @@ public class WechatAritcleSearch { ...@@ -39,49 +35,33 @@ public class WechatAritcleSearch {
* @param * @param
* word 关键词 * word 关键词
* @param * @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内); * proxy 代理
* 5(某一时间段内与startTime和endTime配合使用)
* @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param
* cookie 用户登录后的cookie(不登录最多10页)
* @param * @param
* pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null * pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @throws * @throws
* Exception * Exception
* @return List<Wechat> 返回类型 * @return List<Wechat> 返回类型
*/ */
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String cookie,String startTime, String endTime, public static List<WechatAricle> wechatKeywordSearch(String word, Proxy proxy, Integer pages) throws Exception{
Proxy proxy, Integer pages) throws Exception{
List<WechatAricle> result = new ArrayList<>(); List<WechatAricle> result = new ArrayList<>();
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com"); headerMap.put("Host", "weixin.sogou.com");
if(StringUtils.isNotBlank(cookie)) { headerMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
headerMap.put("cookie", cookie);
}
boolean f = true; boolean f = true;
int page = 1; int page = 1;
while (f) { while (f) {
String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page; String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page;
// String searchUrl = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
// + "&ie=utf8&_sug_=n&_sug_type_=&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
// + "&tsn=" + tsn + "&page=" + page;
// if (tsn == 5) {
// searchUrl = searchUrl + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
// }
headerMap.put("Referer", searchUrl); headerMap.put("Referer", searchUrl);
// 获取数据 // 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string(); try{
String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
// 解析数据 // 解析数据
if (StringUtils.isNotBlank(htmlBody)) { if (StringUtils.isNotBlank(htmlBody)) {
Document document = Jsoup.parse(htmlBody); JXDocument jxDocument = JXDocument.create(htmlBody);
result.addAll(analysis(document)); result.addAll(analysis(jxDocument));
// 解析最大可寻页码 // 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text(); String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
if (pageNext.contains("下一页")) { if (pageNext.contains("下一页")) {
page++; page++;
} else { } else {
...@@ -90,10 +70,13 @@ public class WechatAritcleSearch { ...@@ -90,10 +70,13 @@ public class WechatAritcleSearch {
} else { } else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody); logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
} }
// ZhiWeiTools.sleep(100);
if(pages!=null && pages==page) { if(pages!=null && pages==page) {
break; break;
} }
}catch (IOException e){
logger.error("根据关键词获取微信文章失败,错误为: {}", e);
}
} }
return result; return result;
} }
...@@ -105,7 +88,7 @@ public class WechatAritcleSearch { ...@@ -105,7 +88,7 @@ public class WechatAritcleSearch {
/** /**
* *
* @Title: wechatKeywordSearch * @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章,包含全文) * @Description: 根据关键词在搜狗微信搜索微信文章,包含全文
* @param @param * @param @param
* word 关键词 * word 关键词
* @param @param * @param @param
...@@ -115,8 +98,6 @@ public class WechatAritcleSearch { ...@@ -115,8 +98,6 @@ public class WechatAritcleSearch {
* startTime 开始时间 格式为yyyy-MM-dd * startTime 开始时间 格式为yyyy-MM-dd
* @param @param * @param @param
* endTime 结束时间 格式为yyyy-MM-dd * endTime 结束时间 格式为yyyy-MM-dd
* @param @param
* cookie 用户登录后的cookie(不登录最多10页)
* @param @return * @param @return
* @param @throws * @param @throws
* ZhiWeiException * ZhiWeiException
...@@ -124,30 +105,25 @@ public class WechatAritcleSearch { ...@@ -124,30 +105,25 @@ public class WechatAritcleSearch {
* UnsupportedEncodingException 设定文件 * UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型 * @return List<Wechat> 返回类型
*/ */
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String startTime, String endTime, public static List<WechatAricle> wechatKeywordSearch(String word,
Proxy proxy,ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException { Proxy proxy, ProxyHolder proxyHolder) throws Exception {
List<WechatAricle> result = new ArrayList<WechatAricle>(); List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com"); headerMap.put("Host", "weixin.sogou.com");
headerMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
boolean f = true; boolean f = true;
int page = 1; int page = 1;
while (f) { while (f) {
String searchUrl = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8") String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page;
+ "&ie=utf8&_sug_=n&_sug_type_=" + "&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
+ "&tsn=" + tsn + "&page=" + page;
if (tsn == 5) {
searchUrl = searchUrl + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
}
headerMap.put("Referer", searchUrl); headerMap.put("Referer", searchUrl);
// 获取数据 // 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string(); String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
// 解析数据 // 解析数据
if (StringUtils.isNotBlank(htmlBody)) { if (StringUtils.isNotBlank(htmlBody)) {
Document document = Jsoup.parse(htmlBody); JXDocument jxDocument = JXDocument.create(htmlBody);
result.addAll(analysis(document)); result.addAll(analysis(jxDocument));
// 解析最大可寻页码 // 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text(); String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
if (pageNext.contains("下一页")) { if (pageNext.contains("下一页")) {
page++; page++;
} else { } else {
...@@ -170,7 +146,7 @@ public class WechatAritcleSearch { ...@@ -170,7 +146,7 @@ public class WechatAritcleSearch {
*/ */
private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy,WechatAricle wechatAricle){ private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy,WechatAricle wechatAricle){
try { try {
String contentHtml = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy.getProxy()).body().string(); String contentHtml = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), proxy.getProxy());
String content = null; String content = null;
String time = null; String time = null;
String source = null; String source = null;
...@@ -178,20 +154,36 @@ public class WechatAritcleSearch { ...@@ -178,20 +154,36 @@ public class WechatAritcleSearch {
String title = null; String title = null;
String user_name = null; String user_name = null;
String wxId = null; String wxId = null;
List<String> imgUrls = null;
String rootSource = null;
if(contentHtml!=null){ if(contentHtml!=null){
Document document = Jsoup.parse(contentHtml); JXDocument jxDocument = JXDocument.create(contentHtml);
title = document.select("title").text(); title = jxDocument.selNOne("//h2[@id='activity-name']").asElement().text();
wxId = document.select("p.profile_meta").get(0).select("span.profile_meta_value").text(); wxId = jxDocument.selNOne("//p[@class='profile_meta'][1]/span[@class='profile_meta_value']").asElement().text();
if(contentHtml.contains("js_article")){ if(contentHtml.contains("js_content")){
content = document.select("div#js_article").text(); content = jxDocument.selNOne("//div[@id='js_content']").asElement().text();
}else if(contentHtml.contains("js_share_content")){ }else if(contentHtml.contains("js_share_content")){
content = document.select("div#js_share_content").text(); content = jxDocument.selNOne("//div[@id='js_share_content']").asElement().text();
} }
if(contentHtml.contains("content_tpl")){ if(contentHtml.contains("content_tpl")){
String text = document.select("script#content_tpl").html(); String text = jxDocument.selNOne("//script[@id='content_tpl']").asElement().text();
content = Jsoup.parse(text).text(); content = Jsoup.parse(text).text();
} }
//解析文章图片地址
if(Objects.nonNull(jxDocument.selN("//div[@id='js_content']//img"))){
imgUrls = new ArrayList<>();
List<JXNode> imgNodeList = jxDocument.selN("//div[@id='js_content']//img");
for(JXNode imgNode : imgNodeList){
String imgUrl = imgNode.selOne("//img").asElement().attr("href");
imgUrls.add(imgUrl);
}
}
//解析来源
if(Objects.nonNull(jxDocument.selNOne("//span[@id='copyright_logo']"))){
rootSource = jxDocument.selNOne("//span[@id='profileBt']/a[@id='js_name']").asElement().text();
}
if(contentHtml.contains("d.nick_name = ")){ if(contentHtml.contains("d.nick_name = ")){
time = contentHtml.split("d.ct = \"")[1].split("\";")[0]; time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
...@@ -211,6 +203,9 @@ public class WechatAritcleSearch { ...@@ -211,6 +203,9 @@ public class WechatAritcleSearch {
wechatAricle.setTime(new Date(Long.valueOf(time)*1000)); wechatAricle.setTime(new Date(Long.valueOf(time)*1000));
wechatAricle.setSource(source); wechatAricle.setSource(source);
} }
wechatAricle.setImgUrls(imgUrls);
wechatAricle.setRootSource(rootSource);
wechatAricle.setBiz(biz); wechatAricle.setBiz(biz);
wechatAricle.setContent(content); wechatAricle.setContent(content);
wechatAricle.setWxId(wxId); wechatAricle.setWxId(wxId);
...@@ -241,7 +236,6 @@ public class WechatAritcleSearch { ...@@ -241,7 +236,6 @@ public class WechatAritcleSearch {
List<WechatAricle> result = new ArrayList<WechatAricle>(); List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com"); headerMap.put("Host", "weixin.sogou.com");
if(idOrName==null || idOrName.equals("")){ if(idOrName==null || idOrName.equals("")){
throw new IllegalArgumentException("要检索的昵称或id不能为空"); throw new IllegalArgumentException("要检索的昵称或id不能为空");
} }
...@@ -259,12 +253,12 @@ public class WechatAritcleSearch { ...@@ -259,12 +253,12 @@ public class WechatAritcleSearch {
headerMap.put("Referer", searchUrl); headerMap.put("Referer", searchUrl);
// 获取数据 // 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxyHolder, true).body().string(); String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxyHolder);
if (StringUtils.isNotBlank(htmlBody)) { if (StringUtils.isNotBlank(htmlBody)) {
Document document = Jsoup.parse(htmlBody); JXDocument jxDocument = JXDocument.create(htmlBody);
result.addAll(analysis(document)); result.addAll(analysis(jxDocument));
// 解析最大可寻页码 // 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text(); String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
if (pageNext.contains("下一页")) { if (pageNext.contains("下一页")) {
page++; page++;
} else { } else {
...@@ -280,15 +274,15 @@ public class WechatAritcleSearch { ...@@ -280,15 +274,15 @@ public class WechatAritcleSearch {
/** /**
* 解析数据 * 解析数据
* @param document * @param jxDocument
* @return * @return
*/ */
private static List<WechatAricle> analysis(Document document){ private static List<WechatAricle> analysis(JXDocument jxDocument){
List<WechatAricle> result = new ArrayList<WechatAricle>(); List<WechatAricle> result = new ArrayList<WechatAricle>();
// 解析数据 // 解析数据
try { try {
// 解析数据 // 解析数据
Elements elements = document.select("div.news-box").select("ul.news-list").select("li"); List<JXNode> jxNodeList = jxDocument.selN("//div[@class='news-box']/ul[@class='news-list']/li");
String title = null; String title = null;
String link = null; String link = null;
String content = null; String content = null;
...@@ -297,34 +291,31 @@ public class WechatAritcleSearch { ...@@ -297,34 +291,31 @@ public class WechatAritcleSearch {
String putDate = null; String putDate = null;
Date date = null; Date date = null;
WechatAricle wechat = null; WechatAricle wechat = null;
for (Element element : elements) { if(Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()){
for (JXNode jxNode : jxNodeList) {
try { try {
title = element.select("div.txt-box").select("h3").text(); title = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().text();
link = element.select("div.txt-box").select("h3 >a").attr("href"); link = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().attr("href");
if(!link.contains("weixin.sogou.com")){ if(!link.contains("weixin.sogou.com")){
link = "https://weixin.sogou.com" + link; link = "https://weixin.sogou.com" + link;
} }
if (element.select("p.txt-info").isEmpty()) { if (Objects.nonNull(jxNode.selOne("//div[@class='txt-box']/p"))) {
content = element.select("p.txt-info").text(); content = jxNode.selOne("//div[@class='txt-box']/p").asElement().text();
} else {
content = element.select("div.txt-box").select("p.txt-info").text();
} }
// System.out.println("content======================"+content); source = jxNode.selOne("//div[@class='s-p']/a").asElement().text();
source = element.select("div.txt-box").select("div.s-p").select("a").text(); openid = jxNode.selOne("//div[@class='s-p']/a").asElement().attr("i");
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i"); putDate = jxNode.selOne("//div[@class='s-p']").asElement().attr("t");
putDate = element.select("div.txt-box").select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000); date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0; int readNum = 0;
try { try {
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p") readNum = Integer.valueOf(jxNode.selOne("//div[@class='s-p']/span[@class='s1']").asElement().text().trim());
.select("span.s1").text().trim());
} catch (Exception e) { } catch (Exception e) {
readNum = 0; readNum = 0;
} }
title = ZhiWeiTools.SBC2DBC(title); title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content); content = ZhiWeiTools.SBC2DBC(content);
if(StringUtils.isNotBlank(title)){ if(StringUtils.isNotBlank(title)){
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow"); wechat = new WechatAricle(link, title, source, content, date, null, null,readNum, 0, openid, "unknow");
result.add(wechat); result.add(wechat);
} }
} catch (Exception e) { } catch (Exception e) {
...@@ -332,6 +323,8 @@ public class WechatAritcleSearch { ...@@ -332,6 +323,8 @@ public class WechatAritcleSearch {
continue; continue;
} }
} }
}
// logger.info("数据总页数为:{}", page); // logger.info("数据总页数为:{}", page);
} catch (Exception e) { } catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage()); logger.debug("获取数据出现问题:{}", e.getMessage());
...@@ -356,10 +349,10 @@ public class WechatAritcleSearch { ...@@ -356,10 +349,10 @@ public class WechatAritcleSearch {
for(int i = 1;i < 3;i++) { for(int i = 1;i < 3;i++) {
try { try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxyHolder).body().string(); htmlBody = HtmlDownUtil.downloadHtml(url,null, proxyHolder);
if (htmlBody != null) { if (StringUtils.isNotBlank(htmlBody)) {
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject jsonObject = JSONObject.parseObject(htmlBody);
openId = json.getString("openid"); openId = jsonObject.getString("openid");
return openId; return openId;
} }
} catch (Exception e) { } catch (Exception e) {
......
...@@ -5,16 +5,13 @@ import java.net.URLEncoder; ...@@ -5,16 +5,13 @@ import java.net.URLEncoder;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import com.zhiwei.wechat.util.HtmlDownUtil;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
public class WechatCount { public class WechatCount {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static int getWechatCountByWord(String word, String cookie, public static int getWechatCountByWord(String word, String cookie,
String startTime, String endTime, Proxy proxy) { String startTime, String endTime, Proxy proxy) {
Map<String, String> headerMap = getWechatCount(cookie); Map<String, String> headerMap = getWechatCount(cookie);
...@@ -24,9 +21,7 @@ public class WechatCount { ...@@ -24,9 +21,7 @@ public class WechatCount {
+ startTime.split(" ")[0] + "&et=" + endTime.split(" ")[0] + startTime.split(" ")[0] + "&et=" + endTime.split(" ")[0]
+ "&interation=&wxid=&usip="; + "&interation=&wxid=&usip=";
headerMap.put("Referer", url); headerMap.put("Referer", url);
String result = httpBoot String result = HtmlDownUtil.downloadHtml(url, headerMap, proxy);
.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)
.body().string();
String s = ""; String s = "";
int n = -1; int n = -1;
if (result.contains("找到约") && result.contains("条结果")) { if (result.contains("找到约") && result.contains("条结果")) {
......
package com.zhiwei.wechat.search;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
/**
* @ClassName: WechatIndex
* @Description: TODO(微信指数搜索)
* @author Bewilder Z
* @date 2017年3月24日 下午2:52:01
*/
public class WechatIndex {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static void main(String[] args) throws Exception {
String word = "百度";
String startTime = "2016-01-01 00:00:00";
String endTime = "2017-03-24 00:00:00";
Map<String,String> timeLine = TimeParse.getTimeMap(startTime, endTime, "dd", 7);
for(Entry<String,String> entry: timeLine.entrySet())
{
Date st = TimeParse.stringFormartDate(entry.getKey());
Date et = TimeParse.stringFormartDate(entry.getValue());
getWechatIndex(word,st.getTime()/1000L,et.getTime()/1000L);
}
}
public static void getWechatIndex(String word,long startTime,long endTime) throws Exception
{
String url = "https://search.weixin.qq.com/cgi-bin/searchweb/getwxindex?query="
+URLCodeUtil.getURLEncode(word, "utf-8")+"&start_time="+startTime+"&end_time="+endTime+"&_="+new Date().getTime();
// String urlCookie = "https://search.weixin.qq.com/cgi-bin/searchweb/getjsapiticket?sign_url=https%253A%252F%252Fsearch.weixin.qq.com%252Fcgi-bin%252Fsearchweb%252Fclientjump%253Ftag%253Dwxindex%2526exportkey%253DAStrb5tD4ruSixIDu1cVpTA%25253D%2526pass_ticket%253DbbP7ZT5xEUrYe%25252BoOa6ACUw%25252BmgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva%25252BGxj&_=1490341301892";
System.out.println(url);
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Host", "search.weixin.qq.com");
headerMap.put("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Mobile/14D27 MicroMessenger/6.5.5 NetType/4G Language/zh_CN");
headerMap.put("Referer", url);
headerMap.put("X-Requested-With","XMLHttpRequest");
headerMap.put("Accept","application/json, text/javascript, */*; q=0.01");
headerMap.put("Cookie","mmsearch_user_key=AStrb5tD4ruSixIDu1cVpTA=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700");
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
System.out.println(htmlBody);
Thread.sleep(3000);
// Document htmlBody = Jsoup.connect(url)
// .header("Host", "search.weixin.qq.com")
// .header("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Mobile/14D27 MicroMessenger/6.5.5 NetType/4G Language/zh_CN")
// .header("Referer", url)
// .header("X-Requested-With","XMLHttpRequest")
// .header("Accept","application/json, text/javascript, */*; q=0.01")
// .header("Cookie", "mmsearch_user_key=AfNSrJx116RWkWvTuVC949k=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700")
// .ignoreHttpErrors(false)
// .ignoreContentType(true)
// .timeout(3000)
// .get();
}
}
package com.zhiwei.wechat.search; package com.zhiwei.wechat.search;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.core.utils.RequestUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
......
package com.zhiwei.wechat.util;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.wechat.search.WechatAritcleSearch;
import okhttp3.Response;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.IOException;
import java.net.Proxy;
import java.util.Map;
/**
* @ProjectName: wechat
* @ClassName: HtmlDownUtil
* @Author: admin
* @Description: 网页数据下载
* @Date: 2020/8/3 8:57
* @Version: 1.0
*/
public class HtmlDownUtil {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* 根据链接获取网页数据
* @param url
* @param headMap
* @param proxy
* @return
* @throws IOException
*/
public static String downloadHtml(String url, Map<String,String> headMap, Proxy proxy)throws IOException{
headMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), proxy, false)) {
return response.body().string();
}catch (IOException e){
throw e;
}
}
public static String downloadHtml(String url, Map<String,String> headMap, ProxyHolder proxyHolder)throws IOException{
headMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), proxyHolder, false)) {
return response.body().string();
}catch (IOException e){
throw e;
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment