Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
wechat
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
wechat
Commits
2c702467
Commit
2c702467
authored
Apr 19, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
升级采集核心包
parent
a9af9087
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
911 additions
and
906 deletions
+911
-906
pom.xml
+3
-3
src/main/java/com/zhiwei/wechat/account/WechatAccountFans.java
+2
-1
src/main/java/com/zhiwei/wechat/account/WechatAccountInfo.java
+2
-1
src/main/java/com/zhiwei/wechat/comment/WechatCommentList.java
+170
-167
src/main/java/com/zhiwei/wechat/history/WechatDataFromHistory.java
+482
-482
src/main/java/com/zhiwei/wechat/readAndLike/AriticleContent.java
+6
-5
src/main/java/com/zhiwei/wechat/readAndLike/WeChatReadAndLike.java
+185
-185
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
+1
-1
src/main/java/com/zhiwei/wechat/search/WechatCount.java
+1
-1
src/main/java/com/zhiwei/wechat/search/WechatIndex.java
+4
-3
src/test/java/com/zhiwei/wechat/example/WechatDataFromHistoryExample.java
+53
-53
src/test/java/com/zhiwei/wechat/example/WechatSearchExample.java
+2
-4
No files found.
pom.xml
View file @
2c702467
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
wechat
</artifactId>
<version>
1.1.
4
-SNAPSHOT
</version>
<version>
1.1.
5
-SNAPSHOT
</version>
<description>
知微微信采集程序,包含
1.微信历史文章采集
...
...
@@ -85,13 +85,13 @@
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.
2
-SNAPSHOT
</version>
<version>
0.1.
3
-SNAPSHOT
</version>
<scope>
provided
</scope>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.3.
0
-RELEASE
</version>
<version>
0.3.
6
-RELEASE
</version>
<scope>
provided
</scope>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/wechat/account/WechatAccountFans.java
View file @
2c702467
...
...
@@ -18,7 +18,8 @@ public class WechatAccountFans {
// private static Logger logger = LoggerFactory.getLogger(WechatAccountFans.class);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
Map
<
String
,
String
>
headerMap
;
public
WechatAccountFans
()
...
...
src/main/java/com/zhiwei/wechat/account/WechatAccountInfo.java
View file @
2c702467
...
...
@@ -20,7 +20,8 @@ import com.zhiwei.wechat.entity.WechatAccount;
public
class
WechatAccountInfo
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WechatAccountInfo
.
class
);
/***
...
...
src/main/java/com/zhiwei/wechat/comment/WechatCommentList.java
View file @
2c702467
/**
* 获取微信文章评论
* @Title: WechatComment.java
* @Package com.zhiwei.wechat.comment
* @Description:获取微信文章评论
* @author hero
* @date 2016年6月25日 上午8:17:37
* @version V1.0
*/
/**
*
*/
package
com
.
zhiwei
.
wechat
.
comment
;
import
java.io.IOException
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.wechat.entity.WechatComment
;
import
com.zhiwei.wechat.readAndLike.AriticleContent
;
import
com.zhiwei.wechat.util.Tools
;
/**
* @Description:获取微信文章评论
* @author hero
* @date 2016年6月25日 上午8:17:37
*/
public
class
WechatCommentList
{
private
static
WechatComment
wc
=
new
WechatComment
();
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WechatCommentList
.
class
);
/**
* 根据文章url获取文章评论列表
* @Description:
* @param @param url
* @param @return
* @return List<WechatComment> 返回类型
*/
public
static
List
<
WechatComment
>
getWechatCommentList
(
String
url
,
String
key
)
{
List
<
WechatComment
>
wcList
=
null
;
/*处理url*/
String
urlcookie
=
url
;
if
(!
url
.
contains
(
"key"
)){
urlcookie
=
Tools
.
getWechatCookieUrl
(
url
,
key
);
}
// 请求头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getWechatHeader
();
Map
<
String
,
String
>
cookieMap
;
try
{
cookieMap
=
HttpClientTemplateOK
.
getCookie
(
urlcookie
,
null
,
headerMap
);
headerMap
.
put
(
"Referer"
,
url
);
if
(
cookieMap
.
get
(
"cookie"
).
length
()>
50
){
headerMap
.
put
(
"Cookie"
,
cookieMap
.
get
(
"cookie"
)+
""
);
}
String
appmsg_token
=
Tools
.
getAppMsgToken
(
cookieMap
.
get
(
"htmlBody"
));
String
biz
=
url
.
split
(
"__biz="
)[
1
].
split
(
"&"
)[
0
];
String
appmsgid
=
url
.
split
(
"mid="
)[
1
].
split
(
"&"
)[
0
];
String
comment_id
=
AriticleContent
.
getCommentId
(
url
,
key
);
if
(
comment_id
!=
null
&&
appmsg_token
!=
null
)
{
String
comment_url
=
"https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz="
+
biz
+
"&appmsgid="
+
appmsgid
+
"&idx=1&comment_id="
+
comment_id
+
"&offset=0&limit=100"
+
key
+
"&appmsg_token="
+
appmsg_token
;
/**解析相关数据*/
System
.
out
.
println
(
comment_url
);
if
(
"0"
.
equals
(
comment_id
))
{
logger
.
info
(
"此条微信文章没有评论"
);
}
else
{
try
{
String
htmlBody
=
HttpClientTemplateOK
.
get
(
comment_url
,
null
,
headerMap
);
if
(
htmlBody
!=
null
)
{
JSONObject
json
=
JSON
.
parseObject
(
htmlBody
);
wcList
=
wc
.
constructWechatComment
(
json
.
getJSONArray
(
"elected_comment"
),
url
);
return
wcList
;
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"解析微信文章评论列表时出现问题:"
,
e
.
fillInStackTrace
());
return
null
;
}
}
}
}
catch
(
IOException
e1
)
{
return
null
;
}
catch
(
Exception
e1
)
{
e1
.
printStackTrace
();
}
return
null
;
}
/**
* @Title: getWechatCommentCount
* @Description: TODO(根据微信文章地址更新微信评论数)
* @param @param url
* @param @param key
* @param @return 设定文件
* @return int 返回类型
*/
public
static
int
getWechatCommentCount
(
String
url
,
String
key
)
{
System
.
out
.
println
(
url
);
/*处理url*/
String
url_new
=
url
;
if
(
url
.
contains
(
"#rd"
))
{
url_new
=
url
.
split
(
"#rd"
)[
0
]
+
key
;
}
else
if
(
url
.
contains
(
"#wechat_redirect"
))
{
url_new
=
url
.
split
(
"#wechat_redirect"
)[
0
]
+
key
;
}
String
biz
=
url
.
split
(
"__biz="
)[
1
].
split
(
"&"
)[
0
];
String
appmsgid
=
url
.
split
(
"mid="
)[
1
].
split
(
"&"
)[
0
];
/**获取网页头信息**/
Map
<
String
,
String
>
headerMap
=
Tools
.
getWechatHeader
();
/*获取评论id*/
String
comment_id
=
AriticleContent
.
getCommentId
(
url
,
key
);
if
(
comment_id
!=
null
)
{
String
comment_url
=
"http://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz="
+
biz
+
"&appmsgid="
+
appmsgid
+
"&idx=1&comment_id="
+
comment_id
+
"&offset=0&limit=100"
+
key
;
/**解析相关数据*/
if
(
"0"
.
equals
(
comment_id
))
{
logger
.
info
(
"此条微信文章没有评论"
);
return
0
;
}
else
{
try
{
Map
<
String
,
String
>
cookieMap
=
HttpClientTemplateOK
.
getCookie
(
url_new
,
null
,
headerMap
);
headerMap
.
put
(
"Cookie"
,
cookieMap
.
get
(
"cookie"
));
String
htmlBody
=
HttpClientTemplateOK
.
get
(
comment_url
,
null
,
headerMap
);
System
.
out
.
println
(
htmlBody
);
if
(
htmlBody
!=
null
)
{
JSONObject
json
=
JSON
.
parseObject
(
htmlBody
);
return
json
.
getIntValue
(
"elected_comment_total_cnt"
);
}
}
catch
(
Exception
e
)
{
logger
.
debug
(
"更新微信文章评论数时出现问题,问题信息:"
,
e
.
getMessage
());
return
-
1
;
}
}
}
else
{
logger
.
info
(
"获取评论id失败"
);
return
-
1
;
}
return
-
1
;
}
}
///**
// * 获取微信文章评论
// * @Title: WechatComment.java
// * @Package com.zhiwei.wechat.comment
// * @Description:获取微信文章评论
// * @author hero
// * @date 2016年6月25日 上午8:17:37
// * @version V1.0
// */ /**
// *
// */
//package com.zhiwei.wechat.comment;
//
//import java.io.IOException;
//import java.util.List;
//import java.util.Map;
//
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.alibaba.fastjson.JSON;
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.crawler.core.HttpBoot;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//import com.zhiwei.wechat.entity.WechatComment;
//import com.zhiwei.wechat.readAndLike.AriticleContent;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @Description:获取微信文章评论
// * @author hero
// * @date 2016年6月25日 上午8:17:37
// */
//public class WechatCommentList {
//
// private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
//
// private static WechatComment wc = new WechatComment();
//
// private static Logger logger = LoggerFactory.getLogger(WechatCommentList.class);
// /**
// * 根据文章url获取文章评论列表
// * @Description:
// * @param @param url
// * @param @return
// * @return List<WechatComment> 返回类型
// */
// public static List<WechatComment> getWechatCommentList(String url,String key)
// {
// List<WechatComment> wcList = null;
// /*处理url*/
// String urlcookie = url;
// if(!url.contains("key")){
// urlcookie = Tools.getWechatCookieUrl(url, key);
// }
// // 请求头信息
// Map<String,String> headerMap = Tools.getWechatHeader();
// Map<String, String> cookieMap;
// try {
// cookieMap = HttpClientTemplateOK.getCookie(urlcookie, null, headerMap);
// headerMap.put("Referer", url);
// if(cookieMap.get("cookie").length()>50){
// headerMap.put("Cookie", cookieMap.get("cookie")+"");
// }
// String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
//
// String biz = url.split("__biz=")[1].split("&")[0];
// String appmsgid = url.split("mid=")[1].split("&")[0];
// String comment_id = AriticleContent.getCommentId(url,key);
// if(comment_id!=null && appmsg_token!=null)
// {
// String comment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
// + "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key
// + "&appmsg_token=" + appmsg_token;
// /**解析相关数据*/
// System.out.println(comment_url);
// if("0".equals(comment_id))
// {
// logger.info("此条微信文章没有评论");
// }else
// {
// try {
// String htmlBody = HttpClientTemplateOK.get(comment_url, null, headerMap);
// if(htmlBody!=null)
// {
// JSONObject json = JSON.parseObject(htmlBody);
// wcList = wc.constructWechatComment(json.getJSONArray("elected_comment"),url);
// return wcList;
// }
// } catch (Exception e) {
// logger.info("解析微信文章评论列表时出现问题:", e.fillInStackTrace());
// return null;
// }
// }
// }
// } catch (IOException e1) {
// return null;
// } catch (Exception e1) {
// e1.printStackTrace();
// }
//
// return null;
// }
//
//
// /**
// * @Title: getWechatCommentCount
// * @Description: TODO(根据微信文章地址更新微信评论数)
// * @param @param url
// * @param @param key
// * @param @return 设定文件
// * @return int 返回类型
// */
// public static int getWechatCommentCount(String url,String key)
// {
// System.out.println(url);
// /*处理url*/
// String url_new = url;
// if(url.contains("#rd"))
// {
// url_new = url.split("#rd")[0] + key;
// }else if(url.contains("#wechat_redirect"))
// {
// url_new = url.split("#wechat_redirect")[0] + key;
// }
// String biz = url.split("__biz=")[1].split("&")[0];
// String appmsgid = url.split("mid=")[1].split("&")[0];
//
// /**获取网页头信息**/
// Map<String,String> headerMap = Tools.getWechatHeader();
// /*获取评论id*/
// String comment_id = AriticleContent.getCommentId(url,key);
// if(comment_id!=null)
// {
// String comment_url = "http://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz=" +biz
// + "&appmsgid=" + appmsgid +"&idx=1&comment_id="+comment_id+"&offset=0&limit=100"+key;
// /**解析相关数据*/
//
// if("0".equals(comment_id))
// {
// logger.info("此条微信文章没有评论");
// return 0;
// }else
// {
// try {
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url_new, null,headerMap);
// headerMap.put("Cookie", cookieMap.get("cookie"));
// String htmlBody = HttpClientTemplateOK.get(comment_url, null,headerMap);
// System.out.println(htmlBody);
// if(htmlBody!=null)
// {
// JSONObject json = JSON.parseObject(htmlBody);
// return json.getIntValue("elected_comment_total_cnt");
// }
// } catch (Exception e) {
// logger.debug("更新微信文章评论数时出现问题,问题信息:",e.getMessage());
// return -1;
// }
// }
// }else
// {
// logger.info("获取评论id失败");
// return -1;
// }
// return -1;
// }
//
//
//
//}
src/main/java/com/zhiwei/wechat/history/WechatDataFromHistory.java
View file @
2c702467
/**
* 抓取微信公号历史文章数据
* @Title: WechatDataFromHistory.java
* @Package com.zhiwei.wechat.history
* @Description:抓取微信公号历史文章数据
* @author hero
* @date 2016年5月20日 上午10:27:19
* @version V1.0
*/
/**
*
*/
package
com
.
zhiwei
.
wechat
.
history
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.tools.timeparse.TimeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.wechat.entity.WechatAricle
;
import
com.zhiwei.wechat.entity.WechatReadLike
;
import
com.zhiwei.wechat.readAndLike.AriticleContent
;
import
com.zhiwei.wechat.readAndLike.WeChatReadAndLike
;
import
com.zhiwei.wechat.util.Tools
;
/**
* @Description:抓取微信公号历史文章数据
* @author Bewilder Z
* @date 2016年5月20日 上午10:27:19
*/
public
class
WechatDataFromHistory
{
private
static
final
Logger
log
=
LogManager
.
getLogger
(
WechatDataFromHistory
.
class
);
private
boolean
updateLike
=
false
;
//是否更新点赞阅读数
private
Date
endDate
=
null
;
//采集的结束时间
private
List
<
WechatAricle
>
result
;
//数据总集合
private
Map
<
String
,
String
>
headerMap
;
//请求头信息
private
boolean
follow
=
false
;
//是否关注
private
String
nextId
;
//采集下一页id
private
String
key
;
//更新点赞阅读的key
private
boolean
next
=
true
;
//判断是否有下一页
/**
*
* @Description:
* @param @param updateLike 是否更新点赞数和阅读数
* @param @param endDate 采集结束时间
* @return
*/
public
WechatDataFromHistory
(
boolean
updateLike
,
String
endDate
,
boolean
follow
)
{
this
.
updateLike
=
updateLike
;
result
=
new
ArrayList
<
WechatAricle
>();
headerMap
=
Tools
.
getWechatHeader
();
this
.
follow
=
follow
;
if
(
endDate
==
null
)
{
endDate
=
"2011-12-30"
;
}
this
.
endDate
=
TimeUtil
.
parseTime
(
endDate
,
"yyyy-MM-dd"
);
}
public
WechatDataFromHistory
(){}
/**
* @Title: validateKey
* @author hero
* @Description: 验证链接是否有效
* @param @param key
* @param @return 设定文件
* @return boolean 返回类型
*/
public
static
boolean
validateKey
(
String
key
,
Proxy
proxy
){
String
url
=
"http://mp.weixin.qq.com/s?__biz=MzIwNDk0NzEyOQ==&mid=2247484544&idx=2&sn=f64abc4b15badd77b70ca942bc5176d3&scene=0#wechat_redirect"
;
try
{
WechatReadLike
wrl
=
WeChatReadAndLike
.
getReadAndLike
(
url
,
key
,
proxy
);
if
(
wrl
.
getRead
()>
0
){
return
true
;
}
else
{
return
false
;
}
}
catch
(
Exception
e
)
{
log
.
debug
(
"验证微信key有效性时出现问题,问题为:{}"
,
e
.
getMessage
());
return
false
;
}
}
/**
* @Title: getWechatDataFromHistory
* @author hero
* @Description: 获取微信公众号历史文章
* @param @param url
* @param @return 设定文件
* @return List<WechatAricle> 返回类型
*/
public
List
<
WechatAricle
>
getWechatDataFromHistory
(
String
url
,
Proxy
proxy
)
{
log
.
info
(
"url:::::::::{}"
,
url
);
if
(
updateLike
)
{
key
=
"&uin"
+
url
.
split
(
"uin"
)[
1
].
split
(
"devicetype"
)[
0
];
}
String
firstText
=
null
;
try
{
Map
<
String
,
String
>
cookieMap
=
HttpClientTemplateOK
.
getCookie
(
url
,
proxy
,
headerMap
);
//获取cookie
if
(
cookieMap
.
get
(
"cookie"
)!=
null
){
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Cookie"
,
cookieMap
.
get
(
"cookie"
));
firstText
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
}
//采集下一页数据参数,并获取第一页数据
if
(
firstText
!=
null
){
String
appToken
=
getFirst
(
firstText
,
proxy
);
if
(
follow
==
true
)
{
next
=
true
;
}
//循环读取微信公号历史数据
int
i
=
1
;
while
(
next
)
{
String
nextUrl
=
url
.
replace
(
"home"
,
"getmsg"
)
+
"&f=json&&offset="
+
i
*
10
+
"&count=10&scene=123&is_ok=1&appmsg_token="
+
appToken
;
log
.
info
(
"下一页地址:{}"
,
nextUrl
);
try
{
//采集下一页数据参数,并获取此页数据
headerMap
.
put
(
"Referer"
,
nextUrl
);
String
nextJson
=
HttpClientTemplateOK
.
get
(
nextUrl
,
proxy
,
headerMap
);
nextId
=
getNext
(
nextJson
,
proxy
);
// System.out.println("nextId============"+nextId);
// if(nextId.equals("1")){
///**
// * 抓取微信公号历史文章数据
// * @Title: WechatDataFromHistory.java
// * @Package com.zhiwei.wechat.history
// * @Description:抓取微信公号历史文章数据
// * @author hero
// * @date 2016年5月20日 上午10:27:19
// * @version V1.0
// */ /**
// *
// */
//package com.zhiwei.wechat.history;
//import java.net.Proxy;
//import java.util.ArrayList;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//
//import org.apache.logging.log4j.LogManager;
//import org.apache.logging.log4j.Logger;
//
//import com.alibaba.fastjson.JSONArray;
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//import com.zhiwei.tools.timeparse.TimeUtil;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//import com.zhiwei.wechat.entity.WechatAricle;
//import com.zhiwei.wechat.entity.WechatReadLike;
//import com.zhiwei.wechat.readAndLike.AriticleContent;
//import com.zhiwei.wechat.readAndLike.WeChatReadAndLike;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @Description:抓取微信公号历史文章数据
// * @author Bewilder Z
// * @date 2016年5月20日 上午10:27:19
// */
//public class WechatDataFromHistory {
//
// private static final Logger log = LogManager.getLogger(WechatDataFromHistory.class);
//
// private boolean updateLike = false; //是否更新点赞阅读数
//
// private Date endDate = null; //采集的结束时间
//
// private List<WechatAricle> result; //数据总集合
//
// private Map<String,String> headerMap; //请求头信息
//
// private boolean follow = false; //是否关注
//
// private String nextId; //采集下一页id
//
// private String key; //更新点赞阅读的key
//
// private boolean next = true; //判断是否有下一页
//
//
// /**
// *
// * @Description:
// * @param @param updateLike 是否更新点赞数和阅读数
// * @param @param endDate 采集结束时间
// * @return
// */
// public WechatDataFromHistory(boolean updateLike,String endDate,
// boolean follow)
// {
// this.updateLike = updateLike;
// result = new ArrayList<WechatAricle>();
// headerMap = Tools.getWechatHeader();
// this.follow = follow;
// if(endDate == null)
// {
// endDate = "2011-12-30";
// }
// this.endDate = TimeUtil.parseTime(endDate, "yyyy-MM-dd");
// }
//
// public WechatDataFromHistory(){}
//
//
// /**
// * @Title: validateKey
// * @author hero
// * @Description: 验证链接是否有效
// * @param @param key
// * @param @return 设定文件
// * @return boolean 返回类型
// */
// public static boolean validateKey(String key,Proxy proxy){
// String url = "http://mp.weixin.qq.com/s?__biz=MzIwNDk0NzEyOQ==&mid=2247484544&idx=2&sn=f64abc4b15badd77b70ca942bc5176d3&scene=0#wechat_redirect";
// try {
// WechatReadLike wrl = WeChatReadAndLike.getReadAndLike(url, key,proxy);
// if(wrl.getRead()>0){
// return true;
// }else{
// return false;
// }
// } catch (Exception e) {
// log.debug("验证微信key有效性时出现问题,问题为:{}",e.getMessage());
// return false;
// }
// }
//
//
// /**
// * @Title: getWechatDataFromHistory
// * @author hero
// * @Description: 获取微信公众号历史文章
// * @param @param url
// * @param @return 设定文件
// * @return List<WechatAricle> 返回类型
// */
// public List<WechatAricle> getWechatDataFromHistory(String url,Proxy proxy)
// {
// log.info("url:::::::::{}",url);
// if(updateLike)
// {
// key = "&uin"+url.split("uin")[1].split("devicetype")[0];
// }
//
// String firstText = null;
// try {
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy, headerMap);
// //获取cookie
// if(cookieMap.get("cookie")!=null){
//// headerMap.put("Referer", url);
// headerMap.put("Cookie", cookieMap.get("cookie"));
// firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
// }
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// //采集下一页数据参数,并获取第一页数据
// if(firstText != null){
// String appToken = getFirst(firstText,proxy);
// if(follow == true)
// {
// next = true;
// }
//
// //循环读取微信公号历史数据
// int i = 1;
// while(next)
// {
// String nextUrl = url.replace("home", "getmsg") + "&f=json&&offset=" + i*10 + "&count=10&scene=123&is_ok=1&appmsg_token="+appToken;
// log.info("下一页地址:{}", nextUrl);
// try {
// //采集下一页数据参数,并获取此页数据
// headerMap.put("Referer", nextUrl);
// String nextJson = HttpClientTemplateOK.get(nextUrl, proxy,headerMap);
// nextId = getNext(nextJson,proxy);
//// System.out.println("nextId============"+nextId);
//// if(nextId.equals("1")){
//// next = true;
//// }else{
//// next = false;
//// }
// ZhiWeiTools.sleep(3000);
// } catch (Exception e) {
// e.printStackTrace();
// next = false;
// }
// i++;
// }
//
// }else{
// next = false;
// }
ZhiWeiTools
.
sleep
(
3000
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
next
=
false
;
}
i
++;
}
}
else
{
next
=
false
;
}
return
result
;
}
/***
* 获取公号历史文章
* @Description:
* @param @param url
* @param @param source
* @param @return
* @return List<Wechat> 返回类型
*/
@Deprecated
public
List
<
WechatAricle
>
getWechatDataFromHistoryOld
(
String
url
,
Proxy
proxy
)
{
log
.
info
(
"url:::::::::{}"
,
url
);
if
(
updateLike
)
{
key
=
"&uin"
+
url
.
split
(
"uin"
)[
1
].
split
(
"devicetype"
)[
0
];
}
String
firstText
=
null
;
try
{
Map
<
String
,
String
>
cookieMap
=
HttpClientTemplateOK
.
getCookie
(
url
,
proxy
,
headerMap
);
//获取cookie
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Cookie"
,
cookieMap
.
get
(
"cookie"
));
firstText
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
}
//采集下一页数据参数,并获取第一页数据
nextId
=
getFirstOld
(
firstText
,
proxy
);
boolean
next
=
false
;
//判断是否有下一页
if
(
follow
==
true
)
{
next
=
true
;
}
//循环读取微信公号历史数据
while
(
next
)
{
//没有下一页数据,结束
if
(
nextId
==
null
)
{
next
=
false
;
}
else
//采集下一页数据
{
String
nextUrl
=
url
.
replace
(
"home"
,
"getmsg"
)
+
"&f=json&frommsgid="
+
nextId
+
"&count=10&scene=123&is_ok=1"
;
log
.
info
(
"下一页地址:{}"
,
nextUrl
);
try
{
//采集下一页数据参数,并获取此页数据
headerMap
.
put
(
"Referer"
,
nextUrl
);
String
nextJson
=
HttpClientTemplateOK
.
get
(
nextUrl
,
null
,
headerMap
);
nextId
=
getNext
(
nextJson
,
proxy
);
System
.
out
.
println
(
"nextId-============="
+
nextId
);
ZhiWeiTools
.
sleep
(
3000
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
next
=
false
;
}
}
}
return
result
;
}
/**
* @Title: getFirst
* @Description: TODO(解析第一页数据)
* @param @param fristText
* @param @param source
* @param @return 设定文件
* @return String 返回类型
*/
@Deprecated
public
String
getFirstOld
(
String
fristText
,
Proxy
proxy
)
{
fristText
=
fristText
.
replace
(
"\\"
,
""
)
.
replace
(
"'"
,
""
)
.
replace
(
" "
,
" "
)
.
replace
(
"""
,
"\""
)
.
replace
(
"&"
,
"&"
)
.
replace
(
"amp;"
,
""
)
.
replace
(
"'"
,
"'"
)
.
replace
(
">"
,
">"
)
.
replace
(
"<"
,
"<"
)
.
replace
(
"¥"
,
"¥"
)
;
log
.
info
(
"开始解析第一页文章"
);
// 截取HTML得到有用的JSON;替换掉转义字符
if
(
fristText
.
contains
(
"msgList ="
))
{
fristText
=
fristText
.
split
(
"msgList = "
)[
1
].
split
(
"}}]};"
)[
0
]+
"}}]}"
;
return
getNextIdAndAnalysis
(
fristText
,
proxy
);
}
return
null
;
}
/**
* @Title: getFirst
* @author hero
* @Description: 截取appmsg_token 值
* @param @param fristText
* @param @return 设定文件
* @return String 返回类型
*/
private
String
getFirst
(
String
fristText
,
Proxy
proxy
)
{
String
next
=
null
;
fristText
=
fristText
.
replace
(
"\\"
,
""
)
.
replace
(
"'"
,
""
)
.
replace
(
" "
,
" "
)
.
replace
(
"""
,
"\""
)
.
replace
(
"&"
,
"&"
)
.
replace
(
"amp;"
,
""
)
.
replace
(
"'"
,
"'"
)
.
replace
(
">"
,
">"
)
.
replace
(
"<"
,
"<"
)
.
replace
(
"¥"
,
"¥"
)
;
log
.
info
(
"开始解析第一页文章"
);
if
(
fristText
.
contains
(
"window.appmsg_token = "
)
&&
fristText
.
contains
(
"msgList ="
)){
try
{
next
=
fristText
.
split
(
"window.appmsg_token = \""
)[
1
].
split
(
"\";"
)[
0
];
fristText
=
fristText
.
split
(
"msgList = "
)[
1
].
split
(
"}}]};"
)[
0
]+
"}}]}"
;
getNextIdAndAnalysis
(
fristText
,
proxy
);
return
next
;
}
catch
(
Exception
e
)
{
log
.
info
(
"截取下一页数据参数出现问题:{}"
,
fristText
);
return
null
;
}
}
else
{
log
.
info
(
"获取下一页数据参数出现问题....{}"
,
fristText
);
}
return
null
;
}
/***
* 解析微信历史文章下一页数据
* @Description:
* @param @param nextJosn
* @param @param key
* @param @param source
* @param @return
* @return String 返回类型
*/
private
String
getNext
(
String
nextHtml
,
Proxy
proxy
)
{
try
{
JSONObject
nextJosn
=
JSONObject
.
parseObject
(
nextHtml
);
String
nextText
=
null
;
if
(
null
!=
nextJosn
.
getString
(
"general_msg_list"
))
{
nextText
=
nextJosn
.
getString
(
"general_msg_list"
);
getNextIdAndAnalysis
(
nextText
,
proxy
);
}
else
{
log
.
info
(
"下一页数据解析出现问题:{}"
,
nextHtml
);
next
=
false
;
return
null
;
}
return
nextJosn
.
getInteger
(
"can_msg_continue"
)+
""
;
}
catch
(
Exception
e
)
{
log
.
info
(
"解析数据有问题:{}"
,
nextHtml
);
next
=
false
;
return
null
;
}
}
/**
* @Title: getNextIdAndAnalysis
* @Description: TODO(解析下一页所需字段,及数据解析)
* @param @param text
* @param @param source
* @param @return 设定文件
* @return String 返回类型
*/
public
String
getNextIdAndAnalysis
(
String
text
,
Proxy
proxy
)
{
JSONObject
wechatData
=
JSONObject
.
parseObject
(
text
);
JSONArray
dataList
=
wechatData
.
getJSONArray
(
"list"
);
if
(
dataList
.
size
()==
0
)
{
nextId
=
null
;
next
=
false
;
}
else
{
for
(
int
i
=
0
;
i
<
dataList
.
size
();
i
++)
{
JSONObject
data
=
dataList
.
getJSONObject
(
i
);
//解析时间
JSONObject
dateJson
=
data
.
getJSONObject
(
"comm_msg_info"
);
long
dateTime
=
dateJson
.
getLong
(
"datetime"
);
Date
time
=
new
Date
(
dateTime
*
1000
);
nextId
=
dateJson
.
getString
(
"id"
);
if
(
time
.
before
(
endDate
))
{
next
=
false
;
nextId
=
null
;
}
//解析文本数据
if
(
null
!=
data
.
getJSONObject
(
"app_msg_ext_info"
))
{
//解析头条数据
JSONObject
first
=
data
.
getJSONObject
(
"app_msg_ext_info"
);
String
content_url
=
first
.
getString
(
"content_url"
);
String
content
=
first
.
getString
(
"digest"
);
String
title
=
first
.
getString
(
"title"
);
String
img_url
=
first
.
getString
(
"cover"
);
WechatAricle
wechatFirst
=
setWechat
(
content_url
,
title
,
time
,
img_url
,
content
,
"true"
,
proxy
);
result
.
add
(
wechatFirst
);
//解析其余数据
JSONArray
otherJSON
=
first
.
getJSONArray
(
"multi_app_msg_item_list"
);
if
(
otherJSON
!=
null
)
{
for
(
int
j
=
0
;
j
<
otherJSON
.
size
();
j
++)
{
JSONObject
other
=
otherJSON
.
getJSONObject
(
j
);
String
other_content_url
=
other
.
getString
(
"content_url"
);
String
other_content
=
other
.
getString
(
"digest"
);
String
other_title
=
other
.
getString
(
"title"
);
String
other_img_url
=
other
.
getString
(
"cover"
);
WechatAricle
wechatOther
=
setWechat
(
other_content_url
,
other_title
,
time
,
other_img_url
,
other_content
,
"false"
,
proxy
);
result
.
add
(
wechatOther
);
}
}
else
{
log
.
info
(
"只有一条数据"
);
}
}
else
{
log
.
info
(
"不存在相关文章......"
);
}
}
}
return
nextId
;
}
/**
* 给实体类对象赋值
* @Description:
* @param @param url
* @param @param title
* @param @param source
* @param @param datetime
* @param @param key
* @param @return
* @return Wechat 返回类型
*/
private
WechatAricle
setWechat
(
String
url
,
String
title
,
Date
datetime
,
String
imgUrl
,
String
content
,
String
isFirst
,
Proxy
proxy
)
{
WechatAricle
wechat
=
new
WechatAricle
();
wechat
.
setId
(
url
);
wechat
.
setTitle
(
title
);
wechat
.
setTime
(
datetime
);
wechat
.
setImgUrl
(
imgUrl
);
wechat
.
setIsFirst
(
isFirst
);
//采集文章
String
source
=
null
;
Map
<
String
,
String
>
sacMap
=
AriticleContent
.
getAriticleContent
(
url
);
if
(
sacMap
!=
null
)
{
source
=
sacMap
.
get
(
"source"
);
content
=
sacMap
.
get
(
"content"
);
}
//更新点赞阅读数
if
(
updateLike
)
{
url
=
url
.
replaceAll
(
"amp;"
,
""
).
replaceAll
(
"amp;"
,
""
);
try
{
Thread
.
sleep
(
2000
);
WechatReadLike
wcrl
=
WeChatReadAndLike
.
getReadAndLike
(
url
,
key
,
proxy
);
wechat
.
setLikeNum
(
wcrl
.
getLike
());
wechat
.
setReadNum
(
wcrl
.
getRead
());
}
catch
(
InterruptedException
e
)
{
wechat
.
setLikeNum
(-
1
);
wechat
.
setReadNum
(-
1
);
log
.
error
(
"获取点赞阅读数出现为题,问题:{}"
,
e
.
getMessage
());
}
}
wechat
.
setContent
(
content
);
wechat
.
setSource
(
source
);
return
wechat
;
}
public
static
void
main
(
String
[]
args
)
{
String
url
=
"http:\\/\\/mp.weixin.qq.com\\/s?__biz=MjM5NTU0MzI0MA==&mid=2661648551&idx=1&sn=74397ab60184beb0abd4dd3f8c62f7d3&chksm=bda7c9008ad04016a5eac88c8dd18b6bc5797ae780c56e307e11781af257a68a52b7f87dfd8e&scene=27#wechat_redirect"
;
System
.
out
.
println
(
url
.
replaceAll
(
"\\"
,
""
));
}
}
//
// return result;
// }
//
// /***
// * 获取公号历史文章
// * @Description:
// * @param @param url
// * @param @param source
// * @param @return
// * @return List<Wechat> 返回类型
// */
// @Deprecated
// public List<WechatAricle> getWechatDataFromHistoryOld(String url,Proxy proxy)
// {
// log.info("url:::::::::{}",url);
// if(updateLike)
// {
// key = "&uin"+url.split("uin")[1].split("devicetype")[0];
// }
//
// String firstText = null;
// try {
// Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(url, proxy,headerMap);
// //获取cookie
// headerMap.put("Referer", url);
// headerMap.put("Cookie", cookieMap.get("cookie"));
// firstText = HttpClientTemplateOK.get(url, proxy,headerMap);
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// //采集下一页数据参数,并获取第一页数据
// nextId = getFirstOld(firstText,proxy);
// boolean next = false; //判断是否有下一页
// if(follow == true)
// {
// next = true;
// }
// //循环读取微信公号历史数据
// while(next)
// {
// //没有下一页数据,结束
// if(nextId==null)
// {
// next = false;
// }else //采集下一页数据
// {
// String nextUrl = url.replace("home", "getmsg") + "&f=json&frommsgid=" + nextId + "&count=10&scene=123&is_ok=1";
// log.info("下一页地址:{}", nextUrl);
// try {
// //采集下一页数据参数,并获取此页数据
// headerMap.put("Referer", nextUrl);
// String nextJson = HttpClientTemplateOK.get(nextUrl, null,headerMap);
// nextId = getNext(nextJson,proxy);
// System.out.println("nextId-============="+nextId);
// ZhiWeiTools.sleep(3000);
// } catch (Exception e) {
// e.printStackTrace();
// next = false;
// }
//
// }
// }
//
// return result;
// }
//
//
//
//
// /**
// * @Title: getFirst
// * @Description: TODO(解析第一页数据)
// * @param @param fristText
// * @param @param source
// * @param @return 设定文件
// * @return String 返回类型
// */
// @Deprecated
// public String getFirstOld(String fristText,Proxy proxy)
// {
// fristText = fristText
// .replace("\\", "")
// .replace("'", "")
// .replace(" ", " ")
// .replace(""", "\"")
// .replace("&", "&")
// .replace("amp;", "")
// .replace("'", "'")
// .replace(">", ">")
// .replace("<", "<")
// .replace("¥", "¥")
// ;
// log.info("开始解析第一页文章");
// // 截取HTML得到有用的JSON;替换掉转义字符
// if(fristText.contains("msgList ="))
// {
// fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
// return getNextIdAndAnalysis(fristText,proxy);
// }
// return null;
// }
//
// /**
// * @Title: getFirst
// * @author hero
// * @Description: 截取appmsg_token 值
// * @param @param fristText
// * @param @return 设定文件
// * @return String 返回类型
// */
// private String getFirst(String fristText,Proxy proxy)
// {
// String next = null;
//
// fristText = fristText
// .replace("\\", "")
// .replace("'", "")
// .replace(" ", " ")
// .replace(""", "\"")
// .replace("&", "&")
// .replace("amp;", "")
// .replace("'", "'")
// .replace(">", ">")
// .replace("<", "<")
// .replace("¥", "¥")
// ;
// log.info("开始解析第一页文章");
//
// if(fristText.contains("window.appmsg_token = ") && fristText.contains("msgList =")){
// try {
// next = fristText.split("window.appmsg_token = \"")[1].split("\";")[0];
// fristText = fristText.split("msgList = ")[1].split("}}]};")[0]+"}}]}";
// getNextIdAndAnalysis(fristText,proxy);
// return next;
// } catch (Exception e) {
// log.info("截取下一页数据参数出现问题:{}",fristText);
// return null;
// }
// }else{
// log.info("获取下一页数据参数出现问题....{}",fristText);
// }
// return null;
// }
//
//
// /***
// * 解析微信历史文章下一页数据
// * @Description:
// * @param @param nextJosn
// * @param @param key
// * @param @param source
// * @param @return
// * @return String 返回类型
// */
// private String getNext(String nextHtml,Proxy proxy)
// {
// try {
// JSONObject nextJosn = JSONObject.parseObject(nextHtml);
// String nextText = null;
// if(null != nextJosn.getString("general_msg_list"))
// {
// nextText = nextJosn.getString("general_msg_list");
// getNextIdAndAnalysis(nextText,proxy);
// }else
// {
// log.info("下一页数据解析出现问题:{}", nextHtml);
// next = false;
// return null;
// }
// return nextJosn.getInteger("can_msg_continue")+"";
//
// } catch (Exception e) {
// log.info("解析数据有问题:{}", nextHtml);
// next = false;
// return null;
// }
//
//
// }
//
// /**
// * @Title: getNextIdAndAnalysis
// * @Description: TODO(解析下一页所需字段,及数据解析)
// * @param @param text
// * @param @param source
// * @param @return 设定文件
// * @return String 返回类型
// */
// public String getNextIdAndAnalysis(String text,Proxy proxy)
// {
// JSONObject wechatData = JSONObject.parseObject(text);
// JSONArray dataList = wechatData.getJSONArray("list");
// if(dataList.size()==0)
// {
// nextId = null;
// next = false;
// }else
// {
// for(int i = 0;i<dataList.size();i++)
// {
// JSONObject data = dataList.getJSONObject(i);
// //解析时间
// JSONObject dateJson = data.getJSONObject("comm_msg_info");
// long dateTime = dateJson.getLong("datetime");
// Date time = new Date(dateTime*1000);
// nextId = dateJson.getString("id");
// if(time.before(endDate))
// {
// next = false;
// nextId = null;
// }
// //解析文本数据
// if(null != data.getJSONObject("app_msg_ext_info"))
// {
// //解析头条数据
// JSONObject first = data.getJSONObject("app_msg_ext_info");
// String content_url = first.getString("content_url");
// String content = first.getString("digest");
// String title = first.getString("title");
// String img_url = first.getString("cover");
//
// WechatAricle wechatFirst = setWechat(content_url,title
// , time, img_url, content,"true",proxy);
// result.add(wechatFirst);
// //解析其余数据
// JSONArray otherJSON = first.getJSONArray("multi_app_msg_item_list");
// if(otherJSON != null)
// {
// for(int j = 0;j<otherJSON.size();j++)
// {
// JSONObject other = otherJSON.getJSONObject(j);
// String other_content_url = other.getString("content_url");
// String other_content = other.getString("digest");
// String other_title = other.getString("title");
// String other_img_url = other.getString("cover");
//
// WechatAricle wechatOther = setWechat(other_content_url,other_title
// , time, other_img_url, other_content,"false",proxy);
// result.add(wechatOther);
// }
// }else
// {
// log.info("只有一条数据");
// }
// }else
// {
// log.info("不存在相关文章......");
// }
// }
// }
// return nextId;
// }
//
//
//
// /**
// * 给实体类对象赋值
// * @Description:
// * @param @param url
// * @param @param title
// * @param @param source
// * @param @param datetime
// * @param @param key
// * @param @return
// * @return Wechat 返回类型
// */
// private WechatAricle setWechat(String url,String title,
// Date datetime,String imgUrl,String content,String isFirst,Proxy proxy)
// {
// WechatAricle wechat = new WechatAricle();
// wechat.setId(url);
// wechat.setTitle(title);
// wechat.setTime(datetime);
// wechat.setImgUrl(imgUrl);
// wechat.setIsFirst(isFirst);
// //采集文章
// String source = null;
// Map<String,String> sacMap = AriticleContent.getAriticleContent(url);
// if(sacMap!=null)
// {
// source = sacMap.get("source");
// content = sacMap.get("content");
// }
// //更新点赞阅读数
// if(updateLike)
// {
// url = url.replaceAll("amp;", "").replaceAll("amp;", "");
// try {
// Thread.sleep(2000);
// WechatReadLike wcrl = WeChatReadAndLike.getReadAndLike(url,key,proxy);
// wechat.setLikeNum(wcrl.getLike());
// wechat.setReadNum(wcrl.getRead());
// } catch (InterruptedException e) {
// wechat.setLikeNum(-1);
// wechat.setReadNum(-1);
// log.error("获取点赞阅读数出现为题,问题:{}", e.getMessage());
// }
// }
//
// wechat.setContent(content);
// wechat.setSource(source);
// return wechat;
// }
//
//
// public static void main(String[] args) {
// String url = "http:\\/\\/mp.weixin.qq.com\\/s?__biz=MjM5NTU0MzI0MA==&mid=2661648551&idx=1&sn=74397ab60184beb0abd4dd3f8c62f7d3&chksm=bda7c9008ad04016a5eac88c8dd18b6bc5797ae780c56e307e11781af257a68a52b7f87dfd8e&scene=27#wechat_redirect";
// System.out.println(url.replaceAll("\\", ""));
//
// }
//
//
//}
src/main/java/com/zhiwei/wechat/readAndLike/AriticleContent.java
View file @
2c702467
...
...
@@ -17,8 +17,8 @@ import org.jsoup.nodes.Document;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.
tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.
wechat.comment.WechatCommentList
;
import
com.zhiwei.
crawler.core.HttpBoot
;
import
com.zhiwei.
crawler.utils.RequestUtils
;
import
com.zhiwei.wechat.util.Tools
;
/**
...
...
@@ -28,7 +28,8 @@ import com.zhiwei.wechat.util.Tools;
*/
public
class
AriticleContent
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WechatCommentList
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
AriticleContent
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
...
...
@@ -47,7 +48,7 @@ public class AriticleContent{
String
content
=
null
;
String
source
=
null
;
try
{
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
(
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
content
=
document
.
select
(
"div.rich_media_content"
).
text
();
if
(
htmlBody
.
contains
(
"var nickname = "
)){
...
...
@@ -79,7 +80,7 @@ public class AriticleContent{
headerMap
.
put
(
"Referer"
,
url
);
String
comment_id
=
null
;
try
{
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
(
);
if
(
htmlBody
!=
null
)
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
src/main/java/com/zhiwei/wechat/readAndLike/WeChatReadAndLike.java
View file @
2c702467
/**
* @Title: WindowsClient.java
* @Package com.wcral.client
* @Description: TODO(用一句话描述该文件做什么)
* @author Bewilder Z
* @date 2015年8月6日 上午9:13:37
* @version V1.0
*/
package
com
.
zhiwei
.
wechat
.
readAndLike
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.HashMap
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.wechat.entity.WechatReadLike
;
import
com.zhiwei.wechat.search.WechatAritcleSearch
;
import
com.zhiwei.wechat.util.Tools
;
/**
* @ClassName: WindowsClient
* @Description: TODO(利用windows客戶端進行点赞阅读抓取)
* @author Abner Liu
* @date 2015年8月6日 上午9:13:37
*/
public
class
WeChatReadAndLike
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeChatReadAndLike
.
class
);
/**
*
* @Title: getReadAndLike
* @Description: 利用windows客戶端進行点赞阅读抓取
* @param url
* 微信文章链接
* @return WeChatReadLike 微信文章实体类
*
*/
public
static
WechatReadLike
getReadAndLike
(
String
url
,
String
key
,
Proxy
proxy
){
WechatReadLike
wLike
=
new
WechatReadLike
();
try
{
String
urlcookie
=
Tools
.
getWechatCookieUrl
(
url
,
key
);
// 请求头信息
Map
<
String
,
String
>
headerMap
=
Tools
.
getWechatHeader
();
Map
<
String
,
String
>
cookieMap
=
HttpClientTemplateOK
.
getCookie
(
urlcookie
,
proxy
,
headerMap
);
headerMap
.
put
(
"Referer"
,
urlcookie
);
headerMap
.
put
(
"Cookie"
,
cookieMap
.
get
(
"cookie"
)+
""
);
String
appmsg_token
=
Tools
.
getAppMsgToken
(
cookieMap
.
get
(
"htmlBody"
));
System
.
out
.
println
(
"appmsg_token==========="
+
appmsg_token
);
String
urlLike
=
Tools
.
getWechatLikeUrl
(
urlcookie
,
appmsg_token
);
//设置post请求参数
HashMap
<
String
,
Object
>
postMap
=
new
HashMap
<
String
,
Object
>();
postMap
.
put
(
"is_only_read"
,
"1"
);
//获取数据
String
htsString
=
HttpClientTemplateOK
.
post
(
urlLike
,
proxy
,
headerMap
,
postMap
);
System
.
out
.
println
(
htsString
);
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htsString
);
String
like_num
=
jsonObject
.
getJSONObject
(
"appmsgstat"
)
.
get
(
"like_num"
).
toString
();
String
real_read_num
=
""
;
try
{
real_read_num
=
jsonObject
.
getJSONObject
(
"appmsgstat"
)
.
get
(
"real_read_num"
).
toString
();
if
(
real_read_num
.
equals
(
"0"
))
{
real_read_num
=
jsonObject
.
getJSONObject
(
"appmsgstat"
)
.
get
(
"read_num"
).
toString
();
}
}
catch
(
Exception
e
)
{
real_read_num
=
jsonObject
.
getJSONObject
(
"appmsgstat"
)
.
get
(
"read_num"
).
toString
();
}
wLike
.
setUrl
(
url
);
wLike
.
setRead
(
Integer
.
valueOf
(
real_read_num
));
wLike
.
setLike
(
Integer
.
valueOf
(
like_num
));
}
catch
(
Exception
e
)
{
wLike
.
setUrl
(
url
);
wLike
.
setRead
(-
1
);
wLike
.
setLike
(-
1
);
}
return
wLike
;
}
/**
* @Title: getReadAndLike
* @Description: TODO(通过搜狗微信获取阅读数)
* @param @param word
* @param @param time
* @param @param link
* @param @param wxId
* @param @return 设定文件
* @return WeChatReadLike 返回类型
*/
public
static
WechatReadLike
getReadAndLike
(
String
word
,
String
time
,
String
link
,
String
wxId
){
WechatReadLike
wLike
=
new
WechatReadLike
();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"Upgrade-Insecure-Requests"
,
"1"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
);
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
if
(
time
.
contains
(
" "
))
{
time
=
time
.
split
(
" "
)[
0
];
}
String
openid
=
WechatAritcleSearch
.
getOpenId
(
wxId
,
null
);
logger
.
info
(
"openid is {}"
,
openid
);
try
{
String
url
=
"http://weixin.sogou.com/weixin?query="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)
+
"&type=2&ie=utf8&page=1&interation=&tsn=5&ft="
+
time
+
"&et="
+
time
+
"&wxid="
+
openid
+
"&usip="
+
wxId
+
"&from=tool"
;
logger
.
info
(
"url is {}"
,
url
);
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
if
(
htmlBody
!=
null
)
{
try
{
// 解析数据
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"div.news-box"
)
.
select
(
"ul.news-list"
).
select
(
"li"
);
for
(
Element
element
:
elements
)
{
try
{
String
url_link
=
element
.
select
(
"div.txt-box"
).
select
(
"h3 >a"
).
attr
(
"href"
);
int
readNum
=
0
;
try
{
readNum
=
Integer
.
valueOf
(
element
.
select
(
"div.txt-box"
)
.
select
(
"div.s-p"
).
select
(
"span.s1"
).
text
().
trim
());
logger
.
info
(
"readNum is {}"
,
readNum
);
}
catch
(
Exception
e
)
{
readNum
=
0
;
}
if
(
url_link
.
contains
(
"&chksm="
))
{
url_link
=
url_link
.
split
(
"&chksm="
)[
0
]
+
"&3rd"
+
url_link
.
split
(
"&3rd"
)[
1
];
}
if
(
link
.
equals
(
url_link
))
{
wLike
.
setUrl
(
link
);
wLike
.
setRead
(
readNum
);
break
;
}
}
catch
(
Exception
e
)
{
continue
;
}
}
}
catch
(
Exception
e
)
{
wLike
.
setUrl
(
link
);
wLike
.
setRead
(
0
);
return
null
;
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
wLike
.
setUrl
(
link
);
wLike
.
setRead
(
0
);
return
null
;
}
return
wLike
;
}
}
/
//
**
//
* @Title: WindowsClient.java
//
* @Package com.wcral.client
//
* @Description: TODO(用一句话描述该文件做什么)
//
* @author Bewilder Z
//
* @date 2015年8月6日 上午9:13:37
//
* @version V1.0
//
*/
//
//
package com.zhiwei.wechat.readAndLike;
//
//
import java.net.Proxy;
//
import java.net.URLEncoder;
//
import java.util.HashMap;
//
import java.util.Map;
//
//
import org.jsoup.Jsoup;
//
import org.jsoup.nodes.Document;
//
import org.jsoup.nodes.Element;
//
import org.jsoup.select.Elements;
//
import org.slf4j.Logger;
//
import org.slf4j.LoggerFactory;
//
//
import com.alibaba.fastjson.JSONObject;
//
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//
import com.zhiwei.wechat.entity.WechatReadLike;
//
import com.zhiwei.wechat.search.WechatAritcleSearch;
//
import com.zhiwei.wechat.util.Tools;
//
/
//
**
//
* @ClassName: WindowsClient
//
* @Description: TODO(利用windows客戶端進行点赞阅读抓取)
//
* @author Abner Liu
//
* @date 2015年8月6日 上午9:13:37
//
*/
//
public class WeChatReadAndLike {
//
//
//
private static Logger logger = LoggerFactory.getLogger(WeChatReadAndLike.class);
//
//
/**
//
*
//
* @Title: getReadAndLike
//
* @Description: 利用windows客戶端進行点赞阅读抓取
//
* @param url
//
* 微信文章链接
//
* @return WeChatReadLike 微信文章实体类
//
*
//
*/
//
public static WechatReadLike getReadAndLike(String url,String key,Proxy proxy){
//
WechatReadLike wLike = new WechatReadLike();
//
try {
//
String urlcookie = Tools.getWechatCookieUrl(url, key);
//
// 请求头信息
//
Map<String,String> headerMap = Tools.getWechatHeader();
//
Map<String,String> cookieMap = HttpClientTemplateOK.getCookie(urlcookie, proxy, headerMap);
//
//
headerMap.put("Referer", urlcookie);
//
headerMap.put("Cookie", cookieMap.get("cookie")+"");
//
String appmsg_token = Tools.getAppMsgToken(cookieMap.get("htmlBody"));
//
System.out.println("appmsg_token==========="+appmsg_token);
//
String urlLike = Tools.getWechatLikeUrl(urlcookie,appmsg_token);
//
//设置post请求参数
//
HashMap<String,Object> postMap = new HashMap<String,Object>();
//
postMap.put("is_only_read", "1");
//
//
//获取数据
//
String htsString = HttpClientTemplateOK.post(urlLike, proxy, headerMap ,postMap);
//
System.out.println(htsString);
//
JSONObject jsonObject = JSONObject.parseObject(htsString);
//
String like_num = jsonObject.getJSONObject("appmsgstat")
//
.get("like_num").toString();
//
//
String real_read_num = "";
//
try {
//
real_read_num = jsonObject.getJSONObject("appmsgstat")
//
.get("real_read_num").toString();
//
if(real_read_num.equals("0"))
//
{
//
real_read_num = jsonObject.getJSONObject("appmsgstat")
//
.get("read_num").toString();
//
}
//
} catch (Exception e) {
//
real_read_num = jsonObject.getJSONObject("appmsgstat")
//
.get("read_num").toString();
//
}
//
wLike.setUrl(url);
//
wLike.setRead(Integer.valueOf(real_read_num));
//
wLike.setLike(Integer.valueOf(like_num));
//
} catch (Exception e) {
//
wLike.setUrl(url);
//
wLike.setRead(-1);
//
wLike.setLike(-1);
//
}
//
return wLike;
//
}
//
//
//
//
/**
//
* @Title: getReadAndLike
//
* @Description: TODO(通过搜狗微信获取阅读数)
//
* @param @param word
//
* @param @param time
//
* @param @param link
//
* @param @param wxId
//
* @param @return 设定文件
//
* @return WeChatReadLike 返回类型
//
*/
//
public static WechatReadLike getReadAndLike(String word,
//
String time,String link,String wxId){
//
//
WechatReadLike wLike = new WechatReadLike();
//
//
Map<String,String> headerMap = new HashMap<String,String>();
//
headerMap.put("Upgrade-Insecure-Requests", "1");
//
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
//
headerMap.put("Host","weixin.sogou.com");
//
//
if(time.contains(" "))
//
{
//
time = time.split(" ")[0];
//
}
//
//
String openid = WechatAritcleSearch.getOpenId(wxId,null);
//
logger.info("openid is {}", openid);
//
//
try {
//
String url = "http://weixin.sogou.com/weixin?query=" + URLEncoder.encode(word,"utf-8")
//
+ "&type=2&ie=utf8&page=1&interation=&tsn=5&ft="+time + "&et="+ time
//
+ "&wxid="+openid+"&usip="+wxId+"&from=tool";
//
//
logger.info("url is {}",url);
//
//
String htmlBody = HttpClientTemplateOK.get(url, null, headerMap);
//
if(htmlBody!=null)
//
{
//
try {
//
// 解析数据
//
Document document = Jsoup.parse(htmlBody);
//
Elements elements = document.select("div.news-box")
//
.select("ul.news-list").select("li");
//
for (Element element : elements)
//
{
//
try {
//
String url_link = element.select("div.txt-box").select("h3 >a").attr("href");
//
int readNum = 0;
//
try {
//
readNum = Integer.valueOf(element.select("div.txt-box")
//
.select("div.s-p").select("span.s1").text().trim());
//
logger.info("readNum is {}", readNum);
//
} catch (Exception e) {
//
readNum = 0;
//
}
//
if(url_link.contains("&chksm="))
//
{
//
url_link = url_link.split("&chksm=")[0] + "&3rd" + url_link.split("&3rd")[1];
//
}
//
//
if(link.equals(url_link))
//
{
//
wLike.setUrl(link);
//
wLike.setRead(readNum);
//
break;
//
}
//
} catch (Exception e) {
//
continue;
//
}
//
}
//
} catch (Exception e) {
//
wLike.setUrl(link);
//
wLike.setRead(0);
//
return null;
//
}
//
}
//
} catch (Exception e) {
//
e.printStackTrace();
//
wLike.setUrl(link);
//
wLike.setRead(0);
//
return null;
//
}
//
return wLike;
//
}
//
//
}
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
View file @
2c702467
...
...
@@ -35,7 +35,7 @@ import com.zhiwei.wechat.entity.WechatAricle;
public
class
WechatAritcleSearch
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
WechatAritcleSearch
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
*
...
...
src/main/java/com/zhiwei/wechat/search/WechatCount.java
View file @
2c702467
...
...
@@ -13,7 +13,7 @@ import com.zhiwei.crawler.utils.RequestUtils;
public
class
WechatCount
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
public
static
int
getWechatCountByWord
(
String
word
,
String
cookie
,
String
startTime
,
String
endTime
,
Proxy
proxy
)
{
...
...
src/main/java/com/zhiwei/wechat/search/WechatIndex.java
View file @
2c702467
...
...
@@ -5,7 +5,8 @@ import java.util.HashMap;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
...
...
@@ -17,7 +18,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
*/
public
class
WechatIndex
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
public
static
void
main
(
String
[]
args
)
throws
Exception
{
...
...
@@ -53,7 +54,7 @@ public class WechatIndex {
headerMap
.
put
(
"Accept"
,
"application/json, text/javascript, */*; q=0.01"
);
headerMap
.
put
(
"Cookie"
,
"mmsearch_user_key=AStrb5tD4ruSixIDu1cVpTA=; pass_ticket=bbP7ZT5xEUrYe+oOa6ACUw+mgR05TAGGA1P9xnC7fIyaaOnwkWyNQK8aYtva+Gxj; pgv_pvi=4102772736; pgv_si=s1607859200; pgv_pvid=153672700"
);
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
(
);
System
.
out
.
println
(
htmlBody
);
Thread
.
sleep
(
3000
);
...
...
src/test/java/com/zhiwei/wechat/example/WechatDataFromHistoryExample.java
View file @
2c702467
/**
* @Title: WechatDataFromHistoryExample.java
* @Package com.zhiwei.wechat.example
* @Description:微信采集历史文章测试
* @author hero
* @date 2016年5月20日 下午5:47:56
* @version V1.0
*/
/**
*
*/
package
com
.
zhiwei
.
wechat
.
example
;
import
java.util.ArrayList
;
import
java.util.List
;
import
com.zhiwei.wechat.entity.WechatAricle
;
import
com.zhiwei.wechat.history.WechatDataFromHistory
;
/**
* @Description:微信采集历史文章测试
* @author hero
* @date 2016年5月20日 下午5:47:56
*/
public
class
WechatDataFromHistoryExample
{
public
static
void
main
(
String
[]
args
)
{
boolean
updateLike
=
false
;
boolean
follow
=
true
;
String
endDate
=
"2017-01-27"
;
try
{
List
<
String
>
urllist
=
new
ArrayList
<
String
>();
urllist
.
add
(
"https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5NTU0MzI0MA==&scene=124&uin=MTE4OTQyMDc0MQ%3D%3D&key=df62f0a2a8b7732dca2d1f886b5bd15c398e1fe92940e352837738ea99e5ddc531fc24d5d57a5a43eab11df1e4db7db80aeeddfc06c8f410e159d80df4f822c07c555b4b536b52593f132f39c6868698&devicetype=Windows+8&version=6203005d&lang=zh_CN&a8scene=7&pass_ticket=nMJ5n97UE%2BxdJKqeKp3ovi8slnCMNSYF6Tu%2FgsQ4Phk%2Bc%2B%2BDM5AQy7LT6H%2BBQTc5&winzoom=1"
);
System
.
out
.
println
(
urllist
.
size
());
int
i
=
0
;
for
(
String
s
:
urllist
)
{
System
.
out
.
println
(
"i==========="
+
i
);
String
url
=
s
.
split
(
","
)[
0
];
// String source = s.split(",")[1];
WechatDataFromHistory
wdfh
=
new
WechatDataFromHistory
(
updateLike
,
endDate
,
follow
);
System
.
out
.
println
(
url
);
List
<
WechatAricle
>
list
=
wdfh
.
getWechatDataFromHistory
(
url
,
null
);
System
.
out
.
println
(
"list size is :"
+
list
.
size
());
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
/
//
**
//
* @Title: WechatDataFromHistoryExample.java
//
* @Package com.zhiwei.wechat.example
//
* @Description:微信采集历史文章测试
//
* @author hero
//
* @date 2016年5月20日 下午5:47:56
//
* @version V1.0
//
*/
/
//
**
//
*
//
*/
//
package com.zhiwei.wechat.example;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
//
import com.zhiwei.wechat.entity.WechatAricle;
//
import com.zhiwei.wechat.history.WechatDataFromHistory;
//
/
//
**
//
* @Description:微信采集历史文章测试
//
* @author hero
//
* @date 2016年5月20日 下午5:47:56
//
*/
//
public class WechatDataFromHistoryExample {
//
//
public static void main(String[] args) {
//
boolean updateLike = false;
//
boolean follow = true;
//
String endDate = "2017-01-27";
//
try {
//
List<String> urllist = new ArrayList<String>();
//
urllist.add("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5NTU0MzI0MA==&scene=124&uin=MTE4OTQyMDc0MQ%3D%3D&key=df62f0a2a8b7732dca2d1f886b5bd15c398e1fe92940e352837738ea99e5ddc531fc24d5d57a5a43eab11df1e4db7db80aeeddfc06c8f410e159d80df4f822c07c555b4b536b52593f132f39c6868698&devicetype=Windows+8&version=6203005d&lang=zh_CN&a8scene=7&pass_ticket=nMJ5n97UE%2BxdJKqeKp3ovi8slnCMNSYF6Tu%2FgsQ4Phk%2Bc%2B%2BDM5AQy7LT6H%2BBQTc5&winzoom=1");
//
System.out.println(urllist.size());
//
int i = 0;
//
for (String s : urllist) {
//
System.out.println("i===========" + i);
//
String url = s.split(",")[0];
//
//
String source = s.split(",")[1];
//
//
WechatDataFromHistory wdfh = new WechatDataFromHistory(updateLike,endDate,follow);
//
System.out.println(url);
//
List<WechatAricle> list = wdfh.getWechatDataFromHistory(url,null);
//
System.out.println("list size is :" + list.size());
//
//
}
//
} catch (Exception e) {
//
e.printStackTrace();
//
}
//
}
//
//
//
}
src/test/java/com/zhiwei/wechat/example/WechatSearchExample.java
View file @
2c702467
...
...
@@ -40,13 +40,11 @@ public class WechatSearchExample{
public
static
void
wechatSearchExample
()
throws
UnknownHostException
{
List
<
String
>
wordList
=
new
ArrayList
<
String
>();
wordList
.
add
(
"工业互联网"
);
String
idOrName
=
"吴晓波频道"
;
wordList
.
add
(
"京东"
);
for
(
String
word
:
wordList
)
{
try
{
List
<
WechatAricle
>
list
=
WechatAritcleSearch
.
wechatKeywordSearch
ByAccount
(
word
,
idOrName
,
"2017-12-01"
,
"2018-12-01"
,
ProxyHolder
.
SOUGOU_INNER_PROXY
);
List
<
WechatAricle
>
list
=
WechatAritcleSearch
.
wechatKeywordSearch
(
word
,
5
,
null
,
"2019-04-08"
,
"2019-04-08"
,
ProxyHolder
.
SOUGOU_INNER_PROXY
.
getProxy
()
);
System
.
out
.
println
(
"======"
+
list
.
size
());
for
(
WechatAricle
wechat
:
list
){
System
.
out
.
println
(
wechat
.
getTitle
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment