Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
wechat
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
wechat
Commits
ab9c3fd4
Commit
ab9c3fd4
authored
Aug 25, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
处理微信链接出现重复拼接问题
parent
1cbcc794
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
308 additions
and
322 deletions
+308
-322
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
+308
-322
No files found.
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
View file @
ab9c3fd4
...
@@ -7,6 +7,7 @@ import java.net.URLEncoder;
...
@@ -7,6 +7,7 @@ import java.net.URLEncoder;
import
java.util.*
;;
import
java.util.*
;;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.URIUtils
;
import
com.zhiwei.wechat.util.HtmlDownUtil
;
import
com.zhiwei.wechat.util.HtmlDownUtil
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.LogManager
;
...
@@ -20,347 +21,332 @@ import org.seimicrawler.xpath.JXDocument;
...
@@ -20,347 +21,332 @@ import org.seimicrawler.xpath.JXDocument;
import
org.seimicrawler.xpath.JXNode
;
import
org.seimicrawler.xpath.JXNode
;
/**
/**
* @author Bewilder Z
* @ClassName: WechatAritcleSearch
* @ClassName: WechatAritcleSearch
* @Description: TODO(在搜索接口根据关键词采集微信文章)
* @Description: TODO(在搜索接口根据关键词采集微信文章)
* @author Bewilder Z
* @date 2016年10月14日 上午9:40:18
* @date 2016年10月14日 上午9:40:18
*/
*/
public
class
WechatAritcleSearch
{
public
class
WechatAritcleSearch
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
WechatAritcleSearch
.
class
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
WechatAritcleSearch
.
class
);
/**
* 根据关键词在搜狗微信搜索微信文章,不包含全文
*
* @param word 关键词
* @param proxy 代理
* @param pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @return List<Wechat> 返回类型
* @throws Exception
* @Title: wechatKeywordSearch
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
Proxy
proxy
,
Integer
pages
)
throws
Exception
{
List
<
WechatAricle
>
result
=
new
ArrayList
<>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
headerMap
.
put
(
"cookie"
,
"com_sohu_websearch_ITEM_PER_PAGE=100;"
);
boolean
f
=
true
;
int
page
=
1
;
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&s_from=input&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&ie=utf8&_sug_=n&_sug_type_=&page="
+
page
;
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
try
{
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxy
);
// 解析数据
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JXDocument
jxDocument
=
JXDocument
.
create
(
htmlBody
);
result
.
addAll
(
analysis
(
jxDocument
));
// 解析最大可寻页码
String
pageNext
=
jxDocument
.
selNOne
(
"//a[@id='sogou_next']"
).
asElement
().
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
}
else
{
f
=
false
;
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
if
(
pages
!=
null
&&
pages
==
page
)
{
break
;
}
}
catch
(
IOException
e
)
{
logger
.
error
(
"根据关键词获取微信文章失败,错误为: {}"
,
e
);
}
}
return
result
;
}
/**
* @param @param word 关键词
* @param @param tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param startTime 开始时间 格式为yyyy-MM-dd
* @param @param endTime 结束时间 格式为yyyy-MM-dd
* @param @return
* @param @throws ZhiWeiException
* @param @throws UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
* @Title: wechatKeywordSearch
* @Description: 根据关键词在搜狗微信搜索微信文章, 包含全文
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
Proxy
proxy
,
ProxyHolder
proxyHolder
)
throws
Exception
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
headerMap
.
put
(
"cookie"
,
"com_sohu_websearch_ITEM_PER_PAGE=100;"
);
boolean
f
=
true
;
int
page
=
1
;
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&s_from=input&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&ie=utf8&_sug_=n&_sug_type_=&page="
+
page
;
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxy
);
// 解析数据
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JXDocument
jxDocument
=
JXDocument
.
create
(
htmlBody
);
result
.
addAll
(
analysis
(
jxDocument
));
// 解析最大可寻页码
String
pageNext
=
jxDocument
.
selNOne
(
"//a[@id='sogou_next']"
).
asElement
().
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
}
else
{
f
=
false
;
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
}
return
result
;
}
/**
* 获取全文及来源
*
* @param url
* @param proxy
* @param wechatAricle
* @return
* @throws IOException
*/
private
static
WechatAricle
getWechatAricleInfo
(
String
url
,
ProxyHolder
proxy
,
WechatAricle
wechatAricle
)
{
try
{
String
contentHtml
=
HtmlDownUtil
.
downloadHtml
(
url
,
HeaderTool
.
getCommonHead
(),
proxy
.
getProxy
());
String
content
=
null
;
String
time
=
null
;
String
source
=
null
;
String
biz
=
null
;
String
title
=
null
;
String
user_name
=
null
;
String
wxId
=
null
;
List
<
String
>
imgUrls
=
null
;
String
rootSource
=
null
;
if
(
contentHtml
!=
null
)
{
JXDocument
jxDocument
=
JXDocument
.
create
(
contentHtml
);
title
=
jxDocument
.
selNOne
(
"//h2[@id='activity-name']"
).
asElement
().
text
();
wxId
=
jxDocument
.
selNOne
(
"//p[@class='profile_meta'][1]/span[@class='profile_meta_value']"
).
asElement
().
text
();
/**
if
(
contentHtml
.
contains
(
"js_content"
))
{
* 根据关键词在搜狗微信搜索微信文章,不包含全文
content
=
jxDocument
.
selNOne
(
"//div[@id='js_content']"
).
asElement
().
text
();
* @Title: wechatKeywordSearch
}
else
if
(
contentHtml
.
contains
(
"js_share_content"
))
{
* @param
content
=
jxDocument
.
selNOne
(
"//div[@id='js_share_content']"
).
asElement
().
text
();
* word 关键词
}
* @param
if
(
contentHtml
.
contains
(
"content_tpl"
))
{
* proxy 代理
String
text
=
jxDocument
.
selNOne
(
"//script[@id='content_tpl']"
).
asElement
().
text
();
* @param
content
=
Jsoup
.
parse
(
text
).
text
();
* pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
}
* @throws
//解析文章图片地址
* Exception
if
(
Objects
.
nonNull
(
jxDocument
.
selN
(
"//div[@id='js_content']//img"
)))
{
* @return List<Wechat> 返回类型
imgUrls
=
new
ArrayList
<>();
*/
List
<
JXNode
>
imgNodeList
=
jxDocument
.
selN
(
"//div[@id='js_content']//img"
);
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
Proxy
proxy
,
Integer
pages
)
throws
Exception
{
for
(
JXNode
imgNode
:
imgNodeList
)
{
List
<
WechatAricle
>
result
=
new
ArrayList
<>();
String
imgUrl
=
imgNode
.
selOne
(
"//img"
).
asElement
().
attr
(
"href"
);
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
imgUrls
.
add
(
imgUrl
);
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
}
headerMap
.
put
(
"cookie"
,
"com_sohu_websearch_ITEM_PER_PAGE=100;"
);
}
boolean
f
=
true
;
//解析来源
int
page
=
1
;
if
(
Objects
.
nonNull
(
jxDocument
.
selNOne
(
"//span[@id='copyright_logo']"
)))
{
rootSource
=
jxDocument
.
selNOne
(
"//span[@id='profileBt']/a[@id='js_name']"
).
asElement
().
text
();
}
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&s_from=input&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&ie=utf8&_sug_=n&_sug_type_=&page="
+
page
;
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
try
{
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxy
);
// 解析数据
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JXDocument
jxDocument
=
JXDocument
.
create
(
htmlBody
);
result
.
addAll
(
analysis
(
jxDocument
));
// 解析最大可寻页码
String
pageNext
=
jxDocument
.
selNOne
(
"//a[@id='sogou_next']"
).
asElement
().
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
}
else
{
f
=
false
;
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
if
(
pages
!=
null
&&
pages
==
page
)
{
break
;
}
}
catch
(
IOException
e
){
logger
.
error
(
"根据关键词获取微信文章失败,错误为: {}"
,
e
);
}
}
if
(
contentHtml
.
contains
(
"d.nick_name = "
))
{
return
result
;
time
=
contentHtml
.
split
(
"d.ct = \""
)[
1
].
split
(
"\";"
)[
0
];
}
source
=
contentHtml
.
split
(
"d.nick_name = \""
)[
1
].
split
(
"\";"
)[
0
];
biz
=
contentHtml
.
split
(
"d.biz = \""
)[
1
].
split
(
"\""
)[
0
];
user_name
=
contentHtml
.
split
(
"d.user_name = \""
)[
1
].
split
(
"\""
)[
0
];
}
else
if
(
contentHtml
.
contains
(
"var nickname = "
))
{
time
=
contentHtml
.
split
(
"var ct = \""
)[
1
].
split
(
"\";"
)[
0
];
source
=
contentHtml
.
split
(
"var nickname = \""
)[
1
].
split
(
"\";"
)[
0
];
biz
=
contentHtml
.
split
(
"var appuin = \"\"||\""
)[
1
].
split
(
"\""
)[
0
];
user_name
=
contentHtml
.
split
(
"var user_name = \""
)[
1
].
split
(
"\""
)[
0
];
}
}
if
(
wechatAricle
==
null
)
{
wechatAricle
=
new
WechatAricle
();
wechatAricle
.
setTitle
(
title
);
wechatAricle
.
setTime
(
new
Date
(
Long
.
valueOf
(
time
)
*
1000
));
wechatAricle
.
setSource
(
source
);
}
/**
wechatAricle
.
setImgUrls
(
imgUrls
);
*
wechatAricle
.
setRootSource
(
rootSource
);
* @Title: wechatKeywordSearch
wechatAricle
.
setBiz
(
biz
);
* @Description: 根据关键词在搜狗微信搜索微信文章,包含全文
wechatAricle
.
setContent
(
content
);
* @param @param
wechatAricle
.
setWxId
(
wxId
);
* word 关键词
wechatAricle
.
setUser_name
(
user_name
);
* @param @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @return
* @param @throws
* ZhiWeiException
* @param @throws
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
Proxy
proxy
,
ProxyHolder
proxyHolder
)
throws
Exception
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
headerMap
.
put
(
"cookie"
,
"com_sohu_websearch_ITEM_PER_PAGE=100;"
);
boolean
f
=
true
;
int
page
=
1
;
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&s_from=input&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&ie=utf8&_sug_=n&_sug_type_=&page="
+
page
;
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxy
);
// 解析数据
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JXDocument
jxDocument
=
JXDocument
.
create
(
htmlBody
);
result
.
addAll
(
analysis
(
jxDocument
));
// 解析最大可寻页码
String
pageNext
=
jxDocument
.
selNOne
(
"//a[@id='sogou_next']"
).
asElement
().
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
}
else
{
f
=
false
;
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
}
return
result
;
}
/**
}
catch
(
Exception
e
)
{
* 获取全文及来源
e
.
printStackTrace
();
* @param url
return
wechatAricle
;
* @param proxy
}
* @param wechatAricle
return
wechatAricle
;
* @return
}
* @throws IOException
*/
private
static
WechatAricle
getWechatAricleInfo
(
String
url
,
ProxyHolder
proxy
,
WechatAricle
wechatAricle
){
try
{
String
contentHtml
=
HtmlDownUtil
.
downloadHtml
(
url
,
HeaderTool
.
getCommonHead
(),
proxy
.
getProxy
());
String
content
=
null
;
String
time
=
null
;
String
source
=
null
;
String
biz
=
null
;
String
title
=
null
;
String
user_name
=
null
;
String
wxId
=
null
;
List
<
String
>
imgUrls
=
null
;
String
rootSource
=
null
;
if
(
contentHtml
!=
null
){
JXDocument
jxDocument
=
JXDocument
.
create
(
contentHtml
);
title
=
jxDocument
.
selNOne
(
"//h2[@id='activity-name']"
).
asElement
().
text
();
wxId
=
jxDocument
.
selNOne
(
"//p[@class='profile_meta'][1]/span[@class='profile_meta_value']"
).
asElement
().
text
();
if
(
contentHtml
.
contains
(
"js_content"
)){
content
=
jxDocument
.
selNOne
(
"//div[@id='js_content']"
).
asElement
().
text
();
}
else
if
(
contentHtml
.
contains
(
"js_share_content"
)){
content
=
jxDocument
.
selNOne
(
"//div[@id='js_share_content']"
).
asElement
().
text
();
}
if
(
contentHtml
.
contains
(
"content_tpl"
)){
String
text
=
jxDocument
.
selNOne
(
"//script[@id='content_tpl']"
).
asElement
().
text
();
content
=
Jsoup
.
parse
(
text
).
text
();
}
//解析文章图片地址
if
(
Objects
.
nonNull
(
jxDocument
.
selN
(
"//div[@id='js_content']//img"
))){
imgUrls
=
new
ArrayList
<>();
List
<
JXNode
>
imgNodeList
=
jxDocument
.
selN
(
"//div[@id='js_content']//img"
);
for
(
JXNode
imgNode
:
imgNodeList
){
String
imgUrl
=
imgNode
.
selOne
(
"//img"
).
asElement
().
attr
(
"href"
);
imgUrls
.
add
(
imgUrl
);
}
}
//解析来源
if
(
Objects
.
nonNull
(
jxDocument
.
selNOne
(
"//span[@id='copyright_logo']"
))){
rootSource
=
jxDocument
.
selNOne
(
"//span[@id='profileBt']/a[@id='js_name']"
).
asElement
().
text
();
}
/**
* 根据关键词采集指定时间+账号的数据
*
* @param word
* @param idOrName
* @param startTime
* @param endTime
* @param proxyHolder
* @return
* @throws Exception
* @throws UnsupportedEncodingException
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearchByAccount
(
String
word
,
String
idOrName
,
String
startTime
,
String
endTime
,
ProxyHolder
proxyHolder
)
throws
Exception
,
UnsupportedEncodingException
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
if
(
idOrName
==
null
||
idOrName
.
equals
(
""
))
{
throw
new
IllegalArgumentException
(
"要检索的昵称或id不能为空"
);
}
String
openId
=
getOpenId
(
idOrName
,
proxyHolder
);
boolean
f
=
false
;
if
(
openId
!=
null
)
{
f
=
true
;
}
int
page
=
1
;
if
(
contentHtml
.
contains
(
"d.nick_name = "
)){
while
(
f
)
{
time
=
contentHtml
.
split
(
"d.ct = \""
)[
1
].
split
(
"\";"
)[
0
];
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&ie=utf8&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
source
=
contentHtml
.
split
(
"d.nick_name = \""
)[
1
].
split
(
"\";"
)[
0
];
+
"&tsn=5&ft="
+
startTime
+
"&et="
+
endTime
+
"&interation=&page="
+
page
+
"&wxid="
+
openId
biz
=
contentHtml
.
split
(
"d.biz = \""
)[
1
].
split
(
"\""
)[
0
];
+
"&usip="
+
URLEncoder
.
encode
(
idOrName
,
"UTF-8"
);
user_name
=
contentHtml
.
split
(
"d.user_name = \""
)[
1
].
split
(
"\""
)[
0
];
}
else
if
(
contentHtml
.
contains
(
"var nickname = "
)){
time
=
contentHtml
.
split
(
"var ct = \""
)[
1
].
split
(
"\";"
)[
0
];
source
=
contentHtml
.
split
(
"var nickname = \""
)[
1
].
split
(
"\";"
)[
0
];
biz
=
contentHtml
.
split
(
"var appuin = \"\"||\""
)[
1
].
split
(
"\""
)[
0
];
user_name
=
contentHtml
.
split
(
"var user_name = \""
)[
1
].
split
(
"\""
)[
0
];
}
}
if
(
wechatAricle
==
null
)
{
wechatAricle
=
new
WechatAricle
();
wechatAricle
.
setTitle
(
title
);
wechatAricle
.
setTime
(
new
Date
(
Long
.
valueOf
(
time
)*
1000
));
wechatAricle
.
setSource
(
source
);
}
wechatAricle
.
setImgUrls
(
imgUrls
);
headerMap
.
put
(
"Referer"
,
searchUrl
);
wechatAricle
.
setRootSource
(
rootSource
);
// 获取数据
wechatAricle
.
setBiz
(
biz
);
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxyHolder
);
wechatAricle
.
setContent
(
content
);
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
wechatAricle
.
setWxId
(
wxId
);
JXDocument
jxDocument
=
JXDocument
.
create
(
htmlBody
);
wechatAricle
.
setUser_name
(
user_name
);
result
.
addAll
(
analysis
(
jxDocument
));
// 解析最大可寻页码
}
catch
(
Exception
e
)
{
String
pageNext
=
jxDocument
.
selNOne
(
"//a[@id='sogou_next']"
).
asElement
().
text
();
e
.
printStackTrace
();
if
(
pageNext
.
contains
(
"下一页"
))
{
return
wechatAricle
;
page
++;
}
}
else
{
return
wechatAricle
;
f
=
false
;
}
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
/**
}
* 根据关键词采集指定时间+账号的数据
return
result
;
* @param word
}
* @param idOrName
* @param startTime
* @param endTime
* @param proxyHolder
* @return
* @throws Exception
* @throws UnsupportedEncodingException
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearchByAccount
(
String
word
,
String
idOrName
,
String
startTime
,
String
endTime
,
ProxyHolder
proxyHolder
)
throws
Exception
,
UnsupportedEncodingException
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
if
(
idOrName
==
null
||
idOrName
.
equals
(
""
)){
throw
new
IllegalArgumentException
(
"要检索的昵称或id不能为空"
);
}
String
openId
=
getOpenId
(
idOrName
,
proxyHolder
);
boolean
f
=
false
;
if
(
openId
!=
null
){
f
=
true
;
}
int
page
=
1
;
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&ie=utf8&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&tsn=5&ft="
+
startTime
+
"&et="
+
endTime
+
"&interation=&page="
+
page
+
"&wxid="
+
openId
+
"&usip="
+
URLEncoder
.
encode
(
idOrName
,
"UTF-8"
);
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxyHolder
);
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JXDocument
jxDocument
=
JXDocument
.
create
(
htmlBody
);
result
.
addAll
(
analysis
(
jxDocument
));
// 解析最大可寻页码
String
pageNext
=
jxDocument
.
selNOne
(
"//a[@id='sogou_next']"
).
asElement
().
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
}
else
{
f
=
false
;
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
}
return
result
;
}
/**
* 解析数据
*
* @param jxDocument
* @return
*/
private
static
List
<
WechatAricle
>
analysis
(
JXDocument
jxDocument
)
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
// 解析数据
try
{
// 解析数据
List
<
JXNode
>
jxNodeList
=
jxDocument
.
selN
(
"//div[@class='news-box']/ul[@class='news-list']/li"
);
String
title
=
null
;
String
link
=
null
;
String
content
=
null
;
String
source
=
null
;
String
openid
=
null
;
String
putDate
=
null
;
Date
date
=
null
;
WechatAricle
wechat
=
null
;
if
(
Objects
.
nonNull
(
jxNodeList
)
&&
!
jxNodeList
.
isEmpty
())
{
for
(
JXNode
jxNode
:
jxNodeList
)
{
try
{
title
=
jxNode
.
selOne
(
"//div[@class='txt-box']/h3/a"
).
asElement
().
text
();
link
=
jxNode
.
selOne
(
"//div[@class='txt-box']/h3/a"
).
asElement
().
attr
(
"href"
);
link
=
URIUtils
.
resolve
(
"https://weixin.sogou.com"
,
link
);
if
(
Objects
.
nonNull
(
jxNode
.
selOne
(
"//div[@class='txt-box']/p"
)))
{
content
=
jxNode
.
selOne
(
"//div[@class='txt-box']/p"
).
asElement
().
text
();
}
source
=
jxNode
.
selOne
(
"//div[@class='s-p']/a"
).
asElement
().
text
();
openid
=
jxNode
.
selOne
(
"//div[@class='s-p']/a"
).
asElement
().
attr
(
"i"
);
putDate
=
jxNode
.
selOne
(
"//div[@class='s-p']"
).
asElement
().
attr
(
"t"
);
date
=
new
Date
(
Long
.
valueOf
(
putDate
)
*
1000
);
int
readNum
=
0
;
try
{
readNum
=
Integer
.
valueOf
(
jxNode
.
selOne
(
"//div[@class='s-p']/span[@class='s1']"
).
asElement
().
text
().
trim
());
}
catch
(
Exception
e
)
{
readNum
=
0
;
}
title
=
ZhiWeiTools
.
SBC2DBC
(
title
);
content
=
ZhiWeiTools
.
SBC2DBC
(
content
);
if
(
StringUtils
.
isNotBlank
(
title
))
{
wechat
=
new
WechatAricle
(
link
,
title
,
source
,
content
,
date
,
null
,
null
,
readNum
,
0
,
openid
,
"unknow"
);
result
.
add
(
wechat
);
}
}
catch
(
Exception
e
)
{
logger
.
debug
(
"解析数据出现错误:{}"
,
e
.
getMessage
());
continue
;
}
}
/**
}
* 解析数据
// logger.info("数据总页数为:{}", page);
* @param jxDocument
}
catch
(
Exception
e
)
{
* @return
logger
.
debug
(
"获取数据出现问题:{}"
,
e
.
getMessage
());
*/
return
null
;
private
static
List
<
WechatAricle
>
analysis
(
JXDocument
jxDocument
){
}
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
return
result
;
// 解析数据
}
try
{
// 解析数据
List
<
JXNode
>
jxNodeList
=
jxDocument
.
selN
(
"//div[@class='news-box']/ul[@class='news-list']/li"
);
String
title
=
null
;
String
link
=
null
;
String
content
=
null
;
String
source
=
null
;
String
openid
=
null
;
String
putDate
=
null
;
Date
date
=
null
;
WechatAricle
wechat
=
null
;
if
(
Objects
.
nonNull
(
jxNodeList
)
&&
!
jxNodeList
.
isEmpty
()){
for
(
JXNode
jxNode
:
jxNodeList
)
{
try
{
title
=
jxNode
.
selOne
(
"//div[@class='txt-box']/h3/a"
).
asElement
().
text
();
link
=
jxNode
.
selOne
(
"//div[@class='txt-box']/h3/a"
).
asElement
().
attr
(
"href"
);
if
(!
link
.
contains
(
"weixin.sogou.com"
)){
link
=
"https://weixin.sogou.com"
+
link
;
}
if
(
Objects
.
nonNull
(
jxNode
.
selOne
(
"//div[@class='txt-box']/p"
)))
{
content
=
jxNode
.
selOne
(
"//div[@class='txt-box']/p"
).
asElement
().
text
();
}
source
=
jxNode
.
selOne
(
"//div[@class='s-p']/a"
).
asElement
().
text
();
openid
=
jxNode
.
selOne
(
"//div[@class='s-p']/a"
).
asElement
().
attr
(
"i"
);
putDate
=
jxNode
.
selOne
(
"//div[@class='s-p']"
).
asElement
().
attr
(
"t"
);
date
=
new
Date
(
Long
.
valueOf
(
putDate
)
*
1000
);
int
readNum
=
0
;
try
{
readNum
=
Integer
.
valueOf
(
jxNode
.
selOne
(
"//div[@class='s-p']/span[@class='s1']"
).
asElement
().
text
().
trim
());
}
catch
(
Exception
e
)
{
readNum
=
0
;
}
title
=
ZhiWeiTools
.
SBC2DBC
(
title
);
content
=
ZhiWeiTools
.
SBC2DBC
(
content
);
if
(
StringUtils
.
isNotBlank
(
title
)){
wechat
=
new
WechatAricle
(
link
,
title
,
source
,
content
,
date
,
null
,
null
,
readNum
,
0
,
openid
,
"unknow"
);
result
.
add
(
wechat
);
}
}
catch
(
Exception
e
)
{
logger
.
debug
(
"解析数据出现错误:{}"
,
e
.
getMessage
());
continue
;
}
}
}
/**
// logger.info("数据总页数为:{}", page);
* @param @param wxId
}
catch
(
Exception
e
)
{
* @param @return 设定文件
logger
.
debug
(
"获取数据出现问题:{}"
,
e
.
getMessage
());
* @return String 返回类型
return
null
;
* @Title: getOpenId
}
* @Description: 获取微信wxID
return
result
;
*/
}
public
static
String
getOpenId
(
String
idOrName
,
ProxyHolder
proxyHolder
)
{
String
openId
=
null
;
String
url
=
"https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query="
+
URLCodeUtil
.
getURLEncode
(
idOrName
,
"utf-8"
);
String
htmlBody
;
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
/**
try
{
* @Title: getOpenId
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
url
,
null
,
proxyHolder
);
* @Description: 获取微信wxID
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
* @param @param
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
* wxId
openId
=
jsonObject
.
getString
(
"openid"
);
* @param @return
return
openId
;
* 设定文件
}
* @return String 返回类型
}
catch
(
Exception
e
)
{
*/
e
.
printStackTrace
();
public
static
String
getOpenId
(
String
idOrName
,
ProxyHolder
proxyHolder
)
{
openId
=
null
;
String
openId
=
null
;
}
String
url
=
"https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query="
+
URLCodeUtil
.
getURLEncode
(
idOrName
,
"utf-8"
);
}
String
htmlBody
;
return
openId
;
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
}
try
{
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
url
,
null
,
proxyHolder
);
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
openId
=
jsonObject
.
getString
(
"openid"
);
return
openId
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
openId
=
null
;
}
}
return
openId
;
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment