Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
wechat
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
wechat
Commits
ab9c3fd4
Commit
ab9c3fd4
authored
Aug 25, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
处理微信链接出现重复拼接问题
parent
1cbcc794
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
308 additions
and
322 deletions
+308
-322
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
+308
-322
No files found.
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
View file @
ab9c3fd4
...
...
@@ -7,6 +7,7 @@ import java.net.URLEncoder;
import
java.util.*
;;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.URIUtils
;
import
com.zhiwei.wechat.util.HtmlDownUtil
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
...
...
@@ -20,347 +21,332 @@ import org.seimicrawler.xpath.JXDocument;
import
org.seimicrawler.xpath.JXNode
;
/**
* @author Bewilder Z
* @ClassName: WechatAritcleSearch
* @Description: TODO(在搜索接口根据关键词采集微信文章)
* @author Bewilder Z
* @date 2016年10月14日 上午9:40:18
*/
public
class
WechatAritcleSearch
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
WechatAritcleSearch
.
class
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
WechatAritcleSearch
.
class
);
/**
* 根据关键词在搜狗微信搜索微信文章,不包含全文
*
* @param word 关键词
* @param proxy 代理
* @param pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @return List<Wechat> 返回类型
* @throws Exception
* @Title: wechatKeywordSearch
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
Proxy
proxy
,
Integer
pages
)
throws
Exception
{
List
<
WechatAricle
>
result
=
new
ArrayList
<>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
headerMap
.
put
(
"cookie"
,
"com_sohu_websearch_ITEM_PER_PAGE=100;"
);
boolean
f
=
true
;
int
page
=
1
;
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&s_from=input&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&ie=utf8&_sug_=n&_sug_type_=&page="
+
page
;
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
try
{
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxy
);
// 解析数据
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JXDocument
jxDocument
=
JXDocument
.
create
(
htmlBody
);
result
.
addAll
(
analysis
(
jxDocument
));
// 解析最大可寻页码
String
pageNext
=
jxDocument
.
selNOne
(
"//a[@id='sogou_next']"
).
asElement
().
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
}
else
{
f
=
false
;
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
if
(
pages
!=
null
&&
pages
==
page
)
{
break
;
}
}
catch
(
IOException
e
)
{
logger
.
error
(
"根据关键词获取微信文章失败,错误为: {}"
,
e
);
}
}
return
result
;
}
/**
* @param @param word 关键词
* @param @param tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param startTime 开始时间 格式为yyyy-MM-dd
* @param @param endTime 结束时间 格式为yyyy-MM-dd
* @param @return
* @param @throws ZhiWeiException
* @param @throws UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
* @Title: wechatKeywordSearch
* @Description: 根据关键词在搜狗微信搜索微信文章, 包含全文
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
Proxy
proxy
,
ProxyHolder
proxyHolder
)
throws
Exception
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
headerMap
.
put
(
"cookie"
,
"com_sohu_websearch_ITEM_PER_PAGE=100;"
);
boolean
f
=
true
;
int
page
=
1
;
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&s_from=input&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&ie=utf8&_sug_=n&_sug_type_=&page="
+
page
;
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxy
);
// 解析数据
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JXDocument
jxDocument
=
JXDocument
.
create
(
htmlBody
);
result
.
addAll
(
analysis
(
jxDocument
));
// 解析最大可寻页码
String
pageNext
=
jxDocument
.
selNOne
(
"//a[@id='sogou_next']"
).
asElement
().
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
}
else
{
f
=
false
;
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
}
return
result
;
}
/**
* 获取全文及来源
*
* @param url
* @param proxy
* @param wechatAricle
* @return
* @throws IOException
*/
private
static
WechatAricle
getWechatAricleInfo
(
String
url
,
ProxyHolder
proxy
,
WechatAricle
wechatAricle
)
{
try
{
String
contentHtml
=
HtmlDownUtil
.
downloadHtml
(
url
,
HeaderTool
.
getCommonHead
(),
proxy
.
getProxy
());
String
content
=
null
;
String
time
=
null
;
String
source
=
null
;
String
biz
=
null
;
String
title
=
null
;
String
user_name
=
null
;
String
wxId
=
null
;
List
<
String
>
imgUrls
=
null
;
String
rootSource
=
null
;
if
(
contentHtml
!=
null
)
{
JXDocument
jxDocument
=
JXDocument
.
create
(
contentHtml
);
title
=
jxDocument
.
selNOne
(
"//h2[@id='activity-name']"
).
asElement
().
text
();
wxId
=
jxDocument
.
selNOne
(
"//p[@class='profile_meta'][1]/span[@class='profile_meta_value']"
).
asElement
().
text
();
/**
* 根据关键词在搜狗微信搜索微信文章,不包含全文
* @Title: wechatKeywordSearch
* @param
* word 关键词
* @param
* proxy 代理
* @param
* pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @throws
* Exception
* @return List<Wechat> 返回类型
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
Proxy
proxy
,
Integer
pages
)
throws
Exception
{
List
<
WechatAricle
>
result
=
new
ArrayList
<>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
headerMap
.
put
(
"cookie"
,
"com_sohu_websearch_ITEM_PER_PAGE=100;"
);
boolean
f
=
true
;
int
page
=
1
;
if
(
contentHtml
.
contains
(
"js_content"
))
{
content
=
jxDocument
.
selNOne
(
"//div[@id='js_content']"
).
asElement
().
text
();
}
else
if
(
contentHtml
.
contains
(
"js_share_content"
))
{
content
=
jxDocument
.
selNOne
(
"//div[@id='js_share_content']"
).
asElement
().
text
();
}
if
(
contentHtml
.
contains
(
"content_tpl"
))
{
String
text
=
jxDocument
.
selNOne
(
"//script[@id='content_tpl']"
).
asElement
().
text
();
content
=
Jsoup
.
parse
(
text
).
text
();
}
//解析文章图片地址
if
(
Objects
.
nonNull
(
jxDocument
.
selN
(
"//div[@id='js_content']//img"
)))
{
imgUrls
=
new
ArrayList
<>();
List
<
JXNode
>
imgNodeList
=
jxDocument
.
selN
(
"//div[@id='js_content']//img"
);
for
(
JXNode
imgNode
:
imgNodeList
)
{
String
imgUrl
=
imgNode
.
selOne
(
"//img"
).
asElement
().
attr
(
"href"
);
imgUrls
.
add
(
imgUrl
);
}
}
//解析来源
if
(
Objects
.
nonNull
(
jxDocument
.
selNOne
(
"//span[@id='copyright_logo']"
)))
{
rootSource
=
jxDocument
.
selNOne
(
"//span[@id='profileBt']/a[@id='js_name']"
).
asElement
().
text
();
}
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&s_from=input&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&ie=utf8&_sug_=n&_sug_type_=&page="
+
page
;
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
try
{
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxy
);
// 解析数据
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JXDocument
jxDocument
=
JXDocument
.
create
(
htmlBody
);
result
.
addAll
(
analysis
(
jxDocument
));
// 解析最大可寻页码
String
pageNext
=
jxDocument
.
selNOne
(
"//a[@id='sogou_next']"
).
asElement
().
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
}
else
{
f
=
false
;
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
if
(
pages
!=
null
&&
pages
==
page
)
{
break
;
}
}
catch
(
IOException
e
){
logger
.
error
(
"根据关键词获取微信文章失败,错误为: {}"
,
e
);
}
}
return
result
;
}
if
(
contentHtml
.
contains
(
"d.nick_name = "
))
{
time
=
contentHtml
.
split
(
"d.ct = \""
)[
1
].
split
(
"\";"
)[
0
];
source
=
contentHtml
.
split
(
"d.nick_name = \""
)[
1
].
split
(
"\";"
)[
0
];
biz
=
contentHtml
.
split
(
"d.biz = \""
)[
1
].
split
(
"\""
)[
0
];
user_name
=
contentHtml
.
split
(
"d.user_name = \""
)[
1
].
split
(
"\""
)[
0
];
}
else
if
(
contentHtml
.
contains
(
"var nickname = "
))
{
time
=
contentHtml
.
split
(
"var ct = \""
)[
1
].
split
(
"\";"
)[
0
];
source
=
contentHtml
.
split
(
"var nickname = \""
)[
1
].
split
(
"\";"
)[
0
];
biz
=
contentHtml
.
split
(
"var appuin = \"\"||\""
)[
1
].
split
(
"\""
)[
0
];
user_name
=
contentHtml
.
split
(
"var user_name = \""
)[
1
].
split
(
"\""
)[
0
];
}
}
if
(
wechatAricle
==
null
)
{
wechatAricle
=
new
WechatAricle
();
wechatAricle
.
setTitle
(
title
);
wechatAricle
.
setTime
(
new
Date
(
Long
.
valueOf
(
time
)
*
1000
));
wechatAricle
.
setSource
(
source
);
}
/**
*
* @Title: wechatKeywordSearch
* @Description: 根据关键词在搜狗微信搜索微信文章,包含全文
* @param @param
* word 关键词
* @param @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @return
* @param @throws
* ZhiWeiException
* @param @throws
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
Proxy
proxy
,
ProxyHolder
proxyHolder
)
throws
Exception
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
headerMap
.
put
(
"cookie"
,
"com_sohu_websearch_ITEM_PER_PAGE=100;"
);
boolean
f
=
true
;
int
page
=
1
;
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&s_from=input&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&ie=utf8&_sug_=n&_sug_type_=&page="
+
page
;
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxy
);
// 解析数据
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JXDocument
jxDocument
=
JXDocument
.
create
(
htmlBody
);
result
.
addAll
(
analysis
(
jxDocument
));
// 解析最大可寻页码
String
pageNext
=
jxDocument
.
selNOne
(
"//a[@id='sogou_next']"
).
asElement
().
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
}
else
{
f
=
false
;
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
}
return
result
;
}
wechatAricle
.
setImgUrls
(
imgUrls
);
wechatAricle
.
setRootSource
(
rootSource
);
wechatAricle
.
setBiz
(
biz
);
wechatAricle
.
setContent
(
content
);
wechatAricle
.
setWxId
(
wxId
);
wechatAricle
.
setUser_name
(
user_name
);
/**
* 获取全文及来源
* @param url
* @param proxy
* @param wechatAricle
* @return
* @throws IOException
*/
private
static
WechatAricle
getWechatAricleInfo
(
String
url
,
ProxyHolder
proxy
,
WechatAricle
wechatAricle
){
try
{
String
contentHtml
=
HtmlDownUtil
.
downloadHtml
(
url
,
HeaderTool
.
getCommonHead
(),
proxy
.
getProxy
());
String
content
=
null
;
String
time
=
null
;
String
source
=
null
;
String
biz
=
null
;
String
title
=
null
;
String
user_name
=
null
;
String
wxId
=
null
;
List
<
String
>
imgUrls
=
null
;
String
rootSource
=
null
;
if
(
contentHtml
!=
null
){
JXDocument
jxDocument
=
JXDocument
.
create
(
contentHtml
);
title
=
jxDocument
.
selNOne
(
"//h2[@id='activity-name']"
).
asElement
().
text
();
wxId
=
jxDocument
.
selNOne
(
"//p[@class='profile_meta'][1]/span[@class='profile_meta_value']"
).
asElement
().
text
();
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
wechatAricle
;
}
return
wechatAricle
;
}
if
(
contentHtml
.
contains
(
"js_content"
)){
content
=
jxDocument
.
selNOne
(
"//div[@id='js_content']"
).
asElement
().
text
();
}
else
if
(
contentHtml
.
contains
(
"js_share_content"
)){
content
=
jxDocument
.
selNOne
(
"//div[@id='js_share_content']"
).
asElement
().
text
();
}
if
(
contentHtml
.
contains
(
"content_tpl"
)){
String
text
=
jxDocument
.
selNOne
(
"//script[@id='content_tpl']"
).
asElement
().
text
();
content
=
Jsoup
.
parse
(
text
).
text
();
}
//解析文章图片地址
if
(
Objects
.
nonNull
(
jxDocument
.
selN
(
"//div[@id='js_content']//img"
))){
imgUrls
=
new
ArrayList
<>();
List
<
JXNode
>
imgNodeList
=
jxDocument
.
selN
(
"//div[@id='js_content']//img"
);
for
(
JXNode
imgNode
:
imgNodeList
){
String
imgUrl
=
imgNode
.
selOne
(
"//img"
).
asElement
().
attr
(
"href"
);
imgUrls
.
add
(
imgUrl
);
}
}
//解析来源
if
(
Objects
.
nonNull
(
jxDocument
.
selNOne
(
"//span[@id='copyright_logo']"
))){
rootSource
=
jxDocument
.
selNOne
(
"//span[@id='profileBt']/a[@id='js_name']"
).
asElement
().
text
();
}
/**
* 根据关键词采集指定时间+账号的数据
*
* @param word
* @param idOrName
* @param startTime
* @param endTime
* @param proxyHolder
* @return
* @throws Exception
* @throws UnsupportedEncodingException
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearchByAccount
(
String
word
,
String
idOrName
,
String
startTime
,
String
endTime
,
ProxyHolder
proxyHolder
)
throws
Exception
,
UnsupportedEncodingException
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
if
(
idOrName
==
null
||
idOrName
.
equals
(
""
))
{
throw
new
IllegalArgumentException
(
"要检索的昵称或id不能为空"
);
}
String
openId
=
getOpenId
(
idOrName
,
proxyHolder
);
boolean
f
=
false
;
if
(
openId
!=
null
)
{
f
=
true
;
}
int
page
=
1
;
if
(
contentHtml
.
contains
(
"d.nick_name = "
)){
time
=
contentHtml
.
split
(
"d.ct = \""
)[
1
].
split
(
"\";"
)[
0
];
source
=
contentHtml
.
split
(
"d.nick_name = \""
)[
1
].
split
(
"\";"
)[
0
];
biz
=
contentHtml
.
split
(
"d.biz = \""
)[
1
].
split
(
"\""
)[
0
];
user_name
=
contentHtml
.
split
(
"d.user_name = \""
)[
1
].
split
(
"\""
)[
0
];
}
else
if
(
contentHtml
.
contains
(
"var nickname = "
)){
time
=
contentHtml
.
split
(
"var ct = \""
)[
1
].
split
(
"\";"
)[
0
];
source
=
contentHtml
.
split
(
"var nickname = \""
)[
1
].
split
(
"\";"
)[
0
];
biz
=
contentHtml
.
split
(
"var appuin = \"\"||\""
)[
1
].
split
(
"\""
)[
0
];
user_name
=
contentHtml
.
split
(
"var user_name = \""
)[
1
].
split
(
"\""
)[
0
];
}
}
if
(
wechatAricle
==
null
)
{
wechatAricle
=
new
WechatAricle
();
wechatAricle
.
setTitle
(
title
);
wechatAricle
.
setTime
(
new
Date
(
Long
.
valueOf
(
time
)*
1000
));
wechatAricle
.
setSource
(
source
);
}
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&ie=utf8&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&tsn=5&ft="
+
startTime
+
"&et="
+
endTime
+
"&interation=&page="
+
page
+
"&wxid="
+
openId
+
"&usip="
+
URLEncoder
.
encode
(
idOrName
,
"UTF-8"
);
wechatAricle
.
setImgUrls
(
imgUrls
);
wechatAricle
.
setRootSource
(
rootSource
);
wechatAricle
.
setBiz
(
biz
);
wechatAricle
.
setContent
(
content
);
wechatAricle
.
setWxId
(
wxId
);
wechatAricle
.
setUser_name
(
user_name
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
wechatAricle
;
}
return
wechatAricle
;
}
/**
* 根据关键词采集指定时间+账号的数据
* @param word
* @param idOrName
* @param startTime
* @param endTime
* @param proxyHolder
* @return
* @throws Exception
* @throws UnsupportedEncodingException
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearchByAccount
(
String
word
,
String
idOrName
,
String
startTime
,
String
endTime
,
ProxyHolder
proxyHolder
)
throws
Exception
,
UnsupportedEncodingException
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
if
(
idOrName
==
null
||
idOrName
.
equals
(
""
)){
throw
new
IllegalArgumentException
(
"要检索的昵称或id不能为空"
);
}
String
openId
=
getOpenId
(
idOrName
,
proxyHolder
);
boolean
f
=
false
;
if
(
openId
!=
null
){
f
=
true
;
}
int
page
=
1
;
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxyHolder
);
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JXDocument
jxDocument
=
JXDocument
.
create
(
htmlBody
);
result
.
addAll
(
analysis
(
jxDocument
));
// 解析最大可寻页码
String
pageNext
=
jxDocument
.
selNOne
(
"//a[@id='sogou_next']"
).
asElement
().
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
}
else
{
f
=
false
;
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
}
return
result
;
}
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&ie=utf8&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&tsn=5&ft="
+
startTime
+
"&et="
+
endTime
+
"&interation=&page="
+
page
+
"&wxid="
+
openId
+
"&usip="
+
URLEncoder
.
encode
(
idOrName
,
"UTF-8"
);
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxyHolder
);
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JXDocument
jxDocument
=
JXDocument
.
create
(
htmlBody
);
result
.
addAll
(
analysis
(
jxDocument
));
// 解析最大可寻页码
String
pageNext
=
jxDocument
.
selNOne
(
"//a[@id='sogou_next']"
).
asElement
().
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
}
else
{
f
=
false
;
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
}
return
result
;
}
/**
* 解析数据
*
* @param jxDocument
* @return
*/
private
static
List
<
WechatAricle
>
analysis
(
JXDocument
jxDocument
)
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
// 解析数据
try
{
// 解析数据
List
<
JXNode
>
jxNodeList
=
jxDocument
.
selN
(
"//div[@class='news-box']/ul[@class='news-list']/li"
);
String
title
=
null
;
String
link
=
null
;
String
content
=
null
;
String
source
=
null
;
String
openid
=
null
;
String
putDate
=
null
;
Date
date
=
null
;
WechatAricle
wechat
=
null
;
if
(
Objects
.
nonNull
(
jxNodeList
)
&&
!
jxNodeList
.
isEmpty
())
{
for
(
JXNode
jxNode
:
jxNodeList
)
{
try
{
title
=
jxNode
.
selOne
(
"//div[@class='txt-box']/h3/a"
).
asElement
().
text
();
link
=
jxNode
.
selOne
(
"//div[@class='txt-box']/h3/a"
).
asElement
().
attr
(
"href"
);
link
=
URIUtils
.
resolve
(
"https://weixin.sogou.com"
,
link
);
if
(
Objects
.
nonNull
(
jxNode
.
selOne
(
"//div[@class='txt-box']/p"
)))
{
content
=
jxNode
.
selOne
(
"//div[@class='txt-box']/p"
).
asElement
().
text
();
}
source
=
jxNode
.
selOne
(
"//div[@class='s-p']/a"
).
asElement
().
text
();
openid
=
jxNode
.
selOne
(
"//div[@class='s-p']/a"
).
asElement
().
attr
(
"i"
);
putDate
=
jxNode
.
selOne
(
"//div[@class='s-p']"
).
asElement
().
attr
(
"t"
);
date
=
new
Date
(
Long
.
valueOf
(
putDate
)
*
1000
);
int
readNum
=
0
;
try
{
readNum
=
Integer
.
valueOf
(
jxNode
.
selOne
(
"//div[@class='s-p']/span[@class='s1']"
).
asElement
().
text
().
trim
());
}
catch
(
Exception
e
)
{
readNum
=
0
;
}
title
=
ZhiWeiTools
.
SBC2DBC
(
title
);
content
=
ZhiWeiTools
.
SBC2DBC
(
content
);
if
(
StringUtils
.
isNotBlank
(
title
))
{
wechat
=
new
WechatAricle
(
link
,
title
,
source
,
content
,
date
,
null
,
null
,
readNum
,
0
,
openid
,
"unknow"
);
result
.
add
(
wechat
);
}
}
catch
(
Exception
e
)
{
logger
.
debug
(
"解析数据出现错误:{}"
,
e
.
getMessage
());
continue
;
}
}
/**
* 解析数据
* @param jxDocument
* @return
*/
private
static
List
<
WechatAricle
>
analysis
(
JXDocument
jxDocument
){
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
// 解析数据
try
{
// 解析数据
List
<
JXNode
>
jxNodeList
=
jxDocument
.
selN
(
"//div[@class='news-box']/ul[@class='news-list']/li"
);
String
title
=
null
;
String
link
=
null
;
String
content
=
null
;
String
source
=
null
;
String
openid
=
null
;
String
putDate
=
null
;
Date
date
=
null
;
WechatAricle
wechat
=
null
;
if
(
Objects
.
nonNull
(
jxNodeList
)
&&
!
jxNodeList
.
isEmpty
()){
for
(
JXNode
jxNode
:
jxNodeList
)
{
try
{
title
=
jxNode
.
selOne
(
"//div[@class='txt-box']/h3/a"
).
asElement
().
text
();
link
=
jxNode
.
selOne
(
"//div[@class='txt-box']/h3/a"
).
asElement
().
attr
(
"href"
);
if
(!
link
.
contains
(
"weixin.sogou.com"
)){
link
=
"https://weixin.sogou.com"
+
link
;
}
if
(
Objects
.
nonNull
(
jxNode
.
selOne
(
"//div[@class='txt-box']/p"
)))
{
content
=
jxNode
.
selOne
(
"//div[@class='txt-box']/p"
).
asElement
().
text
();
}
source
=
jxNode
.
selOne
(
"//div[@class='s-p']/a"
).
asElement
().
text
();
openid
=
jxNode
.
selOne
(
"//div[@class='s-p']/a"
).
asElement
().
attr
(
"i"
);
putDate
=
jxNode
.
selOne
(
"//div[@class='s-p']"
).
asElement
().
attr
(
"t"
);
date
=
new
Date
(
Long
.
valueOf
(
putDate
)
*
1000
);
int
readNum
=
0
;
try
{
readNum
=
Integer
.
valueOf
(
jxNode
.
selOne
(
"//div[@class='s-p']/span[@class='s1']"
).
asElement
().
text
().
trim
());
}
catch
(
Exception
e
)
{
readNum
=
0
;
}
title
=
ZhiWeiTools
.
SBC2DBC
(
title
);
content
=
ZhiWeiTools
.
SBC2DBC
(
content
);
if
(
StringUtils
.
isNotBlank
(
title
)){
wechat
=
new
WechatAricle
(
link
,
title
,
source
,
content
,
date
,
null
,
null
,
readNum
,
0
,
openid
,
"unknow"
);
result
.
add
(
wechat
);
}
}
catch
(
Exception
e
)
{
logger
.
debug
(
"解析数据出现错误:{}"
,
e
.
getMessage
());
continue
;
}
}
}
// logger.info("数据总页数为:{}", page);
}
catch
(
Exception
e
)
{
logger
.
debug
(
"获取数据出现问题:{}"
,
e
.
getMessage
());
return
null
;
}
return
result
;
}
}
// logger.info("数据总页数为:{}", page);
}
catch
(
Exception
e
)
{
logger
.
debug
(
"获取数据出现问题:{}"
,
e
.
getMessage
());
return
null
;
}
return
result
;
}
/**
* @param @param wxId
* @param @return 设定文件
* @return String 返回类型
* @Title: getOpenId
* @Description: 获取微信wxID
*/
public
static
String
getOpenId
(
String
idOrName
,
ProxyHolder
proxyHolder
)
{
String
openId
=
null
;
String
url
=
"https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query="
+
URLCodeUtil
.
getURLEncode
(
idOrName
,
"utf-8"
);
String
htmlBody
;
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
/**
* @Title: getOpenId
* @Description: 获取微信wxID
* @param @param
* wxId
* @param @return
* 设定文件
* @return String 返回类型
*/
public
static
String
getOpenId
(
String
idOrName
,
ProxyHolder
proxyHolder
)
{
String
openId
=
null
;
String
url
=
"https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query="
+
URLCodeUtil
.
getURLEncode
(
idOrName
,
"utf-8"
);
String
htmlBody
;
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
try
{
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
url
,
null
,
proxyHolder
);
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
openId
=
jsonObject
.
getString
(
"openid"
);
return
openId
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
openId
=
null
;
}
}
return
openId
;
}
try
{
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
url
,
null
,
proxyHolder
);
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
openId
=
jsonObject
.
getString
(
"openid"
);
return
openId
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
openId
=
null
;
}
}
return
openId
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment