Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
wechat
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
wechat
Commits
ab9c3fd4
Commit
ab9c3fd4
authored
Aug 25, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
处理微信链接出现重复拼接问题
parent
1cbcc794
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
51 additions
and
65 deletions
+51
-65
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
+51
-65
No files found.
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
View file @
ab9c3fd4
...
...
@@ -7,6 +7,7 @@ import java.net.URLEncoder;
import
java.util.*
;;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.URIUtils
;
import
com.zhiwei.wechat.util.HtmlDownUtil
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
...
...
@@ -20,9 +21,9 @@ import org.seimicrawler.xpath.JXDocument;
import
org.seimicrawler.xpath.JXNode
;
/**
* @author Bewilder Z
* @ClassName: WechatAritcleSearch
* @Description: TODO(在搜索接口根据关键词采集微信文章)
* @author Bewilder Z
* @date 2016年10月14日 上午9:40:18
*/
public
class
WechatAritcleSearch
{
...
...
@@ -31,18 +32,15 @@ public class WechatAritcleSearch {
/**
* 根据关键词在搜狗微信搜索微信文章,不包含全文
* @Title: wechatKeywordSearch
* @param
* word 关键词
* @param
* proxy 代理
* @param
* pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @throws
* Exception
*
* @param word 关键词
* @param proxy 代理
* @param pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @return List<Wechat> 返回类型
* @throws Exception
* @Title: wechatKeywordSearch
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
Proxy
proxy
,
Integer
pages
)
throws
Exception
{
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
Proxy
proxy
,
Integer
pages
)
throws
Exception
{
List
<
WechatAricle
>
result
=
new
ArrayList
<>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
...
...
@@ -51,10 +49,10 @@ public class WechatAritcleSearch {
int
page
=
1
;
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&s_from=input&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&ie=utf8&_sug_=n&_sug_type_=&page="
+
page
;
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&s_from=input&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&ie=utf8&_sug_=n&_sug_type_=&page="
+
page
;
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
try
{
try
{
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxy
);
// 解析数据
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
...
...
@@ -70,10 +68,10 @@ public class WechatAritcleSearch {
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
if
(
pages
!=
null
&&
pages
==
page
)
{
if
(
pages
!=
null
&&
pages
==
page
)
{
break
;
}
}
catch
(
IOException
e
)
{
}
catch
(
IOException
e
)
{
logger
.
error
(
"根据关键词获取微信文章失败,错误为: {}"
,
e
);
}
...
...
@@ -82,28 +80,18 @@ public class WechatAritcleSearch {
}
/**
*
* @Title: wechatKeywordSearch
* @Description: 根据关键词在搜狗微信搜索微信文章,包含全文
* @param @param
* word 关键词
* @param @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* @param @param word 关键词
* @param @param tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @param startTime 开始时间 格式为yyyy-MM-dd
* @param @param endTime 结束时间 格式为yyyy-MM-dd
* @param @return
* @param @throws
* ZhiWeiException
* @param @throws
* UnsupportedEncodingException 设定文件
* @param @throws ZhiWeiException
* @param @throws UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
* @Title: wechatKeywordSearch
* @Description: 根据关键词在搜狗微信搜索微信文章, 包含全文
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
Proxy
proxy
,
ProxyHolder
proxyHolder
)
throws
Exception
{
...
...
@@ -114,7 +102,7 @@ public class WechatAritcleSearch {
boolean
f
=
true
;
int
page
=
1
;
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&s_from=input&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&ie=utf8&_sug_=n&_sug_type_=&page="
+
page
;
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&s_from=input&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&ie=utf8&_sug_=n&_sug_type_=&page="
+
page
;
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
String
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
searchUrl
,
headerMap
,
proxy
);
...
...
@@ -138,13 +126,14 @@ public class WechatAritcleSearch {
/**
* 获取全文及来源
*
* @param url
* @param proxy
* @param wechatAricle
* @return
* @throws IOException
*/
private
static
WechatAricle
getWechatAricleInfo
(
String
url
,
ProxyHolder
proxy
,
WechatAricle
wechatAricle
)
{
private
static
WechatAricle
getWechatAricleInfo
(
String
url
,
ProxyHolder
proxy
,
WechatAricle
wechatAricle
)
{
try
{
String
contentHtml
=
HtmlDownUtil
.
downloadHtml
(
url
,
HeaderTool
.
getCommonHead
(),
proxy
.
getProxy
());
String
content
=
null
;
...
...
@@ -156,51 +145,51 @@ public class WechatAritcleSearch {
String
wxId
=
null
;
List
<
String
>
imgUrls
=
null
;
String
rootSource
=
null
;
if
(
contentHtml
!=
null
)
{
if
(
contentHtml
!=
null
)
{
JXDocument
jxDocument
=
JXDocument
.
create
(
contentHtml
);
title
=
jxDocument
.
selNOne
(
"//h2[@id='activity-name']"
).
asElement
().
text
();
wxId
=
jxDocument
.
selNOne
(
"//p[@class='profile_meta'][1]/span[@class='profile_meta_value']"
).
asElement
().
text
();
if
(
contentHtml
.
contains
(
"js_content"
))
{
if
(
contentHtml
.
contains
(
"js_content"
))
{
content
=
jxDocument
.
selNOne
(
"//div[@id='js_content']"
).
asElement
().
text
();
}
else
if
(
contentHtml
.
contains
(
"js_share_content"
))
{
}
else
if
(
contentHtml
.
contains
(
"js_share_content"
))
{
content
=
jxDocument
.
selNOne
(
"//div[@id='js_share_content']"
).
asElement
().
text
();
}
if
(
contentHtml
.
contains
(
"content_tpl"
))
{
if
(
contentHtml
.
contains
(
"content_tpl"
))
{
String
text
=
jxDocument
.
selNOne
(
"//script[@id='content_tpl']"
).
asElement
().
text
();
content
=
Jsoup
.
parse
(
text
).
text
();
}
//解析文章图片地址
if
(
Objects
.
nonNull
(
jxDocument
.
selN
(
"//div[@id='js_content']//img"
)))
{
if
(
Objects
.
nonNull
(
jxDocument
.
selN
(
"//div[@id='js_content']//img"
)))
{
imgUrls
=
new
ArrayList
<>();
List
<
JXNode
>
imgNodeList
=
jxDocument
.
selN
(
"//div[@id='js_content']//img"
);
for
(
JXNode
imgNode
:
imgNodeList
)
{
for
(
JXNode
imgNode
:
imgNodeList
)
{
String
imgUrl
=
imgNode
.
selOne
(
"//img"
).
asElement
().
attr
(
"href"
);
imgUrls
.
add
(
imgUrl
);
}
}
//解析来源
if
(
Objects
.
nonNull
(
jxDocument
.
selNOne
(
"//span[@id='copyright_logo']"
)))
{
if
(
Objects
.
nonNull
(
jxDocument
.
selNOne
(
"//span[@id='copyright_logo']"
)))
{
rootSource
=
jxDocument
.
selNOne
(
"//span[@id='profileBt']/a[@id='js_name']"
).
asElement
().
text
();
}
if
(
contentHtml
.
contains
(
"d.nick_name = "
))
{
if
(
contentHtml
.
contains
(
"d.nick_name = "
))
{
time
=
contentHtml
.
split
(
"d.ct = \""
)[
1
].
split
(
"\";"
)[
0
];
source
=
contentHtml
.
split
(
"d.nick_name = \""
)[
1
].
split
(
"\";"
)[
0
];
biz
=
contentHtml
.
split
(
"d.biz = \""
)[
1
].
split
(
"\""
)[
0
];
user_name
=
contentHtml
.
split
(
"d.user_name = \""
)[
1
].
split
(
"\""
)[
0
];
}
else
if
(
contentHtml
.
contains
(
"var nickname = "
))
{
}
else
if
(
contentHtml
.
contains
(
"var nickname = "
))
{
time
=
contentHtml
.
split
(
"var ct = \""
)[
1
].
split
(
"\";"
)[
0
];
source
=
contentHtml
.
split
(
"var nickname = \""
)[
1
].
split
(
"\";"
)[
0
];
biz
=
contentHtml
.
split
(
"var appuin = \"\"||\""
)[
1
].
split
(
"\""
)[
0
];
user_name
=
contentHtml
.
split
(
"var user_name = \""
)[
1
].
split
(
"\""
)[
0
];
}
}
if
(
wechatAricle
==
null
)
{
if
(
wechatAricle
==
null
)
{
wechatAricle
=
new
WechatAricle
();
wechatAricle
.
setTitle
(
title
);
wechatAricle
.
setTime
(
new
Date
(
Long
.
valueOf
(
time
)*
1000
));
wechatAricle
.
setTime
(
new
Date
(
Long
.
valueOf
(
time
)
*
1000
));
wechatAricle
.
setSource
(
source
);
}
...
...
@@ -219,9 +208,9 @@ public class WechatAritcleSearch {
}
/**
* 根据关键词采集指定时间+账号的数据
*
* @param word
* @param idOrName
* @param startTime
...
...
@@ -236,20 +225,20 @@ public class WechatAritcleSearch {
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
if
(
idOrName
==
null
||
idOrName
.
equals
(
""
))
{
if
(
idOrName
==
null
||
idOrName
.
equals
(
""
))
{
throw
new
IllegalArgumentException
(
"要检索的昵称或id不能为空"
);
}
String
openId
=
getOpenId
(
idOrName
,
proxyHolder
);
boolean
f
=
false
;
if
(
openId
!=
null
)
{
if
(
openId
!=
null
)
{
f
=
true
;
}
int
page
=
1
;
while
(
f
)
{
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&ie=utf8&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&tsn=5&ft="
+
startTime
+
"&et="
+
endTime
+
"&interation=&page="
+
page
+
"&wxid="
+
openId
+
"&usip="
+
URLEncoder
.
encode
(
idOrName
,
"UTF-8"
);
+
"&tsn=5&ft="
+
startTime
+
"&et="
+
endTime
+
"&interation=&page="
+
page
+
"&wxid="
+
openId
+
"&usip="
+
URLEncoder
.
encode
(
idOrName
,
"UTF-8"
);
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
...
...
@@ -274,10 +263,11 @@ public class WechatAritcleSearch {
/**
* 解析数据
*
* @param jxDocument
* @return
*/
private
static
List
<
WechatAricle
>
analysis
(
JXDocument
jxDocument
)
{
private
static
List
<
WechatAricle
>
analysis
(
JXDocument
jxDocument
)
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
// 解析数据
try
{
...
...
@@ -291,14 +281,12 @@ public class WechatAritcleSearch {
String
putDate
=
null
;
Date
date
=
null
;
WechatAricle
wechat
=
null
;
if
(
Objects
.
nonNull
(
jxNodeList
)
&&
!
jxNodeList
.
isEmpty
())
{
if
(
Objects
.
nonNull
(
jxNodeList
)
&&
!
jxNodeList
.
isEmpty
())
{
for
(
JXNode
jxNode
:
jxNodeList
)
{
try
{
title
=
jxNode
.
selOne
(
"//div[@class='txt-box']/h3/a"
).
asElement
().
text
();
link
=
jxNode
.
selOne
(
"//div[@class='txt-box']/h3/a"
).
asElement
().
attr
(
"href"
);
if
(!
link
.
contains
(
"weixin.sogou.com"
)){
link
=
"https://weixin.sogou.com"
+
link
;
}
link
=
URIUtils
.
resolve
(
"https://weixin.sogou.com"
,
link
);
if
(
Objects
.
nonNull
(
jxNode
.
selOne
(
"//div[@class='txt-box']/p"
)))
{
content
=
jxNode
.
selOne
(
"//div[@class='txt-box']/p"
).
asElement
().
text
();
}
...
...
@@ -314,8 +302,8 @@ public class WechatAritcleSearch {
}
title
=
ZhiWeiTools
.
SBC2DBC
(
title
);
content
=
ZhiWeiTools
.
SBC2DBC
(
content
);
if
(
StringUtils
.
isNotBlank
(
title
))
{
wechat
=
new
WechatAricle
(
link
,
title
,
source
,
content
,
date
,
null
,
null
,
readNum
,
0
,
openid
,
"unknow"
);
if
(
StringUtils
.
isNotBlank
(
title
))
{
wechat
=
new
WechatAricle
(
link
,
title
,
source
,
content
,
date
,
null
,
null
,
readNum
,
0
,
openid
,
"unknow"
);
result
.
add
(
wechat
);
}
}
catch
(
Exception
e
)
{
...
...
@@ -334,22 +322,20 @@ public class WechatAritcleSearch {
}
/**
* @param @param wxId
* @param @return 设定文件
* @return String 返回类型
* @Title: getOpenId
* @Description: 获取微信wxID
* @param @param
* wxId
* @param @return
* 设定文件
* @return String 返回类型
*/
public
static
String
getOpenId
(
String
idOrName
,
ProxyHolder
proxyHolder
)
{
String
openId
=
null
;
String
url
=
"https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query="
+
URLCodeUtil
.
getURLEncode
(
idOrName
,
"utf-8"
);
String
htmlBody
;
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
try
{
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
url
,
null
,
proxyHolder
);
htmlBody
=
HtmlDownUtil
.
downloadHtml
(
url
,
null
,
proxyHolder
);
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
openId
=
jsonObject
.
getString
(
"openid"
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment