Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
wechat
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
wechat
Commits
7db2a9e8
Commit
7db2a9e8
authored
Oct 29, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
分享链接消失并失效,解析改为有验证码的链接
parent
7ad96e77
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
156 additions
and
90 deletions
+156
-90
pom.xml
+2
-2
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
+78
-25
src/test/java/com/zhiwei/wechat/example/WechatSearchExample.java
+76
-63
No files found.
pom.xml
View file @
7db2a9e8
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
wechat
</artifactId>
<version>
1.1.
7-SNAPSHOT
</version>
<version>
1.1.
8-SNAPSHOT
</version>
<description>
知微微信采集程序,包含
1.微信历史文章采集
...
...
@@ -91,7 +91,7 @@
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.5.
2-SNAPSHOT
</version>
<version>
0.5.
5.6-SNAPSHOT
</version>
<scope>
provided
</scope>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
View file @
7db2a9e8
...
...
@@ -4,10 +4,9 @@ import java.io.IOException;
import
java.io.UnsupportedEncodingException
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.*
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.logging.log4j.LogManager
;
...
...
@@ -54,7 +53,7 @@ public class WechatAritcleSearch {
* @param
* pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @throws
*
ZhiWeiException
*
Exception
* @return List<Wechat> 返回类型
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
int
tsn
,
String
cookie
,
String
startTime
,
String
endTime
,
...
...
@@ -68,16 +67,15 @@ public class WechatAritcleSearch {
boolean
f
=
true
;
int
page
=
1
;
while
(
f
)
{
String
url
=
"http://weixin.sogou.com/weixin?type=2&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
String
searchUrl
=
"http://weixin.sogou.com/weixin?type=2&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&ie=utf8&_sug_=n&_sug_type_=&ri=1&sourceid=sugg&sst0="
+
System
.
currentTimeMillis
()
+
"&tsn="
+
tsn
+
"&page="
+
page
;
if
(
tsn
==
5
)
{
url
=
url
+
"&ft="
+
startTime
+
"&et="
+
endTime
+
"&wxid=&usip=&interation=&from=tool"
;
searchUrl
=
searchUrl
+
"&ft="
+
startTime
+
"&et="
+
endTime
+
"&wxid=&usip=&interation=&from=tool"
;
}
System
.
out
.
println
(
url
);
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
searchUrl
,
headerMap
),
proxy
,
false
).
body
().
string
();
// 解析数据
if
(
htmlBody
!=
null
)
{
try
{
...
...
@@ -95,7 +93,10 @@ public class WechatAritcleSearch {
for
(
Element
element
:
elements
)
{
try
{
title
=
element
.
select
(
"div.txt-box"
).
select
(
"h3"
).
text
();
link
=
element
.
select
(
"div.txt-box"
).
select
(
"h3 >a"
).
attr
(
"data-share"
);
link
=
element
.
select
(
"div.txt-box"
).
select
(
"h3 >a"
).
attr
(
"href"
);
// link = getRealLink(link, searchUrl);
content
=
""
;
if
(
element
.
select
(
"p.txt-info"
).
isEmpty
())
{
content
=
element
.
select
(
"p.txt-info"
).
text
();
...
...
@@ -121,6 +122,7 @@ public class WechatAritcleSearch {
wechat
=
new
WechatAricle
(
link
,
title
,
source
,
content
,
date
,
readNum
,
0
,
openid
,
"unknow"
);
result
.
add
(
wechat
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
debug
(
"解析数据出现错误:{}"
,
e
);
}
}
...
...
@@ -183,15 +185,15 @@ public class WechatAritcleSearch {
boolean
f
=
true
;
int
page
=
1
;
while
(
f
)
{
String
url
=
"http://weixin.sogou.com/weixin?type=2&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
String
searchUrl
=
"http://weixin.sogou.com/weixin?type=2&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&ie=utf8&_sug_=n&_sug_type_="
+
"&ri=1&sourceid=sugg&sst0="
+
System
.
currentTimeMillis
()
+
"&tsn="
+
tsn
+
"&page="
+
page
;
if
(
tsn
==
5
)
{
url
=
url
+
"&ft="
+
startTime
+
"&et="
+
endTime
+
"&wxid=&usip=&interation=&from=tool"
;
searchUrl
=
searchUrl
+
"&ft="
+
startTime
+
"&et="
+
endTime
+
"&wxid=&usip=&interation=&from=tool"
;
}
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
searchUrl
,
headerMap
),
proxy
,
false
).
body
().
string
();
// 解析数据
if
(
htmlBody
!=
null
)
{
try
{
...
...
@@ -209,7 +211,8 @@ public class WechatAritcleSearch {
for
(
Element
element
:
elements
)
{
try
{
title
=
element
.
select
(
"div.txt-box"
).
select
(
"h3"
).
text
();
link
=
element
.
select
(
"div.txt-box"
).
select
(
"h3 >a"
).
attr
(
"data-share"
);
link
=
element
.
select
(
"div.txt-box"
).
select
(
"h3 >a"
).
attr
(
"href"
);
// link = getRealLink(link, searchUrl);
content
=
""
;
if
(
element
.
select
(
"p.txt-info"
).
isEmpty
())
{
content
=
element
.
select
(
"p.txt-info"
).
text
();
...
...
@@ -263,7 +266,6 @@ public class WechatAritcleSearch {
* 获取全文及来源
* @param url
* @param proxy
* @param headerMap
* @param wechatAricle
* @return
* @throws IOException
...
...
@@ -329,10 +331,8 @@ public class WechatAritcleSearch {
* 根据关键词采集指定时间+账号的数据
* @param word
* @param idOrName
* @param tsn
* @param startTime
* @param endTime
* @param proxy
* @param proxyHolder
* @return
* @throws Exception
...
...
@@ -355,14 +355,13 @@ public class WechatAritcleSearch {
int
page
=
1
;
while
(
f
)
{
String
url
=
"https://weixin.sogou.com/weixin?type=2&ie=utf8&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
String
searchUrl
=
"https://weixin.sogou.com/weixin?type=2&ie=utf8&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&tsn=5&ft="
+
startTime
+
"&et="
+
endTime
+
"&interation=&page="
+
page
+
"&wxid="
+
openId
+
"&usip="
+
URLEncoder
.
encode
(
idOrName
,
"UTF-8"
);
headerMap
.
put
(
"Referer"
,
url
);
System
.
out
.
println
(
url
);
headerMap
.
put
(
"Referer"
,
searchUrl
);
// 获取数据
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxyHolder
,
true
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
searchUrl
,
headerMap
),
proxyHolder
,
true
).
body
().
string
();
// 解析数据
if
(
htmlBody
!=
null
)
{
try
{
...
...
@@ -380,7 +379,8 @@ public class WechatAritcleSearch {
for
(
Element
element
:
elements
)
{
try
{
title
=
element
.
select
(
"div.txt-box"
).
select
(
"h3"
).
text
();
link
=
element
.
select
(
"div.txt-box"
).
select
(
"h3 >a"
).
attr
(
"data-share"
);
link
=
element
.
select
(
"div.txt-box"
).
select
(
"h3 >a"
).
attr
(
"href"
);
// link = getRealLink(link, searchUrl);
content
=
""
;
if
(
element
.
select
(
"p.txt-info"
).
isEmpty
())
{
content
=
element
.
select
(
"p.txt-info"
).
text
();
...
...
@@ -431,11 +431,64 @@ public class WechatAritcleSearch {
}
/**
* 获取真实链接
* @param originalUrl
* @param searchUrl
* @return
* @throws IOException
*/
public
static
String
getRealLink
(
String
originalUrl
,
String
searchUrl
)
throws
Exception
{
originalUrl
=
"https://weixin.sogou.com"
+
originalUrl
;
int
b
=
(
int
)
(
Math
.
floor
(
100
*
Math
.
random
())
+
1
);
int
a
=
originalUrl
.
indexOf
(
"url="
);
int
c
=
originalUrl
.
indexOf
(
"&k="
);
String
d
=
null
;
if
(
a
!=
-
1
&&
-
1
==
c
)
{
d
=
originalUrl
.
substring
(
a
+
25
+
b
,
a
+
26
+
b
);
}
originalUrl
+=
"&k="
+
b
+
"&h="
+
d
;
originalUrl
=
getFinalUrl
(
originalUrl
,
searchUrl
);
return
originalUrl
;
}
/**
* 获取真实链接
* @param originalUrl
* @param rerferer
* @return
* @throws Exception
*/
public
static
String
getFinalUrl
(
String
originalUrl
,
String
rerferer
)
throws
Exception
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"Sec-Fetch-Mode"
,
"navigate"
);
headerMap
.
put
(
"Sec-Fetch-User"
,
"?1"
);
headerMap
.
put
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
);
headerMap
.
put
(
"Sec-Fetch-Site"
,
"same-origin"
);
headerMap
.
put
(
"Referer"
,
rerferer
);
headerMap
.
put
(
"Cookie"
,
"SUID=EAD6E7733765860A5AEAE09C000ACA78; SUV=00C351E873E7D6EA5AEBCB68E5B81671; wuid=AAGyrPzuHwAAAAqLFD3eFgAAGwY=; pgv_pvi=5713931264; GOTO=; ssuid=5316643370; pex=C864C03270DED3DD8A06887A372DA219231FFAC25A9D64AE09E82AED12E416AC; weixinIndexVisited=1; ABTEST=8|1572271712|v1; SNUID=C5F9D7432F2ABAD638CB0A7A30803056; sct=917; JSESSIONID=aaaR-8KOdPrlZ_KSPKs4w; PHPSESSID=oc296ck54mc3jbgvnu2mar6r40; IPLOC=CN3302"
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
originalUrl
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
){
StringBuilder
furl
=
new
StringBuilder
();
Pattern
pa1
=
Pattern
.
compile
(
"url \\+= \'(.*?)\';"
);
Matcher
ma1
=
pa1
.
matcher
(
htmlBody
);
while
(
ma1
.
find
())
{
furl
.
append
(
ma1
.
group
(
1
));
}
return
furl
.
toString
();
}
return
null
;
}
/**
* @Title: getOpenId
* @Description:
TODO(获取微信wxID)
* @Description:
获取微信wxID
* @param @param
* wxId
* @param @return
...
...
src/test/java/com/zhiwei/wechat/example/WechatSearchExample.java
View file @
7db2a9e8
//package com.zhiwei.wechat.example;
//
//import java.io.UnsupportedEncodingException;
//import java.net.UnknownHostException;
//import java.util.ArrayList;
//import java.util.List;
//
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.wechat.entity.WechatAricle;
//import com.zhiwei.wechat.search.WechatAritcleSearch;
//
///**
// * @ClassName: WechatSearchExample
// * @Description: TODO(根据关键词等采集数据)
// * @author hero
// * @date 2016年12月16日 上午9:15:42
// */
//public class WechatSearchExample{
//
// private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
// private static final String registry = "zookeeper://192.168.0.36:2181";
// private static final String group = "local";
//
// public static void main(String[] args) {
// ProxyFactory.init(registry, group, GroupType.PROVIDER);
// try {
// WechatSearchExample.wechatSearchExample();
// } catch (UnknownHostException e) {
// e.printStackTrace();
// }
// }
//
//
// public static void wechatSearchExample() throws UnknownHostException
// {
// List<String> wordList = new ArrayList<String>();
// wordList.add("京东");
// for(String word : wordList)
package
com
.
zhiwei
.
wechat
.
example
;
import
java.io.IOException
;
import
java.io.UnsupportedEncodingException
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.net.UnknownHostException
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.wechat.entity.WechatAricle
;
import
com.zhiwei.wechat.search.WechatAritcleSearch
;
/**
* @ClassName: WechatSearchExample
* @Description: TODO(根据关键词等采集数据)
* @author hero
* @date 2016年12月16日 上午9:15:42
*/
public
class
WechatSearchExample
{
private
Logger
logger
=
LoggerFactory
.
getLogger
(
WechatSearchExample
.
class
);
private
static
final
String
registry
=
"zookeeper://192.168.0.36:2181"
;
private
static
final
String
group
=
"local"
;
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
private
static
Proxy
proxy
=
null
;
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
registry
,
group
,
GroupType
.
PROVIDER
,
10000018
);
proxy
=
ProxyHolder
.
SOUGOU_INNER_PROXY
.
getProxy
();
try
{
WechatSearchExample
.
wechatSearchExample
();
}
catch
(
UnknownHostException
e
)
{
e
.
printStackTrace
();
}
}
public
static
void
wechatSearchExample
()
throws
UnknownHostException
{
List
<
String
>
wordList
=
new
ArrayList
<
String
>();
wordList
.
add
(
"京东"
);
for
(
String
word
:
wordList
)
{
try
{
List
<
WechatAricle
>
list
=
WechatAritcleSearch
.
wechatKeywordSearch
(
word
,
5
,
null
,
"2019-10-28"
,
"2019-10-28"
,
proxy
,
51
);
System
.
out
.
println
(
"======"
+
list
.
size
());
for
(
WechatAricle
wechat
:
list
){
System
.
out
.
println
(
wechat
.
getId
());
}
}
catch
(
UnsupportedEncodingException
e
)
{
e
.
printStackTrace
();
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
// for(String wxId : wechatIds)
// {
// try {
// List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-07-24", "2019-07-24", ProxyHolder.SOUGOU_INNER_PROXY.getProxy(), 21);
// System.out.println("======"+list.size());
// for(WechatAricle wechat : list){
//// System.out.println(wechat.getTitle());
// }
// logger.info("需要采集的wxId:::{}", wxId);
//
// } catch (UnsupportedEncodingException e) {
// e.printStackTrace();
// } catch (Exception e) {
// e.printStackTrace();
// }
//// for(String wxId : wechatIds)
//// {
//// try {
//// logger.info("需要采集的wxId:::{}", wxId);
////
//// } catch (UnsupportedEncodingException e) {
//// e.printStackTrace();
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// }
// }
//
//
//}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment