Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
wechat
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
wechat
Commits
3ea331c1
Commit
3ea331c1
authored
Dec 19, 2018
by
[zhangzhiwei]
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加搜狗微信根据账号+关键词采集数据
parent
60e4b279
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
126 additions
and
25 deletions
+126
-25
src/main/java/com/zhiwei/wechat/readAndLike/WeChatReadAndLike.java
+1
-1
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
+115
-20
src/main/java/com/zhiwei/wechat/util/Tools.java
+1
-1
src/test/java/com/zhiwei/wechat/example/WechatSearchExample.java
+9
-3
No files found.
src/main/java/com/zhiwei/wechat/readAndLike/WeChatReadAndLike.java
View file @
3ea331c1
...
...
@@ -122,7 +122,7 @@ public class WeChatReadAndLike {
time
=
time
.
split
(
" "
)[
0
];
}
String
openid
=
WechatAritcleSearch
.
getOpenId
(
wxId
);
String
openid
=
WechatAritcleSearch
.
getOpenId
(
wxId
,
null
);
logger
.
info
(
"openid is {}"
,
openid
);
try
{
...
...
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
View file @
3ea331c1
...
...
@@ -21,10 +21,9 @@ import com.zhiwei.crawler.core.HttpBoot;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.
httpclient.HttpClientTemplateOK
;
import
com.zhiwei.tools.
tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.wechat.entity.WechatAricle
;
import
com.zhiwei.wechat.util.Tools
;
/**
* @ClassName: WechatAritcleSearch
...
...
@@ -60,13 +59,10 @@ public class WechatAritcleSearch {
* @return List<Wechat> 返回类型
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
int
tsn
,
String
startTime
,
String
endTime
,
String
cookie
,
Proxy
proxy
)
throws
Exception
,
UnsupportedEncodingException
{
Proxy
proxy
)
throws
Exception
,
UnsupportedEncodingException
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
boolean
f
=
true
;
int
page
=
1
;
...
...
@@ -174,13 +170,10 @@ public class WechatAritcleSearch {
* @return List<Wechat> 返回类型
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
int
tsn
,
String
startTime
,
String
endTime
,
String
cookie
,
Proxy
proxy
,
ProxyHolder
proxyHolder
)
throws
Exception
,
UnsupportedEncodingException
{
Proxy
proxy
,
ProxyHolder
proxyHolder
)
throws
Exception
,
UnsupportedEncodingException
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
boolean
f
=
true
;
int
page
=
1
;
...
...
@@ -272,7 +265,6 @@ public class WechatAritcleSearch {
*/
private
static
WechatAricle
getContentAndSource
(
String
url
,
ProxyHolder
proxy
,
WechatAricle
wechatAricle
){
try
{
// String htmlBody = HttpClientTemplateOK.get(url, proxy, null);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
.
getProxy
()).
body
().
string
();
if
(
htmlBody
!=
null
){
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
@@ -310,10 +302,114 @@ public class WechatAritcleSearch {
}
// public static void main(String[] args) {
// String url = "https://mp.weixin.qq.com/s?src=11×tamp=1540521001&ver=1205&signature=12dtyhMA3Xi7lzUhGUFyEpJmWPlnaLAwDVXMUi-tcFXHJbIYDKuLm76sdQUAZxkEjyGby22amJ4AnxIM4oS0ivtAS6ibs4F3OO8-jwoFLk4Pd6d8AhZdj94Z1gQdhdIQ&new=1";
// getContentAndSource(url, null, null);
// }
/**
* 根据关键词采集指定时间+账号的数据
* @param word
* @param idOrName
* @param tsn
* @param startTime
* @param endTime
* @param proxy
* @param proxyHolder
* @return
* @throws Exception
* @throws UnsupportedEncodingException
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearchByAccount
(
String
word
,
String
idOrName
,
String
startTime
,
String
endTime
,
ProxyHolder
proxyHolder
)
throws
Exception
,
UnsupportedEncodingException
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
if
(
idOrName
==
null
||
idOrName
.
equals
(
""
)){
throw
new
IllegalArgumentException
(
"要检索的昵称或id不能为空"
);
}
String
openId
=
getOpenId
(
idOrName
,
proxyHolder
);
boolean
f
=
false
;
if
(
openId
!=
null
){
f
=
true
;
}
int
page
=
1
;
while
(
f
)
{
String
url
=
"https://weixin.sogou.com/weixin?type=2&ie=utf8&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&tsn=5&ft="
+
startTime
+
"&et="
+
endTime
+
"&interation=&page="
+
page
+
"&wxid="
+
openId
+
"&usip="
+
URLEncoder
.
encode
(
idOrName
,
"UTF-8"
);
headerMap
.
put
(
"Referer"
,
url
);
System
.
out
.
println
(
url
);
// 获取数据
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxyHolder
,
true
).
body
().
string
();
// 解析数据
if
(
htmlBody
!=
null
)
{
try
{
// 解析数据
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"div.news-box"
).
select
(
"ul.news-list"
).
select
(
"li"
);
String
title
=
null
;
String
link
=
null
;
String
content
=
null
;
String
source
=
null
;
String
openid
=
null
;
String
putDate
=
null
;
Date
date
=
null
;
WechatAricle
wechat
=
null
;
for
(
Element
element
:
elements
)
{
try
{
title
=
element
.
select
(
"div.txt-box"
).
select
(
"h3"
).
text
();
link
=
element
.
select
(
"div.txt-box"
).
select
(
"h3 >a"
).
attr
(
"href"
);
content
=
""
;
if
(
element
.
select
(
"p.txt-info"
).
isEmpty
())
{
content
=
element
.
select
(
"p.txt-info"
).
text
();
}
else
{
content
=
element
.
select
(
"div.txt-box"
).
select
(
"p.txt-info"
).
text
();
}
// System.out.println("content======================"+content);
source
=
element
.
select
(
"div.txt-box"
).
select
(
"div.s-p"
).
select
(
"a"
).
text
();
openid
=
element
.
select
(
"div.txt-box"
).
select
(
"div.s-p"
).
select
(
"a"
).
attr
(
"i"
);
putDate
=
element
.
select
(
"div.txt-box"
).
select
(
"div.s-p"
).
attr
(
"t"
);
date
=
new
Date
(
Long
.
valueOf
(
putDate
)
*
1000
);
int
readNum
=
0
;
try
{
readNum
=
Integer
.
valueOf
(
element
.
select
(
"div.txt-box"
).
select
(
"div.s-p"
)
.
select
(
"span.s1"
).
text
().
trim
());
}
catch
(
Exception
e
)
{
readNum
=
0
;
}
title
=
ZhiWeiTools
.
SBC2DBC
(
title
);
content
=
ZhiWeiTools
.
SBC2DBC
(
content
);
wechat
=
new
WechatAricle
(
link
,
title
,
source
,
content
,
date
,
readNum
,
0
,
openid
,
"unknow"
);
wechat
=
getContentAndSource
(
link
,
proxyHolder
,
wechat
);
result
.
add
(
wechat
);
}
catch
(
Exception
e
)
{
logger
.
debug
(
"解析数据出现错误:{}"
,
e
.
getMessage
());
continue
;
}
}
// 解析最大可寻页码
String
pageNext
=
document
.
select
(
"[id=pagebar_container]>a"
).
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
}
else
{
f
=
false
;
}
// logger.info("数据总页数为:{}", page);
}
catch
(
Exception
e
)
{
logger
.
debug
(
"获取数据出现问题:{}"
,
e
.
getMessage
());
return
null
;
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
// ZhiWeiTools.sleep(100);
}
return
result
;
}
/**
* @Title: getOpenId
...
...
@@ -324,20 +420,19 @@ public class WechatAritcleSearch {
* 设定文件
* @return String 返回类型
*/
public
static
String
getOpenId
(
String
wxId
)
{
public
static
String
getOpenId
(
String
idOrName
,
ProxyHolder
proxyHolder
)
{
String
openId
=
null
;
String
url
=
"http://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query="
+
wxId
;
Map
<
String
,
String
>
headerMap
=
Tools
.
getWechatHeader
();
String
url
=
"https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query="
+
URLCodeUtil
.
getURLEncode
(
idOrName
,
"utf-8"
);
String
htmlBody
;
try
{
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxyHolder
,
true
).
body
().
string
();
System
.
out
.
println
(
htmlBody
);
if
(
htmlBody
!=
null
)
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
openId
=
json
.
getString
(
"openid"
);
}
}
catch
(
Exception
e
)
{
openId
=
null
;
e
.
printStackTrace
();
}
return
openId
;
...
...
src/main/java/com/zhiwei/wechat/util/Tools.java
View file @
3ea331c1
...
...
@@ -24,7 +24,7 @@ public class Tools {
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Upgrade-Insecure-Requests"
,
"1"
);
headerMap
.
put
(
"Host"
,
"mp.weixin.qq.com"
);
headerMap
.
put
(
"Origin"
,
"http://mp.weixin.qq.com"
);
headerMap
.
put
(
"Origin"
,
"http
s
://mp.weixin.qq.com"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.691.400 QQBrowser/9.0.2524.400"
);
return
headerMap
;
...
...
src/test/java/com/zhiwei/wechat/example/WechatSearchExample.java
View file @
3ea331c1
...
...
@@ -8,6 +8,9 @@ import java.util.List;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.wechat.entity.WechatAricle
;
import
com.zhiwei.wechat.search.WechatAritcleSearch
;
import
com.zhiwei.wechat.util.Tools
;
...
...
@@ -21,9 +24,11 @@ import com.zhiwei.wechat.util.Tools;
public
class
WechatSearchExample
{
private
Logger
logger
=
LoggerFactory
.
getLogger
(
WechatSearchExample
.
class
);
private
static
final
String
registry
=
"zookeeper://192.168.0.36:2181"
;
private
static
final
String
group
=
"local"
;
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
registry
,
group
,
GroupType
.
PROVIDER
);
try
{
WechatSearchExample
.
wechatSearchExample
();
}
catch
(
UnknownHostException
e
)
{
...
...
@@ -35,12 +40,13 @@ public class WechatSearchExample{
public
static
void
wechatSearchExample
()
throws
UnknownHostException
{
List
<
String
>
wordList
=
new
ArrayList
<
String
>();
wordList
.
add
(
"QQ 涉密邮件 间谍"
);
wordList
.
add
(
"工业互联网"
);
String
idOrName
=
"吴晓波频道"
;
for
(
String
word
:
wordList
)
{
try
{
List
<
WechatAricle
>
list
=
WechatAritcleSearch
.
wechatKeywordSearch
(
word
,
5
,
"2018-11-29"
,
"2018-11-29"
,
null
,
null
);
List
<
WechatAricle
>
list
=
WechatAritcleSearch
.
wechatKeywordSearch
ByAccount
(
word
,
idOrName
,
"2017-12-01"
,
"2018-12-01"
,
ProxyHolder
.
SOUGOU_INNER_PROXY
);
System
.
out
.
println
(
"======"
+
list
.
size
());
for
(
WechatAricle
wechat
:
list
){
System
.
out
.
println
(
wechat
.
getTitle
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment