Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
wechat
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
wechat
Commits
97909cfe
Commit
97909cfe
authored
Nov 05, 2018
by
[zhangzhiwei]
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加代理采集全文功能
parent
87c407d1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
122 additions
and
3 deletions
+122
-3
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
+122
-3
No files found.
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
View file @
97909cfe
...
...
@@ -19,6 +19,7 @@ import org.jsoup.select.Elements;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
@@ -36,10 +37,15 @@ public class WechatAritcleSearch {
private
static
Logger
logger
=
LogManager
.
getLogger
(
WechatAritcleSearch
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
*
* @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章)
* @Description: TODO(根据关键词在搜狗微信搜索微信文章
,不包含全文
)
* @param @param
* word 关键词
* @param @param
...
...
@@ -119,7 +125,120 @@ public class WechatAritcleSearch {
title
=
ZhiWeiTools
.
SBC2DBC
(
title
);
content
=
ZhiWeiTools
.
SBC2DBC
(
content
);
wechat
=
new
WechatAricle
(
link
,
title
,
source
,
content
,
date
,
readNum
,
0
,
openid
,
"unknow"
);
wechat
=
getContentAndSource
(
link
,
proxy
,
wechat
);
result
.
add
(
wechat
);
}
catch
(
Exception
e
)
{
logger
.
debug
(
"解析数据出现错误:{}"
,
e
.
getMessage
());
continue
;
}
}
// 解析最大可寻页码
String
pageNext
=
document
.
select
(
"[id=pagebar_container]>a"
).
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
}
else
{
f
=
false
;
}
// logger.info("数据总页数为:{}", page);
}
catch
(
Exception
e
)
{
logger
.
debug
(
"获取数据出现问题:{}"
,
e
.
getMessage
());
return
null
;
}
}
else
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
// ZhiWeiTools.sleep(100);
}
return
result
;
}
/**
*
* @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章,包含全文)
* @param @param
* word 关键词
* @param @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @param
* cookie 用户登录后的cookie(不登录最多10页)
* @param @return
* @param @throws
* ZhiWeiException
* @param @throws
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
int
tsn
,
String
startTime
,
String
endTime
,
String
cookie
,
Proxy
proxy
,
ProxyHolder
proxyHolder
)
throws
Exception
,
UnsupportedEncodingException
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
boolean
f
=
true
;
int
page
=
1
;
while
(
f
)
{
String
url
=
"http://weixin.sogou.com/weixin?type=2&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&ie=utf8&_sug_=n&_sug_type_="
+
"&ri=1&sourceid=sugg&sst0="
+
System
.
currentTimeMillis
()
+
"&tsn="
+
tsn
+
"&page="
+
page
;
if
(
tsn
==
5
)
{
url
=
url
+
"&ft="
+
startTime
+
"&et="
+
endTime
+
"&wxid=&usip=&interation=&from=tool"
;
}
headerMap
.
put
(
"Referer"
,
url
);
// 获取数据
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
).
body
().
string
();
// 解析数据
if
(
htmlBody
!=
null
)
{
try
{
// 解析数据
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"div.news-box"
).
select
(
"ul.news-list"
).
select
(
"li"
);
String
title
=
null
;
String
link
=
null
;
String
content
=
null
;
String
source
=
null
;
String
openid
=
null
;
String
putDate
=
null
;
Date
date
=
null
;
WechatAricle
wechat
=
null
;
for
(
Element
element
:
elements
)
{
try
{
title
=
element
.
select
(
"div.txt-box"
).
select
(
"h3"
).
text
();
link
=
element
.
select
(
"div.txt-box"
).
select
(
"h3 >a"
).
attr
(
"href"
);
content
=
""
;
if
(
element
.
select
(
"p.txt-info"
).
isEmpty
())
{
content
=
element
.
select
(
"p.txt-info"
).
text
();
}
else
{
content
=
element
.
select
(
"div.txt-box"
).
select
(
"p.txt-info"
).
text
();
}
// System.out.println("content======================"+content);
source
=
element
.
select
(
"div.txt-box"
).
select
(
"div.s-p"
).
select
(
"a"
).
text
();
openid
=
element
.
select
(
"div.txt-box"
).
select
(
"div.s-p"
).
select
(
"a"
).
attr
(
"i"
);
putDate
=
element
.
select
(
"div.txt-box"
).
select
(
"div.s-p"
).
attr
(
"t"
);
date
=
new
Date
(
Long
.
valueOf
(
putDate
)
*
1000
);
int
readNum
=
0
;
try
{
readNum
=
Integer
.
valueOf
(
element
.
select
(
"div.txt-box"
).
select
(
"div.s-p"
)
.
select
(
"span.s1"
).
text
().
trim
());
}
catch
(
Exception
e
)
{
readNum
=
0
;
}
title
=
ZhiWeiTools
.
SBC2DBC
(
title
);
content
=
ZhiWeiTools
.
SBC2DBC
(
content
);
wechat
=
new
WechatAricle
(
link
,
title
,
source
,
content
,
date
,
readNum
,
0
,
openid
,
"unknow"
);
wechat
=
getContentAndSource
(
link
,
proxyHolder
,
wechat
);
result
.
add
(
wechat
);
}
catch
(
Exception
e
)
{
logger
.
debug
(
"解析数据出现错误:{}"
,
e
.
getMessage
());
...
...
@@ -155,7 +274,7 @@ public class WechatAritcleSearch {
* @return
* @throws IOException
*/
private
static
WechatAricle
getContentAndSource
(
String
url
,
Proxy
proxy
,
WechatAricle
wechatAricle
){
private
static
WechatAricle
getContentAndSource
(
String
url
,
Proxy
Holder
proxy
,
WechatAricle
wechatAricle
){
try
{
// String htmlBody = HttpClientTemplateOK.get(url, proxy, null);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
).
body
().
string
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment