Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
wechat
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
wechat
Commits
87c407d1
Commit
87c407d1
authored
Oct 26, 2018
by
[zhangzhiwei]
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
微信添加全文及来源采集
parent
fd3dac6f
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
175 additions
and
158 deletions
+175
-158
pom.xml
+1
-1
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
+111
-96
src/test/java/com/zhiwei/wechat/example/WechatSearchExample.java
+63
-61
No files found.
pom.xml
View file @
87c407d1
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
wechat
</artifactId>
<artifactId>
wechat
</artifactId>
<version>
1.1.
0
-SNAPSHOT
</version>
<version>
1.1.
1
-SNAPSHOT
</version>
<description>
<description>
知微微信采集程序,包含
知微微信采集程序,包含
1.微信历史文章采集
1.微信历史文章采集
...
...
src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
View file @
87c407d1
...
@@ -26,134 +26,126 @@ import com.zhiwei.wechat.entity.WechatAricle;
...
@@ -26,134 +26,126 @@ import com.zhiwei.wechat.entity.WechatAricle;
import
com.zhiwei.wechat.util.Tools
;
import
com.zhiwei.wechat.util.Tools
;
/**
/**
* @ClassName: WechatAritcleSearch
* @ClassName: WechatAritcleSearch
* @Description: TODO(在搜索接口根据关键词采集微信文章)
* @Description: TODO(在搜索接口根据关键词采集微信文章)
* @author Bewilder Z
* @author Bewilder Z
* @date 2016年10月14日 上午9:40:18
* @date 2016年10月14日 上午9:40:18
*/
*/
public
class
WechatAritcleSearch
{
public
class
WechatAritcleSearch
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
WechatAritcleSearch
.
class
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
WechatAritcleSearch
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
/**
*
*
* @Title: wechatKeywordSearch
* @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章)
* @Description: TODO(根据关键词在搜狗微信搜索微信文章)
* @param @param word 关键词
* @param @param
* @param @param tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* word 关键词
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param
* @param @param startTime 开始时间 格式为yyyy-MM-dd
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* @param @param endTime 结束时间 格式为yyyy-MM-dd
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param cookie 用户登录后的cookie(不登录最多10页)
* @param @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @param
* cookie 用户登录后的cookie(不登录最多10页)
* @param @return
* @param @return
* @param @throws ZhiWeiException
* @param @throws
* @param @throws UnsupportedEncodingException 设定文件
* ZhiWeiException
* @param @throws
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
* @return List<Wechat> 返回类型
*/
*/
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
int
tsn
,
public
static
List
<
WechatAricle
>
wechatKeywordSearch
(
String
word
,
int
tsn
,
String
startTime
,
String
endTime
,
String
startTime
,
String
endTime
,
String
cookie
,
Proxy
proxy
)
String
cookie
,
Proxy
proxy
)
throws
Exception
,
UnsupportedEncodingException
{
throws
Exception
,
UnsupportedEncodingException
{
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
List
<
WechatAricle
>
result
=
new
ArrayList
<
WechatAricle
>();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
headerMap
.
put
(
"Host"
,
"weixin.sogou.com"
);
if
(
cookie
!=
null
)
{
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
headerMap
.
put
(
"Cookie"
,
cookie
);
}
}
boolean
f
=
true
;
boolean
f
=
true
;
int
page
=
1
;
int
page
=
1
;
while
(
f
)
while
(
f
)
{
{
String
url
=
"http://weixin.sogou.com/weixin?type=2&query="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
String
url
=
"http://weixin.sogou.com/weixin?type=2&query="
+
"&ie=utf8&_sug_=n&_sug_type_="
+
"&ri=1&sourceid=sugg&sst0="
+
System
.
currentTimeMillis
()
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&ie=utf8&_sug_=n&_sug_type_="
+
"&tsn="
+
tsn
+
"&page="
+
page
;
+
"&ri=1&sourceid=sugg&sst0="
+
System
.
currentTimeMillis
()
if
(
tsn
==
5
)
{
+
"&tsn="
+
tsn
+
"&page="
+
page
;
url
=
url
+
"&ft="
+
startTime
+
"&et="
+
endTime
+
"&wxid=&usip=&interation=&from=tool"
;
if
(
tsn
==
5
)
{
url
=
url
+
"&ft="
+
startTime
+
"&et="
+
endTime
+
"&wxid=&usip=&interation=&from=tool"
;
}
}
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
url
);
//获取数据
//
获取数据
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
).
body
().
string
();
//解析数据
// 解析数据
if
(
htmlBody
!=
null
)
if
(
htmlBody
!=
null
)
{
{
try
{
try
{
// 解析数据
// 解析数据
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"div.news-box"
)
Elements
elements
=
document
.
select
(
"div.news-box"
).
select
(
"ul.news-list"
).
select
(
"li"
);
.
select
(
"ul.news-list"
).
select
(
"li"
);
String
title
=
null
;
String
title
=
null
;
String
link
=
null
;
String
link
=
null
;
String
content
=
null
;
String
content
=
null
;
String
source
=
null
;
String
source
=
null
;
String
openid
=
null
;
String
openid
=
null
;
String
putDate
=
null
;
String
putDate
=
null
;
Date
date
=
null
;
Date
date
=
null
;
WechatAricle
wechat
=
null
;
WechatAricle
wechat
=
null
;
for
(
Element
element
:
elements
)
for
(
Element
element
:
elements
)
{
{
try
{
try
{
title
=
element
.
select
(
"div.txt-box"
).
select
(
"h3"
).
text
();
title
=
element
.
select
(
"div.txt-box"
).
select
(
"h3"
).
text
();
link
=
element
.
select
(
"div.txt-box"
).
select
(
"h3 >a"
).
attr
(
"href"
);
link
=
element
.
select
(
"div.txt-box"
).
select
(
"h3 >a"
).
attr
(
"href"
);
content
=
""
;
content
=
""
;
if
(
element
.
select
(
"p.txt-info"
).
isEmpty
())
if
(
element
.
select
(
"p.txt-info"
).
isEmpty
())
{
{
content
=
element
.
select
(
"p.txt-info"
).
text
();
content
=
element
.
select
(
"p.txt-info"
).
text
();
}
else
}
else
{
{
content
=
element
.
select
(
"div.txt-box"
).
select
(
"p.txt-info"
).
text
();
content
=
element
.
select
(
"div.txt-box"
).
select
(
"p.txt-info"
).
text
();
}
}
// System.out.println("content======================"+content);
// System.out.println("content======================"+content);
source
=
element
.
select
(
"div.txt-box"
).
select
(
"div.s-p"
)
source
=
element
.
select
(
"div.txt-box"
).
select
(
"div.s-p"
).
select
(
"a"
).
text
();
.
select
(
"a"
).
text
();
openid
=
element
.
select
(
"div.txt-box"
).
select
(
"div.s-p"
).
select
(
"a"
).
attr
(
"i"
);
openid
=
element
.
select
(
"div.txt-box"
)
putDate
=
element
.
select
(
"div.txt-box"
).
select
(
"div.s-p"
).
attr
(
"t"
);
.
select
(
"div.s-p"
).
select
(
"a"
).
attr
(
"i"
);
putDate
=
element
.
select
(
"div.txt-box"
)
.
select
(
"div.s-p"
).
attr
(
"t"
);
date
=
new
Date
(
Long
.
valueOf
(
putDate
)
*
1000
);
date
=
new
Date
(
Long
.
valueOf
(
putDate
)
*
1000
);
int
readNum
=
0
;
int
readNum
=
0
;
try
{
try
{
readNum
=
Integer
.
valueOf
(
element
.
select
(
"div.txt-box"
)
readNum
=
Integer
.
valueOf
(
element
.
select
(
"div.txt-box"
)
.
select
(
"div.s-p"
)
.
select
(
"
div.s-p"
).
select
(
"
span.s1"
).
text
().
trim
());
.
select
(
"span.s1"
).
text
().
trim
());
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
readNum
=
0
;
readNum
=
0
;
}
}
title
=
ZhiWeiTools
.
SBC2DBC
(
title
);
title
=
ZhiWeiTools
.
SBC2DBC
(
title
);
content
=
ZhiWeiTools
.
SBC2DBC
(
content
);
content
=
ZhiWeiTools
.
SBC2DBC
(
content
);
wechat
=
new
WechatAricle
(
link
,
title
,
source
,
content
,
date
,
readNum
,
0
,
openid
,
"unknow"
);
wechat
=
new
WechatAricle
(
link
,
title
,
source
,
content
,
date
,
readNum
,
0
,
openid
,
"unknow"
);
wechat
=
getContentAndSource
(
url
,
proxy
,
headerMap
,
wechat
);
wechat
=
getContentAndSource
(
link
,
proxy
,
wechat
);
result
.
add
(
wechat
);
result
.
add
(
wechat
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
debug
(
"解析数据出现错误:{}"
,
e
.
getMessage
());
logger
.
debug
(
"解析数据出现错误:{}"
,
e
.
getMessage
());
continue
;
continue
;
}
}
}
}
// 解析最大可寻页码
// 解析最大可寻页码
String
pageNext
=
document
.
select
(
"[id=pagebar_container]>a"
).
text
();
String
pageNext
=
document
.
select
(
"[id=pagebar_container]>a"
).
text
();
if
(
pageNext
.
contains
(
"下一页"
))
{
if
(
pageNext
.
contains
(
"下一页"
))
{
page
++;
page
++;
}
else
{
}
else
{
f
=
false
;
f
=
false
;
}
}
//
logger.info("数据总页数为:{}", page);
//
logger.info("数据总页数为:{}", page);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
debug
(
"获取数据出现问题:{}"
,
e
.
getMessage
());
logger
.
debug
(
"获取数据出现问题:{}"
,
e
.
getMessage
());
return
null
;
return
null
;
}
}
}
else
}
else
{
{
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
logger
.
info
(
"根据关键词获取微信文章失败,返回的数据结果集: {}"
,
htmlBody
);
}
}
//
ZhiWeiTools.sleep(100);
//
ZhiWeiTools.sleep(100);
}
}
return
result
;
return
result
;
}
}
/**
/**
* 获取全文及来源
* 获取全文及来源
* @param url
* @param url
...
@@ -163,45 +155,68 @@ public class WechatAritcleSearch {
...
@@ -163,45 +155,68 @@ public class WechatAritcleSearch {
* @return
* @return
* @throws IOException
* @throws IOException
*/
*/
private
static
WechatAricle
getContentAndSource
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
,
WechatAricle
wechatAricle
)
throws
IOException
{
private
static
WechatAricle
getContentAndSource
(
String
url
,
Proxy
proxy
,
WechatAricle
wechatAricle
){
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
).
body
().
string
();
try
{
if
(
htmlBody
!=
null
){
// String htmlBody = HttpClientTemplateOK.get(url, proxy, null);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
).
body
().
string
();
String
content
=
document
.
select
(
"div#js_content"
).
text
();
if
(
htmlBody
!=
null
){
String
source
=
document
.
select
(
"a#js_name"
).
text
();
Document
document
=
Jsoup
.
parse
(
htmlBody
);
if
(
content
!=
null
){
String
content
=
null
;
wechatAricle
.
setContent
(
content
);
String
source
=
null
;
}
String
text
=
null
;
if
(
source
!=
null
){
if
(
htmlBody
.
contains
(
"js_article"
)){
wechatAricle
.
setSource
(
source
);
content
=
document
.
select
(
"div#js_article"
).
text
();
}
else
if
(
htmlBody
.
contains
(
"js_share_content"
)){
content
=
document
.
select
(
"div#js_share_content"
).
text
();
}
if
(
htmlBody
.
contains
(
"content_tpl"
)){
text
=
document
.
select
(
"script#content_tpl"
).
html
();
text
=
Jsoup
.
parse
(
text
).
text
();
}
content
=
content
+
text
;
if
(
htmlBody
.
contains
(
"js_name"
)){
source
=
document
.
select
(
"a#js_name"
).
text
().
trim
();
}
else
if
(
htmlBody
.
contains
(
"account_nickname"
)){
source
=
document
.
select
(
"div.account_nickname"
).
text
().
trim
();
}
// System.out.println(source+"=========="+content);
if
(
content
!=
null
&&
content
.
length
()>
50
){
wechatAricle
.
setContent
(
content
);
}
if
(
source
!=
null
&&
content
.
length
()>
0
){
wechatAricle
.
setSource
(
source
);
}
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
wechatAricle
;
}
}
return
wechatAricle
;
return
wechatAricle
;
}
}
// public static void main(String[] args) {
// String url = "https://mp.weixin.qq.com/s?src=11×tamp=1540521001&ver=1205&signature=12dtyhMA3Xi7lzUhGUFyEpJmWPlnaLAwDVXMUi-tcFXHJbIYDKuLm76sdQUAZxkEjyGby22amJ4AnxIM4oS0ivtAS6ibs4F3OO8-jwoFLk4Pd6d8AhZdj94Z1gQdhdIQ&new=1";
// getContentAndSource(url, null, null);
// }
/**
/**
* @Title: getOpenId
* @Title: getOpenId
* @Description: TODO(获取微信wxID)
* @Description: TODO(获取微信wxID)
* @param @param wxId
* @param @param
* @param @return 设定文件
* wxId
* @param @return
* 设定文件
* @return String 返回类型
* @return String 返回类型
*/
*/
public
static
String
getOpenId
(
String
wxId
)
public
static
String
getOpenId
(
String
wxId
)
{
{
String
openId
=
null
;
String
openId
=
null
;
String
url
=
"http://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query="
+
wxId
;
String
url
=
"http://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query="
+
wxId
;
Map
<
String
,
String
>
headerMap
=
Tools
.
getWechatHeader
();
Map
<
String
,
String
>
headerMap
=
Tools
.
getWechatHeader
();
String
htmlBody
;
String
htmlBody
;
try
{
try
{
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
if
(
htmlBody
!=
null
)
if
(
htmlBody
!=
null
)
{
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
);
openId
=
json
.
getString
(
"openid"
);
openId
=
json
.
getString
(
"openid"
);
}
}
...
@@ -209,8 +224,8 @@ public class WechatAritcleSearch {
...
@@ -209,8 +224,8 @@ public class WechatAritcleSearch {
openId
=
null
;
openId
=
null
;
e
.
printStackTrace
();
e
.
printStackTrace
();
}
}
return
openId
;
return
openId
;
}
}
}
}
src/test/java/com/zhiwei/wechat/example/WechatSearchExample.java
View file @
87c407d1
//package com.zhiwei.wechat.example;
package
com
.
zhiwei
.
wechat
.
example
;
//
//import java.io.UnsupportedEncodingException;
import
java.io.UnsupportedEncodingException
;
//import java.net.UnknownHostException;
import
java.net.UnknownHostException
;
//import java.util.ArrayList;
import
java.util.ArrayList
;
//import java.util.List;
import
java.util.List
;
//
//import org.junit.Test;
import
org.slf4j.Logger
;
//import org.slf4j.Logger;
import
org.slf4j.LoggerFactory
;
//import org.slf4j.LoggerFactory;
//
import
com.zhiwei.wechat.entity.WechatAricle
;
//import com.zhiwei.wechat.entity.WechatAricle;
import
com.zhiwei.wechat.search.WechatAritcleSearch
;
//import com.zhiwei.wechat.search.WechatAritcleSearch;
import
com.zhiwei.wechat.util.Tools
;
//import com.zhiwei.wechat.util.Tools;
//
/**
///**
* @ClassName: WechatSearchExample
// * @ClassName: WechatSearchExample
* @Description: TODO(根据关键词等采集数据)
// * @Description: TODO(根据关键词等采集数据)
* @author hero
// * @author hero
* @date 2016年12月16日 上午9:15:42
// * @date 2016年12月16日 上午9:15:42
*/
// */
public
class
WechatSearchExample
{
//public class WechatSearchExample{
//
private
Logger
logger
=
LoggerFactory
.
getLogger
(
WechatSearchExample
.
class
);
// private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
//
// public static String cookie = "IPLOC=CN3302; SUID=EAD6E7733220910A000000005941E93A; SUV=1497491773102567; ABTEST=7|1497603317|v1; weixinIndexVisited=1; ppinf=5|1498107937|1499317537|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTYlOUElOTclRTYlQkElOUYlRTYlODMlOTF8Y3J0OjEwOjE0OTgxMDc5Mzd8cmVmbmljazoyNzolRTYlOUElOTclRTYlQkElOUYlRTYlODMlOTF8dXNlcmlkOjQ0Om85dDJsdUJ6dUhpQ2IxcnB3OUZ0QWk4WTN5S0lAd2VpeGluLnNvaHUuY29tfA; pprdig=I4bAcCm_wsn8RDnyejcfFQ-1gxkd2q3VhMOcLSGlyEXZaT3Oq0fbbNN1wslhlmUEMSAMcqhwDG46ZYpKwnHMjFWGtWLqB0qzu8HfI0uCja08CIEt6hWICe66kYCzJNvEiXuu26wBjE47Zivcb8p4XD1CSxh5qRl59DYYDFXIrzM; sgid=08-27429961-AVlLUCFlKgO0FEox1ElfuR0; ld=Jlllllllll2ByW6ElllllVOUXJkllllltMKQfkllllwlllll4ylll5@@@@@@@@@@; LSTMV=405%2C353; LCLKINT=8709; SNUID=B08DBC295B5F0970DCAD6F2C5B1D68B2; ppmdig=1498817001000000c7e9b5e47114b70495487a6f03e36c6c; JSESSIONID=aaavdFFFwNH4Y_-_f0OZv; sct=10";
public
static
void
main
(
String
[]
args
)
{
//
try
{
// @Test
WechatSearchExample
.
wechatSearchExample
();
// public void wechatSearchExample() throws UnknownHostException
}
catch
(
UnknownHostException
e
)
{
// {
e
.
printStackTrace
();
// List<String> wordList = new ArrayList<String>();
}
// wordList.add("王石");
}
//
// String fileName = "E:\\微博mid.csv";
// List<String> wechatIds= Tools.getFileName(fileName);
public
static
void
wechatSearchExample
()
throws
UnknownHostException
//
{
// for(String word : wordList)
List
<
String
>
wordList
=
new
ArrayList
<
String
>();
// {
wordList
.
add
(
"马化腾 知乎"
);
//
// try {
for
(
String
word
:
wordList
)
// List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, "2017-06-25", "2017-06-25", cookie);
{
// } catch (UnsupportedEncodingException e) {
try
{
// e.printStackTrace();
List
<
WechatAricle
>
list
=
WechatAritcleSearch
.
wechatKeywordSearch
(
word
,
5
,
"2018-10-25"
,
"2018-10-25"
,
null
,
null
);
// } catch (Exception e) {
}
catch
(
UnsupportedEncodingException
e
)
{
// e.printStackTrace();
e
.
printStackTrace
();
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
// for(String wxId : wechatIds)
// {
// try {
// logger.info("需要采集的wxId:::{}", wxId);
//
// } catch (UnsupportedEncodingException e) {
// e.printStackTrace();
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
// }
//// for(String wxId : wechatIds)
}
//// {
}
//// try {
//// logger.info("需要采集的wxId:::{}", wxId);
////
}
//// } catch (UnsupportedEncodingException e) {
//// e.printStackTrace();
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// }
// }
//
//
//}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment