Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
H
HttpClient-Jsoup-GetData
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
俞宁
HttpClient-Jsoup-GetData
Commits
84056e85
Commit
84056e85
authored
Aug 27, 2020
by
俞宁
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
对搜狗微信网进行数据的爬取
parent
9b7d8114
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
388 additions
and
0 deletions
+388
-0
src/main/java/com/zhiwei/httpclient/GetKey.java
+33
-0
src/main/java/com/zhiwei/httpclient/GetSignature.java
+47
-0
src/main/java/com/zhiwei/httpclient/GetWebData.java
+165
-0
src/main/java/com/zhiwei/httpclient/IfExistNextPage.java
+89
-0
src/main/java/com/zhiwei/httpclient/StockUtils.java
+54
-0
No files found.
src/main/java/com/zhiwei/httpclient/GetKey.java
0 → 100644
View file @
84056e85
package
com
.
zhiwei
.
httpclient
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.util.HashMap
;
import
java.util.Map
;
/**
* 用来获取异步加载网页中的key值
*/
public
class
GetKey
{
public
static
HashMap
<
String
,
Integer
>
getkeyUtils
(
int
i
)
throws
Exception
{
int
z
=
0
;
HashMap
<
String
,
Integer
>
c
=
new
HashMap
<
String
,
Integer
>();
String
url
=
"https://weixin.sogou.com/weixin?query=%E5%BE%AE%E4%BF%A1&_sug_type_=&s_from=input&_sug_=n&type=&page="
+
i
+
"&ie=utf8"
;
//获取页面,并解析html页面空格&bsp乱码问题
Document
document
=
Jsoup
.
connect
(
url
).
get
();
//选择html中想要内容的具体位置的语句
Elements
li
=
document
.
select
(
"ul.news-list2"
);
Elements
links
=
li
.
select
(
"li"
);
for
(
Element
link
:
links
)
{
//获取d中的值,也就是我们想要的内容
String
key
=
link
.
attr
(
"d"
);
c
.
put
(
key
,
z
);
z
++;
}
return
c
;
}
}
src/main/java/com/zhiwei/httpclient/GetSignature.java
0 → 100644
View file @
84056e85
package
com
.
zhiwei
.
httpclient
;
import
net.sf.json.JSONArray
;
import
net.sf.json.JSONObject
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
javax.script.ScriptEngine
;
import
javax.script.ScriptEngineManager
;
import
java.net.URL
;
public
class
GetSignature
{
/**
* 此方法是用来获取每一个网页的signature。
* @param url
* @return
* @throws Exception
*/
public
static
Object
getsignatgure
(
String
url
)
throws
Exception
{
Object
obj
=
null
;
//使用json来parse html
Document
doc
=
Jsoup
.
parse
(
new
URL
(
url
),
3000
);
//取得所有的script tag
Elements
eles
=
doc
.
getElementsByTag
(
"script"
);
for
(
Element
ele
:
eles
){
//检查是否有account_anti_url字串
String
script
=
ele
.
toString
();
if
(
script
.
indexOf
(
"account_anti_url"
)
>
-
1
){
//只取得script的内容
script
=
ele
.
childNode
(
0
).
toString
();
//使用ScriptEngine来parse
ScriptEngine
engine
=
new
ScriptEngineManager
().
getEngineByName
(
"javascript"
);
engine
.
eval
(
script
);
//取得想要的内容:
obj
=
engine
.
get
(
"account_anti_url"
);
}
}
return
obj
;
}
}
src/main/java/com/zhiwei/httpclient/GetWebData.java
0 → 100644
View file @
84056e85
package
com
.
zhiwei
.
httpclient
;
import
com.alibaba.fastjson.JSONObject
;
import
org.seimicrawler.xpath.JXDocument
;
import
org.seimicrawler.xpath.JXNode
;
import
java.io.IOException
;
import
java.text.ParseException
;
import
java.util.*
;
import
static
java
.
lang
.
Thread
.
sleep
;
public
class
GetWebData
{
public
static
int
x
=
1
;
public
static
int
y
=
0
;
public
static
int
i
=
0
;
public
static
void
main
(
String
[]
args
)
throws
ParseException
,
IOException
,
Exception
{
IfExistNextPage
ifExistNextPage
=
new
IfExistNextPage
();
//得到总页数
int
totalpages
=
ifExistNextPage
.
getTotalpages
();
//循环输出爬取的网页数据
for
(
int
page
=
1
;
page
<=
totalpages
;
page
++)
{
String
content
=
StockUtils
.
getHtmlByUrl
(
"https://weixin.sogou.com/weixin?query=%E5%BE%AE%E4%BF%A1&_sug_type_=&s_from=input&_sug_=n&type=1&page="
+
page
+
"&ie=utf8"
);
parseHtmlByXpath
(
content
);
x
++;
y
=
0
;
}
}
/* public static void parseHtml(String content) throws ParseException,IOException{
Document document = Jsoup.parse(content);
Elements links = document.getElementsByClass("news-box").select("li");
for (Element e : links){
// System.out.println("新闻标题:"+e.select("a").text().toString());
//获取页面链接
Elements linkHerf = e.select("div[class='img-box']");
//截取时间字符串
// Elements timeStr = e.select("div[class=news_source");
System.out.println(linkHerf.attr("href"));
// System.out.println(timeStr.text());
System.out.println("=================");
}*/
/**
*
* @param content
* @throws ParseException
* @throws IOException
*/
public
static
void
parseHtmlByXpath
(
String
content
)
throws
ParseException
,
IOException
{
GetSignature
serieExTool
=
new
GetSignature
();
//System.out.println("Test.parseHtmlByXpath====="+content);
//获取页面,解析网页html源码
JXDocument
jxDocument
=
JXDocument
.
create
(
content
);
//规定爬取的数据的范围
List
<
JXNode
>
jxNodeList
=
jxDocument
.
selN
(
"//div[@class='news-box']/ul/li"
);
jxNodeList
.
forEach
(
jxNode
->
{
//获取title数据,并设为text格式
String
title
=
jxNode
.
selOne
(
"//div[@class='txt-box']/p[@class='tit']/a"
).
asElement
().
text
();
System
.
out
.
println
(
"标题:"
+
title
);
//获取url数据,并设为url格式
String
url
=
jxNode
.
selOne
(
"//div[@class='gzh-box2']/div[@class='img-box']/a"
).
asElement
().
attr
(
"href"
);
System
.
out
.
println
(
"链接:"
+
url
);
//获取vxid数据,并设为text格式
String
vxid
=
jxNode
.
selOne
(
"//div[@class='txt-box']/p[@class='info']/label"
).
asElement
().
text
();
System
.
out
.
println
(
"微信号:"
+
vxid
);
//判断功能介绍的内容是否存在
if
(
jxNode
.
selOne
(
"//dl[1]/dd"
)==
null
){
//不存在就输出没有功能介绍
System
.
out
.
println
(
"没有功能介绍"
);
}
else
{
//存在就获取gnjs数据,并设为text格式
String
gnjs
=
jxNode
.
selOne
(
"//dl[1]/dd"
).
asElement
().
text
();
System
.
out
.
println
(
"功能介绍:"
+
gnjs
);
}
if
(
jxNode
.
selOne
(
"//dl[2]/dd"
)==
null
){
System
.
out
.
println
(
"没有微信认证"
);
}
else
{
//存在就获取vxrz数据,并设为text格式
String
vxrz
=
jxNode
.
selOne
(
"//dl[2]/dd"
).
asElement
().
text
();
System
.
out
.
println
(
"微信认证:"
+
vxrz
);
}
if
(
jxNode
.
selOne
(
"//dl[3]/dd/a"
)==
null
){
System
.
out
.
println
(
"没有最近文章。"
);
}
else
{
//存在就获取zjwz数据,并设为text格式
String
zjwz
=
jxNode
.
selOne
(
"//dl[3]/dd/a"
).
asElement
().
text
();
System
.
out
.
println
(
"最近文章:"
+
zjwz
);
}
y
=
y
+
1
;
//获取异步加载的数据(月发文数量)
try
{
//获取signature
String
str
=
(
String
)
serieExTool
.
getsignatgure
(
"https://weixin.sogou.com/weixin?query=%E5%BE%AE%E4%BF%A1&_sug_type_=&s_from=input&_sug_=n&type=1&page="
+
x
+
"&ie=utf8"
);
//结合成异步加载的网址
String
ybcontent
=
StockUtils
.
getHtmlByUrl
(
"https://weixin.sogou.com"
+
str
);
//调用parseHtmlByXpath2获取月发文的数量
parseHtmlByXpath2
(
ybcontent
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
//每爬取一次停3秒
try
{
sleep
(
3000
);
}
catch
(
InterruptedException
e
)
{
e
.
printStackTrace
();
}
});
i
=
0
;
}
public
static
void
parseHtmlByXpath2
(
String
content
)
throws
IOException
,
Exception
{
//获取异步加载网页里的key值
GetKey
getKey
=
new
GetKey
();
HashMap
<
String
,
Integer
>
c
=
getKey
.
getkeyUtils
(
x
);
i
=
y
-
1
;
//System.out.println(y);
//System.out.println(i);
//System.out.println(getAllKey(c, i).toString().replaceAll("\\[|\\]",""));
//把Json格式的数据进行转换
JSONObject
object
=
(
JSONObject
)
JSONObject
.
parse
(
content
);
//获取对应key值的value值(月发文篇数),但是月发文的篇数要用split()来进行分割,提取前半部分
//如果对应的key没有value值(月发文篇数)就返回null
if
(
object
.
getJSONObject
(
"msg"
).
get
(
getAllKey
(
c
,
i
).
toString
().
replaceAll
(
"\\[|\\]"
,
""
))
==
null
)
{
System
.
out
.
println
(
"没有发布文章"
);
}
else
{
//获取月发文的篇数
String
[]
strings
=
object
.
getJSONObject
(
"msg"
).
get
(
getAllKey
(
c
,
i
).
toString
().
replaceAll
(
"\\[|\\]"
,
""
)).
toString
().
split
(
","
);
System
.
out
.
println
(
"月发布"
+
strings
[
0
]
+
"篇文章"
);
}
}
public
static
ArrayList
getAllKey
(
HashMap
hm
,
Integer
value
){
/**
* 通过HashMap中的value值来获取HashMap中的key值
*/
ArrayList
list
=
new
ArrayList
();
for
(
Object
getKey:
hm
.
keySet
()){
if
(
hm
.
get
(
getKey
).
equals
(
value
))
{
list
.
add
(
getKey
);
}
}
return
list
;
}
}
src/main/java/com/zhiwei/httpclient/IfExistNextPage.java
0 → 100644
View file @
84056e85
package
com
.
zhiwei
.
httpclient
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
javax.script.ScriptEngine
;
import
javax.script.ScriptEngineManager
;
import
java.net.URL
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
static
java
.
lang
.
Thread
.
getDefaultUncaughtExceptionHandler
;
import
static
java
.
lang
.
Thread
.
sleep
;
public
class
IfExistNextPage
{
/**
* 用来获取爬取的网页的总页数
* @return
* @throws Exception
*/
public
static
int
getTotalpages
()
throws
Exception
{
int
x
=
0
;
int
y
=
1
;
String
ifnextpage
=
null
;
String
keyhref
=
null
;
String
keyuigs
=
null
;
//设置的标志位,在最后一个网页到来之前都是true
boolean
ifend
=
true
;
//把第一个网页的后半段设为初值
String
nexturl
=
"?query=%E5%BE%AE%E4%BF%A1&_sug_type_=&s_from=input&_sug_=n&type=1&page=1&ie=utf8"
;
//HashMap用来存储内容
HashMap
<
String
,
Integer
>
hashMaphref
=
new
HashMap
<
String
,
Integer
>();
HashMap
<
String
,
Integer
>
hashMapuigs
=
new
HashMap
<
String
,
Integer
>();
while
(
ifend
)
{
String
url
=
"https://weixin.sogou.com/weixin"
+
nexturl
;
//获取网页,并解析网页中的空格&bsp问题
Document
document
=
Jsoup
.
connect
(
url
).
get
();
Elements
li
=
document
.
select
(
"div.p-fy"
);
Elements
links
=
li
.
select
(
"a"
);
for
(
Element
link
:
links
)
{
//获取href中的内容
keyhref
=
link
.
attr
(
"href"
);
//获取uigs中的内容
keyuigs
=
link
.
attr
(
"uigs"
);
//把keyhref放入hashMaphref中
hashMaphref
.
put
(
keyhref
,
y
);
//把keyuigs放入hashMapuigs中
hashMapuigs
.
put
(
keyuigs
,
y
);
y
++;
//设置时间间隔为1秒
try
{
sleep
(
1000
);
}
catch
(
InterruptedException
e
)
{
e
.
printStackTrace
();
}
}
//将nexturl的值赋成下一页的后半段网址
nexturl
=
getAllKey
(
hashMaphref
,
y
-
1
).
toString
().
replaceAll
(
"\\[|\\]"
,
""
);
ifnextpage
=
getAllKey
(
hashMapuigs
,
y
-
1
).
toString
().
replaceAll
(
"\\[|\\]"
,
""
);
// System.out.println(nexturl);
// System.out.println(ifnextpage);
//当循环到最后一页,把ifend设为false
if
(
ifnextpage
.
equals
(
"page_next"
)==
false
){
ifend
=
false
;
}
x
++;
}
//返回总页数
return
x
;
}
public
static
ArrayList
getAllKey
(
HashMap
hm
,
Integer
value
){
/**
* 通过value得到hashMap中的key值
*/
ArrayList
list
=
new
ArrayList
();
for
(
Object
getKey:
hm
.
keySet
()){
if
(
hm
.
get
(
getKey
).
equals
(
value
))
{
list
.
add
(
getKey
);
}
}
return
list
;
}
}
src/main/java/com/zhiwei/httpclient/StockUtils.java
0 → 100644
View file @
84056e85
package
com
.
zhiwei
.
httpclient
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.HttpResponse
;
import
org.apache.http.HttpStatus
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClients
;
import
org.apache.http.util.EntityUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
java.io.IOException
;
public
class
StockUtils
{
public
static
String
getHtmlByUrl
(
String
url
)
throws
IOException
{
String
html
=
null
;
// 创建httpClient对象
CloseableHttpClient
httpClient
=
HttpClients
.
createDefault
();
//使用HttpGet的方式请求网址
HttpGet
httpGet
=
new
HttpGet
(
url
);
//模拟浏览器访问
httpGet
.
addHeader
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"
);
httpGet
.
addHeader
(
"Referer"
,
"https://weixin.sogou.com/weixin?type=1&s_from=input&query=%E5%BE%AE%E4%BF%A1&ie=utf8&_sug_=y&_sug_type_=&w=01019900&sut=3104&sst0=1598233518739&lkt=1%2C1598233518638%2C1598233518638"
);
httpGet
.
addHeader
(
"cookie"
,
"ssuid=7721315200; IPLOC=CN3302; SUID=FD3CB33C3118960A000000005E6EC315; SUV=1589806411593000; pgv_pvi=8877030400; sw_uuid=2356830386; wuid=AAGybl+xLwAAAAqLMX9P4QUAGwY=; CXID=2A332FD98F354B28467C13BEE5751651; LCLKINT=1823; LSTMV=522%2C185; ABTEST=0|1597799749|v1; weixinIndexVisited=1; JSESSIONID=aaaXXkIyD8HIAcrOi3Yox; PHPSESSID=nv30eatifl87ngsfpb2bldcg56; SNUID=B6E081A7C2C76F58857D3FBAC250DF69; seccodeRight=success; successCount=1|Mon, 24 Aug 2020 07:14:34 GMT; refresh=1"
);
httpGet
.
addHeader
(
"Referer"
,
"https://weixin.sogou.com/websearch/weixin/pc/anti_account.jsp?t=1598235364118&signature=zEH6t4vtzmgi-lZOgXZ*wnxyO0xCCmI*98uRvLGbuvdOYmdXdNTo0m9UnsAIjFTJswHTHB-*ndspBdCd-6Wcq9DEKhmxAGRLKPE5SE7EJaSzCG4E4uotravrUcaPDoakBuCXin0uTIVmHkN6zhISeLHZQ6c5yaxgjbhTmqab7Gojgm5t6vZ*II-L*Mnj3E5WpD9YtUFSMBlnDNIrph7IKqSMUFrqlohvdt05KiICwetWRjmLUBXxfZK4FfeoALORnYDQQy8ZXvgVlg9M5tftR5HpBPU-or4rNgtH2yMkpVnQ9HV3B5M98cUcStRyaN016eD6DHgUu8ysQKm0dLfvIS0fIqlxF4YKLRL6vCaFeF8GNVUo2rSPZBcl1MQmyjRHxTXLhs4seZ2CLiTPUiNzBaoTiSCuEtlf6pSozZSnx0Y="
);
httpGet
.
addHeader
(
"cookie"
,
"ssuid=7721315200; IPLOC=CN3302; SUID=FD3CB33C3118960A000000005E6EC315; SUV=1589806411593000; pgv_pvi=8877030400; sw_uuid=2356830386; wuid=AAGybl+xLwAAAAqLMX9P4QUAGwY=; CXID=2A332FD98F354B28467C13BEE5751651; LCLKINT=1823; LSTMV=522%2C185; ABTEST=0|1597799749|v1; weixinIndexVisited=1; SNUID=742341650105AC94BDD0F52601C16B72; JSESSIONID=aaaXXkIyD8HIAcrOi3Yox"
);
try
{
//获取网页的返回结果
HttpResponse
httpResponse
=
httpClient
.
execute
(
httpGet
);
int
rtValue
=
httpResponse
.
getStatusLine
().
getStatusCode
();
if
(
rtValue
==
HttpStatus
.
SC_OK
){
//获取返回结果中的实体
HttpEntity
entity
=
httpResponse
.
getEntity
();
if
(
entity
!=
null
){
//将返回的实体输出
html
=
EntityUtils
.
toString
(
entity
,
"UTF-8"
);
}
}
}
catch
(
Exception
e
){
System
.
out
.
println
(
"访问【"
+
url
+
"】出现了异常!"
);
e
.
printStackTrace
();
}
finally
{
//释放连接
httpClient
.
close
();
}
return
html
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment