Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
weiboDomain
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
xuyimeng
weiboDomain
Commits
306c37e1
Commit
306c37e1
authored
Mar 16, 2018
by
chenweitao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修复了些不规范代码导致的bug
parent
a402132e
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
37 additions
and
29 deletions
+37
-29
src/main/java/com/zhiweidata/weiboDomain/crawler/JsoupHtml.java
+15
-10
src/main/java/com/zhiweidata/weiboDomain/quartz/crawlerQuartz.java
+1
-1
src/main/java/com/zhiweidata/weiboDomain/service/MongoSerivce.java
+17
-11
src/main/java/com/zhiweidata/weiboDomain/start/Start.java
+4
-7
No files found.
src/main/java/com/zhiweidata/weiboDomain/crawler/JsoupHtml.java
View file @
306c37e1
...
...
@@ -173,21 +173,26 @@ public class JsoupHtml {
str
=
json
.
getString
(
"html"
);
doc
=
Jsoup
.
parse
(
str
);
Elements
a
=
doc
.
getElementsBy
Tag
(
"a
"
);
Elements
a
=
doc
.
getElementsBy
Class
(
"page S_txt1
"
);
int
num
=
0
;
for
(
Element
e
:
a
)
{
if
(
"page"
.
equals
(
e
.
attr
(
"bpfilter"
))
&&
"page S_txt1"
.
equals
(
e
.
attr
(
"class"
)))
{
if
(
Integer
.
parseInt
(
e
.
text
())
>
num
)
{
num
=
Integer
.
parseInt
(
e
.
text
());
}
}
if
(
"page"
.
equals
(
a
.
last
().
attr
(
"bpfilter"
)))
{
num
=
Integer
.
parseInt
(
a
.
last
().
text
());
}
// for (Element e : a)
// {
// if ("page".equals(e.attr("bpfilter")) && "page S_txt1".equals(e.attr("class")))
// {
// if (Integer.parseInt(e.text()) > num)
// {
// num = Integer.parseInt(e.text());
// }
// }
// }
return
num
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
0
;
}
...
...
src/main/java/com/zhiweidata/weiboDomain/quartz/crawlerQuartz.java
View file @
306c37e1
...
...
@@ -36,7 +36,7 @@ public class crawlerQuartz {
long
start
=
System
.
currentTimeMillis
();
String
cookie
=
"
SINAGLOBAL=1413878352487.6208.1509610656233; UM_distinctid=16053b396fea93-0a1b0a92fd2d7-5f19311c-1fa400-16053b396ffa9d; un=15757871020; UOR=,www.weibo.com,www.baidu.com; login_sid_t=c3935e41ed072cfa20cbd1462a51e1b6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=3630121226743.0815.1520321567340; ULV=1520321568035:15:2:1:3630121226743.0815.1520321567340:1519887954422; SSOLoginState=1520321599; SCF=AuCAEA8HE7llKecGgDC9mXez57Y8TrXZB-bbR3xw1Rg_-hZyItSqXebc8xU1toDVlo8zXAxor6wCq9fcmjOxtlI.; SUHB=0FQrE4a4pmNdGS; ALF=1551948727; SUB=_2AkMt_rd6f8NxqwJRmP0RzWPrbI90wg3EieKbokahJRMxHRl-yT83qlc5tRB6Bn6ZlSJDljRT4MPLb3O5AT8RtbcROuaE; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_; YF-Page-G0=aabeaa17d9557111c805fb15a9959531
"
;
String
cookie
=
"
YF-Page-G0=b98b45d9bba85e843a07e69c0880151a; SUB=_2AkMt9Wyxf8NxqwJRmP0RzWPrbI90wg3EieKbqZ1qJRMxHRl-yj83qhwMtRB6BnVCXs0VTavYQWuC4hgT0djGew9Twhmm; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_
"
;
serice
.
crawlerData
(
cookie
);
long
end
=
System
.
currentTimeMillis
();
log
.
info
(
time
+
"次运行耗时:"
+
(
end
-
start
)
+
"\t毫秒"
);
...
...
src/main/java/com/zhiweidata/weiboDomain/service/MongoSerivce.java
View file @
306c37e1
...
...
@@ -15,6 +15,7 @@ import java.util.ArrayList;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
java.util.Random
;
import
javax.annotation.Resource
;
...
...
@@ -49,26 +50,31 @@ public class MongoSerivce {
public
void
crawlerData
(
String
cookie
)
{
Map
<
String
,
String
>
map
=
groupSet
();
for
(
String
domain
:
map
.
keySet
())
{
String
domainId
=
map
.
get
(
domain
);
log
.
info
(
"【{}】页开始爬取..............."
,
domain
);
int
sum
=
parseAndInsert
(
domain
,
domainId
,
cookie
);
log
.
info
(
"【{}】页所有数据爬取结束..............."
,
domain
);
tagDao
.
updateByState
(
domain
,
2
);
log
.
info
(
"【{}】所有页数据存储成功,共计【{}】条数据"
,
domain
,
sum
);
for
(
Entry
<
String
,
String
>
en
:
map
.
entrySet
())
{
log
.
info
(
"【{}】页开始爬取..............."
,
en
.
getKey
());
int
sum
=
parseAndInsert
(
en
.
getKey
(),
en
.
getValue
(),
cookie
);
log
.
info
(
"【{}】页所有数据爬取结束..............."
,
en
.
getKey
());
tagDao
.
updateByState
(
en
.
getKey
(),
2
);
log
.
info
(
"【{}】所有页数据存储成功,共计【{}】条数据"
,
en
.
getKey
(),
sum
);
}
log
.
info
(
"所有页面爬取结束,程序结束"
);
tagDao
.
findAll
().
forEach
(
a
->
tagDao
.
updateByState
(
a
.
getDomain
(),
0
));
log
.
info
(
"所有页面爬取结束,程序结束,重置所有主标签状态"
);
}
private
int
getPageNum
(
String
domainId
,
String
cookie
)
{
int
index
=
0
;
while
(
true
)
{
String
page
=
crawler
.
getPage
(
domainId
,
cookie
);
crawler
.
sleep
(
3000L
);
String
page
=
crawler
.
getPage
(
domainId
,
cookie
);
int
num
=
jsoupHtml
.
parsePage
(
page
);
if
(
num
!=
0
)
{
return
num
;
}
if
(++
index
>
10
)
{
log
.
error
(
"【{}】未获取到页码"
);
return
0
;
}
}
}
...
...
@@ -158,6 +164,6 @@ public class MongoSerivce {
tagDao
.
updateByState
(
key
,
0
);
}
domainDao
.
createColl
();
//
domainDao.createColl();
}
}
src/main/java/com/zhiweidata/weiboDomain/start/Start.java
View file @
306c37e1
...
...
@@ -23,19 +23,16 @@ import com.zhiweidata.weiboDomain.service.MongoSerivce;
* @date 2018年2月23日 下午3:09:33
*/
public
class
Start
{
//
private static ApplicationContext ctx = new ClassPathXmlApplicationContext("applicationContext.xml");
//
private static MongoSerivce serice = ctx.getBean(MongoSerivce.class);
private
static
ApplicationContext
ctx
=
new
ClassPathXmlApplicationContext
(
"applicationContext.xml"
);
private
static
MongoSerivce
serice
=
ctx
.
getBean
(
MongoSerivce
.
class
);
public
static
void
main
(
String
[]
args
)
{
ApplicationContext
ctx
=
new
ClassPathXmlApplicationContext
(
"applicationContext.xml"
);
System
.
out
.
println
(
"微博热门榜单采集开始..."
);
//程序主体切换至com.zhiweidata.weiboDomain.quartz定时器
// String cookie = "SINAGLOBAL=1413878352487.6208.1509610656233; UM_distinctid=16053b396fea93-0a1b0a92fd2d7-5f19311c-1fa400-16053b396ffa9d; un=15757871020; UOR=,www.weibo.com,www.baidu.com; login_sid_t=c3935e41ed072cfa20cbd1462a51e1b6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=3630121226743.0815.1520321567340; ULV=1520321568035:15:2:1:3630121226743.0815.1520321567340:1519887954422; SSOLoginState=1520321599; SCF=AuCAEA8HE7llKecGgDC9mXez57Y8TrXZB-bbR3xw1Rg_-hZyItSqXebc8xU1toDVlo8zXAxor6wCq9fcmjOxtlI.; SUHB=0FQrE4a4pmNdGS; ALF=1551948727; SUB=_2AkMt_rd6f8NxqwJRmP0RzWPrbI90wg3EieKbokahJRMxHRl-yT83qlc5tRB6Bn6ZlSJDljRT4MPLb3O5AT8RtbcROuaE; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_; YF-Page-G0=aabeaa17d9557111c805fb15a9959531";
// 初始化程序状态,在再次爬取时调用
// String cookie = "YF-Page-G0=b98b45d9bba85e843a07e69c0880151a; SUB=_2AkMt9Wyxf8NxqwJRmP0RzWPrbI90wg3EieKbqZ1qJRMxHRl-yj83qhwMtRB6BnVCXs0VTavYQWuC4hgT0djGew9Twhmm; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_";
// 初始化程序状态,在再次爬取时调用
// 断点续传时,注释掉
// serice.initTag();
// serice.crawlerData(cookie);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment