Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
247e637d
Commit
247e637d
authored
Nov 30, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
douban topic crawler
parent
36eb5887
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
236 additions
and
0 deletions
+236
-0
src/main/java/com/zhiwei/parse/Douban.java
+141
-0
src/main/java/com/zhiwei/parse/analysis/DoubanCommentAnalysis.java
+66
-0
src/test/java/com/zhiwei/Comment/DoubanCommentTest.java
+29
-0
No files found.
src/main/java/com/zhiwei/parse/Douban.java
0 → 100644
View file @
247e637d
package
com
.
zhiwei
.
parse
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.apache.poi.xwpf.usermodel.BodyElementType
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.parse.analysis.DoubanCommentAnalysis
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
Douban
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Double
.
class
);
private
static
DoubanCommentAnalysis
doubanCommentAnalysis
=
new
DoubanCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
*
* @Description 豆瓣小组采集
* @param word
* @param proxy
* @param cookie
* @param stime
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
doubanTopicGetByWord
(
String
word
,
Proxy
proxy
,
String
cookie
,
String
stime
)
{
int
page
=
0
;
int
count
=
20
;
boolean
more
=
true
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"Host"
,
"www.douban.com"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
);
headerMap
.
put
(
"Cookie"
,
cookie
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
int
cou
=
0
;
while
(
more
){
cou
=
bodyList
.
size
();
String
url
=
"https://www.douban.com/group/search?q="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&start="
+
page
*
count
+
"&cat=1013&sort=time"
;
headerMap
.
put
(
"Referer"
,
url
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
){
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"div.topics"
).
select
(
"tr.pl"
);
String
link
=
null
;
String
title
=
null
;
String
group
=
null
;
String
time
=
null
;
int
replyCount
=
0
;
if
(
nonNull
(
elements
))
{
for
(
Element
element
:
elements
)
{
link
=
element
.
select
(
"td.td-subject"
).
select
(
"a"
).
attr
(
"href"
);
title
=
element
.
select
(
"td.td-subject"
).
select
(
"a"
).
text
();
time
=
element
.
select
(
"td.td-time"
).
attr
(
"title"
);
replyCount
=
Integer
.
valueOf
(
element
.
select
(
"td.td-reply"
).
select
(
"span"
).
text
().
split
(
"回应"
)[
0
].
trim
());
group
=
element
.
select
(
"td"
).
get
(
3
).
text
();
if
(
time
.
compareTo
(
stime
)
>
-
1
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"_id"
,
link
);
map
.
put
(
"title"
,
title
);
map
.
put
(
"group"
,
group
);
map
.
put
(
"time"
,
time
);
map
.
put
(
"reply_count"
,
replyCount
);
bodyList
.
add
(
map
);
}
}
if
(
cou
==
bodyList
.
size
()){
more
=
false
;
}
logger
.
info
(
"采集到 第 {} 页 ,一共采集到 {} 条 是否有下一页 {}"
,
page
,
bodyList
.
size
(),
more
);
}
}
ZhiWeiTools
.
sleep
(
1500
);
page
++;
}
catch
(
Exception
e
)
{
more
=
false
;
logger
.
error
(
"豆瓣 topic 采集出错 {}"
,
e
);
}
}
return
Collections
.
emptyList
();
}
/**
*
* @Description 采集豆瓣topic 评论
* @param url
* @param proxy
* @param cookie
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getDoubanComment
(
String
url
,
Proxy
proxy
,
String
cookie
)
{
if
(
url
.
contains
(
"#"
))
{
url
=
url
.
split
(
"#"
)[
0
];
}
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"Host"
,
"www.douban.com"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
);
headerMap
.
put
(
"Cookie"
,
cookie
);
boolean
more
=
true
;
int
page
=
0
;
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
count
=
-
1
;
while
(
more
)
{
try
{
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
+
"?start="
+
page
*
100
,
headerMap
),
proxy
).
body
().
string
();
count
=
dataList
.
size
();
dataList
.
addAll
(
doubanCommentAnalysis
.
getData
(
result
));
page
++;
if
(
dataList
.
size
()
-
count
<=
95
||
dataList
.
size
()
-
count
>=
105
)
{
more
=
false
;
}
ZhiWeiTools
.
sleep
(
1500
);
logger
.
info
(
"评论采集到 第 {} 页 ,一共采集到 {} 条数据 ,more : {}"
,
page
,
dataList
.
size
(),
more
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"Exception {}"
,
e
);
more
=
false
;
}
}
return
dataList
;
}
}
src/main/java/com/zhiwei/parse/analysis/DoubanCommentAnalysis.java
0 → 100644
View file @
247e637d
package
com
.
zhiwei
.
parse
.
analysis
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.apache.commons.lang3.math.NumberUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
public
class
DoubanCommentAnalysis
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
DoubanCommentAnalysis
.
class
);
public
List
<
Map
<
String
,
Object
>>
getData
(
String
result
)
{
try
{
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
Document
doc
=
Jsoup
.
parse
(
result
);
Elements
elements
=
doc
.
select
(
"#comments"
).
select
(
"li"
);
if
(
nonNull
(
elements
))
{
for
(
Element
element
:
elements
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
String
source
=
element
.
select
(
"div.reply-doc.content > div.bg-img-green > h4 > a"
).
text
();
String
time
=
element
.
select
(
"div.reply-doc.content > div.bg-img-green > h4 > span"
).
text
();
String
content
=
element
.
select
(
"div.reply-doc.content > p"
).
text
();
String
id
=
element
.
select
(
"li"
).
attr
(
"data-cid"
);
map
.
put
(
"source"
,
source
);
map
.
put
(
"time"
,
time
);
map
.
put
(
"content"
,
content
);
map
.
put
(
"id"
,
id
);
map
.
put
(
"like"
,
getLikeNum
(
result
,
"c"
+
id
));
bodyList
.
add
(
map
);
}
}
return
bodyList
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析错误 {}"
,
e
);
}
return
Collections
.
emptyList
();
}
private
int
getLikeNum
(
String
result
,
String
id
)
{
Matcher
matcher
=
Pattern
.
compile
(
id
+
"\":[\\D\\d][0,5]"
).
matcher
(
result
);
while
(
matcher
.
find
())
{
String
ret
=
matcher
.
group
(
0
);
ret
=
ret
.
split
(
":"
)[
1
].
split
(
","
)[
0
];
if
(
NumberUtils
.
isNumber
(
ret
))
{
return
Integer
.
parseInt
(
ret
);
}
}
return
0
;
}
}
src/test/java/com/zhiwei/Comment/DoubanCommentTest.java
0 → 100644
View file @
247e637d
package
com
.
zhiwei
.
Comment
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Douban
;
public
class
DoubanCommentTest
{
@Test
public
void
f
()
{
String
url
=
"https://www.douban.com/group/topic/72528866/"
;
String
cookie
=
"bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543562805%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_ses.100001.8cb4=*; douban-profile-remind=1; loc-last-index-location-id=\"118173\"; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utma=30149280.824403997.1543559458.1543562809.1543564973.3; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.2.1543566557.1543559542.; __utmb=30149280.70.5.1543566539352"
;
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
Map
<
String
,
Object
>>
bodyList
=
Douban
.
getDoubanComment
(
url
,
null
,
cookie
);
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"source"
);
headList
.
add
(
"time"
);
headList
.
add
(
"like"
);
headList
.
add
(
"content"
);
headList
.
add
(
"id"
);
poi
.
exportExcel
(
"D://crawlerdata//自媒体/douban评论采集-2.xlsx"
,
"asd"
,
headList
,
bodyList
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment