Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
weibohotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
weibohotcrawler
Commits
50be39af
Commit
50be39af
authored
Dec 08, 2017
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
微博榜单1小时,社会,热门获取
parents
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
995 additions
and
0 deletions
+995
-0
pom.xml
+109
-0
src/main/java/com/zhiwei/weibocrawler/config/Config.java
+30
-0
src/main/java/com/zhiwei/weibocrawler/crawler/DataCrawlerStart.java
+29
-0
src/main/java/com/zhiwei/weibocrawler/crawler/DataUpdate.java
+65
-0
src/main/java/com/zhiwei/weibocrawler/crawler/GetData.java
+38
-0
src/main/java/com/zhiwei/weibocrawler/crawler/getdata/WeiboBangdanData.java
+40
-0
src/main/java/com/zhiwei/weibocrawler/crawler/getdata/WeiboCrawlerAnalysis.java
+131
-0
src/main/java/com/zhiwei/weibocrawler/crawler/getdata/WeiboHotData.java
+41
-0
src/main/java/com/zhiwei/weibocrawler/crawler/getdata/WeiboSocietyData.java
+43
-0
src/main/java/com/zhiwei/weibocrawler/email/SendEmail.java
+69
-0
src/main/java/com/zhiwei/weibocrawler/httpclient/HttpClientDemo.java
+58
-0
src/main/java/com/zhiwei/weibocrawler/queue/ListQueue.java
+137
-0
src/main/java/com/zhiwei/weibocrawler/rsidClient/DataQueue.java
+61
-0
src/main/java/com/zhiwei/weibocrawler/rsidClient/RsidClientDAO.java
+51
-0
src/main/java/com/zhiwei/weibocrawler/rsidClient/UpdateQueue.java
+70
-0
src/test/java/weibotest/HotWeiboTest.java
+23
-0
No files found.
pom.xml
0 → 100644
View file @
50be39af
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
weibohotcrawler
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
<name>
weibohotcrawler
</name>
<description>
微博热搜1小时榜单,社会、热点采集程序
</description>
<dependencies>
<dependency>
<groupId>
com.zhiwei.middleware
</groupId>
<artifactId>
rsid-client
</artifactId>
<version>
0.0.2-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
javax.mail
</groupId>
<artifactId>
mail
</artifactId>
<version>
1.4.7
</version>
</dependency>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
<version>
3.8.1
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
org.apache.httpcomponents
</groupId>
<artifactId>
httpcore
</artifactId>
<version>
4.4.6
</version>
</dependency>
<dependency>
<groupId>
org.apache.httpcomponents
</groupId>
<artifactId>
httpclient
</artifactId>
<version>
4.5.3
</version>
</dependency>
<dependency>
<groupId>
org.jsoup
</groupId>
<artifactId>
jsoup
</artifactId>
<version>
1.8.3
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
weibobusiness
</artifactId>
<version>
0.0.5-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
zhiweiTools
</artifactId>
<version>
0.0.6-SNAPSHOT
</version>
</dependency>
</dependencies>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>
maven-source-plugin
</artifactId>
<version>
2.4
</version>
<configuration>
<attach>
true
</attach>
</configuration>
<executions>
<execution>
<phase>
compile
</phase>
<goals>
<goal>
jar
</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-javadoc-plugin
</artifactId>
<version>
2.10.4
</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-surefire-plugin
</artifactId>
<version>
2.19.1
</version>
<configuration>
<forkMode>
once
</forkMode>
<argLine>
-Dfile.encoding=UTF-8
</argLine>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>
nexus-releases
</id>
<name>
User Porject Snapshot
</name>
<url>
http://192.168.0.30:8081/nexus/content/repositories/snapshots/
</url>
<uniqueVersion>
true
</uniqueVersion>
</snapshotRepository>
<repository>
<id>
nexus-releases
</id>
<name>
User Porject Release
</name>
<url>
http://192.168.0.30:8081/nexus/content/repositories/releases/
</url>
</repository>
</distributionManagement>
</project>
\ No newline at end of file
src/main/java/com/zhiwei/weibocrawler/config/Config.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
config
;
import
java.io.InputStream
;
import
java.util.Properties
;
public
class
Config
{
static
{
Properties
conf
=
null
;
try
{
InputStream
is
=
Thread
.
currentThread
().
getContextClassLoader
()
.
getResourceAsStream
(
"rsidClient.properties"
);
conf
=
new
Properties
();
conf
.
load
(
is
);
is
.
close
();
rsidUrl
=
conf
.
getProperty
(
"rsidUrl"
);
rsidGroup
=
conf
.
getProperty
(
"rsidGroup"
);
redisWeiboKey
=
conf
.
getProperty
(
"redisWeiboKey"
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
public
static
String
rsidUrl
;
public
static
String
rsidGroup
;
public
static
String
redisWeiboKey
;
}
src/main/java/com/zhiwei/weibocrawler/crawler/DataCrawlerStart.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
crawler
;
import
java.util.concurrent.Executors
;
import
java.util.concurrent.ScheduledExecutorService
;
import
java.util.concurrent.TimeUnit
;
import
com.zhiwei.weibocrawler.crawler.getdata.WeiboBangdanData
;
import
com.zhiwei.weibocrawler.crawler.getdata.WeiboHotData
;
import
com.zhiwei.weibocrawler.crawler.getdata.WeiboSocietyData
;
public
class
DataCrawlerStart
{
private
ScheduledExecutorService
scheduled
;
public
DataCrawlerStart
()
{
this
.
scheduled
=
Executors
.
newScheduledThreadPool
(
4
);
}
public
void
start
()
{
scheduled
.
scheduleWithFixedDelay
(
new
WeiboBangdanData
(),
2000
,
15
*
1000
,
TimeUnit
.
MILLISECONDS
);
scheduled
.
scheduleWithFixedDelay
(
new
WeiboHotData
(),
1000
,
20
*
1000
,
TimeUnit
.
MILLISECONDS
);
scheduled
.
scheduleWithFixedDelay
(
new
WeiboSocietyData
(),
3000
,
19
*
1000
,
TimeUnit
.
MILLISECONDS
);
}
}
src/main/java/com/zhiwei/weibocrawler/crawler/DataUpdate.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
crawler
;
import
java.util.List
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.weibobusiness.weibo4j.model.Status
;
import
com.zhiwei.weibocrawler.crawler.getdata.WeiboCrawlerAnalysis
;
import
com.zhiwei.weibocrawler.rsidClient.DataQueue
;
import
com.zhiwei.weibocrawler.rsidClient.UpdateQueue
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
/***
*
* @ClassName DataUpdate
* @Description 更新数据
* @author byte-zbs
* @Date 2017年12月7日 下午4:06:02
* @version 1.0.0
*/
public
class
DataUpdate
implements
Runnable
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
DataUpdate
.
class
);
private
String
token
;
private
int
i
=
1
;
public
DataUpdate
(
String
token
)
{
super
();
this
.
token
=
token
;
logger
.
info
(
"更新程序启动------------"
);
}
@Override
public
void
run
()
{
while
(
true
)
{
try
{
ZhiWeiTools
.
sleep
(
1000
);
if
(
i
>
60
||
DataQueue
.
linkQueue
.
size
()
>=
50
)
{
logger
.
info
(
"此次更新更新队列中数据量"
+
DataQueue
.
linkQueue
.
size
());
List
<
String
>
midList
=
DataQueue
.
get
(
48
);
if
(
midList
!=
null
&&
midList
.
size
()
>
0
)
{
List
<
Status
>
list
=
WeiboCrawlerAnalysis
.
getWeiboData
(
midList
,
token
);
logger
.
info
(
"更新数据量"
+
list
.
size
());
UpdateQueue
.
add
(
list
);
}
i
=
1
;
}
else
{
i
++;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"更新程序出错"
,
e
.
getMessage
());
continue
;
}
}
}
}
src/main/java/com/zhiwei/weibocrawler/crawler/GetData.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
crawler
;
import
java.util.Map
;
import
com.zhiwei.weibocrawler.rsidClient.UpdateQueue
;
/**
*
* @ClassName GetData
* @Description 启动采集程序和更新程序 获取数据
* @author byte-zbs
* @Date 2017年12月7日 下午5:35:53
* @version 1.0.0
*/
public
class
GetData
{
/**
*
* @Description 启动采集程序
* @param token
*/
public
static
void
start
(
String
token
){
new
DataCrawlerStart
().
start
();
DataUpdate
dataUpdate
=
new
DataUpdate
(
token
);
new
Thread
(
dataUpdate
).
start
();
}
/**
*
* @Description 依据count获取对应数量数据
* @param count
* @return
*/
public
static
Map
<
String
,
Object
>
getWeiboData
(
int
count
)
{
return
UpdateQueue
.
get
(
count
);
}
}
src/main/java/com/zhiwei/weibocrawler/crawler/getdata/WeiboBangdanData.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
crawler
.
getdata
;
import
java.util.Date
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
/**
*
* @ClassName WeiboBangdanData
* @Description 微博榜单采集程序
* @author byte-zbs
* @Date 2017年12月7日 下午5:36:26
* @version 1.0.0
*/
public
class
WeiboBangdanData
implements
Runnable
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboBangdanData
.
class
);
private
static
WeiboCrawlerAnalysis
weiboCrawlerAnalysis
=
new
WeiboCrawlerAnalysis
();
@Override
public
void
run
()
{
logger
.
info
(
"微博榜单数据采集开始========================="
);
for
(
int
i
=
0
;
i
<=
39
;
i
++)
{
try
{
String
url
=
"https://weibo.com/a/aj/transform/loadingmoreunlogin?ajwvr=6&category=9999&page="
+
i
+
"&lefnav=0&__rnd="
+
new
Date
().
getTime
();
weiboCrawlerAnalysis
.
getWeiboHotMid
(
url
);
ZhiWeiTools
.
sleep
(
12000
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"出错====榜单的出错了"
,
e
.
getMessage
());
e
.
printStackTrace
();
ZhiWeiTools
.
sleep
(
20
);
continue
;
}
}
logger
.
info
(
"微博榜单数据采集结束========================="
);
}
}
src/main/java/com/zhiwei/weibocrawler/crawler/getdata/WeiboCrawlerAnalysis.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
crawler
.
getdata
;
import
java.util.ArrayList
;
import
java.util.List
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.weibobusiness.business.SearchBusiness
;
import
com.zhiwei.weibobusiness.weibo4j.model.Status
;
import
com.zhiwei.weibobusiness.weibo4j.model.StatusWapper
;
import
com.zhiwei.weibobusiness.weibo4j.model.WeiboException
;
import
com.zhiwei.weibocrawler.httpclient.HttpClientDemo
;
import
com.zhiwei.weibocrawler.rsidClient.RsidClientDAO
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
import
com.zhiwei.weibocrawler.rsidClient.DataQueue
;
/**
*
* @ClassName WeiboCrawlerAnalysis
* @Description 微博解析程序
* @author byte-zbs
* @Date 2017年12月7日 下午5:36:49
* @version 1.0.0
*/
public
class
WeiboCrawlerAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboCrawlerAnalysis
.
class
);
/**
*
* @Description (mid获取微博数据)
* @param midsList
* @param businessToken
* @return
*/
public
static
List
<
Status
>
getWeiboData
(
List
<
String
>
midsList
,
String
businessToken
)
{
SearchBusiness
searchBusiness
=
new
SearchBusiness
(
businessToken
);
if
(
midsList
.
size
()
<
1
){
return
null
;
}
List
<
Status
>
statuses
=
new
ArrayList
<
Status
>();
String
mids
=
""
;
int
i
=
0
;
for
(
String
mid
:
midsList
)
{
mids
=
mids
+
mid
+
","
;
i
++;
if
(
i
>
48
)
{
try
{
mids
=
mids
.
substring
(
0
,
mids
.
length
()-
1
);
System
.
out
.
println
(
mids
);
StatusWapper
statusWapper
=
searchBusiness
.
showStatusBusniess
(
mids
);
statuses
.
addAll
(
statusWapper
.
getStatuses
());
i
=
0
;
mids
=
""
;
}
catch
(
WeiboException
e
)
{
logger
.
error
(
"数据更新出错部分mids========="
+
mids
);
e
.
printStackTrace
();
continue
;
}
}
}
try
{
mids
=
mids
.
substring
(
0
,
mids
.
length
()-
1
);
StatusWapper
statusWapper
=
searchBusiness
.
showStatusBusniess
(
mids
);
statuses
.
addAll
(
statusWapper
.
getStatuses
());
}
catch
(
WeiboException
e
)
{
logger
.
error
(
"数据更新出错部分mids========="
+
mids
);
logger
.
error
(
"数据出错"
,
e
.
getMessage
());
e
.
printStackTrace
();
return
null
;
}
return
statuses
;
}
/**
*
* @Description (获取微博数据mid集合)
* @return
*/
public
void
getWeiboHotMid
(
String
url
)
{
try
{
String
result
=
HttpClientDemo
.
executeHttpRequestGet
(
url
);
getWeiboData
(
result
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
/**
*
* @Description (解析)
* @param result
* @return
*/
private
void
getWeiboData
(
String
result
)
{
try
{
JSONObject
json
=
(
JSONObject
)
JSONObject
.
parse
(
result
);
String
s
=
json
.
getString
(
"data"
);
Document
document
=
Jsoup
.
parse
(
s
);
Elements
elements
=
document
.
select
(
"div.UG_contents"
).
select
(
"ul.clearfix"
).
select
(
"div[action-type=feed_list_item]"
);
for
(
Element
element
:
elements
)
{
try
{
String
mid
=
element
.
attr
(
"mid"
);
if
(
mid
.
length
()
>
16
)
{
mid
=
mid
.
substring
(
mid
.
length
()-
16
,
mid
.
length
());
}
if
(
RsidClientDAO
.
isWeiboExit
(
mid
))
{
DataQueue
.
offer
(
mid
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出错"
,
e
.
getMessage
());
ZhiWeiTools
.
sleep
(
200
);
e
.
printStackTrace
();
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据解析出错"
,
e
.
getMessage
());
ZhiWeiTools
.
sleep
(
200
);
e
.
printStackTrace
();
}
}
}
src/main/java/com/zhiwei/weibocrawler/crawler/getdata/WeiboHotData.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
crawler
.
getdata
;
import
java.util.Date
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
/**
*
* @ClassName WeiboHotData
* @Description 微博热门采集
* @author byte-zbs
* @Date 2017年12月7日 下午5:37:02
* @version 1.0.0
*/
public
class
WeiboHotData
implements
Runnable
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboHotData
.
class
);
private
static
WeiboCrawlerAnalysis
weiboCrawlerAnalysis
=
new
WeiboCrawlerAnalysis
();
@Override
public
void
run
()
{
logger
.
info
(
"微博热门数据采集开始======================"
);
for
(
int
i
=
0
;
i
<=
9
;
i
++)
{
try
{
String
url
=
"https://weibo.com/a/aj/transform/loadingmoreunlogin?ajwvr=6&category=0&page="
+
i
+
"&lefnav=0&__rnd="
+
new
Date
().
getTime
();
weiboCrawlerAnalysis
.
getWeiboHotMid
(
url
);
ZhiWeiTools
.
sleep
(
11000
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"======= 微博热门的出错了"
,
e
.
getMessage
());
e
.
printStackTrace
();
ZhiWeiTools
.
sleep
(
20
);
continue
;
}
}
logger
.
info
(
"微博热门数据采集完成======================"
);
}
}
src/main/java/com/zhiwei/weibocrawler/crawler/getdata/WeiboSocietyData.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
crawler
.
getdata
;
import
java.util.Date
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
/**
*
* @ClassName WeiboSocietyData
* @Description 微博社会采集
* @author byte-zbs
* @Date 2017年12月7日 下午5:37:16
* @version 1.0.0
*/
public
class
WeiboSocietyData
implements
Runnable
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboSocietyData
.
class
);
private
static
WeiboCrawlerAnalysis
weiboCrawlerAnalysis
=
new
WeiboCrawlerAnalysis
();
@Override
public
void
run
()
{
logger
.
info
(
"微博社会数据采集开始======================"
);
for
(
int
i
=
0
;
i
<=
9
;
i
++)
{
try
{
String
url
=
"https://weibo.com/a/aj/transform/loadingmoreunlogin?ajwvr=6&category=7&page="
+
i
+
"&lefnav=0&__rnd="
+
new
Date
().
getTime
();
weiboCrawlerAnalysis
.
getWeiboHotMid
(
url
);
ZhiWeiTools
.
sleep
(
11000
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"====== 微博社会的出错了"
,
e
.
getMessage
());
e
.
printStackTrace
();
ZhiWeiTools
.
sleep
(
20
);
continue
;
}
}
logger
.
info
(
"微博社会数据采集完成======================"
);
}
}
src/main/java/com/zhiwei/weibocrawler/email/SendEmail.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
email
;
import
java.io.UnsupportedEncodingException
;
import
java.util.Date
;
import
java.util.Properties
;
import
javax.mail.MessagingException
;
import
javax.mail.NoSuchProviderException
;
import
javax.mail.Session
;
import
javax.mail.Transport
;
import
javax.mail.internet.InternetAddress
;
import
javax.mail.internet.MimeMessage
;
public
class
SendEmail
{
public
static
void
main
(
String
[]
args
)
{
sendMessage
(
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa "
);
}
public
static
void
sendMessage
(
String
content
)
{
Properties
props
=
new
Properties
();
try
{
props
.
setProperty
(
"mail.transport.protocol"
,
"smtp"
);
props
.
setProperty
(
"mail.smtp.host"
,
"smtp.163.com"
);
props
.
setProperty
(
"mail.smtp.auth"
,
"true"
);
// final String smtpPort = "465";
// props.setProperty("mail.smtp.port", smtpPort);
// props.setProperty("mail.smtp.socketFactory.class", "javax.net.ssl.SSLSocketFactory");
// props.setProperty("mail.smtp.socketFactory.fallback", "false");
// props.setProperty("mail.smtp.socketFactory.port", smtpPort);
//
Session
session
=
Session
.
getInstance
(
props
);
session
.
setDebug
(
true
);
MimeMessage
message
=
createMimeMessage
(
session
,
"18271694195@163.com"
,
"497332654@qq.com"
,
content
);
Transport
transport
=
session
.
getTransport
();
transport
.
connect
(
"18271694195@163.com"
,
"zhejiang289"
);
transport
.
sendMessage
(
message
,
message
.
getAllRecipients
());
transport
.
close
();
}
catch
(
NoSuchProviderException
e
)
{
e
.
printStackTrace
();
}
catch
(
MessagingException
e
)
{
e
.
printStackTrace
();
}
}
private
static
MimeMessage
createMimeMessage
(
Session
session
,
String
sendMail
,
String
receiveMail
,
String
content
)
{
MimeMessage
message
=
new
MimeMessage
(
session
);
try
{
message
.
setFrom
(
new
InternetAddress
(
sendMail
,
"zhiwei"
,
"UTF-8"
));
message
.
setRecipient
(
MimeMessage
.
RecipientType
.
TO
,
new
InternetAddress
(
receiveMail
));
message
.
setSubject
(
"微博热门数据采集出错了"
);
message
.
setContent
(
content
,
"text/html;charset=UTF-8"
);
message
.
setSentDate
(
new
Date
());
message
.
saveChanges
();
}
catch
(
UnsupportedEncodingException
e
)
{
e
.
printStackTrace
();
}
catch
(
MessagingException
e
)
{
e
.
printStackTrace
();
}
return
message
;
}
}
src/main/java/com/zhiwei/weibocrawler/httpclient/HttpClientDemo.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
httpclient
;
import
java.io.IOException
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.Map.Entry
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClients
;
import
org.apache.http.util.EntityUtils
;
public
class
HttpClientDemo
{
public
static
String
executeHttpRequestGet
(
String
url
)
throws
IOException
{
String
result
=
null
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
headerMap
.
put
(
"Accept"
,
"*/*"
);
headerMap
.
put
(
"Accept-Encoding"
,
"gzip, deflate, br"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Content-Type"
,
"application/x-www-form-urlencoded"
);
headerMap
.
put
(
"Host"
,
"weibo.com"
);
CloseableHttpClient
httpClient
=
null
;
for
(
int
j
=
1
;
j
<=
3
;
j
++)
{
try
{
HttpGet
httpGet
=
new
HttpGet
(
url
);
RequestConfig
requestConfig
=
RequestConfig
.
custom
()
.
setSocketTimeout
(
8000
).
setConnectTimeout
(
8000
).
build
();
httpClient
=
HttpClients
.
custom
()
.
setDefaultRequestConfig
(
requestConfig
).
build
();
if
(
headerMap
!=
null
)
{
for
(
Entry
<
String
,
String
>
header
:
headerMap
.
entrySet
())
{
httpGet
.
setHeader
(
header
.
getKey
(),
header
.
getValue
());
}
}
result
=
EntityUtils
.
toString
(
httpClient
.
execute
(
httpGet
).
getEntity
());
return
result
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
continue
;
}
finally
{
if
(
httpClient
!=
null
)
{
httpClient
.
close
();
}
}
}
return
result
;
}
}
src/main/java/com/zhiwei/weibocrawler/queue/ListQueue.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
queue
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.concurrent.BlockingQueue
;
import
java.util.concurrent.LinkedBlockingQueue
;
/**
*
* @ClassName ListQueue
* @Description 队列
* @author byte-zbs
* @Date 2017年12月7日 下午5:42:34
* @version 1.0.0
* @param <E>
*/
public
class
ListQueue
<
E
>
{
private
BlockingQueue
<
E
>
linkQueue
=
new
LinkedBlockingQueue
<
E
>();
/**
*
* @Description 判断是否为空
* @return
*/
public
boolean
empty
()
{
return
linkQueue
.
isEmpty
();
}
/**
*
* @Description (获取队列一个数据 不删除)
* @return
*/
public
E
element
()
{
return
linkQueue
.
element
();
}
/**
*
* @Description (查看队列中指定数量的元素)
* @param count
* @return
*/
public
List
<
E
>
element
(
int
count
)
{
List
<
E
>
list
=
new
ArrayList
<
E
>();
for
(
int
i
=
0
;
i
<
count
;
i
++)
{
E
e
=
linkQueue
.
element
();
if
(
e
==
null
)
{
break
;
}
list
.
add
(
e
);
}
return
list
;
}
/**
*
* @Description (移除并返回队列头元素)
* @return
*/
public
E
poll
()
{
return
linkQueue
.
poll
();
}
/**
*
* @Description (依据传入的量 传出相应数量的数据 count过大则返回所有数据)
* @param count
* @return
*/
public
List
<
E
>
poll
(
int
count
){
List
<
E
>
list
=
new
ArrayList
<
E
>();
for
(
int
i
=
0
;
i
<
count
;
i
++)
{
E
e
=
linkQueue
.
poll
();
if
(
e
==
null
)
{
break
;
}
list
.
add
(
e
);
}
return
list
;
}
/**
*
* @Description (输出所有数据)
* @return
*/
public
List
<
E
>
pollAll
()
{
List
<
E
>
list
=
new
ArrayList
<
E
>();
int
n
=
this
.
size
();
for
(
int
i
=
0
;
i
<
n
;
i
++)
{
E
e
=
linkQueue
.
poll
();
if
(
e
==
null
)
{
break
;
}
list
.
add
(
e
);
}
return
list
;
}
/**
*
* @Description (插入一条数据 成功为true)
* @param e
* @return
*/
public
boolean
offer
(
E
e
)
{
return
linkQueue
.
offer
(
e
);
}
/**
*
* @Description 插入批量数据成功返回true
* @param lists
* @return
*/
public
boolean
offer
(
List
<
E
>
lists
)
{
for
(
E
e
:
lists
)
{
if
(!
linkQueue
.
offer
(
e
))
{
return
false
;
}
}
return
true
;
}
/**
*
* @Description 返回队列的长度
* @return
*/
public
int
size
()
{
return
linkQueue
.
size
();
}
}
src/main/java/com/zhiwei/weibocrawler/rsidClient/DataQueue.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
rsidClient
;
import
java.util.ArrayList
;
import
java.util.List
;
import
com.zhiwei.weibocrawler.queue.ListQueue
;
import
com.zhiwei.weibocrawler.rsidClient.RsidClientDAO
;
/**
* @ClassName TreatDataCrawler
* @Description 处理采集回来的数据,并验证是否重复
* @author byte-zbs
* @Date 2017年12月7日 下午1:56:55
* @version 1.0.0
*/
public
class
DataQueue
{
public
static
ListQueue
<
String
>
linkQueue
=
new
ListQueue
<
String
>();
//已去重数据队列
public
static
void
offer
(
String
mid
)
{
System
.
out
.
println
(
"更新队列中的数据大小===="
+
linkQueue
.
size
());
linkQueue
.
offer
(
mid
);
}
/**
*
* @Description (存储批量数据)
* @param mids
*/
public
static
void
add
(
List
<
String
>
mids
){
if
(
mids
!=
null
&&
mids
.
size
()>
0
){
for
(
String
mid
:
mids
){
if
(
RsidClientDAO
.
isWeiboExit
(
mid
)){
linkQueue
.
offer
(
mid
);
}
}
}
}
/**
* @Description 取出数据
* @param count
* @return
*/
public
static
List
<
String
>
get
(
int
count
){
List
<
String
>
result
=
null
;
if
(
linkQueue
!=
null
&&
linkQueue
.
size
()>
0
){
result
=
new
ArrayList
<
String
>();
if
(
linkQueue
.
size
()<
count
){
count
=
linkQueue
.
size
();
}
for
(
int
i
=
0
;
i
<
count
;
i
++){
String
mid
=
linkQueue
.
poll
();
result
.
add
(
mid
);
}
}
return
result
;
}
}
src/main/java/com/zhiwei/weibocrawler/rsidClient/RsidClientDAO.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
rsidClient
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.rsid.core.RsidClient
;
import
com.zhiwei.weibocrawler.config.Config
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
/**
* @ClassName RsidClientDAO
* @Description 验证数据重复类
* @author byte-zbs
* @Date 2017年12月6日 下午5:21:15
* @version 1.0.0
*/
public
class
RsidClientDAO
{
// private static final String rsidUrl = "zookeeper://192.168.0.203:2181"; //中间件zookkeeper地址,服务器地址
//
// private static final String rsidGroup = "rsidserver"; //中间件分组
//
// private static final String redisWeiboKey = "weibo"; //去重的分组
private
static
RsidClient
client
=
RsidClient
.
build
(
Config
.
rsidUrl
,
Config
.
rsidGroup
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
RsidClientDAO
.
class
);
/**
* @Description 验证微博是否重复
* @param mid
* @return
*/
public
static
boolean
isWeiboExit
(
String
mid
){
//循环3次避免连接超时引起的验证失效
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
return
client
.
addFilterUrl
(
mid
,
false
,
Config
.
redisWeiboKey
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"判断此条微博消息是否存在出现问题"
,
e
.
fillInStackTrace
());
ZhiWeiTools
.
sleep
(
200
);
continue
;
}
}
return
false
;
}
}
src/main/java/com/zhiwei/weibocrawler/rsidClient/UpdateQueue.java
0 → 100644
View file @
50be39af
package
com
.
zhiwei
.
weibocrawler
.
rsidClient
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.weibobusiness.weibo4j.model.Status
;
import
com.zhiwei.weibocrawler.queue.ListQueue
;
/**
* @ClassName TreatDataCrawler
* @Description 输出结果的队列
* @author byte-zbs
* @Date 2017年12月7日 下午1:56:55
* @version 1.0.0
*/
public
class
UpdateQueue
{
public
static
ListQueue
<
Status
>
linkQueue
=
new
ListQueue
<
Status
>();
//最终队列
/**
* @Description 将采集到的数据添加到队列
* @param mid
*/
public
static
void
add
(
Status
status
){
System
.
out
.
println
(
"更新后队列中的数据大小===="
+
linkQueue
.
size
());
linkQueue
.
offer
(
status
);
}
/**
*
* @Description 批量增加数据到队列中
* @param lists
*/
public
static
void
add
(
List
<
Status
>
lists
){
for
(
Status
status
:
lists
)
{
linkQueue
.
offer
(
status
);
}
}
/**
* @Description 取出数据
* @param count
* @return
*/
public
static
Map
<
String
,
Object
>
get
(
int
count
){
Map
<
String
,
Object
>
result
=
new
HashMap
<
String
,
Object
>();
List
<
Status
>
dataList
=
null
;
if
(
linkQueue
!=
null
&&
linkQueue
.
size
()>
0
){
dataList
=
new
ArrayList
<
Status
>();
if
(
linkQueue
.
size
()<
count
){
count
=
linkQueue
.
size
();
}
for
(
int
i
=
0
;
i
<
count
;
i
++){
Status
status
=
linkQueue
.
poll
();
dataList
.
add
(
status
);
}
result
.
put
(
"messages"
,
"数据获取完成"
);
result
.
put
(
"data"
,
dataList
);
result
.
put
(
"remain_count"
,
linkQueue
.
size
());
}
else
{
result
.
put
(
"messages"
,
"暂无数据"
);
result
.
put
(
"data"
,
dataList
);
result
.
put
(
"remain_count"
,
0
);
}
return
result
;
}
}
src/test/java/weibotest/HotWeiboTest.java
0 → 100644
View file @
50be39af
package
weibotest
;
import
java.util.Map
;
import
com.zhiwei.weibocrawler.crawler.GetData
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
HotWeiboTest
{
public
static
void
main
(
String
[]
args
)
{
//开启采集
String
token
=
"2.00HUuC3C3_jZ8E0c00a67ab8xbOHqB"
;
GetData
.
start
(
token
);
// //获取数据
while
(
true
){
Map
<
String
,
Object
>
data
=
GetData
.
getWeiboData
(
50
);
System
.
out
.
println
(
data
);
ZhiWeiTools
.
sleep
(
10
*
1000
);
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment