Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
weiboDomain
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
xuyimeng
weiboDomain
Commits
158abbbc
Commit
158abbbc
authored
Feb 28, 2018
by
win7
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
sss
parents
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
1198 additions
and
0 deletions
+1198
-0
pom.xml
+65
-0
src/main/java/com/zhiweidata/weiboDomain/crawler/HttpclientInstance.java
+96
-0
src/main/java/com/zhiweidata/weiboDomain/crawler/JsoupHtml.java
+188
-0
src/main/java/com/zhiweidata/weiboDomain/crawler/WeiboDomainCrawler.java
+238
-0
src/main/java/com/zhiweidata/weiboDomain/dao/DomainDao.java
+37
-0
src/main/java/com/zhiweidata/weiboDomain/dao/TagDao.java
+30
-0
src/main/java/com/zhiweidata/weiboDomain/dao/impl/DomainDaoImpl.java
+91
-0
src/main/java/com/zhiweidata/weiboDomain/dao/impl/TagDaoImpl.java
+57
-0
src/main/java/com/zhiweidata/weiboDomain/entity/DomainTag.java
+33
-0
src/main/java/com/zhiweidata/weiboDomain/entity/WeiboDomain.java
+44
-0
src/main/java/com/zhiweidata/weiboDomain/excel/DBOExp.java
+129
-0
src/main/java/com/zhiweidata/weiboDomain/excel/SimpeExcelReport.java
+0
-0
src/main/java/com/zhiweidata/weiboDomain/service/MongoSerivce.java
+155
-0
src/main/java/com/zhiweidata/weiboDomain/start/Start.java
+35
-0
No files found.
pom.xml
0 → 100644
View file @
158abbbc
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiweidata.weiboDomain
</groupId>
<artifactId>
weiboDomain
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
<dependencies>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
<version>
4.12
</version>
</dependency>
<dependency>
<groupId>
org.springframework.data
</groupId>
<artifactId>
spring-data-mongodb
</artifactId>
<version>
1.10.10.RELEASE
</version>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-test
</artifactId>
<version>
4.3.14.RELEASE
</version>
</dependency>
<dependency>
<groupId>
org.mongodb
</groupId>
<artifactId>
mongo-java-driver
</artifactId>
<version>
3.2.2
</version>
</dependency>
<dependency>
<groupId>
org.projectlombok
</groupId>
<artifactId>
lombok
</artifactId>
<version>
1.16.20
</version>
</dependency>
<dependency>
<groupId>
org.jsoup
</groupId>
<artifactId>
jsoup
</artifactId>
<version>
1.10.1
</version>
</dependency>
<dependency>
<groupId>
org.apache.httpcomponents
</groupId>
<artifactId>
httpclient
</artifactId>
<version>
4.5.5
</version>
</dependency>
<dependency>
<groupId>
commons-io
</groupId>
<artifactId>
commons-io
</artifactId>
<version>
2.5
</version>
</dependency>
<dependency>
<groupId>
org.apache.commons
</groupId>
<artifactId>
commons-lang3
</artifactId>
<version>
3.7
</version>
</dependency>
<dependency>
<groupId>
net.sf.json-lib
</groupId>
<artifactId>
json-lib
</artifactId>
<version>
2.4
</version>
<classifier>
jdk15
</classifier>
</dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
jxlzw
</artifactId>
<version>
0.0.2-SNAPSHOT
</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
src/main/java/com/zhiweidata/weiboDomain/crawler/HttpclientInstance.java
0 → 100644
View file @
158abbbc
/**
* @Title: httpclientInstance.java
* @Package com.zhiweidata.weiboDomain.crawler
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午1:54:32
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
crawler
;
import
java.util.List
;
import
javax.net.ssl.SSLContext
;
import
org.apache.http.HttpHost
;
import
org.apache.http.client.CookieStore
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.config.Registry
;
import
org.apache.http.config.RegistryBuilder
;
import
org.apache.http.conn.socket.ConnectionSocketFactory
;
import
org.apache.http.conn.socket.PlainConnectionSocketFactory
;
import
org.apache.http.conn.ssl.SSLConnectionSocketFactory
;
import
org.apache.http.cookie.Cookie
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClientBuilder
;
import
org.apache.http.impl.client.HttpClients
;
import
org.apache.http.impl.conn.PoolingHttpClientConnectionManager
;
import
org.apache.http.ssl.SSLContexts
;
/**
* @ClassName: httpclientInstance
* @Description: TODO(http连接管理,生成http对象)
* @author xuyimeng
* @date 2018年2月23日 下午1:54:32
*/
public
class
HttpclientInstance
{
//设置userAgent池
private
static
final
String
[]
userAgent
=
{
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
,
"NokiaX2-02/2.0 (11.79) Profile/MIDP-2.1 Configuration/CLDC-1.1 Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2;.NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2) UCBrowser8.4.0.159/70/352"
,
"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13"
,
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6"
,
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1"
,
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"
,
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9"
,
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
,
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
};
/**
* @Title: generateClient
* @Description: TODO(用连接池生成httpclient)
* @return
* CloseableHttpClient 返回类型
*/
public
static
CloseableHttpClient
generateClient
(
CookieStore
cookieStore
)
{
return
generateClient
(
null
,
cookieStore
);
}
/**
* @Title: generateClient
* @Description: TODO(增加代理)
* @param httpHost
* @return
* CloseableHttpClient 返回类型
*/
public
static
CloseableHttpClient
generateClient
(
HttpHost
httpHost
,
CookieStore
cookieStore
)
{
SSLContext
sslcontext
=
SSLContexts
.
createSystemDefault
();
Registry
<
ConnectionSocketFactory
>
socketFactoryRegistry
=
RegistryBuilder
.<
ConnectionSocketFactory
>
create
()
.
register
(
"http"
,
PlainConnectionSocketFactory
.
INSTANCE
)
.
register
(
"https"
,
new
SSLConnectionSocketFactory
(
sslcontext
)).
build
();
// http连接池管理,服务于多个执行进程的连接请求
PoolingHttpClientConnectionManager
connectionManager
=
new
PoolingHttpClientConnectionManager
(
socketFactoryRegistry
);
connectionManager
.
setMaxTotal
(
200
);
connectionManager
.
setDefaultMaxPerRoute
(
20
);
RequestConfig
requestConfig
=
RequestConfig
.
custom
().
setProxy
(
httpHost
).
build
();
HttpClientBuilder
httpClientBuilder
=
HttpClients
.
custom
().
setUserAgent
(
randomUserAgent
())
.
setConnectionManager
(
connectionManager
).
setDefaultRequestConfig
(
requestConfig
).
setDefaultCookieStore
(
cookieStore
);
return
httpClientBuilder
.
build
();
}
/**
* @Title: randomUserAgent
* @Description: TODO(随机取一个ua)
* @return
* String 返回类型
*/
public
static
String
randomUserAgent
()
{
return
userAgent
[(
int
)
(
Math
.
random
()
*
userAgent
.
length
)];
}
}
src/main/java/com/zhiweidata/weiboDomain/crawler/JsoupHtml.java
0 → 100644
View file @
158abbbc
/**
* @Title: JsoupHtml.java
* @Package com.zhiweidata.weiboDomain.crawler
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午3:16:06
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
crawler
;
import
java.time.LocalDateTime
;
import
java.time.format.DateTimeFormatter
;
import
java.util.ArrayList
;
import
java.util.List
;
import
javax.print.Doc
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
com.zhiweidata.weiboDomain.entity.WeiboDomain
;
import
net.sf.json.JSONObject
;
/**
* @ClassName: JsoupHtml
* @Description: TODO(解析页面数据)
* @author xuyimeng
* @date 2018年2月23日 下午3:16:06
*/
public
class
JsoupHtml
{
private
static
JsoupHtml
jsoupHtml
=
new
JsoupHtml
();
private
JsoupHtml
()
{}
public
static
JsoupHtml
getInstance
()
{
return
jsoupHtml
;
}
/**
* @Title: parseData
* @Description: TODO(解析页面数据转化为集合)
* @param html
* @param domain
* @return
* List<WeiboDomainGroup> 返回类型
*/
public
List
<
WeiboDomain
>
parseData
(
String
html
,
String
domain
){
List
<
WeiboDomain
>
result
=
new
ArrayList
<>();
Document
doc
=
Jsoup
.
parse
(
html
);
//处理填充数据
String
str
=
""
;
Elements
script
=
doc
.
getElementsByTag
(
"script"
);
str
=
script
.
get
(
0
).
childNode
(
0
).
toString
();
str
=
getHtml
(
str
);
//解析页面数据
doc
=
Jsoup
.
parse
(
str
);
Elements
user
=
doc
.
getElementsByTag
(
"dd"
);
for
(
Element
element
:
user
)
{
if
(
element
.
attr
(
"class"
).
equals
(
"mod_info S_line1"
))
{
WeiboDomain
weiboDomainGroup
=
new
WeiboDomain
();
String
uid
=
""
;
Elements
elements
=
element
.
getElementsByTag
(
"div"
);
for
(
Element
div
:
elements
)
{
if
(
div
.
attr
(
"class"
).
equals
(
"info_name W_fb W_f14"
))
{
Element
S_txt1
=
div
.
getElementsByClass
(
"S_txt1"
).
get
(
0
);
uid
=
S_txt1
.
attr
(
"usercard"
).
split
(
"&"
)[
0
].
replaceAll
(
"id="
,
""
);
weiboDomainGroup
.
setUid
(
uid
);
weiboDomainGroup
.
setUrl
(
S_txt1
.
attr
(
"href"
));
weiboDomainGroup
.
setName
(
S_txt1
.
attr
(
"title"
));
Elements
i
=
div
.
getElementsByTag
(
"i"
);
for
(
Element
ele
:
i
)
{
if
(
ele
.
attr
(
"class"
).
equals
(
"W_icon icon_member"
))
{
weiboDomainGroup
.
setVip
(
true
);
}
if
(
ele
.
attr
(
"class"
).
equals
(
"W_icon icon_male"
))
{
weiboDomainGroup
.
setGender
(
"m"
);
}
else
{
weiboDomainGroup
.
setGender
(
"f"
);
}
}
}
if
(
div
.
attr
(
"class"
).
equals
(
"info_connect"
))
{
Elements
em
=
div
.
getElementsByTag
(
"em"
);
weiboDomainGroup
.
setFriends_count
(
Integer
.
parseInt
(
em
.
get
(
0
).
text
()));
weiboDomainGroup
.
setFollowers_count
(
em
.
get
(
1
).
text
());
weiboDomainGroup
.
setStatuses_count
(
Integer
.
parseInt
(
em
.
get
(
2
).
text
()));
}
if
(
div
.
attr
(
"class"
).
equals
(
"info_add"
))
{
Elements
span
=
div
.
getElementsByTag
(
"span"
);
weiboDomainGroup
.
setLocation
(
span
.
get
(
0
).
text
());
}
if
(
div
.
attr
(
"class"
).
equals
(
"info_intro"
))
{
Elements
span
=
div
.
getElementsByTag
(
"span"
);
weiboDomainGroup
.
setDescription
(
span
.
get
(
0
).
text
());
}
if
(
div
.
attr
(
"class"
).
equals
(
"info_relation"
))
{
String
tag
=
div
.
text
().
split
(
":"
)[
1
];
weiboDomainGroup
.
setTag
(
tag
);
}
}
weiboDomainGroup
.
setDomain
(
domain
);
weiboDomainGroup
.
setUpdateTime
(
LocalDateTime
.
now
()
.
format
(
DateTimeFormatter
.
ofPattern
(
"yyyy-MM-dd HH:mm:ss"
)));
weiboDomainGroup
.
setId
(
domain
+
"_"
+
uid
);
result
.
add
(
weiboDomainGroup
);
}
}
return
result
;
}
/**
* @Title: getHtml
* @Description: TODO(微博数据是用FW.view填充,所以需要解析)
* @return
* String 返回类型
*/
private
String
getHtml
(
String
str
)
{
str
=
str
.
replaceAll
(
"parent.FM.view\\("
,
""
).
replaceAll
(
"\\)"
,
""
);
JSONObject
json
=
JSONObject
.
fromObject
(
str
);
return
json
.
getString
(
"html"
);
}
/**
* @Title: parsePage
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param domainId
* @param cookie
* @return
* int 返回类型
*/
public
int
parsePage
(
String
page
)
{
try
{
Document
doc
=
Jsoup
.
parse
(
page
);
//处理填充数据
String
str
=
""
;
Elements
script
=
doc
.
getElementsByTag
(
"script"
);
for
(
Element
s
:
script
)
{
if
(
s
.
childNode
(
0
).
toString
().
contains
(
"content.signInPeople.index"
))
{
str
=
s
.
childNode
(
0
).
toString
();
}
}
str
=
str
.
replaceAll
(
"FM.view\\("
,
""
).
replaceAll
(
"\\)"
,
""
);
JSONObject
json
=
JSONObject
.
fromObject
(
str
);
str
=
json
.
getString
(
"html"
);
doc
=
Jsoup
.
parse
(
str
);
Elements
a
=
doc
.
getElementsByTag
(
"a"
);
int
num
=
0
;
for
(
Element
e
:
a
)
{
if
(
"page"
.
equals
(
e
.
attr
(
"bpfilter"
))
&&
"page S_txt1"
.
equals
(
e
.
attr
(
"class"
)))
{
if
(
Integer
.
parseInt
(
e
.
text
())
>
num
)
{
num
=
Integer
.
parseInt
(
e
.
text
());
}
}
}
return
num
;
}
catch
(
Exception
e
)
{
return
0
;
}
}
}
src/main/java/com/zhiweidata/weiboDomain/crawler/WeiboDomainCrawler.java
0 → 100644
View file @
158abbbc
/**
* @Title: WeiboDomainCrawler.java
* @Package com.zhiweidata.weiboDomain.crawler
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午1:59:46
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
crawler
;
import
java.io.ByteArrayInputStream
;
import
java.io.ByteArrayOutputStream
;
import
java.io.IOException
;
import
java.io.InputStream
;
import
java.nio.charset.Charset
;
import
java.util.function.Predicate
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.Header
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.HttpHeaders
;
import
org.apache.http.HttpHost
;
import
org.apache.http.HttpStatus
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.entity.GzipDecompressingEntity
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.entity.ContentType
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
/**
* @ClassName: WeiboDomainCrawler
* @Description: TODO(微博榜单爬虫)
* @author xuyimeng
* @date 2018年2月23日 下午1:59:46
*/
public
class
WeiboDomainCrawler
{
private
static
CloseableHttpClient
client
=
HttpclientInstance
.
generateClient
(
null
);
private
HttpHost
httpHost
;
/**
* @Title: getHtml
* @Description: TODO(通过url返回页面数据)
* @param url
* @return
* String 返回类型
*/
public
String
getHtml
(
String
url
,
String
cookie
)
{
HttpGet
httpGet
=
createHttpGet
(
url
,
cookie
);
return
get
(
httpGet
);
}
/**
* @Title: getPage
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param domainId
* @return
* String 返回类型
*/
public
String
getPage
(
String
domainId
,
String
cookie
)
{
String
url
=
"https://d.weibo.com/"
+
domainId
;
HttpGet
httpGet
=
createHttpGet
(
url
,
cookie
);
return
get
(
httpGet
);
}
/**
* @Title: get
* @Description: TODO(返回html数据)
* @param httpGet
* @return
* String 返回类型
*/
private
String
get
(
HttpGet
httpGet
)
{
//设置返回内容的检测逻辑
Predicate
<
String
>
predicate
=
s
->
(
s
==
null
||
""
.
equals
(
s
))
||
s
.
contains
(
"empty_con clearfix"
)
||
!
s
.
contains
(
"follow_item S_line2"
);
return
get
(
httpGet
,
predicate
);
}
private
String
get
(
HttpGet
httpGet
,
Predicate
<
String
>
predicate
)
{
boolean
flag
=
true
;
while
(
flag
)
{
try
{
CloseableHttpResponse
response
=
client
.
execute
(
httpGet
);
HttpEntity
httpEntity
=
response
.
getEntity
();
String
responseContent
=
getResponseContent
(
httpEntity
);
if
(
response
.
getStatusLine
().
getStatusCode
()
==
HttpStatus
.
SC_OK
)
{
flag
=
false
;
}
if
(!
predicate
.
test
(
responseContent
))
{
return
responseContent
;
}
}
catch
(
Exception
e
)
{
sleep
(
3000L
);
}
}
return
null
;
}
/**
* @Title: getResponseContent
* @Description: TODO(字符流的方式,获取相应的正文)
* @param httpEntity
* @return
* @throws IOException
* String 返回类型
*/
public
static
String
getResponseContent
(
final
HttpEntity
httpEntity
)
throws
IOException
{
if
(
httpEntity
==
null
)
{
return
null
;
}
InputStream
in
=
null
;
try
{
Header
header
=
httpEntity
.
getContentEncoding
();
//被压缩就先解压
if
(
null
!=
header
&&
"gzip"
.
equals
(
header
.
getValue
()))
{
in
=
new
GzipDecompressingEntity
(
httpEntity
).
getContent
();
}
else
{
in
=
httpEntity
.
getContent
();
}
ByteArrayOutputStream
baos
=
new
ByteArrayOutputStream
();
IOUtils
.
copy
(
in
,
baos
);
Charset
charset
=
null
;
ContentType
contentType
=
ContentType
.
get
(
httpEntity
);
//获取字符集,为空就从页面解析
if
(
contentType
!=
null
)
{
charset
=
contentType
.
getCharset
();
}
if
(
charset
==
null
)
{
String
content
=
IOUtils
.
toString
(
new
ByteArrayInputStream
(
baos
.
toByteArray
()),
Charset
.
defaultCharset
().
displayName
());
charset
=
getHtmlCharset
(
content
);
if
(
charset
==
null
)
{
return
content
;
}
}
return
IOUtils
.
toString
(
new
ByteArrayInputStream
(
baos
.
toByteArray
()),
charset
.
displayName
());
}
finally
{
if
(
in
!=
null
)
{
in
.
close
();
}
}
}
/**
* @Title: getHtmlCharset
* @Description: TODO(解析页面字符集)
* @param html
* @return
* Charset 返回类型
*/
public
static
Charset
getHtmlCharset
(
final
String
html
)
{
if
(!
StringUtils
.
isEmpty
(
html
))
{
Document
document
=
Jsoup
.
parse
(
html
);
Elements
links
=
document
.
select
(
"meta"
);
for
(
Element
link
:
links
)
{
String
metaContent
=
link
.
attr
(
"content"
);
String
metaCharset
=
link
.
attr
(
"charset"
);
if
(
metaContent
.
contains
(
"charset="
))
{
metaContent
=
metaContent
.
substring
(
metaContent
.
indexOf
(
"charset"
),
metaContent
.
length
());
return
Charset
.
forName
(
metaContent
.
split
(
"="
)[
1
]);
}
else
if
(!
StringUtils
.
isEmpty
(
metaCharset
))
{
return
Charset
.
forName
(
metaCharset
);
}
}
}
return
null
;
}
/**
* 生成get请求,请求头和请求参数
*/
private
HttpGet
createHttpGet
(
String
url
,
String
cookie
)
{
HttpGet
httpGet
=
new
HttpGet
(
url
);
httpGet
.
setConfig
(
getRequestConfig
());
httpGet
.
setHeader
(
HttpHeaders
.
USER_AGENT
,
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
httpGet
.
addHeader
(
HttpHeaders
.
ACCEPT
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
);
httpGet
.
addHeader
(
HttpHeaders
.
ACCEPT_ENCODING
,
"gzip, deflate, br"
);
httpGet
.
addHeader
(
HttpHeaders
.
ACCEPT_LANGUAGE
,
"zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7"
);
httpGet
.
addHeader
(
HttpHeaders
.
CONNECTION
,
"keep-alive"
);
httpGet
.
addHeader
(
"Cookie"
,
cookie
);
httpGet
.
addHeader
(
HttpHeaders
.
HOST
,
"d.weibo.com"
);
return
httpGet
;
}
/**
* @Title: getRequestConfig
* @Description: TODO(设置请求配置)
* @return
* RequestConfig 返回类型
*/
private
RequestConfig
getRequestConfig
()
{
return
RequestConfig
.
custom
().
setSocketTimeout
(
3000
).
setConnectTimeout
(
3000
).
setConnectionRequestTimeout
(
3000
)
.
setProxy
(
httpHost
).
build
();
}
public
void
sleep
(
long
time
)
{
try
{
Thread
.
sleep
(
time
);
}
catch
(
InterruptedException
e
)
{
e
.
printStackTrace
();
}
}
}
src/main/java/com/zhiweidata/weiboDomain/dao/DomainDao.java
0 → 100644
View file @
158abbbc
/**
* @Title: mongoDao.java
* @Package com.zhiweidata.weiboDomain.dao
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午2:34:36
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
dao
;
import
java.util.List
;
import
com.zhiweidata.weiboDomain.entity.WeiboDomain
;
/**
* @ClassName: mongoDao
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午2:34:36
*/
public
interface
DomainDao
{
public
List
<
WeiboDomain
>
findByUid
(
String
uid
);
public
List
<
WeiboDomain
>
findByDomain
(
String
domain
);
public
List
<
WeiboDomain
>
findAll
();
public
void
insert
(
List
<
WeiboDomain
>
list
);
public
void
createColl
();
public
String
bestNewCollName
();
}
src/main/java/com/zhiweidata/weiboDomain/dao/TagDao.java
0 → 100644
View file @
158abbbc
/**
* @Title: TagDao.java
* @Package com.zhiweidata.weiboDomain.dao
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午5:17:58
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
dao
;
import
java.util.List
;
import
com.zhiweidata.weiboDomain.entity.DomainTag
;
/**
* @ClassName: TagDao
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午5:17:58
*/
public
interface
TagDao
{
public
List
<
DomainTag
>
findAll
();
public
List
<
DomainTag
>
findByState
(
Integer
state
);
public
void
updateByState
(
String
domain
,
Integer
state
);
}
src/main/java/com/zhiweidata/weiboDomain/dao/impl/DomainDaoImpl.java
0 → 100644
View file @
158abbbc
/**
* @Title: MongoDaoImpl.java
* @Package dao.impl
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午2:57:24
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
dao
.
impl
;
import
java.time.LocalDate
;
import
java.time.format.DateTimeFormatter
;
import
java.util.List
;
import
java.util.Set
;
import
javax.annotation.Resource
;
import
org.springframework.data.mongodb.core.MongoTemplate
;
import
org.springframework.data.mongodb.core.query.Criteria
;
import
org.springframework.data.mongodb.core.query.Query
;
import
org.springframework.stereotype.Repository
;
import
com.zhiweidata.weiboDomain.dao.DomainDao
;
import
com.zhiweidata.weiboDomain.entity.WeiboDomain
;
/**
* @ClassName: MongoDaoImpl
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午2:57:24
*/
@Repository
public
class
DomainDaoImpl
implements
DomainDao
{
@Resource
MongoTemplate
mongoTemplate
;
@Override
public
List
<
WeiboDomain
>
findByUid
(
String
uid
)
{
String
collName
=
bestNewCollName
();
Query
query
=
new
Query
().
addCriteria
(
Criteria
.
where
(
"uid"
).
is
(
uid
));
return
mongoTemplate
.
find
(
query
,
WeiboDomain
.
class
,
collName
);
}
@Override
public
List
<
WeiboDomain
>
findByDomain
(
String
domain
)
{
String
collName
=
bestNewCollName
();
Query
query
=
new
Query
().
addCriteria
(
Criteria
.
where
(
"domain"
).
is
(
domain
));
return
mongoTemplate
.
find
(
query
,
WeiboDomain
.
class
,
collName
);
}
@Override
public
List
<
WeiboDomain
>
findAll
()
{
String
collName
=
bestNewCollName
();
return
mongoTemplate
.
findAll
(
WeiboDomain
.
class
,
collName
);
}
@Override
public
void
insert
(
List
<
WeiboDomain
>
list
)
{
String
collName
=
bestNewCollName
();
for
(
WeiboDomain
weiboDomain
:
list
)
{
mongoTemplate
.
save
(
weiboDomain
,
collName
);
}
}
@Override
public
String
bestNewCollName
()
{
Set
<
String
>
names
=
mongoTemplate
.
getCollectionNames
();
String
result
=
""
;
for
(
String
name
:
names
)
{
if
(
name
.
contains
(
"weiboDomain"
))
{
if
(
name
.
compareTo
(
result
)
>
0
)
{
result
=
name
;
}
}
}
return
result
;
}
@Override
public
void
createColl
()
{
String
time
=
LocalDate
.
now
().
format
(
DateTimeFormatter
.
ofPattern
(
"yyyyMMdd"
));
String
collName
=
"weiboDomain"
+
time
;
mongoTemplate
.
createCollection
(
collName
);
}
}
src/main/java/com/zhiweidata/weiboDomain/dao/impl/TagDaoImpl.java
0 → 100644
View file @
158abbbc
/**
* @Title: TagDaoImpl.java
* @Package com.zhiweidata.weiboDomain.dao.impl
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午5:21:35
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
dao
.
impl
;
import
java.util.List
;
import
javax.annotation.Resource
;
import
org.springframework.data.mongodb.core.MongoTemplate
;
import
org.springframework.data.mongodb.core.query.Criteria
;
import
org.springframework.data.mongodb.core.query.Query
;
import
org.springframework.data.mongodb.core.query.Update
;
import
org.springframework.stereotype.Repository
;
import
com.zhiweidata.weiboDomain.dao.TagDao
;
import
com.zhiweidata.weiboDomain.entity.DomainTag
;
/**
* @ClassName: TagDaoImpl
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午5:21:35
*/
@Repository
public
class
TagDaoImpl
implements
TagDao
{
@Resource
private
MongoTemplate
mongoTemplate
;
@Override
public
List
<
DomainTag
>
findAll
()
{
return
mongoTemplate
.
findAll
(
DomainTag
.
class
);
}
@Override
public
void
updateByState
(
String
domain
,
Integer
state
)
{
Query
query
=
new
Query
(
Criteria
.
where
(
"domain"
).
is
(
domain
));
Update
update
=
new
Update
();
update
.
set
(
"state"
,
state
);
mongoTemplate
.
updateMulti
(
query
,
update
,
DomainTag
.
class
);
}
@Override
public
List
<
DomainTag
>
findByState
(
Integer
state
)
{
Query
query
=
new
Query
(
Criteria
.
where
(
"state"
).
lte
(
state
));
return
mongoTemplate
.
find
(
query
,
DomainTag
.
class
);
}
}
src/main/java/com/zhiweidata/weiboDomain/entity/DomainTag.java
0 → 100644
View file @
158abbbc
/**
* @Title: DomainTag.java
* @Package com.zhiweidata.weiboDomain.entity
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午3:23:31
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
entity
;
import
java.util.List
;
import
org.springframework.data.mongodb.core.mapping.Document
;
import
lombok.Data
;
/**
* @ClassName: DomainTag
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午3:23:31
*/
@Data
@Document
(
collection
=
"domainTag"
)
public
class
DomainTag
{
private
String
_id
;
private
List
<
String
>
tags
;
private
String
domain
;
private
String
domainId
;
private
Integer
state
;
}
src/main/java/com/zhiweidata/weiboDomain/entity/WeiboDomain.java
0 → 100644
View file @
158abbbc
/**
* @Title: entity.java
* @Package com.zhiweidata.weiboDomain.entity
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午2:37:27
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
entity
;
import
org.springframework.data.annotation.Id
;
import
org.springframework.data.mongodb.core.index.Indexed
;
import
lombok.Data
;
/**
* @ClassName: entity
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午2:37:27
*/
@Data
public
class
WeiboDomain
{
@Id
private
String
id
;
@Indexed
private
String
uid
;
private
String
name
;
private
String
url
;
private
String
gender
;
private
String
location
;
private
String
description
;
@Indexed
private
String
domain
;
private
String
tag
;
private
String
followers_count
;
private
Integer
friends_count
;
private
Integer
statuses_count
;
private
boolean
isVip
;
private
String
updateTime
;
}
\ No newline at end of file
src/main/java/com/zhiweidata/weiboDomain/excel/DBOExp.java
0 → 100644
View file @
158abbbc
package
com
.
zhiweidata
.
weiboDomain
.
excel
;
import
java.io.File
;
import
java.io.FileNotFoundException
;
import
java.io.FileOutputStream
;
import
java.io.IOException
;
import
java.io.OutputStream
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.mongodb.DBObject
;
public
class
DBOExp
{
// private static SimpeExcelReport simpe = SimpeExcelReport.getInstance();
/**
*
* @TODO (输出DBObject集合)
* @author 陈炜涛
* @param listChai
* @param fliename
* @param sheetName
* @time 2016年8月27日上午10:12:37
* @return void
*/
public
void
putRun
(
List
<
DBObject
>
listChai
,
String
fliename
,
String
sheetName
)
{
// flie.mkdirs();
SimpeExcelReport
simpe
=
SimpeExcelReport
.
getInstance
();
File
excelFile
=
new
File
(
fliename
);
boolean
flg
=
excelFile
.
exists
();
// System.out.println(flg);
OutputStream
osOutputStream
=
null
;
try
{
osOutputStream
=
new
FileOutputStream
(
excelFile
,
true
);
}
catch
(
FileNotFoundException
e1
)
{
// TODO Auto-generated catch block
e1
.
printStackTrace
();
}
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
// 将取到的body集合加入总集合
dataList
.
addAll
(
bodyList
(
listChai
));
// 创建文件导出
// simpe.createExcelWithStream(headList(), bodyList(lists),
// osOutputStream,
// "微信信息");
// List<String> mergeList = new ArrayList<String>();
// mergeList.add("主题");
// mergeList.add("关键词");
// simpe.setMergeList(mergeList);
// simpe.addSheetInExcelWithFile(headList(),dataList, new
// File(fliename), "微信信息");
if
(!
flg
)
{
simpe
.
createExcelWithStream
(
headList
(
listChai
.
get
(
0
)),
dataList
,
osOutputStream
,
sheetName
,
excelFile
);
}
else
{
simpe
.
addSheetInExcelWithFile
(
headList
(
listChai
.
get
(
0
)),
dataList
,
new
File
(
fliename
),
sheetName
);
}
try
{
osOutputStream
.
close
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
/**
* @Description 设置文件的列名
*
* @return headList excel中所有列名的list
*/
public
static
List
<
String
>
headList
(
DBObject
dbo
)
{
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
addAll
(
dbo
.
keySet
());
return
headList
;
}
/**
* @Description 装载数据
*
* @return dataList 列名和值组成的map的list
*/
public
List
<
Map
<
String
,
Object
>>
bodyList
(
List
<
DBObject
>
lists
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
// 循环存数据的list组装成制表时候能用的map的list
// List<String> days = InfoSource27.getDayPoint();
List
<
String
>
keys
=
new
ArrayList
<
String
>();
keys
.
addAll
(
lists
.
get
(
0
).
keySet
());
Map
<
String
,
Object
>
beanMap
;
for
(
DBObject
dbo
:
lists
)
{
// 因为这个导出文件类不能导出空对象,所以每个值都做了判断空的
beanMap
=
new
HashMap
<
String
,
Object
>();
for
(
String
key
:
keys
)
{
beanMap
.
put
(
key
,
dbo
.
get
(
key
));
}
dataList
.
add
(
beanMap
);
}
return
dataList
;
}
}
src/main/java/com/zhiweidata/weiboDomain/excel/SimpeExcelReport.java
0 → 100644
View file @
158abbbc
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiweidata/weiboDomain/service/MongoSerivce.java
0 → 100644
View file @
158abbbc
/**
* @Title: Serivce.java
* @Package com.zhiweidata.weiboDomain.service
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午4:55:00
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
service
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
javax.annotation.Resource
;
import
org.springframework.stereotype.Service
;
import
com.zhiweidata.weiboDomain.crawler.JsoupHtml
;
import
com.zhiweidata.weiboDomain.crawler.WeiboDomainCrawler
;
import
com.zhiweidata.weiboDomain.dao.DomainDao
;
import
com.zhiweidata.weiboDomain.dao.TagDao
;
import
com.zhiweidata.weiboDomain.entity.DomainTag
;
import
com.zhiweidata.weiboDomain.entity.WeiboDomain
;
import
lombok.extern.slf4j.Slf4j
;
/**
* @ClassName: Serivce
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午4:55:00
*/
@Slf4j
@Service
public
class
MongoSerivce
{
@Resource
TagDao
tagDao
;
@Resource
DomainDao
domainDao
;
WeiboDomainCrawler
crawler
=
new
WeiboDomainCrawler
();
JsoupHtml
jsoupHtml
=
JsoupHtml
.
getInstance
();
public
void
crawlerData
(
String
cookie
)
{
Map
<
String
,
String
>
map
=
groupSet
();
for
(
String
domain
:
map
.
keySet
())
{
String
domainId
=
map
.
get
(
domain
);
log
.
info
(
"【{}】页开始爬取..............."
,
domain
);
List
<
WeiboDomain
>
list
=
parse
(
domain
,
domainId
,
cookie
);
log
.
info
(
"【{}】页所有数据爬取结束..............."
,
domain
);
domainDao
.
insert
(
list
);
tagDao
.
updateByState
(
domain
,
2
);
log
.
info
(
"【{}】所有页数据存储成功,共计【{}】条数据"
,
domain
,
list
.
size
());
}
log
.
info
(
"所有页面爬取结束,程序结束"
);
}
private
int
getPageNum
(
String
domainId
,
String
cookie
)
{
while
(
true
)
{
String
page
=
crawler
.
getPage
(
domainId
,
cookie
);
crawler
.
sleep
(
3000L
);
int
num
=
jsoupHtml
.
parsePage
(
page
);
if
(
num
!=
0
)
{
return
num
;
}
}
}
private
List
<
WeiboDomain
>
parse
(
String
domain
,
String
domainId
,
String
cookie
)
{
List
<
WeiboDomain
>
result
=
new
ArrayList
<>();
int
num
=
getPageNum
(
domainId
,
cookie
);
int
i
=
1
;
while
(
i
<
300
)
{
String
url
=
"https://d.weibo.com/"
+
domainId
+
"?pids=Pl_Core_F4RightUserList__4"
+
"&page="
+
i
+
"&ajaxpagelet=1&__ref=/"
+
domainId
;
String
html
=
crawler
.
getHtml
(
url
,
cookie
);
if
(
html
==
null
)
{
if
((
result
.
size
()/
10
)+
2
<
num
)
{
continue
;
}
else
{
break
;
}
}
List
<
WeiboDomain
>
list
=
jsoupHtml
.
parseData
(
html
,
domain
);
result
.
addAll
(
list
);
log
.
info
(
"【{}】:第【{}】页爬取成功"
,
domain
,
i
);
i
++;
crawler
.
sleep
(
3000L
);
}
return
result
;
}
private
Map
<
String
,
String
>
groupSet
(){
Map
<
String
,
String
>
result
=
new
HashMap
<>();
List
<
DomainTag
>
list
=
tagDao
.
findByState
(
1
);
for
(
DomainTag
domainTag
:
list
)
{
String
key
=
domainTag
.
getDomain
();
String
value
=
domainTag
.
getDomainId
();
result
.
put
(
key
,
value
);
}
for
(
String
key
:
result
.
keySet
())
{
tagDao
.
updateByState
(
key
,
0
);
}
return
result
;
}
/**
* @Title: initTag
* @Description: TODO(初始化所有tag状态,并创建新的表,在再次爬取时调用)
* void 返回类型
*/
public
void
initTag
()
{
Map
<
String
,
String
>
result
=
new
HashMap
<>();
List
<
DomainTag
>
list
=
tagDao
.
findAll
();
for
(
DomainTag
domainTag
:
list
)
{
String
key
=
domainTag
.
getDomain
();
String
value
=
domainTag
.
getDomainId
();
result
.
put
(
key
,
value
);
}
for
(
String
key
:
result
.
keySet
())
{
tagDao
.
updateByState
(
key
,
0
);
}
domainDao
.
createColl
();
}
}
src/main/java/com/zhiweidata/weiboDomain/start/Start.java
0 → 100644
View file @
158abbbc
/**
* @Title: Main.java
* @Package com.zhiweidata.weiboDomain.start
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午3:09:33
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
start
;
import
org.springframework.context.ApplicationContext
;
import
org.springframework.context.support.ClassPathXmlApplicationContext
;
import
com.zhiweidata.weiboDomain.service.MongoSerivce
;
/**
* @ClassName: Main
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午3:09:33
*/
public
class
Start
{
private
static
ApplicationContext
ctx
=
new
ClassPathXmlApplicationContext
(
"spring-context.xml"
);
private
static
MongoSerivce
serice
=
ctx
.
getBean
(
MongoSerivce
.
class
);
public
static
void
main
(
String
[]
args
)
{
String
cookie
=
"login_sid_t=2da8770fb84cdb5be026bbfcd76ef1e6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=873655794108.0503.1519525903336; SINAGLOBAL=873655794108.0503.1519525903336; ULV=1519525903344:1:1:1:873655794108.0503.1519525903336:; SSOLoginState=1519525975; SCF=AqU8lfV6ROhTkYEEmVi2ROhtdMxlB0mT3EF2ABKenC3OfC3SeK3YfvZYWFJY8ytsaFhYcc1vO5hvhLwolzBW5ps.; SUB=_2A253llAIDeRhGeNH6VoY9C7Mzz-IHXVU4sbArDV8PUNbmtBeLUnSkW9NStghaGFgK4WPoq15L2ikM_srwT7hNvkI; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5eochNrdf3XKPD1VaPcG3T5JpX5K2hUgL.Fo-4eon4Sh57She2dJLoIEQLxK-LBKBLBo2LxKBLBo.L12zLxK-L1-BLBKqLxKML1hBLBoqEeh2ceh-t; SUHB=0mxUFkR8aaPo5m; ALF=1551061975; un=18395807152; wvr=6; YF-Page-G0=416186e6974c7d5349e42861f3303251"
;
// 初始化程序状态,在再次爬取时调用
// 断点续传时,注释掉
// serice.initTag();
serice
.
crawlerData
(
cookie
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment