Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
weiboDomain
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
xuyimeng
weiboDomain
Commits
158abbbc
Commit
158abbbc
authored
Feb 28, 2018
by
win7
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
sss
parents
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
1978 additions
and
0 deletions
+1978
-0
pom.xml
+65
-0
src/main/java/com/zhiweidata/weiboDomain/crawler/HttpclientInstance.java
+96
-0
src/main/java/com/zhiweidata/weiboDomain/crawler/JsoupHtml.java
+188
-0
src/main/java/com/zhiweidata/weiboDomain/crawler/WeiboDomainCrawler.java
+238
-0
src/main/java/com/zhiweidata/weiboDomain/dao/DomainDao.java
+37
-0
src/main/java/com/zhiweidata/weiboDomain/dao/TagDao.java
+30
-0
src/main/java/com/zhiweidata/weiboDomain/dao/impl/DomainDaoImpl.java
+91
-0
src/main/java/com/zhiweidata/weiboDomain/dao/impl/TagDaoImpl.java
+57
-0
src/main/java/com/zhiweidata/weiboDomain/entity/DomainTag.java
+33
-0
src/main/java/com/zhiweidata/weiboDomain/entity/WeiboDomain.java
+44
-0
src/main/java/com/zhiweidata/weiboDomain/excel/DBOExp.java
+129
-0
src/main/java/com/zhiweidata/weiboDomain/excel/SimpeExcelReport.java
+780
-0
src/main/java/com/zhiweidata/weiboDomain/service/MongoSerivce.java
+155
-0
src/main/java/com/zhiweidata/weiboDomain/start/Start.java
+35
-0
No files found.
pom.xml
0 → 100644
View file @
158abbbc
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiweidata.weiboDomain
</groupId>
<artifactId>
weiboDomain
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
<dependencies>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
<version>
4.12
</version>
</dependency>
<dependency>
<groupId>
org.springframework.data
</groupId>
<artifactId>
spring-data-mongodb
</artifactId>
<version>
1.10.10.RELEASE
</version>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-test
</artifactId>
<version>
4.3.14.RELEASE
</version>
</dependency>
<dependency>
<groupId>
org.mongodb
</groupId>
<artifactId>
mongo-java-driver
</artifactId>
<version>
3.2.2
</version>
</dependency>
<dependency>
<groupId>
org.projectlombok
</groupId>
<artifactId>
lombok
</artifactId>
<version>
1.16.20
</version>
</dependency>
<dependency>
<groupId>
org.jsoup
</groupId>
<artifactId>
jsoup
</artifactId>
<version>
1.10.1
</version>
</dependency>
<dependency>
<groupId>
org.apache.httpcomponents
</groupId>
<artifactId>
httpclient
</artifactId>
<version>
4.5.5
</version>
</dependency>
<dependency>
<groupId>
commons-io
</groupId>
<artifactId>
commons-io
</artifactId>
<version>
2.5
</version>
</dependency>
<dependency>
<groupId>
org.apache.commons
</groupId>
<artifactId>
commons-lang3
</artifactId>
<version>
3.7
</version>
</dependency>
<dependency>
<groupId>
net.sf.json-lib
</groupId>
<artifactId>
json-lib
</artifactId>
<version>
2.4
</version>
<classifier>
jdk15
</classifier>
</dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
jxlzw
</artifactId>
<version>
0.0.2-SNAPSHOT
</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
src/main/java/com/zhiweidata/weiboDomain/crawler/HttpclientInstance.java
0 → 100644
View file @
158abbbc
/**
* @Title: httpclientInstance.java
* @Package com.zhiweidata.weiboDomain.crawler
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午1:54:32
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
crawler
;
import
java.util.List
;
import
javax.net.ssl.SSLContext
;
import
org.apache.http.HttpHost
;
import
org.apache.http.client.CookieStore
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.config.Registry
;
import
org.apache.http.config.RegistryBuilder
;
import
org.apache.http.conn.socket.ConnectionSocketFactory
;
import
org.apache.http.conn.socket.PlainConnectionSocketFactory
;
import
org.apache.http.conn.ssl.SSLConnectionSocketFactory
;
import
org.apache.http.cookie.Cookie
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClientBuilder
;
import
org.apache.http.impl.client.HttpClients
;
import
org.apache.http.impl.conn.PoolingHttpClientConnectionManager
;
import
org.apache.http.ssl.SSLContexts
;
/**
* @ClassName: httpclientInstance
* @Description: TODO(http连接管理,生成http对象)
* @author xuyimeng
* @date 2018年2月23日 下午1:54:32
*/
public
class
HttpclientInstance
{
//设置userAgent池
private
static
final
String
[]
userAgent
=
{
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
,
"NokiaX2-02/2.0 (11.79) Profile/MIDP-2.1 Configuration/CLDC-1.1 Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2;.NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2) UCBrowser8.4.0.159/70/352"
,
"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13"
,
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6"
,
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1"
,
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"
,
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9"
,
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
,
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
};
/**
* @Title: generateClient
* @Description: TODO(用连接池生成httpclient)
* @return
* CloseableHttpClient 返回类型
*/
public
static
CloseableHttpClient
generateClient
(
CookieStore
cookieStore
)
{
return
generateClient
(
null
,
cookieStore
);
}
/**
* @Title: generateClient
* @Description: TODO(增加代理)
* @param httpHost
* @return
* CloseableHttpClient 返回类型
*/
public
static
CloseableHttpClient
generateClient
(
HttpHost
httpHost
,
CookieStore
cookieStore
)
{
SSLContext
sslcontext
=
SSLContexts
.
createSystemDefault
();
Registry
<
ConnectionSocketFactory
>
socketFactoryRegistry
=
RegistryBuilder
.<
ConnectionSocketFactory
>
create
()
.
register
(
"http"
,
PlainConnectionSocketFactory
.
INSTANCE
)
.
register
(
"https"
,
new
SSLConnectionSocketFactory
(
sslcontext
)).
build
();
// http连接池管理,服务于多个执行进程的连接请求
PoolingHttpClientConnectionManager
connectionManager
=
new
PoolingHttpClientConnectionManager
(
socketFactoryRegistry
);
connectionManager
.
setMaxTotal
(
200
);
connectionManager
.
setDefaultMaxPerRoute
(
20
);
RequestConfig
requestConfig
=
RequestConfig
.
custom
().
setProxy
(
httpHost
).
build
();
HttpClientBuilder
httpClientBuilder
=
HttpClients
.
custom
().
setUserAgent
(
randomUserAgent
())
.
setConnectionManager
(
connectionManager
).
setDefaultRequestConfig
(
requestConfig
).
setDefaultCookieStore
(
cookieStore
);
return
httpClientBuilder
.
build
();
}
/**
* @Title: randomUserAgent
* @Description: TODO(随机取一个ua)
* @return
* String 返回类型
*/
public
static
String
randomUserAgent
()
{
return
userAgent
[(
int
)
(
Math
.
random
()
*
userAgent
.
length
)];
}
}
src/main/java/com/zhiweidata/weiboDomain/crawler/JsoupHtml.java
0 → 100644
View file @
158abbbc
/**
* @Title: JsoupHtml.java
* @Package com.zhiweidata.weiboDomain.crawler
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午3:16:06
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
crawler
;
import
java.time.LocalDateTime
;
import
java.time.format.DateTimeFormatter
;
import
java.util.ArrayList
;
import
java.util.List
;
import
javax.print.Doc
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
com.zhiweidata.weiboDomain.entity.WeiboDomain
;
import
net.sf.json.JSONObject
;
/**
* @ClassName: JsoupHtml
* @Description: TODO(解析页面数据)
* @author xuyimeng
* @date 2018年2月23日 下午3:16:06
*/
public
class
JsoupHtml
{
private
static
JsoupHtml
jsoupHtml
=
new
JsoupHtml
();
private
JsoupHtml
()
{}
public
static
JsoupHtml
getInstance
()
{
return
jsoupHtml
;
}
/**
* @Title: parseData
* @Description: TODO(解析页面数据转化为集合)
* @param html
* @param domain
* @return
* List<WeiboDomainGroup> 返回类型
*/
public
List
<
WeiboDomain
>
parseData
(
String
html
,
String
domain
){
List
<
WeiboDomain
>
result
=
new
ArrayList
<>();
Document
doc
=
Jsoup
.
parse
(
html
);
//处理填充数据
String
str
=
""
;
Elements
script
=
doc
.
getElementsByTag
(
"script"
);
str
=
script
.
get
(
0
).
childNode
(
0
).
toString
();
str
=
getHtml
(
str
);
//解析页面数据
doc
=
Jsoup
.
parse
(
str
);
Elements
user
=
doc
.
getElementsByTag
(
"dd"
);
for
(
Element
element
:
user
)
{
if
(
element
.
attr
(
"class"
).
equals
(
"mod_info S_line1"
))
{
WeiboDomain
weiboDomainGroup
=
new
WeiboDomain
();
String
uid
=
""
;
Elements
elements
=
element
.
getElementsByTag
(
"div"
);
for
(
Element
div
:
elements
)
{
if
(
div
.
attr
(
"class"
).
equals
(
"info_name W_fb W_f14"
))
{
Element
S_txt1
=
div
.
getElementsByClass
(
"S_txt1"
).
get
(
0
);
uid
=
S_txt1
.
attr
(
"usercard"
).
split
(
"&"
)[
0
].
replaceAll
(
"id="
,
""
);
weiboDomainGroup
.
setUid
(
uid
);
weiboDomainGroup
.
setUrl
(
S_txt1
.
attr
(
"href"
));
weiboDomainGroup
.
setName
(
S_txt1
.
attr
(
"title"
));
Elements
i
=
div
.
getElementsByTag
(
"i"
);
for
(
Element
ele
:
i
)
{
if
(
ele
.
attr
(
"class"
).
equals
(
"W_icon icon_member"
))
{
weiboDomainGroup
.
setVip
(
true
);
}
if
(
ele
.
attr
(
"class"
).
equals
(
"W_icon icon_male"
))
{
weiboDomainGroup
.
setGender
(
"m"
);
}
else
{
weiboDomainGroup
.
setGender
(
"f"
);
}
}
}
if
(
div
.
attr
(
"class"
).
equals
(
"info_connect"
))
{
Elements
em
=
div
.
getElementsByTag
(
"em"
);
weiboDomainGroup
.
setFriends_count
(
Integer
.
parseInt
(
em
.
get
(
0
).
text
()));
weiboDomainGroup
.
setFollowers_count
(
em
.
get
(
1
).
text
());
weiboDomainGroup
.
setStatuses_count
(
Integer
.
parseInt
(
em
.
get
(
2
).
text
()));
}
if
(
div
.
attr
(
"class"
).
equals
(
"info_add"
))
{
Elements
span
=
div
.
getElementsByTag
(
"span"
);
weiboDomainGroup
.
setLocation
(
span
.
get
(
0
).
text
());
}
if
(
div
.
attr
(
"class"
).
equals
(
"info_intro"
))
{
Elements
span
=
div
.
getElementsByTag
(
"span"
);
weiboDomainGroup
.
setDescription
(
span
.
get
(
0
).
text
());
}
if
(
div
.
attr
(
"class"
).
equals
(
"info_relation"
))
{
String
tag
=
div
.
text
().
split
(
":"
)[
1
];
weiboDomainGroup
.
setTag
(
tag
);
}
}
weiboDomainGroup
.
setDomain
(
domain
);
weiboDomainGroup
.
setUpdateTime
(
LocalDateTime
.
now
()
.
format
(
DateTimeFormatter
.
ofPattern
(
"yyyy-MM-dd HH:mm:ss"
)));
weiboDomainGroup
.
setId
(
domain
+
"_"
+
uid
);
result
.
add
(
weiboDomainGroup
);
}
}
return
result
;
}
/**
* @Title: getHtml
* @Description: TODO(微博数据是用FW.view填充,所以需要解析)
* @return
* String 返回类型
*/
private
String
getHtml
(
String
str
)
{
str
=
str
.
replaceAll
(
"parent.FM.view\\("
,
""
).
replaceAll
(
"\\)"
,
""
);
JSONObject
json
=
JSONObject
.
fromObject
(
str
);
return
json
.
getString
(
"html"
);
}
/**
* @Title: parsePage
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param domainId
* @param cookie
* @return
* int 返回类型
*/
public
int
parsePage
(
String
page
)
{
try
{
Document
doc
=
Jsoup
.
parse
(
page
);
//处理填充数据
String
str
=
""
;
Elements
script
=
doc
.
getElementsByTag
(
"script"
);
for
(
Element
s
:
script
)
{
if
(
s
.
childNode
(
0
).
toString
().
contains
(
"content.signInPeople.index"
))
{
str
=
s
.
childNode
(
0
).
toString
();
}
}
str
=
str
.
replaceAll
(
"FM.view\\("
,
""
).
replaceAll
(
"\\)"
,
""
);
JSONObject
json
=
JSONObject
.
fromObject
(
str
);
str
=
json
.
getString
(
"html"
);
doc
=
Jsoup
.
parse
(
str
);
Elements
a
=
doc
.
getElementsByTag
(
"a"
);
int
num
=
0
;
for
(
Element
e
:
a
)
{
if
(
"page"
.
equals
(
e
.
attr
(
"bpfilter"
))
&&
"page S_txt1"
.
equals
(
e
.
attr
(
"class"
)))
{
if
(
Integer
.
parseInt
(
e
.
text
())
>
num
)
{
num
=
Integer
.
parseInt
(
e
.
text
());
}
}
}
return
num
;
}
catch
(
Exception
e
)
{
return
0
;
}
}
}
src/main/java/com/zhiweidata/weiboDomain/crawler/WeiboDomainCrawler.java
0 → 100644
View file @
158abbbc
/**
* @Title: WeiboDomainCrawler.java
* @Package com.zhiweidata.weiboDomain.crawler
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午1:59:46
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
crawler
;
import
java.io.ByteArrayInputStream
;
import
java.io.ByteArrayOutputStream
;
import
java.io.IOException
;
import
java.io.InputStream
;
import
java.nio.charset.Charset
;
import
java.util.function.Predicate
;
import
org.apache.commons.io.IOUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.Header
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.HttpHeaders
;
import
org.apache.http.HttpHost
;
import
org.apache.http.HttpStatus
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.entity.GzipDecompressingEntity
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.entity.ContentType
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
/**
* @ClassName: WeiboDomainCrawler
* @Description: TODO(微博榜单爬虫)
* @author xuyimeng
* @date 2018年2月23日 下午1:59:46
*/
public
class
WeiboDomainCrawler
{
private
static
CloseableHttpClient
client
=
HttpclientInstance
.
generateClient
(
null
);
private
HttpHost
httpHost
;
/**
* @Title: getHtml
* @Description: TODO(通过url返回页面数据)
* @param url
* @return
* String 返回类型
*/
public
String
getHtml
(
String
url
,
String
cookie
)
{
HttpGet
httpGet
=
createHttpGet
(
url
,
cookie
);
return
get
(
httpGet
);
}
/**
* @Title: getPage
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param domainId
* @return
* String 返回类型
*/
public
String
getPage
(
String
domainId
,
String
cookie
)
{
String
url
=
"https://d.weibo.com/"
+
domainId
;
HttpGet
httpGet
=
createHttpGet
(
url
,
cookie
);
return
get
(
httpGet
);
}
/**
* @Title: get
* @Description: TODO(返回html数据)
* @param httpGet
* @return
* String 返回类型
*/
private
String
get
(
HttpGet
httpGet
)
{
//设置返回内容的检测逻辑
Predicate
<
String
>
predicate
=
s
->
(
s
==
null
||
""
.
equals
(
s
))
||
s
.
contains
(
"empty_con clearfix"
)
||
!
s
.
contains
(
"follow_item S_line2"
);
return
get
(
httpGet
,
predicate
);
}
private
String
get
(
HttpGet
httpGet
,
Predicate
<
String
>
predicate
)
{
boolean
flag
=
true
;
while
(
flag
)
{
try
{
CloseableHttpResponse
response
=
client
.
execute
(
httpGet
);
HttpEntity
httpEntity
=
response
.
getEntity
();
String
responseContent
=
getResponseContent
(
httpEntity
);
if
(
response
.
getStatusLine
().
getStatusCode
()
==
HttpStatus
.
SC_OK
)
{
flag
=
false
;
}
if
(!
predicate
.
test
(
responseContent
))
{
return
responseContent
;
}
}
catch
(
Exception
e
)
{
sleep
(
3000L
);
}
}
return
null
;
}
/**
* @Title: getResponseContent
* @Description: TODO(字符流的方式,获取相应的正文)
* @param httpEntity
* @return
* @throws IOException
* String 返回类型
*/
public
static
String
getResponseContent
(
final
HttpEntity
httpEntity
)
throws
IOException
{
if
(
httpEntity
==
null
)
{
return
null
;
}
InputStream
in
=
null
;
try
{
Header
header
=
httpEntity
.
getContentEncoding
();
//被压缩就先解压
if
(
null
!=
header
&&
"gzip"
.
equals
(
header
.
getValue
()))
{
in
=
new
GzipDecompressingEntity
(
httpEntity
).
getContent
();
}
else
{
in
=
httpEntity
.
getContent
();
}
ByteArrayOutputStream
baos
=
new
ByteArrayOutputStream
();
IOUtils
.
copy
(
in
,
baos
);
Charset
charset
=
null
;
ContentType
contentType
=
ContentType
.
get
(
httpEntity
);
//获取字符集,为空就从页面解析
if
(
contentType
!=
null
)
{
charset
=
contentType
.
getCharset
();
}
if
(
charset
==
null
)
{
String
content
=
IOUtils
.
toString
(
new
ByteArrayInputStream
(
baos
.
toByteArray
()),
Charset
.
defaultCharset
().
displayName
());
charset
=
getHtmlCharset
(
content
);
if
(
charset
==
null
)
{
return
content
;
}
}
return
IOUtils
.
toString
(
new
ByteArrayInputStream
(
baos
.
toByteArray
()),
charset
.
displayName
());
}
finally
{
if
(
in
!=
null
)
{
in
.
close
();
}
}
}
/**
* @Title: getHtmlCharset
* @Description: TODO(解析页面字符集)
* @param html
* @return
* Charset 返回类型
*/
public
static
Charset
getHtmlCharset
(
final
String
html
)
{
if
(!
StringUtils
.
isEmpty
(
html
))
{
Document
document
=
Jsoup
.
parse
(
html
);
Elements
links
=
document
.
select
(
"meta"
);
for
(
Element
link
:
links
)
{
String
metaContent
=
link
.
attr
(
"content"
);
String
metaCharset
=
link
.
attr
(
"charset"
);
if
(
metaContent
.
contains
(
"charset="
))
{
metaContent
=
metaContent
.
substring
(
metaContent
.
indexOf
(
"charset"
),
metaContent
.
length
());
return
Charset
.
forName
(
metaContent
.
split
(
"="
)[
1
]);
}
else
if
(!
StringUtils
.
isEmpty
(
metaCharset
))
{
return
Charset
.
forName
(
metaCharset
);
}
}
}
return
null
;
}
/**
* 生成get请求,请求头和请求参数
*/
private
HttpGet
createHttpGet
(
String
url
,
String
cookie
)
{
HttpGet
httpGet
=
new
HttpGet
(
url
);
httpGet
.
setConfig
(
getRequestConfig
());
httpGet
.
setHeader
(
HttpHeaders
.
USER_AGENT
,
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
httpGet
.
addHeader
(
HttpHeaders
.
ACCEPT
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
);
httpGet
.
addHeader
(
HttpHeaders
.
ACCEPT_ENCODING
,
"gzip, deflate, br"
);
httpGet
.
addHeader
(
HttpHeaders
.
ACCEPT_LANGUAGE
,
"zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7"
);
httpGet
.
addHeader
(
HttpHeaders
.
CONNECTION
,
"keep-alive"
);
httpGet
.
addHeader
(
"Cookie"
,
cookie
);
httpGet
.
addHeader
(
HttpHeaders
.
HOST
,
"d.weibo.com"
);
return
httpGet
;
}
/**
* @Title: getRequestConfig
* @Description: TODO(设置请求配置)
* @return
* RequestConfig 返回类型
*/
private
RequestConfig
getRequestConfig
()
{
return
RequestConfig
.
custom
().
setSocketTimeout
(
3000
).
setConnectTimeout
(
3000
).
setConnectionRequestTimeout
(
3000
)
.
setProxy
(
httpHost
).
build
();
}
public
void
sleep
(
long
time
)
{
try
{
Thread
.
sleep
(
time
);
}
catch
(
InterruptedException
e
)
{
e
.
printStackTrace
();
}
}
}
src/main/java/com/zhiweidata/weiboDomain/dao/DomainDao.java
0 → 100644
View file @
158abbbc
/**
* @Title: mongoDao.java
* @Package com.zhiweidata.weiboDomain.dao
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午2:34:36
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
dao
;
import
java.util.List
;
import
com.zhiweidata.weiboDomain.entity.WeiboDomain
;
/**
* @ClassName: mongoDao
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午2:34:36
*/
public
interface
DomainDao
{
public
List
<
WeiboDomain
>
findByUid
(
String
uid
);
public
List
<
WeiboDomain
>
findByDomain
(
String
domain
);
public
List
<
WeiboDomain
>
findAll
();
public
void
insert
(
List
<
WeiboDomain
>
list
);
public
void
createColl
();
public
String
bestNewCollName
();
}
src/main/java/com/zhiweidata/weiboDomain/dao/TagDao.java
0 → 100644
View file @
158abbbc
/**
* @Title: TagDao.java
* @Package com.zhiweidata.weiboDomain.dao
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午5:17:58
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
dao
;
import
java.util.List
;
import
com.zhiweidata.weiboDomain.entity.DomainTag
;
/**
* @ClassName: TagDao
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午5:17:58
*/
public
interface
TagDao
{
public
List
<
DomainTag
>
findAll
();
public
List
<
DomainTag
>
findByState
(
Integer
state
);
public
void
updateByState
(
String
domain
,
Integer
state
);
}
src/main/java/com/zhiweidata/weiboDomain/dao/impl/DomainDaoImpl.java
0 → 100644
View file @
158abbbc
/**
* @Title: MongoDaoImpl.java
* @Package dao.impl
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午2:57:24
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
dao
.
impl
;
import
java.time.LocalDate
;
import
java.time.format.DateTimeFormatter
;
import
java.util.List
;
import
java.util.Set
;
import
javax.annotation.Resource
;
import
org.springframework.data.mongodb.core.MongoTemplate
;
import
org.springframework.data.mongodb.core.query.Criteria
;
import
org.springframework.data.mongodb.core.query.Query
;
import
org.springframework.stereotype.Repository
;
import
com.zhiweidata.weiboDomain.dao.DomainDao
;
import
com.zhiweidata.weiboDomain.entity.WeiboDomain
;
/**
* @ClassName: MongoDaoImpl
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午2:57:24
*/
@Repository
public
class
DomainDaoImpl
implements
DomainDao
{
@Resource
MongoTemplate
mongoTemplate
;
@Override
public
List
<
WeiboDomain
>
findByUid
(
String
uid
)
{
String
collName
=
bestNewCollName
();
Query
query
=
new
Query
().
addCriteria
(
Criteria
.
where
(
"uid"
).
is
(
uid
));
return
mongoTemplate
.
find
(
query
,
WeiboDomain
.
class
,
collName
);
}
@Override
public
List
<
WeiboDomain
>
findByDomain
(
String
domain
)
{
String
collName
=
bestNewCollName
();
Query
query
=
new
Query
().
addCriteria
(
Criteria
.
where
(
"domain"
).
is
(
domain
));
return
mongoTemplate
.
find
(
query
,
WeiboDomain
.
class
,
collName
);
}
@Override
public
List
<
WeiboDomain
>
findAll
()
{
String
collName
=
bestNewCollName
();
return
mongoTemplate
.
findAll
(
WeiboDomain
.
class
,
collName
);
}
@Override
public
void
insert
(
List
<
WeiboDomain
>
list
)
{
String
collName
=
bestNewCollName
();
for
(
WeiboDomain
weiboDomain
:
list
)
{
mongoTemplate
.
save
(
weiboDomain
,
collName
);
}
}
@Override
public
String
bestNewCollName
()
{
Set
<
String
>
names
=
mongoTemplate
.
getCollectionNames
();
String
result
=
""
;
for
(
String
name
:
names
)
{
if
(
name
.
contains
(
"weiboDomain"
))
{
if
(
name
.
compareTo
(
result
)
>
0
)
{
result
=
name
;
}
}
}
return
result
;
}
@Override
public
void
createColl
()
{
String
time
=
LocalDate
.
now
().
format
(
DateTimeFormatter
.
ofPattern
(
"yyyyMMdd"
));
String
collName
=
"weiboDomain"
+
time
;
mongoTemplate
.
createCollection
(
collName
);
}
}
src/main/java/com/zhiweidata/weiboDomain/dao/impl/TagDaoImpl.java
0 → 100644
View file @
158abbbc
/**
* @Title: TagDaoImpl.java
* @Package com.zhiweidata.weiboDomain.dao.impl
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午5:21:35
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
dao
.
impl
;
import
java.util.List
;
import
javax.annotation.Resource
;
import
org.springframework.data.mongodb.core.MongoTemplate
;
import
org.springframework.data.mongodb.core.query.Criteria
;
import
org.springframework.data.mongodb.core.query.Query
;
import
org.springframework.data.mongodb.core.query.Update
;
import
org.springframework.stereotype.Repository
;
import
com.zhiweidata.weiboDomain.dao.TagDao
;
import
com.zhiweidata.weiboDomain.entity.DomainTag
;
/**
* @ClassName: TagDaoImpl
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午5:21:35
*/
@Repository
public
class
TagDaoImpl
implements
TagDao
{
@Resource
private
MongoTemplate
mongoTemplate
;
@Override
public
List
<
DomainTag
>
findAll
()
{
return
mongoTemplate
.
findAll
(
DomainTag
.
class
);
}
@Override
public
void
updateByState
(
String
domain
,
Integer
state
)
{
Query
query
=
new
Query
(
Criteria
.
where
(
"domain"
).
is
(
domain
));
Update
update
=
new
Update
();
update
.
set
(
"state"
,
state
);
mongoTemplate
.
updateMulti
(
query
,
update
,
DomainTag
.
class
);
}
@Override
public
List
<
DomainTag
>
findByState
(
Integer
state
)
{
Query
query
=
new
Query
(
Criteria
.
where
(
"state"
).
lte
(
state
));
return
mongoTemplate
.
find
(
query
,
DomainTag
.
class
);
}
}
src/main/java/com/zhiweidata/weiboDomain/entity/DomainTag.java
0 → 100644
View file @
158abbbc
/**
* @Title: DomainTag.java
* @Package com.zhiweidata.weiboDomain.entity
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午3:23:31
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
entity
;
import
java.util.List
;
import
org.springframework.data.mongodb.core.mapping.Document
;
import
lombok.Data
;
/**
* @ClassName: DomainTag
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午3:23:31
*/
@Data
@Document
(
collection
=
"domainTag"
)
public
class
DomainTag
{
private
String
_id
;
private
List
<
String
>
tags
;
private
String
domain
;
private
String
domainId
;
private
Integer
state
;
}
src/main/java/com/zhiweidata/weiboDomain/entity/WeiboDomain.java
0 → 100644
View file @
158abbbc
/**
* @Title: entity.java
* @Package com.zhiweidata.weiboDomain.entity
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午2:37:27
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
entity
;
import
org.springframework.data.annotation.Id
;
import
org.springframework.data.mongodb.core.index.Indexed
;
import
lombok.Data
;
/**
* @ClassName: entity
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午2:37:27
*/
@Data
public
class
WeiboDomain
{
@Id
private
String
id
;
@Indexed
private
String
uid
;
private
String
name
;
private
String
url
;
private
String
gender
;
private
String
location
;
private
String
description
;
@Indexed
private
String
domain
;
private
String
tag
;
private
String
followers_count
;
private
Integer
friends_count
;
private
Integer
statuses_count
;
private
boolean
isVip
;
private
String
updateTime
;
}
\ No newline at end of file
src/main/java/com/zhiweidata/weiboDomain/excel/DBOExp.java
0 → 100644
View file @
158abbbc
package
com
.
zhiweidata
.
weiboDomain
.
excel
;
import
java.io.File
;
import
java.io.FileNotFoundException
;
import
java.io.FileOutputStream
;
import
java.io.IOException
;
import
java.io.OutputStream
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.mongodb.DBObject
;
public
class
DBOExp
{
// private static SimpeExcelReport simpe = SimpeExcelReport.getInstance();
/**
*
* @TODO (输出DBObject集合)
* @author 陈炜涛
* @param listChai
* @param fliename
* @param sheetName
* @time 2016年8月27日上午10:12:37
* @return void
*/
public
void
putRun
(
List
<
DBObject
>
listChai
,
String
fliename
,
String
sheetName
)
{
// flie.mkdirs();
SimpeExcelReport
simpe
=
SimpeExcelReport
.
getInstance
();
File
excelFile
=
new
File
(
fliename
);
boolean
flg
=
excelFile
.
exists
();
// System.out.println(flg);
OutputStream
osOutputStream
=
null
;
try
{
osOutputStream
=
new
FileOutputStream
(
excelFile
,
true
);
}
catch
(
FileNotFoundException
e1
)
{
// TODO Auto-generated catch block
e1
.
printStackTrace
();
}
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
// 将取到的body集合加入总集合
dataList
.
addAll
(
bodyList
(
listChai
));
// 创建文件导出
// simpe.createExcelWithStream(headList(), bodyList(lists),
// osOutputStream,
// "微信信息");
// List<String> mergeList = new ArrayList<String>();
// mergeList.add("主题");
// mergeList.add("关键词");
// simpe.setMergeList(mergeList);
// simpe.addSheetInExcelWithFile(headList(),dataList, new
// File(fliename), "微信信息");
if
(!
flg
)
{
simpe
.
createExcelWithStream
(
headList
(
listChai
.
get
(
0
)),
dataList
,
osOutputStream
,
sheetName
,
excelFile
);
}
else
{
simpe
.
addSheetInExcelWithFile
(
headList
(
listChai
.
get
(
0
)),
dataList
,
new
File
(
fliename
),
sheetName
);
}
try
{
osOutputStream
.
close
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
/**
* @Description 设置文件的列名
*
* @return headList excel中所有列名的list
*/
public
static
List
<
String
>
headList
(
DBObject
dbo
)
{
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
addAll
(
dbo
.
keySet
());
return
headList
;
}
/**
* @Description 装载数据
*
* @return dataList 列名和值组成的map的list
*/
public
List
<
Map
<
String
,
Object
>>
bodyList
(
List
<
DBObject
>
lists
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
// 循环存数据的list组装成制表时候能用的map的list
// List<String> days = InfoSource27.getDayPoint();
List
<
String
>
keys
=
new
ArrayList
<
String
>();
keys
.
addAll
(
lists
.
get
(
0
).
keySet
());
Map
<
String
,
Object
>
beanMap
;
for
(
DBObject
dbo
:
lists
)
{
// 因为这个导出文件类不能导出空对象,所以每个值都做了判断空的
beanMap
=
new
HashMap
<
String
,
Object
>();
for
(
String
key
:
keys
)
{
beanMap
.
put
(
key
,
dbo
.
get
(
key
));
}
dataList
.
add
(
beanMap
);
}
return
dataList
;
}
}
src/main/java/com/zhiweidata/weiboDomain/excel/SimpeExcelReport.java
0 → 100644
View file @
158abbbc
package
com
.
zhiweidata
.
weiboDomain
.
excel
;
import
java.io.File
;
import
java.io.FileNotFoundException
;
import
java.io.IOException
;
import
java.io.OutputStream
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.jxlzw.report.model.HLink
;
import
jxl.Cell
;
import
jxl.Sheet
;
import
jxl.Workbook
;
import
jxl.format.Border
;
import
jxl.format.BorderLineStyle
;
import
jxl.format.Colour
;
import
jxl.format.UnderlineStyle
;
import
jxl.read.biff.BiffException
;
import
jxl.write.Label
;
import
jxl.write.WritableCellFormat
;
import
jxl.write.WritableFont
;
import
jxl.write.WritableHyperlink
;
import
jxl.write.WritableSheet
;
import
jxl.write.WritableWorkbook
;
import
jxl.write.WriteException
;
import
jxl.write.biff.RowsExceededException
;
import
lombok.extern.slf4j.Slf4j
;
/**
* 简单的 Excel报表
*
* @ClassName: SimpeExcelReport
* @Description: TODO(这里用一句话描述这个类的作用)
* @author Administrator
* @date 2015年11月20日 下午4:52:02
*/
@Slf4j
public
class
SimpeExcelReport
{
private
List
<
Map
<
String
,
Object
>>
bodyList
;
private
List
<
String
>
headList
;
private
WritableCellFormat
format
;
private
WritableCellFormat
formatColor
;
private
WritableCellFormat
headformat
;
private
WritableWorkbook
writeWorkBook
;
private
OutputStream
os
;
private
Workbook
readWordBook
;
private
WritableSheet
sheet
=
null
;
private
List
<
String
>
mergeList
;
public
static
SimpeExcelReport
getInstance
()
{
return
new
SimpeExcelReport
();
}
/**
* 读取一个Excel,返回格式Map<sheetName,DATA> DATA: 是一个Map,存放两个key head,和body head是表头的List,存放了表头的字段 body是Map
* <key,value> 存放了表头字段对应的数据
*
* @Title: readExcel
* @param excelFiel
* @param sheetName 不输入Sheet名字那么返回所有sheet数据
* 设定文件
* @return Map<String,Object> 返回类型
*/
public
Map
<
String
,
Object
>
readExcel
(
File
excelFiel
,
String
sheetName
)
{
readWordBook
=
getExcelFile
(
excelFiel
);
Map
<
String
,
Object
>
map
;
if
(
null
!=
sheetName
&&
!
""
.
equals
(
sheetName
))
{
map
=
getExcelBySheet
(
sheetName
);
}
else
{
map
=
getExcelAllData
();
}
closeAllObject
();
log
.
info
(
"文件读取成功"
);
return
map
;
}
public
WritableCellFormat
getTitleFormat
(
WritableFont
headFont
)
{
WritableCellFormat
wcfFormat
=
new
WritableCellFormat
(
headFont
);
// 设置居中
try
{
wcfFormat
.
setAlignment
(
jxl
.
format
.
Alignment
.
CENTRE
);
// 左右居中
wcfFormat
.
setVerticalAlignment
(
jxl
.
format
.
VerticalAlignment
.
CENTRE
);
// 上下居中
wcfFormat
.
setBackground
(
Colour
.
LIGHT_BLUE
);
}
catch
(
WriteException
e
)
{
e
.
printStackTrace
();
}
return
wcfFormat
;
}
/**
* 获取Excel所有的数据
*
* @Title: getExcelAllData
* @Description: TODO(这里用一句话描述这个方法的作用)
* 设定文件
* @return Map<String,Object> 返回类型
*/
private
Map
<
String
,
Object
>
getExcelAllData
()
{
Sheet
[]
sheets
=
readWordBook
.
getSheets
();
Sheet
sheet
;
Map
<
String
,
Object
>
excelMap
=
new
HashMap
<
String
,
Object
>();
for
(
int
s
=
0
;
s
<
sheets
.
length
;
s
++)
{
sheet
=
sheets
[
s
];
Map
<
String
,
Object
>
sheetMap
=
getSheetData
(
sheet
);
excelMap
.
put
(
sheet
.
getName
(),
sheetMap
);
}
return
excelMap
;
}
/**
* 获取指定sheet的Excel数据
*
* @Title: getExcelBySheet
* @Description: TODO(这里用一句话描述这个方法的作用)
* 设定文件
* @return Map<String,Object> 返回类型
*/
private
Map
<
String
,
Object
>
getExcelBySheet
(
String
sheetName
)
{
Sheet
sheet
=
readWordBook
.
getSheet
(
sheetName
);
Map
<
String
,
Object
>
sheetMap
=
getSheetData
(
sheet
);
return
sheetMap
;
}
private
Map
<
String
,
Object
>
getSheetData
(
Sheet
sheet
)
{
List
<
String
>
headList
=
new
ArrayList
<
String
>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
Map
<
String
,
Object
>
sheetMap
=
new
HashMap
<
String
,
Object
>();
// 获取表头
Cell
[]
cell
=
sheet
.
getRow
(
0
);
for
(
int
i
=
0
;
i
<
cell
.
length
;
i
++)
{
headList
.
add
(
cell
[
i
].
getContents
());
}
// 获取数据
Map
<
String
,
Object
>
bodyData
;
for
(
int
i
=
1
;
i
<
sheet
.
getRows
();
i
++)
{
cell
=
sheet
.
getRow
(
i
);
bodyData
=
new
HashMap
<
String
,
Object
>();
for
(
int
c
=
0
;
c
<
headList
.
size
();
c
++)
{
try
{
bodyData
.
put
(
headList
.
get
(
c
),
cell
[
c
].
getContents
());
}
catch
(
Exception
e
)
{
bodyData
.
put
(
headList
.
get
(
c
),
null
);
}
}
bodyList
.
add
(
bodyData
);
}
sheetMap
.
put
(
"head"
,
headList
);
sheetMap
.
put
(
"body"
,
bodyList
);
return
sheetMap
;
}
/**
* 根据已存在的Excel创建新的Sheet
*
* @Title: createExcelWithStream
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param headList 表头
* @param bodyList 数据对象Map<String,Object>;
* @param isClose 创建完成后是否关闭流;
* @param os
* @param sheetName 设定文件
* @return void 返回类型
*/
public
synchronized
void
createExcelWithStream
(
List
<
String
>
headList
,
List
<
Map
<
String
,
Object
>>
bodyList
,
OutputStream
os
,
String
sheetName
,
File
excelFiel
)
{
this
.
os
=
os
;
this
.
headList
=
headList
;
this
.
bodyList
=
bodyList
;
int
sheetIndex
=
0
;
getWriteWorkBookWithStream
();
@SuppressWarnings
(
"unused"
)
int
size
=
buildSheet
(
sheetIndex
,
null
==
sheetName
||
""
.
equals
(
sheetName
)
?
"Sheet"
:
sheetName
);
try
{
os
.
flush
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
writeWorkBookWriter
();
closeAllObject
();
this
.
headList
=
null
;
this
.
bodyList
=
null
;
log
.
info
(
"文件创建成功"
);
}
/**
* @return the mergeList
*/
public
List
<
String
>
getMergeList
()
{
return
mergeList
;
}
/**
* @param mergeList the mergeList to set
*/
public
void
setMergeList
(
List
<
String
>
mergeList
)
{
this
.
mergeList
=
mergeList
;
}
/**
* 根据输出流创建Excel文件
*
* @Title: createExcelWithStream
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param headList 表头
* @param bodyList 数据对象Map<String,Object>;
* @param os
* @param sheetName 设定文件
* @return void 返回类型
*/
public
synchronized
void
addSheetInExcelWithFile
(
List
<
String
>
headList
,
List
<
Map
<
String
,
Object
>>
bodyList
,
File
excelFile
,
String
sheetName
)
{
this
.
headList
=
headList
;
this
.
bodyList
=
bodyList
;
getWorkBookWithFile
(
excelFile
);
int
sheetIndex
=
getSheetIndex
();
buildSheet
(
sheetIndex
,
null
==
sheetName
||
""
.
equals
(
sheetName
)
?
"Sheet"
:
existsName
(
sheetName
));
writeWorkBookWriter
();
closeAllObject
();
log
.
info
(
"文件创建成功"
);
this
.
headList
=
null
;
this
.
bodyList
=
null
;
}
/**
* 构建Sheet,在这个文件判断了数据是发大于50000条,大于50000那么创建新的Sheet
*
* @Title: buildSheet
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param index
* @param sheetName 设定文件
* @return void 返回类型
*/
public
int
buildSheet
(
int
index
,
String
sheetName
)
{
// 大于五万条数据就进行分表
int
size
=
bodyList
.
size
();
int
limit
=
50000
;
if
(
size
<
limit
)
{
sheet
=
writeWorkBook
.
createSheet
(
sheetName
,
index
);
// 创建工作表,第一个参数是名称,第二个参数是该工作表在工作薄中的位置
fileInToSheet
(
sheet
,
bodyList
,
headList
);
}
else
{
int
count
=
size
%
limit
==
0
?
size
/
limit
:
size
/
limit
+
1
;
for
(
int
i
=
0
;
i
<
count
;
i
++)
{
sheet
=
writeWorkBook
.
createSheet
(
sheetName
+
"("
+
(
i
+
index
)
+
")"
,
i
+
index
);
// 创建工作表,第一个参数是名称,第二个参数是该工作表在工作薄中的位置
int
toIndex
=
limit
*
(
i
+
1
)
>
size
?
size
-
1
:
limit
*
(
i
+
1
);
fileInToSheet
(
sheet
,
bodyList
.
subList
(
i
*
limit
,
toIndex
),
headList
);
}
}
for
(
Integer
mergeColNum
:
getmergeColNumS
())
{
mergeCell
(
mergeColNum
);
}
return
writeWorkBook
.
getSheetNames
().
length
;
}
private
List
<
Integer
>
getmergeColNumS
()
{
if
(
null
!=
mergeList
&&
mergeList
.
size
()
>
0
)
{
List
<
Integer
>
mergerList
=
new
ArrayList
<
Integer
>();
for
(
String
merge
:
mergeList
)
{
mergerList
.
add
(
getHeadListMap
().
get
(
merge
));
}
return
mergerList
;
}
return
new
ArrayList
<
Integer
>();
}
/**
* 选择列号,进行相同的值合并操作
*
* @Title: mergeCell
* @param colNum 列的下标第一列为0
*/
private
void
mergeCell
(
int
colNum
)
{
try
{
Map
<
Integer
,
Integer
>
map
=
getMergeCellsList
(
colNum
);
for
(
Integer
startRowNum
:
map
.
keySet
())
sheet
.
mergeCells
(
colNum
,
startRowNum
,
colNum
,
map
.
get
(
startRowNum
));
}
catch
(
RowsExceededException
e
)
{
e
.
printStackTrace
();
}
catch
(
WriteException
e
)
{
e
.
printStackTrace
();
}
}
private
Map
<
Integer
,
Integer
>
getMergeCellsList
(
Integer
colNum
)
{
Map
<
Integer
,
Integer
>
map
=
new
HashMap
<
Integer
,
Integer
>();
int
rows
=
sheet
.
getRows
();
for
(
int
rowNum
=
0
;
rowNum
<
rows
;
rowNum
++)
{
Cell
cell
=
sheet
.
getCell
(
colNum
,
rowNum
);
while
(
rowNum
<
rows
)
{
rowNum
++;
Cell
cellNext
=
sheet
.
getCell
(
colNum
,
rowNum
);
if
(
cell
.
getContents
().
equals
(
cellNext
.
getContents
()))
{
map
.
put
(
cell
.
getRow
(),
rowNum
);
}
else
{
rowNum
-=
1
;
break
;
}
}
}
return
map
;
}
private
void
getWriteWorkBookWithStream
()
{
try
{
writeWorkBook
=
Workbook
.
createWorkbook
(
os
);
// 创建xls文件
if
(
writeWorkBook
!=
null
)
{
formatting
();
}
}
catch
(
FileNotFoundException
e
)
{
e
.
printStackTrace
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
// 创建新的Sheet
private
void
getWorkBookWithFile
(
File
excelFile
)
{
try
{
readWordBook
=
getExcelFile
(
excelFile
);
writeWorkBook
=
Workbook
.
createWorkbook
(
excelFile
,
readWordBook
);
if
(
writeWorkBook
!=
null
)
{
formatting
();
}
}
catch
(
FileNotFoundException
e
)
{
excelFile
.
deleteOnExit
();
e
.
printStackTrace
();
}
catch
(
IOException
e
)
{
excelFile
.
deleteOnExit
();
e
.
printStackTrace
();
}
}
// 创建新的Sheet
private
Workbook
getExcelFile
(
File
excelFile
)
{
try
{
return
Workbook
.
getWorkbook
(
excelFile
);
}
catch
(
BiffException
e
)
{
e
.
printStackTrace
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
return
null
;
}
/**
* @Title: getSheetIndex
* @Description: TODO(获取新增加的sheet在表中的位置)
* 设定文件
* @return int 返回类型
*/
private
int
getSheetIndex
()
{
int
index
=
readWordBook
.
getSheets
().
length
+
1
;
return
index
;
}
private
String
existsName
(
String
sheetName
)
{
if
(
readWordBook
!=
null
)
{
String
[]
names
=
readWordBook
.
getSheetNames
();
for
(
int
i
=
0
;
i
<
names
.
length
;
i
++)
{
if
(
sheetName
.
equals
(
names
[
i
]))
{
sheetName
+=
"副本"
;
}
}
}
return
sheetName
;
}
/**
* @Title: fileInSheet
* @Description: TODO(为sheet页填充数据)
* @param sheet
* @param list 设定文件
* @return void 返回类型
*/
private
void
fileInToSheet
(
WritableSheet
sheet
,
List
<
Map
<
String
,
Object
>>
list
,
List
<
String
>
headList
)
{
builderHeader
(
sheet
,
headList
);
parserBean
(
sheet
,
list
);
}
/**
* @Title: builderHeader
* @Description: TODO(生成表头)
* @param sheet
* @param list
* @param headformat 设定文件
* @return void 返回类型
*/
private
void
builderHeader
(
WritableSheet
sheet
,
List
<
String
>
list
)
{
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
try
{
sheet
.
addCell
(
new
Label
(
i
,
0
,
list
.
get
(
i
),
headformat
));
}
catch
(
RowsExceededException
e
)
{
e
.
printStackTrace
();
}
catch
(
WriteException
e
)
{
e
.
printStackTrace
();
}
}
}
/** 解析行 */
private
void
parserBean
(
WritableSheet
sheet
,
List
<
Map
<
String
,
Object
>>
list
)
{
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
builderCell
(
sheet
,
list
.
get
(
i
),
i
+
1
);
}
}
/** 根据表头来解析列数据 */
private
void
builderCell
(
WritableSheet
sheet
,
Map
<
String
,
Object
>
obj
,
int
row
)
{
try
{
List
<
String
>
head
=
headList
;
for
(
int
i
=
0
;
i
<
head
.
size
();
i
++)
{
Object
o
=
obj
.
get
(
head
.
get
(
i
))
==
null
?
""
:
obj
.
get
(
head
.
get
(
i
));
try
{
if
(
o
instanceof
Integer
)
{
sheet
.
addCell
(
new
jxl
.
write
.
Number
(
i
,
row
,
Integer
.
valueOf
(
o
.
toString
()),
format
));
}
else
if
(
o
instanceof
Double
)
{
sheet
.
addCell
(
new
jxl
.
write
.
Number
(
i
,
row
,
Double
.
valueOf
(
o
.
toString
()),
format
));
}
else
if
(
o
instanceof
Long
)
{
sheet
.
addCell
(
new
jxl
.
write
.
Number
(
i
,
row
,
Long
.
valueOf
(
o
.
toString
()),
format
));
}
else
if
(
o
instanceof
HLink
)
{
HLink
l
=
(
HLink
)
o
;
if
(
null
==
l
.
getUrl
())
{
sheet
.
addCell
(
new
Label
(
i
,
row
,
"超链接出错:_"
+
l
.
getDescription
(),
format
));
}
else
{
WritableHyperlink
link
=
null
;
link
=
new
WritableHyperlink
(
i
,
row
,
l
.
getUrl
());
link
.
setDescription
(
l
.
getDescription
());
sheet
.
addHyperlink
(
link
);
}
}
else
{
if
(
row
%
2
==
0
)
{
sheet
.
addCell
(
new
Label
(
i
,
row
,
o
.
toString
(),
formatColor
));
}
else
{
sheet
.
addCell
(
new
Label
(
i
,
row
,
o
.
toString
(),
format
));
}
}
}
catch
(
NumberFormatException
e
)
{
log
.
error
(
"第几列:{}\t列名:{}\t数据:"
+
o
,
i
,
headList
.
get
(
i
));
}
}
}
catch
(
RowsExceededException
e
)
{
e
.
printStackTrace
();
}
catch
(
WriteException
e
)
{
e
.
printStackTrace
();
}
}
/**
* @Title: formatting
* @Description: TODO(设置Excel单元格格式)
* @param 设定文件
* @return void 返回类型
*/
private
void
formatting
()
{
format
=
getCellFormat
(
writeWorkBook
);
formatColor
=
getCellSimpleFormat
(
writeWorkBook
);
headformat
=
getHeaderFormat
(
writeWorkBook
);
}
public
void
closeAllObject
()
{
try
{
if
(
writeWorkBook
!=
null
)
{
writeWorkBook
.
close
();
}
if
(
os
!=
null
)
{
os
.
close
();
}
if
(
readWordBook
!=
null
)
{
readWordBook
.
close
();
}
}
catch
(
WriteException
e
)
{
e
.
printStackTrace
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
/** 表头各式 */
public
WritableCellFormat
getHeaderFormat
(
WritableWorkbook
wb
)
{
// 创建表头样式
//WritableFont headFont = new WritableFont(WritableFont.TIMES, 10, WritableFont.BOLD, false);
WritableFont
headFont
=
new
WritableFont
(
WritableFont
.
TIMES
,
10
,
WritableFont
.
BOLD
,
false
,
UnderlineStyle
.
NO_UNDERLINE
,
Colour
.
WHITE
);
WritableCellFormat
wcfFormat
=
new
WritableCellFormat
(
headFont
);
// 设置居中
try
{
wb
.
setColourRGB
(
Colour
.
GRAY_50
,
166
,
166
,
166
);
wcfFormat
.
setAlignment
(
jxl
.
format
.
Alignment
.
CENTRE
);
// 左右居中
wcfFormat
.
setVerticalAlignment
(
jxl
.
format
.
VerticalAlignment
.
CENTRE
);
// 上下居中
wcfFormat
.
setBorder
(
Border
.
ALL
,
BorderLineStyle
.
THIN
,
Colour
.
BLUE2
);
// 黑色边框
wcfFormat
.
setBackground
(
Colour
.
GRAY_50
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
wcfFormat
;
}
/**
* 单元格各式
* 添加了背景色
**/
public
WritableCellFormat
getCellSimpleFormat
(
WritableWorkbook
wb
)
{
// 创建表头样式
WritableFont
headFonts
=
new
WritableFont
(
WritableFont
.
createFont
(
"微软雅黑"
),
9
,
WritableFont
.
NO_BOLD
,
false
);
WritableCellFormat
wcfSimpleFormat
=
new
WritableCellFormat
(
headFonts
);
// 设置居中
try
{
wb
.
setColourRGB
(
Colour
.
GRAY_80
,
242
,
242
,
242
);
// 工作簿颜色设置
wcfSimpleFormat
.
setAlignment
(
jxl
.
format
.
Alignment
.
CENTRE
);
// 左右居中
wcfSimpleFormat
.
setVerticalAlignment
(
jxl
.
format
.
VerticalAlignment
.
CENTRE
);
// 上下居中
wcfSimpleFormat
.
setBorder
(
Border
.
ALL
,
BorderLineStyle
.
THIN
,
Colour
.
BLUE2
);
// 蓝色边框
wcfSimpleFormat
.
setBackground
(
Colour
.
GRAY_80
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
wcfSimpleFormat
;
}
/** 单元格各式 */
public
WritableCellFormat
getCellFormat
(
WritableWorkbook
wb
)
{
// 创建表头样式
WritableFont
headFont
=
new
WritableFont
(
WritableFont
.
createFont
(
"微软雅黑"
),
9
,
WritableFont
.
NO_BOLD
,
false
);
WritableCellFormat
wcfFormat
=
new
WritableCellFormat
(
headFont
);
// 设置居中
try
{
wb
.
setColourRGB
(
Colour
.
BLUE2
,
0
,
176
,
240
);
// 工作簿颜色设置
wcfFormat
.
setAlignment
(
jxl
.
format
.
Alignment
.
CENTRE
);
// 左右居中
wcfFormat
.
setVerticalAlignment
(
jxl
.
format
.
VerticalAlignment
.
CENTRE
);
// 上下居中
wcfFormat
.
setBorder
(
Border
.
ALL
,
BorderLineStyle
.
THIN
,
Colour
.
BLUE2
);
// 蓝色边框
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
wcfFormat
;
}
/**
* @return the format
*/
public
WritableCellFormat
getFormat
()
{
return
format
;
}
/**
* @return the headformat
*/
public
WritableCellFormat
getHeadformat
()
{
return
headformat
;
}
/**
* @param format the format to set
*/
public
void
setFormat
(
WritableCellFormat
format
)
{
this
.
format
=
format
;
}
/**
* @Description (这里用一句话描述这个方法的作用)
* @param formatColor
*/
public
void
setFormatColor
(
WritableCellFormat
formatColor
)
{
this
.
formatColor
=
formatColor
;
}
/**
* @param headformat the headformat to set
*/
public
void
setHeadformat
(
WritableCellFormat
headformat
)
{
this
.
headformat
=
headformat
;
}
private
void
writeWorkBookWriter
()
{
try
{
writeWorkBook
.
write
();
}
catch
(
IOException
e
)
{
}
}
/**
* 通过表头List,获取 表头对应的列下标 key-value ,表头-下标
*
* @Title: getHeadListMap
* 设定文件
* @return Map<String,Integer> 返回类型
*/
private
Map
<
String
,
Integer
>
getHeadListMap
()
{
if
(
null
!=
headList
&&
headList
.
size
()
>
0
)
{
Map
<
String
,
Integer
>
map
=
new
HashMap
<
String
,
Integer
>();
int
i
=
0
;
for
(
String
headNum
:
headList
)
{
map
.
put
(
headNum
,
i
);
i
++;
}
return
map
;
}
return
new
HashMap
<
String
,
Integer
>();
}
}
src/main/java/com/zhiweidata/weiboDomain/service/MongoSerivce.java
0 → 100644
View file @
158abbbc
/**
* @Title: Serivce.java
* @Package com.zhiweidata.weiboDomain.service
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午4:55:00
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
service
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
javax.annotation.Resource
;
import
org.springframework.stereotype.Service
;
import
com.zhiweidata.weiboDomain.crawler.JsoupHtml
;
import
com.zhiweidata.weiboDomain.crawler.WeiboDomainCrawler
;
import
com.zhiweidata.weiboDomain.dao.DomainDao
;
import
com.zhiweidata.weiboDomain.dao.TagDao
;
import
com.zhiweidata.weiboDomain.entity.DomainTag
;
import
com.zhiweidata.weiboDomain.entity.WeiboDomain
;
import
lombok.extern.slf4j.Slf4j
;
/**
* @ClassName: Serivce
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午4:55:00
*/
@Slf4j
@Service
public
class
MongoSerivce
{
@Resource
TagDao
tagDao
;
@Resource
DomainDao
domainDao
;
WeiboDomainCrawler
crawler
=
new
WeiboDomainCrawler
();
JsoupHtml
jsoupHtml
=
JsoupHtml
.
getInstance
();
public
void
crawlerData
(
String
cookie
)
{
Map
<
String
,
String
>
map
=
groupSet
();
for
(
String
domain
:
map
.
keySet
())
{
String
domainId
=
map
.
get
(
domain
);
log
.
info
(
"【{}】页开始爬取..............."
,
domain
);
List
<
WeiboDomain
>
list
=
parse
(
domain
,
domainId
,
cookie
);
log
.
info
(
"【{}】页所有数据爬取结束..............."
,
domain
);
domainDao
.
insert
(
list
);
tagDao
.
updateByState
(
domain
,
2
);
log
.
info
(
"【{}】所有页数据存储成功,共计【{}】条数据"
,
domain
,
list
.
size
());
}
log
.
info
(
"所有页面爬取结束,程序结束"
);
}
private
int
getPageNum
(
String
domainId
,
String
cookie
)
{
while
(
true
)
{
String
page
=
crawler
.
getPage
(
domainId
,
cookie
);
crawler
.
sleep
(
3000L
);
int
num
=
jsoupHtml
.
parsePage
(
page
);
if
(
num
!=
0
)
{
return
num
;
}
}
}
private
List
<
WeiboDomain
>
parse
(
String
domain
,
String
domainId
,
String
cookie
)
{
List
<
WeiboDomain
>
result
=
new
ArrayList
<>();
int
num
=
getPageNum
(
domainId
,
cookie
);
int
i
=
1
;
while
(
i
<
300
)
{
String
url
=
"https://d.weibo.com/"
+
domainId
+
"?pids=Pl_Core_F4RightUserList__4"
+
"&page="
+
i
+
"&ajaxpagelet=1&__ref=/"
+
domainId
;
String
html
=
crawler
.
getHtml
(
url
,
cookie
);
if
(
html
==
null
)
{
if
((
result
.
size
()/
10
)+
2
<
num
)
{
continue
;
}
else
{
break
;
}
}
List
<
WeiboDomain
>
list
=
jsoupHtml
.
parseData
(
html
,
domain
);
result
.
addAll
(
list
);
log
.
info
(
"【{}】:第【{}】页爬取成功"
,
domain
,
i
);
i
++;
crawler
.
sleep
(
3000L
);
}
return
result
;
}
private
Map
<
String
,
String
>
groupSet
(){
Map
<
String
,
String
>
result
=
new
HashMap
<>();
List
<
DomainTag
>
list
=
tagDao
.
findByState
(
1
);
for
(
DomainTag
domainTag
:
list
)
{
String
key
=
domainTag
.
getDomain
();
String
value
=
domainTag
.
getDomainId
();
result
.
put
(
key
,
value
);
}
for
(
String
key
:
result
.
keySet
())
{
tagDao
.
updateByState
(
key
,
0
);
}
return
result
;
}
/**
* @Title: initTag
* @Description: TODO(初始化所有tag状态,并创建新的表,在再次爬取时调用)
* void 返回类型
*/
public
void
initTag
()
{
Map
<
String
,
String
>
result
=
new
HashMap
<>();
List
<
DomainTag
>
list
=
tagDao
.
findAll
();
for
(
DomainTag
domainTag
:
list
)
{
String
key
=
domainTag
.
getDomain
();
String
value
=
domainTag
.
getDomainId
();
result
.
put
(
key
,
value
);
}
for
(
String
key
:
result
.
keySet
())
{
tagDao
.
updateByState
(
key
,
0
);
}
domainDao
.
createColl
();
}
}
src/main/java/com/zhiweidata/weiboDomain/start/Start.java
0 → 100644
View file @
158abbbc
/**
* @Title: Main.java
* @Package com.zhiweidata.weiboDomain.start
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午3:09:33
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
weiboDomain
.
start
;
import
org.springframework.context.ApplicationContext
;
import
org.springframework.context.support.ClassPathXmlApplicationContext
;
import
com.zhiweidata.weiboDomain.service.MongoSerivce
;
/**
* @ClassName: Main
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午3:09:33
*/
public
class
Start
{
private
static
ApplicationContext
ctx
=
new
ClassPathXmlApplicationContext
(
"spring-context.xml"
);
private
static
MongoSerivce
serice
=
ctx
.
getBean
(
MongoSerivce
.
class
);
public
static
void
main
(
String
[]
args
)
{
String
cookie
=
"login_sid_t=2da8770fb84cdb5be026bbfcd76ef1e6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=873655794108.0503.1519525903336; SINAGLOBAL=873655794108.0503.1519525903336; ULV=1519525903344:1:1:1:873655794108.0503.1519525903336:; SSOLoginState=1519525975; SCF=AqU8lfV6ROhTkYEEmVi2ROhtdMxlB0mT3EF2ABKenC3OfC3SeK3YfvZYWFJY8ytsaFhYcc1vO5hvhLwolzBW5ps.; SUB=_2A253llAIDeRhGeNH6VoY9C7Mzz-IHXVU4sbArDV8PUNbmtBeLUnSkW9NStghaGFgK4WPoq15L2ikM_srwT7hNvkI; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5eochNrdf3XKPD1VaPcG3T5JpX5K2hUgL.Fo-4eon4Sh57She2dJLoIEQLxK-LBKBLBo2LxKBLBo.L12zLxK-L1-BLBKqLxKML1hBLBoqEeh2ceh-t; SUHB=0mxUFkR8aaPo5m; ALF=1551061975; un=18395807152; wvr=6; YF-Page-G0=416186e6974c7d5349e42861f3303251"
;
// 初始化程序状态,在再次爬取时调用
// 断点续传时,注释掉
// serice.initTag();
serice
.
crawlerData
(
cookie
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment