Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
weiboDomain
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
xuyimeng
weiboDomain
Commits
8fbf7592
Commit
8fbf7592
authored
Jan 10, 2019
by
chenweitao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
检查性维护
parent
306c37e1
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
231 additions
and
10 deletions
+231
-10
src/main/java/com/zhiweidata/weiboDomain/service/MongoSerivce.java
+9
-5
src/main/java/com/zhiweidata/weiboDomain/start/Start.java
+1
-1
src/main/java/com/zhiweidata/weiboDomain/utils/TreatVtype.java
+119
-0
src/main/resources/mongo.properties
+9
-1
src/main/resources/mongoContext.xml
+1
-1
src/test/java/com/zhiweidata/weiboDomain/crawler/CrawlerTest.java
+46
-0
src/test/java/com/zhiweidata/weiboDomain/dao/DomainDaoTest.java
+46
-2
No files found.
src/main/java/com/zhiweidata/weiboDomain/service/MongoSerivce.java
View file @
8fbf7592
...
@@ -65,16 +65,20 @@ public class MongoSerivce {
...
@@ -65,16 +65,20 @@ public class MongoSerivce {
private
int
getPageNum
(
String
domainId
,
String
cookie
)
{
private
int
getPageNum
(
String
domainId
,
String
cookie
)
{
int
index
=
0
;
int
index
=
0
;
while
(
true
)
{
while
(
true
)
{
crawler
.
sleep
(
3000L
);
if
(
index
>
10
)
{
log
.
error
(
"【{}】未获取到页码"
,
domainId
);
return
0
;
}
String
page
=
crawler
.
getPage
(
domainId
,
cookie
);
String
page
=
crawler
.
getPage
(
domainId
,
cookie
);
if
(
page
==
null
||
""
.
equals
(
page
))
{
log
.
info
(
"获取页码失败,tag【{}】,重试【{}】"
,
domainId
,
++
index
);
continue
;
}
int
num
=
jsoupHtml
.
parsePage
(
page
);
int
num
=
jsoupHtml
.
parsePage
(
page
);
if
(
num
!=
0
)
{
if
(
num
!=
0
)
{
return
num
;
return
num
;
}
}
if
(++
index
>
10
)
{
crawler
.
sleep
(
5000
);
log
.
error
(
"【{}】未获取到页码"
);
return
0
;
}
}
}
}
}
...
...
src/main/java/com/zhiweidata/weiboDomain/start/Start.java
View file @
8fbf7592
...
@@ -31,7 +31,7 @@ public class Start {
...
@@ -31,7 +31,7 @@ public class Start {
System
.
out
.
println
(
"微博热门榜单采集开始..."
);
System
.
out
.
println
(
"微博热门榜单采集开始..."
);
//程序主体切换至com.zhiweidata.weiboDomain.quartz定时器
//程序主体切换至com.zhiweidata.weiboDomain.quartz定时器
//
String cookie = "YF-Page-G0=b98b45d9bba85e843a07e69c0880151a; SUB=_2AkMt9Wyxf8NxqwJRmP0RzWPrbI90wg3EieKbqZ1qJRMxHRl-yj83qhwMtRB6BnVCXs0VTavYQWuC4hgT0djGew9Twhmm; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_";
String
cookie
=
"YF-Page-G0=b98b45d9bba85e843a07e69c0880151a; SUB=_2AkMt9Wyxf8NxqwJRmP0RzWPrbI90wg3EieKbqZ1qJRMxHRl-yj83qhwMtRB6BnVCXs0VTavYQWuC4hgT0djGew9Twhmm; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_"
;
// 初始化程序状态,在再次爬取时调用
// 初始化程序状态,在再次爬取时调用
// 断点续传时,注释掉
// 断点续传时,注释掉
// serice.initTag();
// serice.initTag();
...
...
src/main/java/com/zhiweidata/weiboDomain/utils/TreatVtype.java
0 → 100644
View file @
8fbf7592
package
com
.
zhiweidata
.
weiboDomain
.
utils
;
/**
* @Description: TODO(处理微博的用户的认证类型)
* @ClassName: TreatVtype
* @author 志伟
* @date 2015-5-26 下午2:49:49
* ***/
public
class
TreatVtype
{
/**
* @deprecated:处理微博的用户的认证类型
* @param int vtype
* 认证类型(int)
* @return String
* **/
public
static
String
changeVtype
(
int
vtype
)
{
String
type
=
""
;
switch
(
vtype
)
{
case
-
2
:
type
=
"未知"
;
break
;
case
-
1
:
type
=
"普通用户"
;
break
;
case
0
:
type
=
"名人"
;
break
;
case
1
:
type
=
"政府"
;
break
;
case
2
:
type
=
"企业"
;
break
;
case
3
:
type
=
"媒体"
;
break
;
case
4
:
type
=
"校园"
;
break
;
case
5
:
type
=
"网站"
;
break
;
case
6
:
type
=
"应用"
;
break
;
case
7
:
type
=
"团体"
;
break
;
case
10
:
type
=
"微博女郎"
;
break
;
case
220
:
type
=
"达人"
;
break
;
case
200
:
type
=
"达人"
;
break
;
}
return
type
;
}
/**
* @deprecated:处理微博的用户的认证类型
* @param int vtype
* 认证类型(int)
* @return String
* **/
public
static
String
changeVtype
(
String
vtype
)
{
String
type
=
""
;
if
(
vtype
.
equals
(
"n"
))
{
type
=
"普通用户"
;
}
else
if
(
vtype
.
equals
(
"b"
))
{
type
=
"蓝V"
;
}
else
if
(
vtype
.
equals
(
"y"
))
{
type
=
"黄V"
;
}
else
if
(
vtype
.
equals
(
"d"
))
{
type
=
"达人"
;
}
return
type
;
}
/**
* @deprecated:处理微博的用户的性别
* @param String sex 性别
* @return String
* **/
public
static
String
changeSex
(
String
sex
)
{
String
gender
=
sex
;
if
(
sex
.
equals
(
"f"
))
{
gender
=
"女"
;
}
else
if
(
sex
.
equals
(
"m"
))
{
gender
=
"男"
;
}
else
if
(
sex
.
equals
(
"n"
))
{
gender
=
"未知"
;
}
return
gender
;
}
/**
* @deprecated:处理微博的是否为原创
* @param int isforword 微博状态
* @return String
* **/
public
static
String
changeIsForword
(
int
isforword
)
{
String
isForword
=
""
;
if
(
isforword
==
0
)
{
isForword
=
"原创"
;
}
else
if
(
isforword
==
1
)
{
isForword
=
"转发"
;
}
return
isForword
;
}
}
src/main/resources/mongo.properties
View file @
8fbf7592
#mongo.serverMongoIp=127.0.0.1
#mongo.serverMongoIp=127.0.0.1
mongo.serverMongoIp
=
115.236.59.91
mongo.serverMongoIp
=
202.107.192.94
mongo.dbName
=
weiboDomain
mongo.dbName
=
weiboDomain
#mongo.ef.username=rsync
#mongo.ef.pwd=rsync1q2w3e4r
#mongo.ef.defaultDB=admin
mongo.ef.username
=
cwtno
mongo.ef.pwd
=
cwtno1q2w3e4r
mongo.ef.defaultDB
=
admin
src/main/resources/mongoContext.xml
View file @
8fbf7592
...
@@ -40,7 +40,7 @@
...
@@ -40,7 +40,7 @@
<property
name=
"typeMapper"
ref=
"defaultMongoTypeMapper"
/>
<property
name=
"typeMapper"
ref=
"defaultMongoTypeMapper"
/>
</bean>
</bean>
<!-- 配置数据库相关配置 -->
<!-- 配置数据库相关配置 -->
<mongo:mongo-client
id=
"Mongo"
host=
"${mongo.serverMongoIp}"
port=
"
27017
"
/>
<mongo:mongo-client
id=
"Mongo"
host=
"${mongo.serverMongoIp}"
port=
"
30000"
credentials=
"${mongo.ef.username}:${mongo.ef.pwd}@${mongo.ef.defaultDB}
"
/>
<mongo:db-factory
id=
"Factory"
dbname=
"${mongo.dbName}"
<mongo:db-factory
id=
"Factory"
dbname=
"${mongo.dbName}"
mongo-ref=
"Mongo"
/>
mongo-ref=
"Mongo"
/>
<mongo:template
id=
"template"
converter-ref=
"mappingMongoConverter"
<mongo:template
id=
"template"
converter-ref=
"mappingMongoConverter"
...
...
src/test/java/com/zhiweidata/weiboDomain/crawler/CrawlerTest.java
0 → 100644
View file @
8fbf7592
package
com
.
zhiweidata
.
weiboDomain
.
crawler
;
import
javax.annotation.Resource
;
import
org.junit.Test
;
import
com.zhiweidata.weiboDomain.ObjectTest
;
import
com.zhiweidata.weiboDomain.dao.DomainDao
;
import
com.zhiweidata.weiboDomain.dao.TagDao
;
public
class
CrawlerTest
extends
ObjectTest
{
@Resource
DomainDao
domainDao
;
@Resource
TagDao
tagDao
;
@Test
public
void
test2
()
{
WeiboDomainCrawler
weiboDomainCrawler
=
new
WeiboDomainCrawler
();
JsoupHtml
jsoupHtml
=
JsoupHtml
.
getInstance
();
String
cookie
=
"SINAGLOBAL=1523184237048.3672.1521084294046; UOR=,,www.baidu.com; login_sid_t=e1701f6447d260c7bdb34906162c80e8; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=6087081085858.952.1545809452672; ULV=1545809452676:23:3:1:6087081085858.952.1545809452672:1544841771587; YF-Page-G0=1ffbef18656bf02c17e45a764e3d5336; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhJTOikRJv51mSslx-nU-WA5JpX5K2hUgL.Foe0eK571h-f1K-2dJLoIEXLxKBLB.eL1-2LxK.LBKeL1KnLxKML1-2L1hBLxKqL1h.LB-zLxKqL122L122t; SSOLoginState=1547099803; ALF=1578635821; SCF=AuCAEA8HE7llKecGgDC9mXez57Y8TrXZB-bbR3xw1Rg_LDOkUG0Qj0SGSDLEvIMQSNKl6aF3mXb0ucWr77cjGH4.; SUB=_2A25xMq7_DeRhGeVN6lIR-CvJwjmIHXVSSYc3rDV8PUNbmtBeLRX2kW9NTHXVKUPFRv4an3QvDUPEjSX37t9jiiWn; SUHB=05i6gOaQs8TYWb; un=15757871020; wvr=6; wb_view_log_3310085595=1920*10801"
;
tagDao
.
findAll
().
forEach
(
tag
->{
String
url
=
"https://d.weibo.com/"
+
tag
.
getDomainId
()+
"?pids=Pl_Core_F4RightUserList__4"
+
"&page="
+
1
+
"&ajaxpagelet=1&__ref=/"
+
tag
.
getDomainId
()
+
"&_t=FM_"
+
System
.
currentTimeMillis
()
+
100
;
String
page
=
weiboDomainCrawler
.
getPage
(
tag
.
getDomainId
(),
cookie
);
if
(
page
!=
null
)
{
System
.
out
.
println
(
"页码:"
+
jsoupHtml
.
parsePage
(
page
));
System
.
out
.
println
(
url
);
}
else
{
System
.
out
.
println
(
"失效页码:"
+
url
);
}
});
}
@Test
public
void
test
()
{
System
.
out
.
println
(
"testRunning"
);
}
}
src/test/java/com/zhiweidata/weiboDomain/dao/DomainDaoTest.java
View file @
8fbf7592
package
com
.
zhiweidata
.
weiboDomain
.
dao
;
package
com
.
zhiweidata
.
weiboDomain
.
dao
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
javax.annotation.Resource
;
import
javax.annotation.Resource
;
...
@@ -11,9 +13,14 @@ import org.jsoup.nodes.Element;
...
@@ -11,9 +13,14 @@ import org.jsoup.nodes.Element;
import
org.jsoup.select.Elements
;
import
org.jsoup.select.Elements
;
import
org.junit.Test
;
import
org.junit.Test
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiweidata.weiboDomain.ObjectTest
;
import
com.zhiweidata.weiboDomain.ObjectTest
;
import
com.zhiweidata.weiboDomain.crawler.WeiboDomainCrawler
;
import
com.zhiweidata.weiboDomain.crawler.WeiboDomainCrawler
;
import
com.zhiweidata.weiboDomain.entity.DomainTag
;
import
com.zhiweidata.weiboDomain.entity.DomainTag
;
import
com.zhiweidata.weiboDomain.entity.WeiboDomain
;
import
com.zhiweidata.weiboDomain.excel.DBOExp
;
import
com.zhiweidata.weiboDomain.utils.TreatVtype
;
import
net.sf.json.JSONObject
;
import
net.sf.json.JSONObject
;
...
@@ -25,7 +32,7 @@ public class DomainDaoTest extends ObjectTest {
...
@@ -25,7 +32,7 @@ public class DomainDaoTest extends ObjectTest {
@Resource
@Resource
TagDao
tagDao
;
TagDao
tagDao
;
//
@Test
//
@Test
public
void
insertTagTest
()
{
public
void
insertTagTest
()
{
DomainTag
domainTag
=
new
DomainTag
();
DomainTag
domainTag
=
new
DomainTag
();
String
id
=
"24H热门"
;
String
id
=
"24H热门"
;
...
@@ -36,6 +43,43 @@ public class DomainDaoTest extends ObjectTest {
...
@@ -36,6 +43,43 @@ public class DomainDaoTest extends ObjectTest {
tagDao
.
insert
(
domainTag
);
tagDao
.
insert
(
domainTag
);
}
}
@Test
public
void
exTest
()
{
List
<
WeiboDomain
>
doms
;
List
<
DBObject
>
dbos
;
DBObject
dbo
;
String
fatherPath
=
"E:\\出的数据\\微博栏目数据\\"
;
List
<
DomainTag
>
tsgs
=
tagDao
.
findAll
();
for
(
DomainTag
domainTag
:
tsgs
)
{
doms
=
domainDao
.
findByDomain
(
domainTag
.
getDomain
());
dbos
=
new
ArrayList
<>();
for
(
WeiboDomain
weiboDomain
:
doms
)
{
dbo
=
new
BasicDBObject
();
dbo
.
put
(
"栏目"
,
weiboDomain
.
getDomain
());
dbo
.
put
(
"标签"
,
weiboDomain
.
getTag
());
dbo
.
put
(
"uid"
,
weiboDomain
.
getUid
());
dbo
.
put
(
"用户昵称"
,
weiboDomain
.
getName
());
dbo
.
put
(
"性别"
,
TreatVtype
.
changeSex
(
weiboDomain
.
getGender
()));
dbo
.
put
(
"地域"
,
weiboDomain
.
getLocation
());
dbo
.
put
(
"简介"
,
weiboDomain
.
getDescription
());
dbo
.
put
(
"粉丝数"
,
weiboDomain
.
getFollowers_count
());
dbo
.
put
(
"关注数"
,
weiboDomain
.
getFriends_count
());
dbo
.
put
(
"微博数"
,
weiboDomain
.
getStatuses_count
());
dbo
.
put
(
"认证类别"
,
weiboDomain
.
getvType
());
dbos
.
add
(
dbo
);
}
if
(
dbos
.
size
()>
0
)
{
DBOExp
dbExp
=
new
DBOExp
();
dbExp
.
putRun
(
dbos
,
fatherPath
+
domainTag
.
getDomain
()+
"用户.xls"
,
domainTag
.
getDomain
());
}
}
}
/**
/**
*
*
* @Title: insertTagFirstTest
* @Title: insertTagFirstTest
...
@@ -46,7 +90,7 @@ public class DomainDaoTest extends ObjectTest {
...
@@ -46,7 +90,7 @@ public class DomainDaoTest extends ObjectTest {
* 陈炜涛
* 陈炜涛
* @date: 2018年3月13日 下午5:53:34
* @date: 2018年3月13日 下午5:53:34
*/
*/
//
@Test
//
@Test
public
void
insertTagFirstTest
()
{
public
void
insertTagFirstTest
()
{
WeiboDomainCrawler
crawler
=
new
WeiboDomainCrawler
();
WeiboDomainCrawler
crawler
=
new
WeiboDomainCrawler
();
String
url
=
"https://d.weibo.com/1087030002_2975_1003_0#"
;
String
url
=
"https://d.weibo.com/1087030002_2975_1003_0#"
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment