Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
6f72ce80
Commit
6f72ce80
authored
Oct 09, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修复按月分库失败bug及添加索引
parent
db96247a
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
83 additions
and
46 deletions
+83
-46
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHuatiCrawler.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchListDAO.java
+19
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiboTopicDAO.java
+21
-0
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchListTest.java
+29
-32
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
+1
-2
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
+1
-2
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
+2
-1
src/main/resources/proxyip.properties
+5
-5
No files found.
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHuatiCrawler.java
View file @
6f72ce80
...
...
@@ -108,7 +108,7 @@ public class WeiboHuatiCrawler {
id
=
data
.
getString
(
"page_id"
);
score
=
data
.
getString
(
"score"
);
desc1
=
data
.
getString
(
"desc1"
);
fensi
=
desc1
.
replaceAll
(
"
影响力.*"
,
""
);
fensi
=
desc1
.
replaceAll
(
"
.*影响力|粉丝"
,
""
).
trim
(
);
url
=
data
.
getString
(
"link"
);
WeiboTopic
topic
=
new
WeiboTopic
(
url
,
topicName
,
toprank
,
score
,
fensi
,
type
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchListDAO.java
View file @
6f72ce80
...
...
@@ -28,6 +28,25 @@ public class HotSearchListDAO extends MongoDBTemplate{
String
month
=
time
.
substring
(
5
,
7
);
String
collName
=
Config
.
searchCollName
+
year
+
"_"
+
month
;
super
.
setCollName
(
collName
);
DBObject
countIndexDoc
=
new
BasicDBObject
();
countIndexDoc
.
put
(
"count"
,
-
1
);
DBObject
timeIndexDoc
=
new
BasicDBObject
();
timeIndexDoc
.
put
(
"time"
,
-
1
);
DBObject
rankIndexDoc
=
new
BasicDBObject
();
rankIndexDoc
.
put
(
"rank"
,
-
1
);
DBObject
nameIndexDoc
=
new
BasicDBObject
();
nameIndexDoc
.
put
(
"name"
,
-
1
);
DBObject
typeIndexDoc
=
new
BasicDBObject
();
typeIndexDoc
.
put
(
"type"
,
-
1
);
try
{
super
.
getReadColl
().
createIndex
(
countIndexDoc
,
new
BasicDBObject
(
"name"
,
"count_desc"
));
super
.
getReadColl
().
createIndex
(
timeIndexDoc
,
new
BasicDBObject
(
"name"
,
"time_desc"
));
super
.
getReadColl
().
createIndex
(
rankIndexDoc
,
new
BasicDBObject
(
"name"
,
"rank_desc"
));
super
.
getReadColl
().
createIndex
(
nameIndexDoc
,
new
BasicDBObject
(
"name"
,
"name_desc"
));
super
.
getReadColl
().
createIndex
(
typeIndexDoc
,
new
BasicDBObject
(
"name"
,
"type_desc"
));
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
/**
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiboTopicDAO.java
View file @
6f72ce80
...
...
@@ -7,6 +7,7 @@ import java.util.List;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
...
...
@@ -23,6 +24,26 @@ public class WeiboTopicDAO extends MongoDBTemplate{
String
month
=
time
.
substring
(
5
,
7
);
String
collName
=
Config
.
topicCollName
+
year
+
"_"
+
month
;
super
.
setCollName
(
collName
);
DBObject
countIndexDoc
=
new
BasicDBObject
();
countIndexDoc
.
put
(
"score_num"
,
-
1
);
DBObject
timeIndexDoc
=
new
BasicDBObject
();
timeIndexDoc
.
put
(
"time"
,
-
1
);
DBObject
rankIndexDoc
=
new
BasicDBObject
();
rankIndexDoc
.
put
(
"rank"
,
-
1
);
DBObject
nameIndexDoc
=
new
BasicDBObject
();
nameIndexDoc
.
put
(
"name"
,
-
1
);
DBObject
typeIndexDoc
=
new
BasicDBObject
();
typeIndexDoc
.
put
(
"type"
,
-
1
);
try
{
super
.
getReadColl
().
createIndex
(
countIndexDoc
,
new
BasicDBObject
(
"name"
,
"score_desc"
));
super
.
getReadColl
().
createIndex
(
timeIndexDoc
,
new
BasicDBObject
(
"name"
,
"time_desc"
));
super
.
getReadColl
().
createIndex
(
rankIndexDoc
,
new
BasicDBObject
(
"name"
,
"rank_desc"
));
super
.
getReadColl
().
createIndex
(
nameIndexDoc
,
new
BasicDBObject
(
"name"
,
"name_desc"
));
super
.
getReadColl
().
createIndex
(
typeIndexDoc
,
new
BasicDBObject
(
"name"
,
"type_desc"
));
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
/**
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearchListTest.java
View file @
6f72ce80
...
...
@@ -28,15 +28,15 @@ public class HotSearchListTest{
ServerAddress
address
=
new
ServerAddress
(
Config
.
mongoIp
,
Config
.
mongoPort
);
Mongo
mongo
=
new
MongoClient
(
address
,
Arrays
.
asList
(
credential
));
DB
db
=
mongo
.
getDB
(
"
NetWork
"
);
DBCollection
coll
=
db
.
getCollection
(
"
weibo_hotsearch2018_10
"
);
DB
db
=
mongo
.
getDB
(
"
hot_search_list
"
);
DBCollection
coll
=
db
.
getCollection
(
"
hot_search_list2019_09
"
);
MongoCredential
credentialNew
=
MongoCredential
.
createCredential
(
"datapush"
,
"admin"
,
"4d8ce5c42073c"
.
toCharArray
());
ServerAddress
addressNew
=
new
ServerAddress
(
Config
.
mongoIp
,
Config
.
mongoPort
);
Mongo
mongoNew
=
new
MongoClient
(
address
,
Arrays
.
asList
(
credentialNew
));
DB
dbNew
=
mongoNew
.
getDB
(
"hot_search_list"
);
Map
<
String
,
String
>
timLine
=
TimeParse
.
getTimeMap
(
"201
8-02-01 00:00:00"
,
"2019-04-30 23:59:59"
,
"MM
"
,
1
);
Map
<
String
,
String
>
timLine
=
TimeParse
.
getTimeMap
(
"201
9-10-02 00:00:00"
,
"2019-10-09 23:59:59"
,
"dd
"
,
1
);
timLine
.
forEach
((
start
,
end
)
->{
...
...
@@ -70,37 +70,34 @@ public class HotSearchListTest{
e
.
printStackTrace
();
}
// DBObject query = new BasicDBObject(new BasicDBObject("time",
// new BasicDBObject("$gte",startDate).append("$lte", endDate)));
// DBCursor cur = coll.find(query);
// System.out.println(query +"======="+ cur.count());
// List<DBObject> dataList = new ArrayList<>();
// int i = 0;
// while(cur.hasNext()) {
// DBObject doc = cur.next();
// DBObject zhihu = new BasicDBObject();
// zhihu.put("_id", doc.get("_id"));
// zhihu.put("name", doc.get("name"));
// zhihu.put("url", doc.get("url"));
// zhihu.put("count", doc.get("count"));
// zhihu.put("hot", doc.get("hot"));
// zhihu.put("day", doc.get("day"));
// zhihu.put("time", doc.get("time"));
// zhihu.put("changeCount", doc.get("changeCount"));
// zhihu.put("rank", doc.get("rank"));
// zhihu.put("type", HotSearchType.微博热搜.name());
//
// collNew.save(zhihu);
// dataList.add(zhihu);
// }
// System.out.println(collName +"数据量大小" +dataList.size());
// cur.close();
DBObject
query
=
new
BasicDBObject
(
new
BasicDBObject
(
"time"
,
new
BasicDBObject
(
"$gte"
,
startDate
).
append
(
"$lte"
,
endDate
)));
System
.
out
.
println
(
query
);
int
i
=
0
;
DBCursor
cur
=
coll
.
find
(
query
).
skip
(
i
);
System
.
out
.
println
(
query
+
"======="
+
cur
.
count
());
List
<
DBObject
>
dataList
=
new
ArrayList
<>();
while
(
cur
.
hasNext
())
{
DBObject
doc
=
cur
.
next
();
try
{
System
.
out
.
println
(
i
+
"===="
);
collNew
.
save
(
doc
);
i
++;
// coll.remove(doc);
}
catch
(
Exception
e2
)
{
e2
.
printStackTrace
();
}
dataList
.
add
(
doc
);
}
System
.
out
.
println
(
collName
+
"数据量大小"
+
dataList
.
size
());
cur
.
close
();
// if(!dataList.isEmpty()) {
// collNew.insert(dataList);
// try {
// collNew.insert(dataList);
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
});
mongo
.
close
();
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
View file @
6f72ce80
...
...
@@ -19,7 +19,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public
class
BaiduHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiduHotSearchRun
.
class
);
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
@Override
public
void
run
()
{
...
...
@@ -38,6 +37,7 @@ public class BaiduHotSearchRun extends Thread{
private
void
getHotList
()
{
logger
.
info
(
"百度风云榜采集开始........"
);
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
List
<
HotSearchList
>
list
=
BaiDuHotSearchCrawler
.
baiduHotSearch
();
logger
.
info
(
"{}, 此轮百度风云榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
saveDataList
=
new
ArrayList
<>();
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
View file @
6f72ce80
...
...
@@ -18,8 +18,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public
class
DouyinHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
DouyinHotSearchRun
.
class
);
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
@Override
public
void
run
()
{
...
...
@@ -42,6 +40,7 @@ public class DouyinHotSearchRun extends Thread{
*/
private
void
getHotList
()
{
logger
.
info
(
"抖音热搜榜采集开始........"
);
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
List
<
HotSearchList
>
list
=
DouyinHotSearchCrawler
.
getMobileDouyinHotList
();
logger
.
info
(
"{}, 抖音热搜榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
View file @
6f72ce80
...
...
@@ -17,8 +17,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public
class
SougoHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SougoHotSearchRun
.
class
);
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
@Override
public
void
run
()
{
...
...
@@ -36,6 +34,7 @@ public class SougoHotSearchRun extends Thread {
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
logger
.
info
(
"搜狗微信采集开始........"
);
List
<
HotSearchList
>
list
=
SougoHotSearchCrawler
.
sougoHotSearch
();
logger
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
View file @
6f72ce80
...
...
@@ -18,7 +18,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public
class
WeiboHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboHotSearchRun
.
class
);
private
HotSearchListDAO
weiboHotSearchDAO
=
new
HotSearchListDAO
();
@Override
public
void
run
()
{
...
...
@@ -37,6 +36,7 @@ public class WeiboHotSearchRun extends Thread{
private
void
getHotList
()
{
logger
.
info
(
"微博话题采集开始........"
);
HotSearchListDAO
weiboHotSearchDAO
=
new
HotSearchListDAO
();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch();
List
<
HotSearchList
>
list
=
WeiboHotSearchCrawler
.
weiboHotSearchByPhone
();
logger
.
info
(
"{}, 微博此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboTopicRun.java
View file @
6f72ce80
...
...
@@ -18,7 +18,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public
class
WeiboTopicRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboTopicRun
.
class
);
private
WeiboTopicDAO
weiboTopicDAO
=
new
WeiboTopicDAO
();
@Override
public
void
run
()
{
...
...
@@ -36,11 +35,13 @@ public class WeiboTopicRun extends Thread{
private
void
getTopicList
()
{
WeiboTopicDAO
weiboTopicDAO
=
new
WeiboTopicDAO
();
logger
.
info
(
"微博超话采集开始........"
);
List
<
WeiboTopic
>
list
=
WeiboHuatiCrawler
.
startCrawler
();
logger
.
info
(
"{}, 微博超话此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
WeiboTopic
topic
:
list
){
System
.
out
.
println
(
"topic::::"
+
topic
);
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
topic
.
getId
());
doc
.
put
(
"name"
,
topic
.
getTopicName
());
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
View file @
6f72ce80
...
...
@@ -17,7 +17,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public
class
ZhihuHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuHotSearchRun
.
class
);
private
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
@Override
public
void
run
()
{
...
...
@@ -36,6 +35,8 @@ public class ZhihuHotSearchRun extends Thread{
private
void
getHotList
()
{
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
logger
.
info
(
"知乎话题采集开始...,当前线程名字:{}"
,
Thread
.
currentThread
().
getName
());
List
<
HotSearchList
>
list
=
ZhihuHotSearchCrawler
.
getZhihuHotList
();
List
<
HotSearchList
>
mobilelist
=
ZhihuHotSearchCrawler
.
getMobileZhihuHotList
();
...
...
src/main/resources/proxyip.properties
View file @
6f72ce80
registry
=
zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
group
=
hangzhou
#
registry=zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
#
group=hangzhou
########################################################
#registry=zookeeper://192.168.0.36:2181
#
group
=
local
\ No newline at end of file
registry
=
zookeeper://192.168.0.36:2181
group
=
local
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment