Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
soubao_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
soubao_crawler
Commits
b00e3d2d
Commit
b00e3d2d
authored
Jul 05, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改数据库读取位置
parent
223c421c
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
23 additions
and
20 deletions
+23
-20
pom.xml
+1
-1
src/main/java/com/zhiwei/crawler/dao/WordsDao.java
+0
-1
src/main/java/com/zhiwei/crawler/run/SoubaoCrawlerRun.java
+1
-1
src/main/java/com/zhiwei/crawler/soubao/Crawler.java
+14
-2
src/main/java/com/zhiwei/crawler/util/TreatData.java
+1
-1
src/main/resources/db.properties
+6
-4
src/main/resources/proxyip.properties
+0
-10
No files found.
pom.xml
View file @
b00e3d2d
...
...
@@ -4,7 +4,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
soubao-crawlerNew
</artifactId>
<version>
0.0.
1
-SNAPSHOT
</version>
<version>
0.0.
2
-SNAPSHOT
</version>
<name>
搜报网采集
</name>
<properties>
...
...
src/main/java/com/zhiwei/crawler/dao/WordsDao.java
View file @
b00e3d2d
...
...
@@ -32,7 +32,6 @@ private static Logger logger = LoggerFactory.getLogger(WordsDao.class);
public
BlockingQueue
<
String
>
getAllWordList
(){
try
{
BlockingQueue
<
String
>
list
=
new
LinkedBlockingQueue
<
String
>();
DBObject
query
=
new
BasicDBObject
();
DBCursor
cur
=
this
.
getReadColl
().
find
();
while
(
cur
.
hasNext
()){
DBObject
doc
=
cur
.
next
();
...
...
src/main/java/com/zhiwei/crawler/run/SoubaoCrawlerRun.java
View file @
b00e3d2d
...
...
@@ -42,7 +42,7 @@ public class SoubaoCrawlerRun implements Runnable{
//其他组数据采集关键词
BlockingQueue
<
String
>
otherWordQueue
=
wordsDao
.
getWordList
(
"-美赞臣"
);
wordesQueue
.
addAll
(
otherWordQueue
);
logger
.
info
(
"关键词总量为:::{}"
,
wordesQueue
.
size
());
SouBaoCrawlerThread
[]
souBaoCrawlerThread
=
new
SouBaoCrawlerThread
[
thread
];
ExecutorService
service
=
Executors
.
newFixedThreadPool
(
2
);
for
(
int
i
=
0
;
i
<
thread
;
i
++)
{
...
...
src/main/java/com/zhiwei/crawler/soubao/Crawler.java
View file @
b00e3d2d
...
...
@@ -6,7 +6,6 @@
*/
package
com
.
zhiwei
.
crawler
.
soubao
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.text.SimpleDateFormat
;
...
...
@@ -112,7 +111,7 @@ public class Crawler {
}
}
count
++;
logger
.
info
(
"关键词 {} 翻页页数: {} 访问成功
"
,
keyword
,
i
);
logger
.
info
(
"关键词 {} 翻页页数: {} 访问成功
, 页面长度:{}"
,
keyword
,
i
,
body
.
length
()
);
// 解析翻页
parse
(
client
,
request
.
headers
(),
html
);
TimeUnit
.
SECONDS
.
sleep
(
2
);
...
...
@@ -133,6 +132,7 @@ public class Crawler {
private
static
void
parse
(
OkHttpClient
client
,
Headers
headers
,
Document
html
)
{
try
{
Elements
elements
=
html
.
select
(
"ul.newList"
).
select
(
"li"
);
logger
.
info
(
"数据大小:::{}"
,
elements
.
size
());
for
(
Element
element
:
elements
)
{
try
{
String
link
=
"http://www.soubao.net"
+
element
.
select
(
"h2"
).
select
(
"a"
).
attr
(
"href"
);
...
...
@@ -182,4 +182,16 @@ public class Crawler {
}
return
realUrl
;
}
public
static
void
main
(
String
[]
args
)
{
try
{
start
(
1
,
"京东"
,
null
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
src/main/java/com/zhiwei/crawler/util/TreatData.java
View file @
b00e3d2d
...
...
@@ -33,7 +33,7 @@ public class TreatData {
public
static
void
treatDataAccount
(
Map
<
String
,
Object
>
dataMap
)
{
if
(!(
dataMap
.
get
(
"_id"
)
==
null
||
dataMap
.
get
(
"_id"
).
equals
(
""
)
||
dataMap
.
get
(
"source"
)
==
null
||
dataMap
.
get
(
"source"
).
equals
(
""
))
||
dataMap
.
get
(
"time"
)!=
null
)
{
Date
now
=
new
Date
(
new
Date
().
getTime
()
-
24
*
60
*
60
*
1000
);
Date
now
=
new
Date
(
new
Date
().
getTime
()
-
49
*
60
*
60
*
1000
);
//避免时间为:2018-06-22 00:00:00 时间格式的数据丢掉
Date
date
=
TimeParse
.
stringFormartDate
(
dataMap
.
get
(
"time"
).
toString
());
if
(
date
.
after
(
now
))
{
logger
.
info
(
"去重的链接为:{}"
,
dataMap
.
get
(
"_id"
));
...
...
src/main/resources/db.properties
View file @
b00e3d2d
#####################生产环境#################################
mongoIp
=
192.168.0.101
mongoPort
=
27017
db.username
=
zzwno
db.paasword
=
zzwno1q2w3e4r
mongoIp
=
192.168.0.108
mongoPort
=
30000
#db.username=zzwno
#db.paasword=zzwno1q2w3e4r
db.username
=
rsync
db.paasword
=
rsync1q2w3e4r
db.certifiedDB
=
admin
##save data dbInfo
savedbName
=
mediaspider
...
...
src/main/resources/proxyip.properties
deleted
100644 → 0
View file @
223c421c
registry
=
zookeeper://192.168.0.203:2181
group
=
hangzhou
minCount
=
20
maxCount
=
40
########################################################
#registry=zookeeper://192.168.0.36:2181
#group=testGroup
#minCount=10
#
maxCount
=
20
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment