Commit b00e3d2d by zhiwei

修改数据库读取位置

parent 223c421c
......@@ -4,7 +4,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>soubao-crawlerNew</artifactId>
<version>0.0.1-SNAPSHOT</version>
<version>0.0.2-SNAPSHOT</version>
<name>搜报网采集</name>
<properties>
......
......@@ -32,7 +32,6 @@ private static Logger logger = LoggerFactory.getLogger(WordsDao.class);
public BlockingQueue<String> getAllWordList(){
try {
BlockingQueue<String> list = new LinkedBlockingQueue<String>();
DBObject query = new BasicDBObject();
DBCursor cur = this.getReadColl().find();
while(cur.hasNext()){
DBObject doc = cur.next();
......
......@@ -42,7 +42,7 @@ public class SoubaoCrawlerRun implements Runnable{
//其他组数据采集关键词
BlockingQueue<String> otherWordQueue = wordsDao.getWordList("-美赞臣");
wordesQueue.addAll(otherWordQueue);
logger.info("关键词总量为:::{}", wordesQueue.size());
SouBaoCrawlerThread[] souBaoCrawlerThread = new SouBaoCrawlerThread[thread];
ExecutorService service = Executors.newFixedThreadPool(2);
for (int i = 0; i < thread; i++) {
......
......@@ -6,7 +6,6 @@
*/
package com.zhiwei.crawler.soubao;
import java.io.IOException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
......@@ -112,7 +111,7 @@ public class Crawler {
}
}
count ++;
logger.info("关键词 {} 翻页页数: {} 访问成功", keyword, i);
logger.info("关键词 {} 翻页页数: {} 访问成功, 页面长度:{}", keyword, i, body.length());
// 解析翻页
parse(client, request.headers(), html);
TimeUnit.SECONDS.sleep(2);
......@@ -133,6 +132,7 @@ public class Crawler {
private static void parse(OkHttpClient client, Headers headers, Document html) {
try {
Elements elements = html.select("ul.newList").select("li");
logger.info("数据大小:::{}", elements.size());
for (Element element : elements) {
try {
String link = "http://www.soubao.net" + element.select("h2").select("a").attr("href");
......@@ -182,4 +182,16 @@ public class Crawler {
}
return realUrl;
}
public static void main(String[] args) {
try {
start(1, "京东", null);
} catch (Exception e) {
e.printStackTrace();
}
}
}
......@@ -33,7 +33,7 @@ public class TreatData {
public static void treatDataAccount(Map<String,Object> dataMap) {
if (!(dataMap.get("_id") == null || dataMap.get("_id").equals("") || dataMap.get("source") == null
|| dataMap.get("source").equals("")) || dataMap.get("time")!=null) {
Date now = new Date(new Date().getTime() - 24 * 60 * 60 * 1000);
Date now = new Date(new Date().getTime() - 49 * 60 * 60 * 1000); //避免时间为:2018-06-22 00:00:00 时间格式的数据丢掉
Date date = TimeParse.stringFormartDate(dataMap.get("time").toString());
if (date.after(now)) {
logger.info("去重的链接为:{}", dataMap.get("_id"));
......
#####################生产环境#################################
mongoIp=192.168.0.101
mongoPort=27017
db.username=zzwno
db.paasword=zzwno1q2w3e4r
mongoIp=192.168.0.108
mongoPort=30000
#db.username=zzwno
#db.paasword=zzwno1q2w3e4r
db.username=rsync
db.paasword=rsync1q2w3e4r
db.certifiedDB=admin
##save data dbInfo
savedbName=mediaspider
......
registry=zookeeper://192.168.0.203:2181
group=hangzhou
minCount=20
maxCount=40
########################################################
#registry=zookeeper://192.168.0.36:2181
#group=testGroup
#minCount=10
#maxCount=20
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment