Commit b00e3d2d by zhiwei

修改数据库读取位置

parent 223c421c
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>soubao-crawlerNew</artifactId> <artifactId>soubao-crawlerNew</artifactId>
<version>0.0.1-SNAPSHOT</version> <version>0.0.2-SNAPSHOT</version>
<name>搜报网采集</name> <name>搜报网采集</name>
<properties> <properties>
......
...@@ -32,7 +32,6 @@ private static Logger logger = LoggerFactory.getLogger(WordsDao.class); ...@@ -32,7 +32,6 @@ private static Logger logger = LoggerFactory.getLogger(WordsDao.class);
public BlockingQueue<String> getAllWordList(){ public BlockingQueue<String> getAllWordList(){
try { try {
BlockingQueue<String> list = new LinkedBlockingQueue<String>(); BlockingQueue<String> list = new LinkedBlockingQueue<String>();
DBObject query = new BasicDBObject();
DBCursor cur = this.getReadColl().find(); DBCursor cur = this.getReadColl().find();
while(cur.hasNext()){ while(cur.hasNext()){
DBObject doc = cur.next(); DBObject doc = cur.next();
......
...@@ -42,7 +42,7 @@ public class SoubaoCrawlerRun implements Runnable{ ...@@ -42,7 +42,7 @@ public class SoubaoCrawlerRun implements Runnable{
//其他组数据采集关键词 //其他组数据采集关键词
BlockingQueue<String> otherWordQueue = wordsDao.getWordList("-美赞臣"); BlockingQueue<String> otherWordQueue = wordsDao.getWordList("-美赞臣");
wordesQueue.addAll(otherWordQueue); wordesQueue.addAll(otherWordQueue);
logger.info("关键词总量为:::{}", wordesQueue.size());
SouBaoCrawlerThread[] souBaoCrawlerThread = new SouBaoCrawlerThread[thread]; SouBaoCrawlerThread[] souBaoCrawlerThread = new SouBaoCrawlerThread[thread];
ExecutorService service = Executors.newFixedThreadPool(2); ExecutorService service = Executors.newFixedThreadPool(2);
for (int i = 0; i < thread; i++) { for (int i = 0; i < thread; i++) {
......
...@@ -6,7 +6,6 @@ ...@@ -6,7 +6,6 @@
*/ */
package com.zhiwei.crawler.soubao; package com.zhiwei.crawler.soubao;
import java.io.IOException;
import java.net.Proxy; import java.net.Proxy;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
...@@ -112,7 +111,7 @@ public class Crawler { ...@@ -112,7 +111,7 @@ public class Crawler {
} }
} }
count ++; count ++;
logger.info("关键词 {} 翻页页数: {} 访问成功", keyword, i); logger.info("关键词 {} 翻页页数: {} 访问成功, 页面长度:{}", keyword, i, body.length());
// 解析翻页 // 解析翻页
parse(client, request.headers(), html); parse(client, request.headers(), html);
TimeUnit.SECONDS.sleep(2); TimeUnit.SECONDS.sleep(2);
...@@ -133,6 +132,7 @@ public class Crawler { ...@@ -133,6 +132,7 @@ public class Crawler {
private static void parse(OkHttpClient client, Headers headers, Document html) { private static void parse(OkHttpClient client, Headers headers, Document html) {
try { try {
Elements elements = html.select("ul.newList").select("li"); Elements elements = html.select("ul.newList").select("li");
logger.info("数据大小:::{}", elements.size());
for (Element element : elements) { for (Element element : elements) {
try { try {
String link = "http://www.soubao.net" + element.select("h2").select("a").attr("href"); String link = "http://www.soubao.net" + element.select("h2").select("a").attr("href");
...@@ -182,4 +182,16 @@ public class Crawler { ...@@ -182,4 +182,16 @@ public class Crawler {
} }
return realUrl; return realUrl;
} }
public static void main(String[] args) {
try {
start(1, "京东", null);
} catch (Exception e) {
e.printStackTrace();
}
}
} }
...@@ -33,7 +33,7 @@ public class TreatData { ...@@ -33,7 +33,7 @@ public class TreatData {
public static void treatDataAccount(Map<String,Object> dataMap) { public static void treatDataAccount(Map<String,Object> dataMap) {
if (!(dataMap.get("_id") == null || dataMap.get("_id").equals("") || dataMap.get("source") == null if (!(dataMap.get("_id") == null || dataMap.get("_id").equals("") || dataMap.get("source") == null
|| dataMap.get("source").equals("")) || dataMap.get("time")!=null) { || dataMap.get("source").equals("")) || dataMap.get("time")!=null) {
Date now = new Date(new Date().getTime() - 24 * 60 * 60 * 1000); Date now = new Date(new Date().getTime() - 49 * 60 * 60 * 1000); //避免时间为:2018-06-22 00:00:00 时间格式的数据丢掉
Date date = TimeParse.stringFormartDate(dataMap.get("time").toString()); Date date = TimeParse.stringFormartDate(dataMap.get("time").toString());
if (date.after(now)) { if (date.after(now)) {
logger.info("去重的链接为:{}", dataMap.get("_id")); logger.info("去重的链接为:{}", dataMap.get("_id"));
......
#####################生产环境################################# #####################生产环境#################################
mongoIp=192.168.0.101 mongoIp=192.168.0.108
mongoPort=27017 mongoPort=30000
db.username=zzwno #db.username=zzwno
db.paasword=zzwno1q2w3e4r #db.paasword=zzwno1q2w3e4r
db.username=rsync
db.paasword=rsync1q2w3e4r
db.certifiedDB=admin db.certifiedDB=admin
##save data dbInfo ##save data dbInfo
savedbName=mediaspider savedbName=mediaspider
......
registry=zookeeper://192.168.0.203:2181
group=hangzhou
minCount=20
maxCount=40
########################################################
#registry=zookeeper://192.168.0.36:2181
#group=testGroup
#minCount=10
#maxCount=20
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment