Commit cb82a2d5 by zhiwei

新版字段相应的配置及相应jar

parent c1281636
...@@ -29,14 +29,9 @@ ...@@ -29,14 +29,9 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.3-SNAPSHOT</version> <version>0.1.4-SNAPSHOT</version>
</dependency> </dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>cleaner-unified-urlfilter</artifactId>
<version>1.0.6.RELEASE</version>
</dependency>
<dependency> <dependency>
<groupId>org.apache.kafka</groupId> <groupId>org.apache.kafka</groupId>
...@@ -53,7 +48,17 @@ ...@@ -53,7 +48,17 @@
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.3.6-RELEASE</version> <version>0.5.2-RELEASE</version>
</dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>cleaner-unified-filter</artifactId>
<version>0.0.7-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.base</groupId>
<artifactId>base-objects</artifactId>
<version>0.2.7-SNAPSHOT</version>
</dependency> </dependency>
</dependencies> </dependencies>
......
package com.zhiwei.crawler.config;
import java.io.InputStream;
import java.util.Properties;
public class CleanerFilterConfig {
static {
Properties conf = null;
try {
InputStream is = Thread.currentThread().getContextClassLoader()
.getResourceAsStream("redis.properties");
conf = new Properties();
conf.load(is);
is.close();
rsidUrl = conf.getProperty("rsid.zookeeper.url");
rsidGroup = conf.getProperty("rsid.zookeeper.group");
} catch (Exception e) {
e.printStackTrace();
}
}
public static String rsidUrl;
public static String rsidGroup;
}
...@@ -3,7 +3,7 @@ package com.zhiwei.crawler.config; ...@@ -3,7 +3,7 @@ package com.zhiwei.crawler.config;
import java.io.InputStream; import java.io.InputStream;
import java.util.Properties; import java.util.Properties;
public class Config { public class DBConfig {
static { static {
Properties conf = null; Properties conf = null;
try { try {
...@@ -17,13 +17,8 @@ public class Config { ...@@ -17,13 +17,8 @@ public class Config {
userName = conf.getProperty("db.username"); userName = conf.getProperty("db.username");
userPwd = conf.getProperty("db.paasword"); userPwd = conf.getProperty("db.paasword");
authDB = conf.getProperty("db.certifiedDB"); authDB = conf.getProperty("db.certifiedDB");
savedbName = conf.getProperty("savedbName");
saveCollName = conf.getProperty("saveCollName");
crawlerdbName = conf.getProperty("crawlerdbName"); crawlerdbName = conf.getProperty("crawlerdbName");
crawlerCollName = conf.getProperty("crawlerCollName"); crawlerCollName = conf.getProperty("crawlerCollName");
redisKey = conf.getProperty("redisKey");
rsidUrl = conf.getProperty("rsid.zookeeper.url");
rsidGroup = conf.getProperty("rsid.zookeeper.group");
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }
...@@ -35,15 +30,7 @@ public class Config { ...@@ -35,15 +30,7 @@ public class Config {
public static String userName; public static String userName;
public static String userPwd; public static String userPwd;
public static String authDB; public static String authDB;
public static String savedbName;
public static String saveCollName;
public static String crawlerdbName; public static String crawlerdbName;
public static String crawlerCollName; public static String crawlerCollName;
public static String redisKey;
public static String rsidUrl;
public static String rsidGroup;
public static String filePath;
public static int saveCount;
public static int queueCount;
} }
...@@ -11,7 +11,7 @@ import org.apache.logging.log4j.LogManager; ...@@ -11,7 +11,7 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.mongodb.DBObject; import com.zhiwei.base.entity.subclass.CompleteText;
import com.zhiwei.crawler.config.KafkaConfig; import com.zhiwei.crawler.config.KafkaConfig;
public class ProducerKafka { public class ProducerKafka {
...@@ -31,7 +31,7 @@ public class ProducerKafka { ...@@ -31,7 +31,7 @@ public class ProducerKafka {
} }
} }
public static void add(String key ,DBObject doc) { public static void add(String key ,CompleteText doc) {
String data = JSONObject.toJSONString(doc); String data = JSONObject.toJSONString(doc);
Future<RecordMetadata> future = producer.send(new ProducerRecord<String, String>(KafkaConfig.kafkaTopic, key, data)); Future<RecordMetadata> future = producer.send(new ProducerRecord<String, String>(KafkaConfig.kafkaTopic, key, data));
try { try {
......
...@@ -10,15 +10,15 @@ import org.apache.logging.log4j.Logger; ...@@ -10,15 +10,15 @@ import org.apache.logging.log4j.Logger;
import com.mongodb.BasicDBObject; import com.mongodb.BasicDBObject;
import com.mongodb.DBCursor; import com.mongodb.DBCursor;
import com.mongodb.DBObject; import com.mongodb.DBObject;
import com.zhiwei.crawler.config.Config; import com.zhiwei.crawler.config.DBConfig;
import com.zhiwei.crawler.dbtemplate.MongoDBTemplate; import com.zhiwei.crawler.dbtemplate.MongoDBTemplate;
public class WordsDao extends MongoDBTemplate{ public class WordsDao extends MongoDBTemplate{
private static Logger logger = LogManager.getLogger(WordsDao.class); private static Logger logger = LogManager.getLogger(WordsDao.class);
public WordsDao(){ public WordsDao(){
super.setDbName(Config.crawlerdbName); super.setDbName(DBConfig.crawlerdbName);
super.setCollName(Config.crawlerCollName); super.setCollName(DBConfig.crawlerCollName);
} }
......
...@@ -10,7 +10,7 @@ import com.mongodb.MongoClient; ...@@ -10,7 +10,7 @@ import com.mongodb.MongoClient;
import com.mongodb.MongoCredential; import com.mongodb.MongoCredential;
import com.mongodb.MongoException; import com.mongodb.MongoException;
import com.mongodb.ServerAddress; import com.mongodb.ServerAddress;
import com.zhiwei.crawler.config.Config; import com.zhiwei.crawler.config.DBConfig;
/** /**
* *
* @author LihuaTang * @author LihuaTang
...@@ -25,8 +25,8 @@ public class MongoDBTemplate ...@@ -25,8 +25,8 @@ public class MongoDBTemplate
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
public MongoDBTemplate() { public MongoDBTemplate() {
MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray()); MongoCredential credential = MongoCredential.createCredential(DBConfig.userName, DBConfig.authDB, DBConfig.userPwd.toCharArray());
ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort); ServerAddress address = new ServerAddress(DBConfig.mongoIp, DBConfig.mongoPort);
try { try {
if(reader==null) if(reader==null)
{ {
......
package com.zhiwei.crawler.dbtemplate; package com.zhiwei.crawler.dbtemplate;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.base.entity.subclass.CompleteText;
import com.zhiwei.common.config.GroupType; import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.config.Config; import com.zhiwei.crawler.config.CleanerFilterConfig;
import com.zhiwei.middleware.cleaner.ptenum.PTENUM; import com.zhiwei.middleware.cleaner.filter.UnifiedFilterClient;
import com.zhiwei.middleware.cleaner.urlfilter.UnifiedUrlFilterClient;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
/** /**
...@@ -23,15 +19,15 @@ import com.zhiwei.tools.tools.ZhiWeiTools; ...@@ -23,15 +19,15 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public class RsidClientTemplate { public class RsidClientTemplate {
private static Logger logger = LogManager.getLogger(RsidClientTemplate.class); private static Logger logger = LogManager.getLogger(RsidClientTemplate.class);
private static UnifiedUrlFilterClient client; private static UnifiedFilterClient client;
static{ static{
if(client==null){ if(client==null){
synchronized (RsidClientTemplate.class) { synchronized (RsidClientTemplate.class) {
if(client==null) { if(client==null) {
try { try {
client = UnifiedUrlFilterClient.getClient(Config.rsidUrl, client = UnifiedFilterClient.getClient(CleanerFilterConfig.rsidUrl,
Config.rsidGroup, GroupType.PROVIDER); CleanerFilterConfig.rsidGroup, GroupType.PROVIDER);
} catch (Exception e) { } catch (Exception e) {
logger.error("链接清洗中间件时出现错误,错误为:::{}", e); logger.error("链接清洗中间件时出现错误,错误为:::{}", e);
} }
...@@ -49,42 +45,17 @@ public class RsidClientTemplate { ...@@ -49,42 +45,17 @@ public class RsidClientTemplate {
* @param @return 设定文件 * @param @return 设定文件
* @return boolean 返回类型 * @return boolean 返回类型
*/ */
public static boolean addFilterUrl(String url,String title, String source, Date date,String pt){ public static boolean addFilterUrl(CompleteText completeText){
for(int i=0; i<3; i++){ for(int i=0; i<3; i++){
try { try {
Map<String,Object> filterMap = new HashMap<String,Object>(); return client.contains(completeText.filterInfo());
filterMap.put("_id", url);
filterMap.put("url", url);
filterMap.put("title", title);
filterMap.put("source", source);
filterMap.put("time", String.valueOf(date.getTime()));
filterMap.put("pt", pt);
if(client==null){
System.out.println("client is null");
}
return client.contains(filterMap, PTENUM.COMMON);
} catch (Exception e) { } catch (Exception e) {
logger.error("判断此条网媒消息是否存在出现问题", e); logger.error("判断此条网媒消息是否存在出现问题", e);
ZhiWeiTools.sleep(500); ZhiWeiTools.sleep(500);
continue;
} }
} }
return false; return false;
} }
public static boolean addFilterUrl(String url){
for(int i=0; i<3; i++){
try{
return client.contains(url, PTENUM.COMMON);
} catch (Exception e) {
logger.error("判断此条网媒消息是否存在出现问题", e);
ZhiWeiTools.sleep(500);
continue;
}
}
return false;
}
} }
...@@ -6,8 +6,13 @@ import java.util.Map; ...@@ -6,8 +6,13 @@ import java.util.Map;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject; import com.zhiwei.base.category.ClassA;
import com.mongodb.DBObject; import com.zhiwei.base.category.ClassA.TypeA;
import com.zhiwei.base.category.ClassB.TypeB;
import com.zhiwei.base.category.ClassC.TypeC;
import com.zhiwei.base.category.ClassD;
import com.zhiwei.base.category.ClassD.TypeD;
import com.zhiwei.base.entity.subclass.CompleteText;
import com.zhiwei.crawler.dao.ProducerKafka; import com.zhiwei.crawler.dao.ProducerKafka;
import com.zhiwei.crawler.dbtemplate.RsidClientTemplate; import com.zhiwei.crawler.dbtemplate.RsidClientTemplate;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
...@@ -31,7 +36,7 @@ public class TreatData { ...@@ -31,7 +36,7 @@ public class TreatData {
* @return void 返回类型 * @return void 返回类型
*/ */
public static void treatDataAccount(Map<String,Object> dataMap) { public static void treatDataAccount(Map<String,Object> dataMap) {
if (!(dataMap.get("_id") == null || dataMap.get("_id").equals("") if (!(dataMap.get("url") == null || dataMap.get("url").equals("")
|| dataMap.get("source") == null || dataMap.get("source").equals("") || dataMap.get("source") == null || dataMap.get("source").equals("")
|| dataMap.get("time")==null || dataMap.get("time").equals("") || dataMap.get("time")==null || dataMap.get("time").equals("")
|| dataMap.get("title").equals("")|| dataMap.get("title")==null) || dataMap.get("title").equals("")|| dataMap.get("title")==null)
...@@ -39,23 +44,22 @@ public class TreatData { ...@@ -39,23 +44,22 @@ public class TreatData {
Date now = new Date(new Date().getTime() - 49 * 60 * 60 * 1000); //避免时间为:2018-06-22 00:00:00 时间格式的数据丢掉 Date now = new Date(new Date().getTime() - 49 * 60 * 60 * 1000); //避免时间为:2018-06-22 00:00:00 时间格式的数据丢掉
Date date = TimeParse.stringFormartDate(dataMap.get("time").toString()); Date date = TimeParse.stringFormartDate(dataMap.get("time").toString());
if (date.after(now)) { if (date.after(now)) {
logger.info("去重的链接为:{}", dataMap.get("_id")); logger.info("去重的链接为:{}", dataMap.get("url"));
String url = dataMap.get("_id").toString(); String url = dataMap.get("url").toString();
String title = dataMap.get("title").toString(); String title = dataMap.get("title").toString();
String source = dataMap.get("source").toString(); String source = dataMap.get("source").toString();
if(!RsidClientTemplate.addFilterUrl(url, title, source, date ,"平媒")){ String content = dataMap.get("content")+"".replaceAll("<[^>]*>", "");
DBObject doc = new BasicDBObject(); ClassD cd = ClassA.selectA(TypeA.TEXT).selectB(TypeB.COMPLETE).selectC(TypeC.PGC).selectD(TypeD.平媒);
doc.put("_id", url); long ctime = System.currentTimeMillis();
doc.put("url", url); CompleteText completeText = new CompleteText.Builder(date.getTime(), ctime, source.trim()
doc.put("title", dataMap.get("title")); , cd.encodeA(), cd.encodeB(), cd.encodeC(), cd.encodeD(), cd.combineEncode(), "PC", null, 100010005L)
doc.put("time", String.valueOf(date.getTime())); .url(url)
doc.put("source", dataMap.get("source")); .title(title)
doc.put("content", dataMap.get("content")+"".replaceAll("<[^>]*>", "")); .content(content)
doc.put("pt", "平媒"); .build();
doc.put("type", "平媒");
doc.put("savetime", System.currentTimeMillis());
ProducerKafka.add("搜报网", doc); if(!RsidClientTemplate.addFilterUrl(completeText)){
ProducerKafka.add("搜报网采集程序", completeText);
}else{ }else{
logger.info("搜报网地址为:{},此条数据重复", dataMap.get("_id")); logger.info("搜报网地址为:{},此条数据重复", dataMap.get("_id"));
} }
......
rsid.zookeeper.url=zookeeper://192.168.0.203:2182;zookeeper://192.168.0.104:2182;zookeeper://192.168.0.105:2182
rsid.zookeeper.group=zhiwei-bloom-filter
########################local##############################
#rsid.zookeeper.url=zookeeper://192.168.0.36:2181
#rsid.zookeeper.group=filter-test_sjj
\ No newline at end of file
#####################service################################# #####################service#################################
mongoIp=192.168.0.101 mongoIp=192.168.0.101
mongoPort=30000 mongoPort=30000
db.username=zzwno db.username=zzwno
db.paasword=zzwno1q2w3e4r db.paasword=zzwno1q2w3e4r
db.certifiedDB=admin db.certifiedDB=admin
##save data dbInfo
savedbName=mediaspider
saveCollName=net_media
##crawler word dbInfo ##crawler word dbInfo
crawlerdbName=qbjcPhoenix crawlerdbName=qbjcPhoenix
crawlerCollName=qbjc_crawlerword crawlerCollName=qbjc_crawlerword
rsid.zookeeper.url = zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181 #####################local�#################################
rsid.zookeeper.group=crawler-filter
redisKey=media
#####################local#################################
#mongoIp=192.168.0.233 #mongoIp=192.168.0.233
#mongoPort=27017 #mongoPort=27017
#db.username=zzwno #db.username=zzwno
#db.paasword=zzwno1q2w3e4r #db.paasword=zzwno1q2w3e4r
#db.certifiedDB=admin #db.certifiedDB=admin
###save data dbInfo ###save data dbInfo
#savedbName=mediaspider
#saveCollName=net_media
###crawler word dbInfo ###crawler word dbInfo
#crawlerdbName=qbjcPhoenix #crawlerdbName=qbjcPhoenix
#crawlerCollName=qbjc_crawlerword #crawlerCollName=qbjc_crawlerword
\ No newline at end of file
#rsid.zookeeper.url = zookeeper://192.168.0.36:2181;
#rsid.zookeeper.group=local
#redisKey=media
\ No newline at end of file
##########################local############################## ##########################local##############################,kafka1.irybd.com:9093,kafka1.irybd.com:9094
#kafka.ip=kafka1.irybd.com:9092,kafka1.irybd.com:9093,kafka1.irybd.com:9094 #kafka.ip=kafka1.irybd.com:9092
#kafka.topic=crawler-test #kafka.topic=crawler-media
##########################service############################## ##########################service##############################
kafka.ip=192.168.0.203:9092,192.168.0.203:9093,192.168.0.203:9094 kafka.ip=192.168.0.203:9095,192.168.0.104:9093,192.168.0.105:9093
kafka.topic=crawler-media kafka.topic=crawler-media
\ No newline at end of file
registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181 registry=zookeeper://192.168.0.203:2182;zookeeper://192.168.0.104:2182;zookeeper://192.168.0.105:2182
group=hangzhou group=hangzhou
######################################################## ########################################################
#registry=zookeeper://192.168.0.36:2181 #registry=zookeeper://192.168.0.36:2181
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment