Commit cb82a2d5 by zhiwei

新版字段相应的配置及相应jar

parent c1281636
......@@ -29,14 +29,9 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.3-SNAPSHOT</version>
<version>0.1.4-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>cleaner-unified-urlfilter</artifactId>
<version>1.0.6.RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
......@@ -53,7 +48,17 @@
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.3.6-RELEASE</version>
<version>0.5.2-RELEASE</version>
</dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>cleaner-unified-filter</artifactId>
<version>0.0.7-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.base</groupId>
<artifactId>base-objects</artifactId>
<version>0.2.7-SNAPSHOT</version>
</dependency>
</dependencies>
......
package com.zhiwei.crawler.config;
import java.io.InputStream;
import java.util.Properties;
public class CleanerFilterConfig {
static {
Properties conf = null;
try {
InputStream is = Thread.currentThread().getContextClassLoader()
.getResourceAsStream("redis.properties");
conf = new Properties();
conf.load(is);
is.close();
rsidUrl = conf.getProperty("rsid.zookeeper.url");
rsidGroup = conf.getProperty("rsid.zookeeper.group");
} catch (Exception e) {
e.printStackTrace();
}
}
public static String rsidUrl;
public static String rsidGroup;
}
......@@ -3,7 +3,7 @@ package com.zhiwei.crawler.config;
import java.io.InputStream;
import java.util.Properties;
public class Config {
public class DBConfig {
static {
Properties conf = null;
try {
......@@ -17,13 +17,8 @@ public class Config {
userName = conf.getProperty("db.username");
userPwd = conf.getProperty("db.paasword");
authDB = conf.getProperty("db.certifiedDB");
savedbName = conf.getProperty("savedbName");
saveCollName = conf.getProperty("saveCollName");
crawlerdbName = conf.getProperty("crawlerdbName");
crawlerCollName = conf.getProperty("crawlerCollName");
redisKey = conf.getProperty("redisKey");
rsidUrl = conf.getProperty("rsid.zookeeper.url");
rsidGroup = conf.getProperty("rsid.zookeeper.group");
} catch (Exception e) {
e.printStackTrace();
}
......@@ -35,15 +30,7 @@ public class Config {
public static String userName;
public static String userPwd;
public static String authDB;
public static String savedbName;
public static String saveCollName;
public static String crawlerdbName;
public static String crawlerCollName;
public static String redisKey;
public static String rsidUrl;
public static String rsidGroup;
public static String filePath;
public static int saveCount;
public static int queueCount;
}
......@@ -11,7 +11,7 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.DBObject;
import com.zhiwei.base.entity.subclass.CompleteText;
import com.zhiwei.crawler.config.KafkaConfig;
public class ProducerKafka {
......@@ -31,7 +31,7 @@ public class ProducerKafka {
}
}
public static void add(String key ,DBObject doc) {
public static void add(String key ,CompleteText doc) {
String data = JSONObject.toJSONString(doc);
Future<RecordMetadata> future = producer.send(new ProducerRecord<String, String>(KafkaConfig.kafkaTopic, key, data));
try {
......
......@@ -10,15 +10,15 @@ import org.apache.logging.log4j.Logger;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.zhiwei.crawler.config.Config;
import com.zhiwei.crawler.config.DBConfig;
import com.zhiwei.crawler.dbtemplate.MongoDBTemplate;
public class WordsDao extends MongoDBTemplate{
private static Logger logger = LogManager.getLogger(WordsDao.class);
public WordsDao(){
super.setDbName(Config.crawlerdbName);
super.setCollName(Config.crawlerCollName);
super.setDbName(DBConfig.crawlerdbName);
super.setCollName(DBConfig.crawlerCollName);
}
......
......@@ -10,7 +10,7 @@ import com.mongodb.MongoClient;
import com.mongodb.MongoCredential;
import com.mongodb.MongoException;
import com.mongodb.ServerAddress;
import com.zhiwei.crawler.config.Config;
import com.zhiwei.crawler.config.DBConfig;
/**
*
* @author LihuaTang
......@@ -25,8 +25,8 @@ public class MongoDBTemplate
@SuppressWarnings("deprecation")
public MongoDBTemplate() {
MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray());
ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
MongoCredential credential = MongoCredential.createCredential(DBConfig.userName, DBConfig.authDB, DBConfig.userPwd.toCharArray());
ServerAddress address = new ServerAddress(DBConfig.mongoIp, DBConfig.mongoPort);
try {
if(reader==null)
{
......
package com.zhiwei.crawler.dbtemplate;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.base.entity.subclass.CompleteText;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.config.Config;
import com.zhiwei.middleware.cleaner.ptenum.PTENUM;
import com.zhiwei.middleware.cleaner.urlfilter.UnifiedUrlFilterClient;
import com.zhiwei.crawler.config.CleanerFilterConfig;
import com.zhiwei.middleware.cleaner.filter.UnifiedFilterClient;
import com.zhiwei.tools.tools.ZhiWeiTools;
/**
......@@ -23,15 +19,15 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public class RsidClientTemplate {
private static Logger logger = LogManager.getLogger(RsidClientTemplate.class);
private static UnifiedUrlFilterClient client;
private static UnifiedFilterClient client;
static{
if(client==null){
synchronized (RsidClientTemplate.class) {
if(client==null) {
try {
client = UnifiedUrlFilterClient.getClient(Config.rsidUrl,
Config.rsidGroup, GroupType.PROVIDER);
client = UnifiedFilterClient.getClient(CleanerFilterConfig.rsidUrl,
CleanerFilterConfig.rsidGroup, GroupType.PROVIDER);
} catch (Exception e) {
logger.error("链接清洗中间件时出现错误,错误为:::{}", e);
}
......@@ -49,42 +45,17 @@ public class RsidClientTemplate {
* @param @return 设定文件
* @return boolean 返回类型
*/
public static boolean addFilterUrl(String url,String title, String source, Date date,String pt){
public static boolean addFilterUrl(CompleteText completeText){
for(int i=0; i<3; i++){
try {
Map<String,Object> filterMap = new HashMap<String,Object>();
filterMap.put("_id", url);
filterMap.put("url", url);
filterMap.put("title", title);
filterMap.put("source", source);
filterMap.put("time", String.valueOf(date.getTime()));
filterMap.put("pt", pt);
if(client==null){
System.out.println("client is null");
}
return client.contains(filterMap, PTENUM.COMMON);
return client.contains(completeText.filterInfo());
} catch (Exception e) {
logger.error("判断此条网媒消息是否存在出现问题", e);
ZhiWeiTools.sleep(500);
continue;
}
}
return false;
}
public static boolean addFilterUrl(String url){
for(int i=0; i<3; i++){
try{
return client.contains(url, PTENUM.COMMON);
} catch (Exception e) {
logger.error("判断此条网媒消息是否存在出现问题", e);
ZhiWeiTools.sleep(500);
continue;
}
}
return false;
}
}
......@@ -6,8 +6,13 @@ import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.base.category.ClassA;
import com.zhiwei.base.category.ClassA.TypeA;
import com.zhiwei.base.category.ClassB.TypeB;
import com.zhiwei.base.category.ClassC.TypeC;
import com.zhiwei.base.category.ClassD;
import com.zhiwei.base.category.ClassD.TypeD;
import com.zhiwei.base.entity.subclass.CompleteText;
import com.zhiwei.crawler.dao.ProducerKafka;
import com.zhiwei.crawler.dbtemplate.RsidClientTemplate;
import com.zhiwei.tools.timeparse.TimeParse;
......@@ -31,7 +36,7 @@ public class TreatData {
* @return void 返回类型
*/
public static void treatDataAccount(Map<String,Object> dataMap) {
if (!(dataMap.get("_id") == null || dataMap.get("_id").equals("")
if (!(dataMap.get("url") == null || dataMap.get("url").equals("")
|| dataMap.get("source") == null || dataMap.get("source").equals("")
|| dataMap.get("time")==null || dataMap.get("time").equals("")
|| dataMap.get("title").equals("")|| dataMap.get("title")==null)
......@@ -39,23 +44,22 @@ public class TreatData {
Date now = new Date(new Date().getTime() - 49 * 60 * 60 * 1000); //避免时间为:2018-06-22 00:00:00 时间格式的数据丢掉
Date date = TimeParse.stringFormartDate(dataMap.get("time").toString());
if (date.after(now)) {
logger.info("去重的链接为:{}", dataMap.get("_id"));
String url = dataMap.get("_id").toString();
logger.info("去重的链接为:{}", dataMap.get("url"));
String url = dataMap.get("url").toString();
String title = dataMap.get("title").toString();
String source = dataMap.get("source").toString();
if(!RsidClientTemplate.addFilterUrl(url, title, source, date ,"平媒")){
DBObject doc = new BasicDBObject();
doc.put("_id", url);
doc.put("url", url);
doc.put("title", dataMap.get("title"));
doc.put("time", String.valueOf(date.getTime()));
doc.put("source", dataMap.get("source"));
doc.put("content", dataMap.get("content")+"".replaceAll("<[^>]*>", ""));
doc.put("pt", "平媒");
doc.put("type", "平媒");
doc.put("savetime", System.currentTimeMillis());
String content = dataMap.get("content")+"".replaceAll("<[^>]*>", "");
ClassD cd = ClassA.selectA(TypeA.TEXT).selectB(TypeB.COMPLETE).selectC(TypeC.PGC).selectD(TypeD.平媒);
long ctime = System.currentTimeMillis();
CompleteText completeText = new CompleteText.Builder(date.getTime(), ctime, source.trim()
, cd.encodeA(), cd.encodeB(), cd.encodeC(), cd.encodeD(), cd.combineEncode(), "PC", null, 100010005L)
.url(url)
.title(title)
.content(content)
.build();
ProducerKafka.add("搜报网", doc);
if(!RsidClientTemplate.addFilterUrl(completeText)){
ProducerKafka.add("搜报网采集程序", completeText);
}else{
logger.info("搜报网地址为:{},此条数据重复", dataMap.get("_id"));
}
......
rsid.zookeeper.url=zookeeper://192.168.0.203:2182;zookeeper://192.168.0.104:2182;zookeeper://192.168.0.105:2182
rsid.zookeeper.group=zhiwei-bloom-filter
########################local##############################
#rsid.zookeeper.url=zookeeper://192.168.0.36:2181
#rsid.zookeeper.group=filter-test_sjj
\ No newline at end of file
#####################service#################################
#####################service#################################
mongoIp=192.168.0.101
mongoPort=30000
db.username=zzwno
db.paasword=zzwno1q2w3e4r
db.certifiedDB=admin
##save data dbInfo
savedbName=mediaspider
saveCollName=net_media
##crawler word dbInfo
crawlerdbName=qbjcPhoenix
crawlerCollName=qbjc_crawlerword
rsid.zookeeper.url = zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181
rsid.zookeeper.group=crawler-filter
redisKey=media
#####################local#################################
#####################local�#################################
#mongoIp=192.168.0.233
#mongoPort=27017
#db.username=zzwno
#db.paasword=zzwno1q2w3e4r
#db.certifiedDB=admin
###save data dbInfo
#savedbName=mediaspider
#saveCollName=net_media
###crawler word dbInfo
#crawlerdbName=qbjcPhoenix
#crawlerCollName=qbjc_crawlerword
\ No newline at end of file
#rsid.zookeeper.url = zookeeper://192.168.0.36:2181;
#rsid.zookeeper.group=local
#redisKey=media
\ No newline at end of file
##########################local##############################
#kafka.ip=kafka1.irybd.com:9092,kafka1.irybd.com:9093,kafka1.irybd.com:9094
#kafka.topic=crawler-test
##########################local##############################,kafka1.irybd.com:9093,kafka1.irybd.com:9094
#kafka.ip=kafka1.irybd.com:9092
#kafka.topic=crawler-media
##########################service##############################
kafka.ip=192.168.0.203:9092,192.168.0.203:9093,192.168.0.203:9094
kafka.ip=192.168.0.203:9095,192.168.0.104:9093,192.168.0.105:9093
kafka.topic=crawler-media
\ No newline at end of file
registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181
registry=zookeeper://192.168.0.203:2182;zookeeper://192.168.0.104:2182;zookeeper://192.168.0.105:2182
group=hangzhou
########################################################
#registry=zookeeper://192.168.0.36:2181
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment