Commit 53c0e739 by shenjunjie

词云图由hanLp调整为ansj 2

parent 896274b1
......@@ -10,6 +10,7 @@ import org.ansj.recognition.impl.NatureRecognition;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.HashMap;
......@@ -17,59 +18,56 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
public class AnsjSeg
{
@Component
public class AnsjSeg {
private static Logger logger = LogManager.getLogger(AnsjSeg.class);
private List<String> stopWords = MyDic.getStopWords(); // 停用词集合
private List<String> posivtiveWords = MyDic.getPosivtiveWords(); // 正面词集合
private List<String> negativeWords = MyDic.getNegativeWords(); // 负面词集合
private List<String> customWords = MyDic.getCustomWords(); // 自定义词集合
private final MyDic myDic;
private final List<String> stopWords; // 停用词集合
private final List<String> positiveWords; // 正面词集合
private final List<String> negativeWords; // 负面词集合
private final List<String> customWords; // 自定义词集合
public AnsjSeg(MyDic myDic) {
this.myDic = myDic;
this.stopWords = MyDic.getStopWords();
this.positiveWords = MyDic.getPositiveWords();
this.negativeWords = MyDic.getNegativeWords();
this.customWords = MyDic.getCustomWords();
}
public void addAnsjSeg(List<String> newStopWords,
List<String> newCustomWords)
{
if (newStopWords != null)
{
List<String> newCustomWords) {
if (newStopWords != null) {
this.stopWords.addAll(newStopWords);
}
if (newCustomWords != null)
{
if (newCustomWords != null) {
this.customWords.addAll(newCustomWords);
}
}
/**
* @param dataList 设定文件
* @return HashMap<String, Object> 返回类型
* @Title: getFenCi
* @Description: TODO(针对集合分词统计,并输出正负面词汇)
* @param dataList
* 设定文件
* @return HashMap<String,Object> 返回类型
* @Description: TODO(针对集合分词统计, 并输出正负面词汇)
*/
public Map<String, Object> getFenCi(List<String> dataList)
{
public Map<String, Object> getFenCi(List<String> dataList) {
Map<String, Object> map = new HashMap<String, Object>(); // 分词总结果
Map<String, Integer> hash = new HashMap<String, Integer>(); // 初步分词结果
// 统计分词
for (String txt : dataList)
{
for (String txt : dataList) {
List<Term> termList = getFenci(txt);
if (termList != null)
{
for (Term term : termList)
{
if (termList != null) {
for (Term term : termList) {
String words = term.getName();
// 去除停用词和单词
if (words.length() > 1)
{
if (hash.containsKey(words))
{
if (words.length() > 1) {
if (hash.containsKey(words)) {
hash.put(words, hash.get(words) + 1);
}
else
{
} else {
hash.put(words, 1);
}
}
......@@ -82,44 +80,34 @@ public class AnsjSeg
// 对分词结果排序
List<Entry<String, Integer>> resultList = TreatOrder
.treatOrderByCountDesc(hash);
try
{
try {
// 统计正负面关键词
for (Entry<String, Integer> entry : resultList)
{
for (Entry<String, Integer> entry : resultList) {
String word = entry.getKey();
if (posivtiveWords.contains(word))
{
if (positiveWords.contains(word)) {
HashMap<String, Object> goodmap = new HashMap<String, Object>();
goodmap.put("key", entry.getKey());
goodmap.put("value", entry.getValue());
goodResultList.add(goodmap);
}
else if (negativeWords.contains(word))
{
} else if (negativeWords.contains(word)) {
HashMap<String, Object> badmap = new HashMap<String, Object>();
badmap.put("key", entry.getKey());
badmap.put("value", entry.getValue());
badResultList.add(badmap);
}
}
}
catch (Exception e)
{
} catch (Exception e) {
e.printStackTrace();
}
// 输出正负面关键词
if (goodResultList.size() >= 10)
{
if (goodResultList.size() >= 10) {
goodResultList = goodResultList.subList(0, 10);
}
if (badResultList.size() >= 10)
{
if (badResultList.size() >= 10) {
badResultList = badResultList.subList(0, 10);
}
if (resultList.size() >= 20)
{
if (resultList.size() >= 20) {
resultList = resultList.subList(0, 20);
}
......@@ -131,99 +119,79 @@ public class AnsjSeg
}
/**
*
* @TODO (统计分词结果,按频次取前maxNum)
* @author 陈炜涛
* @param dataList
* @param maxNum
* @return
* @return Map<String, Object>
* @TODO (统计分词结果 , 按频次取前maxNum)
* @author 陈炜涛
* @time 2016年11月16日上午11:06:10
* @return Map<String,Object>
*/
public List<Entry<String, Integer>> getFenCi(List<String> dataList, int maxNum)
{
public List<Entry<String, Integer>> getFenCi(List<String> dataList, int maxNum) {
Map<String, Object> map = new HashMap<>(); // 分词总结果
Map<String, Integer> hash = new HashMap<>(); // 初步分词结果
// 统计分词
for (String txt : dataList)
{
for (String txt : dataList) {
List<Term> termList = getFenci(txt);
if (termList != null)
{
for (Term term : termList)
{
if (termList != null) {
for (Term term : termList) {
String words = Tools.filterSpecialCharacter(term.getName());
String wordsPro = term.getNatureStr();
// 去除停用词和单词
if (words.length() > 1)
{
switch (wordsPro)
{
case "w":
break;
case "r":
break;
case "p":
break;
default:
if (hash.containsKey(words))
{
hash.put(words, hash.get(words) + 1);
}
else
{
hash.put(words, 1);
}
break;
if (words.length() > 1) {
switch (wordsPro) {
case "w":
break;
case "r":
break;
case "p":
break;
default:
if (hash.containsKey(words)) {
hash.put(words, hash.get(words) + 1);
} else {
hash.put(words, 1);
}
break;
}
}
}
}
}
Map<String, Integer> stopResults = new HashMap<>();
for (Entry<String, Integer> en : hash.entrySet())
{
if (!stopWords.contains(en.getKey()))
{
for (Entry<String, Integer> en : hash.entrySet()) {
if (!stopWords.contains(en.getKey())) {
stopResults.put(en.getKey(), en.getValue());
}
}
// 对分词结果排序
List<Entry<String, Integer>> resultList = treatOrderByCountDesc(stopResults);
if (resultList.size() >= maxNum)
{
if (resultList.size() >= maxNum) {
resultList = resultList.subList(0, maxNum);
}
return resultList;
}
/**
* @param name 设定文件
* @return HashMap<String, Integer> 返回类型
* @Title: getFenCi
* @Description: TODO(针对真文本分词并统计)
* @param name
* 设定文件
* @return HashMap<String,Integer> 返回类型
*/
public HashMap<String, Integer> getFenCi(String name)
{
public HashMap<String, Integer> getFenCi(String name) {
HashMap<String, Integer> hash = new HashMap<String, Integer>(); // 初步分词结果
List<Term> termList = getFenci(name);
if (termList != null)
{
for (Term term : termList)
{
if (termList != null) {
for (Term term : termList) {
String word = term.getName();
if (hash.containsKey(word))
{
if (hash.containsKey(word)) {
hash.put(word, hash.get(word) + 1);
}
else
{
} else {
hash.put(word, 1);
}
}
......@@ -233,23 +201,20 @@ public class AnsjSeg
}
/**
* @return void 返回类型
* @Title: removeStopWord
* @Description: TODO(分词)
* @ 设定文件
* @return void 返回类型
*/
public List<Term> getFenci(String text)
{
try
{
public List<Term> getFenci(String text) {
try {
// 去重停用词(看源码应该是删除分词关键词)
// for (String word : stopWords)
// {
// UserDefineLibrary.removeWord(word);
// }
// 添加自定义词
for (String word : customWords)
{
for (String word : customWords) {
UserDefineLibrary.insertWord(word);
}
// 分词
......@@ -258,57 +223,44 @@ public class AnsjSeg
Result result = nlp.parseStr(text);
new NatureRecognition().recognition(result);
return result.getTerms();
}
catch (Exception e)
{
logger.error("分词出现问题");
} catch (Exception e) {
logger.error("分词出现问题", e);
return null;
}
}
/**
*
* @param dataMap 设定文件
* @return List<Entry < String, Integer>> 返回类型
* @Title: treatOrderByCountDesc
* @Description: TODO(根据数量降序)
* @param dataMap
* 设定文件
* @return List<Entry<String,Integer>> 返回类型
*/
public static List<Entry<String, Integer>> treatOrderByCountDesc(
Map<String, Integer> dataMap)
{
Map<String, Integer> dataMap) {
List<Entry<String, Integer>> list = new ArrayList<>(dataMap.entrySet());
list.sort((o1, o2) -> getCompareResult(o1.getValue(), o2.getValue(), false));
return list;
}
/**
* @Title: getCompareResult
* @Description: TODO(排序比较)
* @param time1
* @param time2
* @param asc
* 设定文件
* @param asc 设定文件
* @return int 返回类型
* @Title: getCompareResult
* @Description: TODO(排序比较)
*/
private static int getCompareResult(long time1, long time2, boolean asc)
{
private static int getCompareResult(long time1, long time2, boolean asc) {
long result;
if (asc)
{
if (asc) {
result = time1 - time2;
}
else
{
} else {
result = time2 - time1;
}
if (result > 0)
{
if (result > 0) {
result = 1;
}
else if (result < 0)
{
} else if (result < 0) {
result = -1;
}
return (int) result;
......
......@@ -39,13 +39,11 @@ public class MyDic {
public void init() {
try {
// 读取词典
List<String> customDics = Tools.readListFile(customDic.getInputStream());
List<String> stopDics = Tools.readListFile(stopDic.getInputStream());
InputStream inputStream = negativeDic.getInputStream();
List<String> negativeDic = Tools.readListFile(inputStream);
InputStream inputStream2 = positiveDic.getInputStream();
List<String> positiveDic = Tools.readListFile(inputStream2);
log.info("ansj自定义词典加载:{}条,停用词加载:{}条,负面词典加载:{}条,正面词典加载:{}条", customDics.size(), stopDics.size(), negativeDic.size(), positiveDic.size());
customWords = Tools.readListFile(customDic.getInputStream());
stopWords = Tools.readListFile(stopDic.getInputStream());
negativeWords = Tools.readListFile(negativeDic.getInputStream());
positiveWords= Tools.readListFile(positiveDic.getInputStream());
log.info("ansj自定义词典加载:{}条,停用词加载:{}条,负面词典加载:{}条,正面词典加载:{}条", customWords.size(), stopWords.size(), negativeWords.size(), positiveWords.size());
} catch (Exception e) {
log.info("MyDic-init 异常", e);
}
......@@ -78,8 +76,8 @@ public class MyDic {
* @Description: TODO(获取正面词)
* 设定文件
*/
public static List<String> getPosivtiveWords() {
return negativeWords;
public static List<String> getPositiveWords() {
return positiveWords;
}
/**
......
......@@ -33,6 +33,7 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.joda.time.Period;
import org.joda.time.PeriodType;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.mongodb.core.query.Criteria;
import org.springframework.data.mongodb.core.query.Query;
import org.springframework.data.mongodb.core.query.Update;
......@@ -81,6 +82,9 @@ public class CustomEventServiceImpl implements CustomEventService {
@Resource
MongoUtil mongoUtil;
@Autowired
TextUtil textUtil;
@Override
public List<JSONObject> getCustomEventRankList(Long startTime, Long endTime) {
List<JSONObject> resultList = null;
......@@ -375,7 +379,7 @@ public class CustomEventServiceImpl implements CustomEventService {
private List<JSONObject> getHotKeyword(List<BaseMap> articleList) {
List<String> texts = articleList.stream().map(article -> article.getTitle() + article.getContent()).collect(Collectors.toList());
//分析热评词
return TextUtil.getHighWordsJson(texts, 30);
return textUtil.getHighWordsJson(texts, 30);
}
/**
......
......@@ -137,6 +137,9 @@ public class MarkDataServiceImpl implements MarkDataService {
@Resource(name = "mongoUtil")
MongoUtil mongoUtil;
@Autowired
TextUtil textUtil;
@Override
public PageVO<MarkFlowEntity> getYuqingMarkList(MarkSearchDTO markSearchDTO) {
try {
......@@ -504,12 +507,13 @@ public class MarkDataServiceImpl implements MarkDataService {
log.info("es查询size:{},耗时:{}", texts.size(), System.currentTimeMillis() - s);
long s1 = System.currentTimeMillis();
// 分析高频词
List<JSONObject> highWords = TextUtil.getHighWordsJson(texts, 30);
List<JSONObject> highWords = textUtil.getHighWordsJson(texts, 30);
log.info("分析高频词耗时:{}", (System.currentTimeMillis() - s1));
redisUtil.setExpire(redisKey, JSON.toJSONString(highWords));
return highWords;
} catch (IOException e) {
ExceptionCast.cast(CommonCodeEnum.FAIL, "es查询异常", e);
log.error("getMarkHighWord-",e);
ExceptionCast.cast(CommonCodeEnum.FAIL);
}
return null;
}
......
......@@ -15,6 +15,7 @@ import com.zhiwei.nlp.vo.KResult;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.search.SearchHit;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.stereotype.Component;
......@@ -39,7 +40,8 @@ public class TextUtil {
private Resource customDic;
@Value("classpath:wordDic/stopWordDictionary.txt")
private Resource stopDic;
private static AnsjSeg ansjSeg;
@Autowired
private AnsjSeg ansjSeg;
@PostConstruct
public void init() {
......@@ -105,7 +107,7 @@ public class TextUtil {
return Tools.sortMap(result, maxSize);
}
public static List<JSONObject> getHighWordsJson(List<String> texts, Integer maxSize) {
public List<JSONObject> getHighWordsJson(List<String> texts, Integer maxSize) {
// Map<String, Integer> highWords = getHighWords(texts, maxSize);
List<Map.Entry<String, Integer>> wordRate = ansjSeg.getFenCi(texts, maxSize);
List<JSONObject> result = new ArrayList<>(wordRate.size());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment