Commit 53c0e739 by shenjunjie

词云图由hanLp调整为ansj 2

parent 896274b1
...@@ -10,6 +10,7 @@ import org.ansj.recognition.impl.NatureRecognition; ...@@ -10,6 +10,7 @@ import org.ansj.recognition.impl.NatureRecognition;
import org.ansj.splitWord.analysis.NlpAnalysis; import org.ansj.splitWord.analysis.NlpAnalysis;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.springframework.stereotype.Component;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
...@@ -17,59 +18,56 @@ import java.util.List; ...@@ -17,59 +18,56 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
public class AnsjSeg @Component
{ public class AnsjSeg {
private static Logger logger = LogManager.getLogger(AnsjSeg.class); private static Logger logger = LogManager.getLogger(AnsjSeg.class);
private List<String> stopWords = MyDic.getStopWords(); // 停用词集合 private final MyDic myDic;
private List<String> posivtiveWords = MyDic.getPosivtiveWords(); // 正面词集合 private final List<String> stopWords; // 停用词集合
private List<String> negativeWords = MyDic.getNegativeWords(); // 负面词集合 private final List<String> positiveWords; // 正面词集合
private List<String> customWords = MyDic.getCustomWords(); // 自定义词集合 private final List<String> negativeWords; // 负面词集合
private final List<String> customWords; // 自定义词集合
public AnsjSeg(MyDic myDic) {
this.myDic = myDic;
this.stopWords = MyDic.getStopWords();
this.positiveWords = MyDic.getPositiveWords();
this.negativeWords = MyDic.getNegativeWords();
this.customWords = MyDic.getCustomWords();
}
public void addAnsjSeg(List<String> newStopWords, public void addAnsjSeg(List<String> newStopWords,
List<String> newCustomWords) List<String> newCustomWords) {
{ if (newStopWords != null) {
if (newStopWords != null)
{
this.stopWords.addAll(newStopWords); this.stopWords.addAll(newStopWords);
} }
if (newCustomWords != null) if (newCustomWords != null) {
{
this.customWords.addAll(newCustomWords); this.customWords.addAll(newCustomWords);
} }
} }
/** /**
* @param dataList 设定文件
* @return HashMap<String, Object> 返回类型
* @Title: getFenCi * @Title: getFenCi
* @Description: TODO(针对集合分词统计,并输出正负面词汇) * @Description: TODO(针对集合分词统计, 并输出正负面词汇)
* @param dataList
* 设定文件
* @return HashMap<String,Object> 返回类型
*/ */
public Map<String, Object> getFenCi(List<String> dataList) public Map<String, Object> getFenCi(List<String> dataList) {
{
Map<String, Object> map = new HashMap<String, Object>(); // 分词总结果 Map<String, Object> map = new HashMap<String, Object>(); // 分词总结果
Map<String, Integer> hash = new HashMap<String, Integer>(); // 初步分词结果 Map<String, Integer> hash = new HashMap<String, Integer>(); // 初步分词结果
// 统计分词 // 统计分词
for (String txt : dataList) for (String txt : dataList) {
{
List<Term> termList = getFenci(txt); List<Term> termList = getFenci(txt);
if (termList != null) if (termList != null) {
{ for (Term term : termList) {
for (Term term : termList)
{
String words = term.getName(); String words = term.getName();
// 去除停用词和单词 // 去除停用词和单词
if (words.length() > 1) if (words.length() > 1) {
{ if (hash.containsKey(words)) {
if (hash.containsKey(words))
{
hash.put(words, hash.get(words) + 1); hash.put(words, hash.get(words) + 1);
} } else {
else
{
hash.put(words, 1); hash.put(words, 1);
} }
} }
...@@ -82,44 +80,34 @@ public class AnsjSeg ...@@ -82,44 +80,34 @@ public class AnsjSeg
// 对分词结果排序 // 对分词结果排序
List<Entry<String, Integer>> resultList = TreatOrder List<Entry<String, Integer>> resultList = TreatOrder
.treatOrderByCountDesc(hash); .treatOrderByCountDesc(hash);
try try {
{
// 统计正负面关键词 // 统计正负面关键词
for (Entry<String, Integer> entry : resultList) for (Entry<String, Integer> entry : resultList) {
{
String word = entry.getKey(); String word = entry.getKey();
if (posivtiveWords.contains(word)) if (positiveWords.contains(word)) {
{
HashMap<String, Object> goodmap = new HashMap<String, Object>(); HashMap<String, Object> goodmap = new HashMap<String, Object>();
goodmap.put("key", entry.getKey()); goodmap.put("key", entry.getKey());
goodmap.put("value", entry.getValue()); goodmap.put("value", entry.getValue());
goodResultList.add(goodmap); goodResultList.add(goodmap);
} } else if (negativeWords.contains(word)) {
else if (negativeWords.contains(word))
{
HashMap<String, Object> badmap = new HashMap<String, Object>(); HashMap<String, Object> badmap = new HashMap<String, Object>();
badmap.put("key", entry.getKey()); badmap.put("key", entry.getKey());
badmap.put("value", entry.getValue()); badmap.put("value", entry.getValue());
badResultList.add(badmap); badResultList.add(badmap);
} }
} }
} } catch (Exception e) {
catch (Exception e)
{
e.printStackTrace(); e.printStackTrace();
} }
// 输出正负面关键词 // 输出正负面关键词
if (goodResultList.size() >= 10) if (goodResultList.size() >= 10) {
{
goodResultList = goodResultList.subList(0, 10); goodResultList = goodResultList.subList(0, 10);
} }
if (badResultList.size() >= 10) if (badResultList.size() >= 10) {
{
badResultList = badResultList.subList(0, 10); badResultList = badResultList.subList(0, 10);
} }
if (resultList.size() >= 20) if (resultList.size() >= 20) {
{
resultList = resultList.subList(0, 20); resultList = resultList.subList(0, 20);
} }
...@@ -131,35 +119,28 @@ public class AnsjSeg ...@@ -131,35 +119,28 @@ public class AnsjSeg
} }
/** /**
*
* @TODO (统计分词结果,按频次取前maxNum)
* @author 陈炜涛
* @param dataList * @param dataList
* @param maxNum * @param maxNum
* @return * @return
* @return Map<String, Object>
* @TODO (统计分词结果 , 按频次取前maxNum)
* @author 陈炜涛
* @time 2016年11月16日上午11:06:10 * @time 2016年11月16日上午11:06:10
* @return Map<String,Object>
*/ */
public List<Entry<String, Integer>> getFenCi(List<String> dataList, int maxNum) public List<Entry<String, Integer>> getFenCi(List<String> dataList, int maxNum) {
{
Map<String, Object> map = new HashMap<>(); // 分词总结果 Map<String, Object> map = new HashMap<>(); // 分词总结果
Map<String, Integer> hash = new HashMap<>(); // 初步分词结果 Map<String, Integer> hash = new HashMap<>(); // 初步分词结果
// 统计分词 // 统计分词
for (String txt : dataList) for (String txt : dataList) {
{
List<Term> termList = getFenci(txt); List<Term> termList = getFenci(txt);
if (termList != null) if (termList != null) {
{ for (Term term : termList) {
for (Term term : termList)
{
String words = Tools.filterSpecialCharacter(term.getName()); String words = Tools.filterSpecialCharacter(term.getName());
String wordsPro = term.getNatureStr(); String wordsPro = term.getNatureStr();
// 去除停用词和单词 // 去除停用词和单词
if (words.length() > 1) if (words.length() > 1) {
{ switch (wordsPro) {
switch (wordsPro)
{
case "w": case "w":
break; break;
case "r": case "r":
...@@ -167,12 +148,9 @@ public class AnsjSeg ...@@ -167,12 +148,9 @@ public class AnsjSeg
case "p": case "p":
break; break;
default: default:
if (hash.containsKey(words)) if (hash.containsKey(words)) {
{
hash.put(words, hash.get(words) + 1); hash.put(words, hash.get(words) + 1);
} } else {
else
{
hash.put(words, 1); hash.put(words, 1);
} }
break; break;
...@@ -184,46 +162,36 @@ public class AnsjSeg ...@@ -184,46 +162,36 @@ public class AnsjSeg
Map<String, Integer> stopResults = new HashMap<>(); Map<String, Integer> stopResults = new HashMap<>();
for (Entry<String, Integer> en : hash.entrySet()) for (Entry<String, Integer> en : hash.entrySet()) {
{ if (!stopWords.contains(en.getKey())) {
if (!stopWords.contains(en.getKey()))
{
stopResults.put(en.getKey(), en.getValue()); stopResults.put(en.getKey(), en.getValue());
} }
} }
// 对分词结果排序 // 对分词结果排序
List<Entry<String, Integer>> resultList = treatOrderByCountDesc(stopResults); List<Entry<String, Integer>> resultList = treatOrderByCountDesc(stopResults);
if (resultList.size() >= maxNum) if (resultList.size() >= maxNum) {
{
resultList = resultList.subList(0, maxNum); resultList = resultList.subList(0, maxNum);
} }
return resultList; return resultList;
} }
/** /**
* @param name 设定文件
* @return HashMap<String, Integer> 返回类型
* @Title: getFenCi * @Title: getFenCi
* @Description: TODO(针对真文本分词并统计) * @Description: TODO(针对真文本分词并统计)
* @param name
* 设定文件
* @return HashMap<String,Integer> 返回类型
*/ */
public HashMap<String, Integer> getFenCi(String name) public HashMap<String, Integer> getFenCi(String name) {
{
HashMap<String, Integer> hash = new HashMap<String, Integer>(); // 初步分词结果 HashMap<String, Integer> hash = new HashMap<String, Integer>(); // 初步分词结果
List<Term> termList = getFenci(name); List<Term> termList = getFenci(name);
if (termList != null) if (termList != null) {
{ for (Term term : termList) {
for (Term term : termList)
{
String word = term.getName(); String word = term.getName();
if (hash.containsKey(word)) if (hash.containsKey(word)) {
{
hash.put(word, hash.get(word) + 1); hash.put(word, hash.get(word) + 1);
} } else {
else
{
hash.put(word, 1); hash.put(word, 1);
} }
} }
...@@ -233,23 +201,20 @@ public class AnsjSeg ...@@ -233,23 +201,20 @@ public class AnsjSeg
} }
/** /**
* @return void 返回类型
* @Title: removeStopWord * @Title: removeStopWord
* @Description: TODO(分词) * @Description: TODO(分词)
* @ 设定文件 * @ 设定文件
* @return void 返回类型
*/ */
public List<Term> getFenci(String text) public List<Term> getFenci(String text) {
{ try {
try
{
// 去重停用词(看源码应该是删除分词关键词) // 去重停用词(看源码应该是删除分词关键词)
// for (String word : stopWords) // for (String word : stopWords)
// { // {
// UserDefineLibrary.removeWord(word); // UserDefineLibrary.removeWord(word);
// } // }
// 添加自定义词 // 添加自定义词
for (String word : customWords) for (String word : customWords) {
{
UserDefineLibrary.insertWord(word); UserDefineLibrary.insertWord(word);
} }
// 分词 // 分词
...@@ -258,57 +223,44 @@ public class AnsjSeg ...@@ -258,57 +223,44 @@ public class AnsjSeg
Result result = nlp.parseStr(text); Result result = nlp.parseStr(text);
new NatureRecognition().recognition(result); new NatureRecognition().recognition(result);
return result.getTerms(); return result.getTerms();
} } catch (Exception e) {
catch (Exception e) logger.error("分词出现问题", e);
{
logger.error("分词出现问题");
return null; return null;
} }
} }
/** /**
* * @param dataMap 设定文件
* @return List<Entry < String, Integer>> 返回类型
* @Title: treatOrderByCountDesc * @Title: treatOrderByCountDesc
* @Description: TODO(根据数量降序) * @Description: TODO(根据数量降序)
* @param dataMap
* 设定文件
* @return List<Entry<String,Integer>> 返回类型
*/ */
public static List<Entry<String, Integer>> treatOrderByCountDesc( public static List<Entry<String, Integer>> treatOrderByCountDesc(
Map<String, Integer> dataMap) Map<String, Integer> dataMap) {
{
List<Entry<String, Integer>> list = new ArrayList<>(dataMap.entrySet()); List<Entry<String, Integer>> list = new ArrayList<>(dataMap.entrySet());
list.sort((o1, o2) -> getCompareResult(o1.getValue(), o2.getValue(), false)); list.sort((o1, o2) -> getCompareResult(o1.getValue(), o2.getValue(), false));
return list; return list;
} }
/** /**
* @Title: getCompareResult
* @Description: TODO(排序比较)
* @param time1 * @param time1
* @param time2 * @param time2
* @param asc * @param asc 设定文件
* 设定文件
* @return int 返回类型 * @return int 返回类型
* @Title: getCompareResult
* @Description: TODO(排序比较)
*/ */
private static int getCompareResult(long time1, long time2, boolean asc) private static int getCompareResult(long time1, long time2, boolean asc) {
{
long result; long result;
if (asc) if (asc) {
{
result = time1 - time2; result = time1 - time2;
} } else {
else
{
result = time2 - time1; result = time2 - time1;
} }
if (result > 0) if (result > 0) {
{
result = 1; result = 1;
} } else if (result < 0) {
else if (result < 0)
{
result = -1; result = -1;
} }
return (int) result; return (int) result;
......
...@@ -39,13 +39,11 @@ public class MyDic { ...@@ -39,13 +39,11 @@ public class MyDic {
public void init() { public void init() {
try { try {
// 读取词典 // 读取词典
List<String> customDics = Tools.readListFile(customDic.getInputStream()); customWords = Tools.readListFile(customDic.getInputStream());
List<String> stopDics = Tools.readListFile(stopDic.getInputStream()); stopWords = Tools.readListFile(stopDic.getInputStream());
InputStream inputStream = negativeDic.getInputStream(); negativeWords = Tools.readListFile(negativeDic.getInputStream());
List<String> negativeDic = Tools.readListFile(inputStream); positiveWords= Tools.readListFile(positiveDic.getInputStream());
InputStream inputStream2 = positiveDic.getInputStream(); log.info("ansj自定义词典加载:{}条,停用词加载:{}条,负面词典加载:{}条,正面词典加载:{}条", customWords.size(), stopWords.size(), negativeWords.size(), positiveWords.size());
List<String> positiveDic = Tools.readListFile(inputStream2);
log.info("ansj自定义词典加载:{}条,停用词加载:{}条,负面词典加载:{}条,正面词典加载:{}条", customDics.size(), stopDics.size(), negativeDic.size(), positiveDic.size());
} catch (Exception e) { } catch (Exception e) {
log.info("MyDic-init 异常", e); log.info("MyDic-init 异常", e);
} }
...@@ -78,8 +76,8 @@ public class MyDic { ...@@ -78,8 +76,8 @@ public class MyDic {
* @Description: TODO(获取正面词) * @Description: TODO(获取正面词)
* 设定文件 * 设定文件
*/ */
public static List<String> getPosivtiveWords() { public static List<String> getPositiveWords() {
return negativeWords; return positiveWords;
} }
/** /**
......
...@@ -33,6 +33,7 @@ import org.apache.logging.log4j.LogManager; ...@@ -33,6 +33,7 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.joda.time.Period; import org.joda.time.Period;
import org.joda.time.PeriodType; import org.joda.time.PeriodType;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.mongodb.core.query.Criteria; import org.springframework.data.mongodb.core.query.Criteria;
import org.springframework.data.mongodb.core.query.Query; import org.springframework.data.mongodb.core.query.Query;
import org.springframework.data.mongodb.core.query.Update; import org.springframework.data.mongodb.core.query.Update;
...@@ -81,6 +82,9 @@ public class CustomEventServiceImpl implements CustomEventService { ...@@ -81,6 +82,9 @@ public class CustomEventServiceImpl implements CustomEventService {
@Resource @Resource
MongoUtil mongoUtil; MongoUtil mongoUtil;
@Autowired
TextUtil textUtil;
@Override @Override
public List<JSONObject> getCustomEventRankList(Long startTime, Long endTime) { public List<JSONObject> getCustomEventRankList(Long startTime, Long endTime) {
List<JSONObject> resultList = null; List<JSONObject> resultList = null;
...@@ -375,7 +379,7 @@ public class CustomEventServiceImpl implements CustomEventService { ...@@ -375,7 +379,7 @@ public class CustomEventServiceImpl implements CustomEventService {
private List<JSONObject> getHotKeyword(List<BaseMap> articleList) { private List<JSONObject> getHotKeyword(List<BaseMap> articleList) {
List<String> texts = articleList.stream().map(article -> article.getTitle() + article.getContent()).collect(Collectors.toList()); List<String> texts = articleList.stream().map(article -> article.getTitle() + article.getContent()).collect(Collectors.toList());
//分析热评词 //分析热评词
return TextUtil.getHighWordsJson(texts, 30); return textUtil.getHighWordsJson(texts, 30);
} }
/** /**
......
...@@ -137,6 +137,9 @@ public class MarkDataServiceImpl implements MarkDataService { ...@@ -137,6 +137,9 @@ public class MarkDataServiceImpl implements MarkDataService {
@Resource(name = "mongoUtil") @Resource(name = "mongoUtil")
MongoUtil mongoUtil; MongoUtil mongoUtil;
@Autowired
TextUtil textUtil;
@Override @Override
public PageVO<MarkFlowEntity> getYuqingMarkList(MarkSearchDTO markSearchDTO) { public PageVO<MarkFlowEntity> getYuqingMarkList(MarkSearchDTO markSearchDTO) {
try { try {
...@@ -504,12 +507,13 @@ public class MarkDataServiceImpl implements MarkDataService { ...@@ -504,12 +507,13 @@ public class MarkDataServiceImpl implements MarkDataService {
log.info("es查询size:{},耗时:{}", texts.size(), System.currentTimeMillis() - s); log.info("es查询size:{},耗时:{}", texts.size(), System.currentTimeMillis() - s);
long s1 = System.currentTimeMillis(); long s1 = System.currentTimeMillis();
// 分析高频词 // 分析高频词
List<JSONObject> highWords = TextUtil.getHighWordsJson(texts, 30); List<JSONObject> highWords = textUtil.getHighWordsJson(texts, 30);
log.info("分析高频词耗时:{}", (System.currentTimeMillis() - s1)); log.info("分析高频词耗时:{}", (System.currentTimeMillis() - s1));
redisUtil.setExpire(redisKey, JSON.toJSONString(highWords)); redisUtil.setExpire(redisKey, JSON.toJSONString(highWords));
return highWords; return highWords;
} catch (IOException e) { } catch (IOException e) {
ExceptionCast.cast(CommonCodeEnum.FAIL, "es查询异常", e); log.error("getMarkHighWord-",e);
ExceptionCast.cast(CommonCodeEnum.FAIL);
} }
return null; return null;
} }
......
...@@ -15,6 +15,7 @@ import com.zhiwei.nlp.vo.KResult; ...@@ -15,6 +15,7 @@ import com.zhiwei.nlp.vo.KResult;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHit;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value; import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource; import org.springframework.core.io.Resource;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
...@@ -39,7 +40,8 @@ public class TextUtil { ...@@ -39,7 +40,8 @@ public class TextUtil {
private Resource customDic; private Resource customDic;
@Value("classpath:wordDic/stopWordDictionary.txt") @Value("classpath:wordDic/stopWordDictionary.txt")
private Resource stopDic; private Resource stopDic;
private static AnsjSeg ansjSeg; @Autowired
private AnsjSeg ansjSeg;
@PostConstruct @PostConstruct
public void init() { public void init() {
...@@ -105,7 +107,7 @@ public class TextUtil { ...@@ -105,7 +107,7 @@ public class TextUtil {
return Tools.sortMap(result, maxSize); return Tools.sortMap(result, maxSize);
} }
public static List<JSONObject> getHighWordsJson(List<String> texts, Integer maxSize) { public List<JSONObject> getHighWordsJson(List<String> texts, Integer maxSize) {
// Map<String, Integer> highWords = getHighWords(texts, maxSize); // Map<String, Integer> highWords = getHighWords(texts, maxSize);
List<Map.Entry<String, Integer>> wordRate = ansjSeg.getFenCi(texts, maxSize); List<Map.Entry<String, Integer>> wordRate = ansjSeg.getFenCi(texts, maxSize);
List<JSONObject> result = new ArrayList<>(wordRate.size()); List<JSONObject> result = new ArrayList<>(wordRate.size());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment