Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
B
brandkbs2
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
shenjunjie
brandkbs2
Commits
d300091b
Commit
d300091b
authored
Jan 05, 2023
by
shenjunjie
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'feature' into 'dev'
词云图由hanLp调整为ansj 2 See merge request
!141
parents
9928b97f
53c0e739
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
97 additions
and
137 deletions
+97
-137
src/main/java/com/zhiwei/brandkbs2/ansjSeg/AnsjSeg.java
+75
-123
src/main/java/com/zhiwei/brandkbs2/ansjSeg/MyDic.java
+7
-9
src/main/java/com/zhiwei/brandkbs2/service/impl/CustomEventServiceImpl.java
+5
-1
src/main/java/com/zhiwei/brandkbs2/service/impl/MarkDataServiceImpl.java
+6
-2
src/main/java/com/zhiwei/brandkbs2/util/TextUtil.java
+4
-2
No files found.
src/main/java/com/zhiwei/brandkbs2/ansjSeg/AnsjSeg.java
View file @
d300091b
...
...
@@ -10,6 +10,7 @@ import org.ansj.recognition.impl.NatureRecognition;
import
org.ansj.splitWord.analysis.NlpAnalysis
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.springframework.stereotype.Component
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
...
...
@@ -17,59 +18,56 @@ import java.util.List;
import
java.util.Map
;
import
java.util.Map.Entry
;
public
class
AnsjSeg
{
@Component
public
class
AnsjSeg
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
AnsjSeg
.
class
);
private
List
<
String
>
stopWords
=
MyDic
.
getStopWords
();
// 停用词集合
private
List
<
String
>
posivtiveWords
=
MyDic
.
getPosivtiveWords
();
// 正面词集合
private
List
<
String
>
negativeWords
=
MyDic
.
getNegativeWords
();
// 负面词集合
private
List
<
String
>
customWords
=
MyDic
.
getCustomWords
();
// 自定义词集合
private
final
MyDic
myDic
;
private
final
List
<
String
>
stopWords
;
// 停用词集合
private
final
List
<
String
>
positiveWords
;
// 正面词集合
private
final
List
<
String
>
negativeWords
;
// 负面词集合
private
final
List
<
String
>
customWords
;
// 自定义词集合
public
AnsjSeg
(
MyDic
myDic
)
{
this
.
myDic
=
myDic
;
this
.
stopWords
=
MyDic
.
getStopWords
();
this
.
positiveWords
=
MyDic
.
getPositiveWords
();
this
.
negativeWords
=
MyDic
.
getNegativeWords
();
this
.
customWords
=
MyDic
.
getCustomWords
();
}
public
void
addAnsjSeg
(
List
<
String
>
newStopWords
,
List
<
String
>
newCustomWords
)
{
if
(
newStopWords
!=
null
)
{
List
<
String
>
newCustomWords
)
{
if
(
newStopWords
!=
null
)
{
this
.
stopWords
.
addAll
(
newStopWords
);
}
if
(
newCustomWords
!=
null
)
{
if
(
newCustomWords
!=
null
)
{
this
.
customWords
.
addAll
(
newCustomWords
);
}
}
/**
* @param dataList 设定文件
* @return HashMap<String, Object> 返回类型
* @Title: getFenCi
* @Description: TODO(针对集合分词统计,并输出正负面词汇)
* @param dataList
* 设定文件
* @return HashMap<String,Object> 返回类型
* @Description: TODO(针对集合分词统计, 并输出正负面词汇)
*/
public
Map
<
String
,
Object
>
getFenCi
(
List
<
String
>
dataList
)
{
public
Map
<
String
,
Object
>
getFenCi
(
List
<
String
>
dataList
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
// 分词总结果
Map
<
String
,
Integer
>
hash
=
new
HashMap
<
String
,
Integer
>();
// 初步分词结果
// 统计分词
for
(
String
txt
:
dataList
)
{
for
(
String
txt
:
dataList
)
{
List
<
Term
>
termList
=
getFenci
(
txt
);
if
(
termList
!=
null
)
{
for
(
Term
term
:
termList
)
{
if
(
termList
!=
null
)
{
for
(
Term
term
:
termList
)
{
String
words
=
term
.
getName
();
// 去除停用词和单词
if
(
words
.
length
()
>
1
)
{
if
(
hash
.
containsKey
(
words
))
{
if
(
words
.
length
()
>
1
)
{
if
(
hash
.
containsKey
(
words
))
{
hash
.
put
(
words
,
hash
.
get
(
words
)
+
1
);
}
else
{
}
else
{
hash
.
put
(
words
,
1
);
}
}
...
...
@@ -82,44 +80,34 @@ public class AnsjSeg
// 对分词结果排序
List
<
Entry
<
String
,
Integer
>>
resultList
=
TreatOrder
.
treatOrderByCountDesc
(
hash
);
try
{
try
{
// 统计正负面关键词
for
(
Entry
<
String
,
Integer
>
entry
:
resultList
)
{
for
(
Entry
<
String
,
Integer
>
entry
:
resultList
)
{
String
word
=
entry
.
getKey
();
if
(
posivtiveWords
.
contains
(
word
))
{
if
(
positiveWords
.
contains
(
word
))
{
HashMap
<
String
,
Object
>
goodmap
=
new
HashMap
<
String
,
Object
>();
goodmap
.
put
(
"key"
,
entry
.
getKey
());
goodmap
.
put
(
"value"
,
entry
.
getValue
());
goodResultList
.
add
(
goodmap
);
}
else
if
(
negativeWords
.
contains
(
word
))
{
}
else
if
(
negativeWords
.
contains
(
word
))
{
HashMap
<
String
,
Object
>
badmap
=
new
HashMap
<
String
,
Object
>();
badmap
.
put
(
"key"
,
entry
.
getKey
());
badmap
.
put
(
"value"
,
entry
.
getValue
());
badResultList
.
add
(
badmap
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
// 输出正负面关键词
if
(
goodResultList
.
size
()
>=
10
)
{
if
(
goodResultList
.
size
()
>=
10
)
{
goodResultList
=
goodResultList
.
subList
(
0
,
10
);
}
if
(
badResultList
.
size
()
>=
10
)
{
if
(
badResultList
.
size
()
>=
10
)
{
badResultList
=
badResultList
.
subList
(
0
,
10
);
}
if
(
resultList
.
size
()
>=
20
)
{
if
(
resultList
.
size
()
>=
20
)
{
resultList
=
resultList
.
subList
(
0
,
20
);
}
...
...
@@ -131,35 +119,28 @@ public class AnsjSeg
}
/**
*
* @TODO (统计分词结果,按频次取前maxNum)
* @author 陈炜涛
* @param dataList
* @param maxNum
* @return
* @return Map<String, Object>
* @TODO (统计分词结果 , 按频次取前maxNum)
* @author 陈炜涛
* @time 2016年11月16日上午11:06:10
* @return Map<String,Object>
*/
public
List
<
Entry
<
String
,
Integer
>>
getFenCi
(
List
<
String
>
dataList
,
int
maxNum
)
{
public
List
<
Entry
<
String
,
Integer
>>
getFenCi
(
List
<
String
>
dataList
,
int
maxNum
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
// 分词总结果
Map
<
String
,
Integer
>
hash
=
new
HashMap
<>();
// 初步分词结果
// 统计分词
for
(
String
txt
:
dataList
)
{
for
(
String
txt
:
dataList
)
{
List
<
Term
>
termList
=
getFenci
(
txt
);
if
(
termList
!=
null
)
{
for
(
Term
term
:
termList
)
{
if
(
termList
!=
null
)
{
for
(
Term
term
:
termList
)
{
String
words
=
Tools
.
filterSpecialCharacter
(
term
.
getName
());
String
wordsPro
=
term
.
getNatureStr
();
// 去除停用词和单词
if
(
words
.
length
()
>
1
)
{
switch
(
wordsPro
)
{
if
(
words
.
length
()
>
1
)
{
switch
(
wordsPro
)
{
case
"w"
:
break
;
case
"r"
:
...
...
@@ -167,12 +148,9 @@ public class AnsjSeg
case
"p"
:
break
;
default
:
if
(
hash
.
containsKey
(
words
))
{
if
(
hash
.
containsKey
(
words
))
{
hash
.
put
(
words
,
hash
.
get
(
words
)
+
1
);
}
else
{
}
else
{
hash
.
put
(
words
,
1
);
}
break
;
...
...
@@ -184,46 +162,36 @@ public class AnsjSeg
Map
<
String
,
Integer
>
stopResults
=
new
HashMap
<>();
for
(
Entry
<
String
,
Integer
>
en
:
hash
.
entrySet
())
{
if
(!
stopWords
.
contains
(
en
.
getKey
()))
{
for
(
Entry
<
String
,
Integer
>
en
:
hash
.
entrySet
())
{
if
(!
stopWords
.
contains
(
en
.
getKey
()))
{
stopResults
.
put
(
en
.
getKey
(),
en
.
getValue
());
}
}
// 对分词结果排序
List
<
Entry
<
String
,
Integer
>>
resultList
=
treatOrderByCountDesc
(
stopResults
);
if
(
resultList
.
size
()
>=
maxNum
)
{
if
(
resultList
.
size
()
>=
maxNum
)
{
resultList
=
resultList
.
subList
(
0
,
maxNum
);
}
return
resultList
;
}
/**
* @param name 设定文件
* @return HashMap<String, Integer> 返回类型
* @Title: getFenCi
* @Description: TODO(针对真文本分词并统计)
* @param name
* 设定文件
* @return HashMap<String,Integer> 返回类型
*/
public
HashMap
<
String
,
Integer
>
getFenCi
(
String
name
)
{
public
HashMap
<
String
,
Integer
>
getFenCi
(
String
name
)
{
HashMap
<
String
,
Integer
>
hash
=
new
HashMap
<
String
,
Integer
>();
// 初步分词结果
List
<
Term
>
termList
=
getFenci
(
name
);
if
(
termList
!=
null
)
{
for
(
Term
term
:
termList
)
{
if
(
termList
!=
null
)
{
for
(
Term
term
:
termList
)
{
String
word
=
term
.
getName
();
if
(
hash
.
containsKey
(
word
))
{
if
(
hash
.
containsKey
(
word
))
{
hash
.
put
(
word
,
hash
.
get
(
word
)
+
1
);
}
else
{
}
else
{
hash
.
put
(
word
,
1
);
}
}
...
...
@@ -233,23 +201,20 @@ public class AnsjSeg
}
/**
* @return void 返回类型
* @Title: removeStopWord
* @Description: TODO(分词)
* @ 设定文件
* @return void 返回类型
*/
public
List
<
Term
>
getFenci
(
String
text
)
{
try
{
public
List
<
Term
>
getFenci
(
String
text
)
{
try
{
// 去重停用词(看源码应该是删除分词关键词)
// for (String word : stopWords)
// {
// UserDefineLibrary.removeWord(word);
// }
// 添加自定义词
for
(
String
word
:
customWords
)
{
for
(
String
word
:
customWords
)
{
UserDefineLibrary
.
insertWord
(
word
);
}
// 分词
...
...
@@ -258,57 +223,44 @@ public class AnsjSeg
Result
result
=
nlp
.
parseStr
(
text
);
new
NatureRecognition
().
recognition
(
result
);
return
result
.
getTerms
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"分词出现问题"
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"分词出现问题"
,
e
);
return
null
;
}
}
/**
*
* @param dataMap 设定文件
* @return List<Entry < String, Integer>> 返回类型
* @Title: treatOrderByCountDesc
* @Description: TODO(根据数量降序)
* @param dataMap
* 设定文件
* @return List<Entry<String,Integer>> 返回类型
*/
public
static
List
<
Entry
<
String
,
Integer
>>
treatOrderByCountDesc
(
Map
<
String
,
Integer
>
dataMap
)
{
Map
<
String
,
Integer
>
dataMap
)
{
List
<
Entry
<
String
,
Integer
>>
list
=
new
ArrayList
<>(
dataMap
.
entrySet
());
list
.
sort
((
o1
,
o2
)
->
getCompareResult
(
o1
.
getValue
(),
o2
.
getValue
(),
false
));
return
list
;
}
/**
* @Title: getCompareResult
* @Description: TODO(排序比较)
* @param time1
* @param time2
* @param asc
* 设定文件
* @param asc 设定文件
* @return int 返回类型
* @Title: getCompareResult
* @Description: TODO(排序比较)
*/
private
static
int
getCompareResult
(
long
time1
,
long
time2
,
boolean
asc
)
{
private
static
int
getCompareResult
(
long
time1
,
long
time2
,
boolean
asc
)
{
long
result
;
if
(
asc
)
{
if
(
asc
)
{
result
=
time1
-
time2
;
}
else
{
}
else
{
result
=
time2
-
time1
;
}
if
(
result
>
0
)
{
if
(
result
>
0
)
{
result
=
1
;
}
else
if
(
result
<
0
)
{
}
else
if
(
result
<
0
)
{
result
=
-
1
;
}
return
(
int
)
result
;
...
...
src/main/java/com/zhiwei/brandkbs2/ansjSeg/MyDic.java
View file @
d300091b
...
...
@@ -39,13 +39,11 @@ public class MyDic {
public
void
init
()
{
try
{
// 读取词典
List
<
String
>
customDics
=
Tools
.
readListFile
(
customDic
.
getInputStream
());
List
<
String
>
stopDics
=
Tools
.
readListFile
(
stopDic
.
getInputStream
());
InputStream
inputStream
=
negativeDic
.
getInputStream
();
List
<
String
>
negativeDic
=
Tools
.
readListFile
(
inputStream
);
InputStream
inputStream2
=
positiveDic
.
getInputStream
();
List
<
String
>
positiveDic
=
Tools
.
readListFile
(
inputStream2
);
log
.
info
(
"ansj自定义词典加载:{}条,停用词加载:{}条,负面词典加载:{}条,正面词典加载:{}条"
,
customDics
.
size
(),
stopDics
.
size
(),
negativeDic
.
size
(),
positiveDic
.
size
());
customWords
=
Tools
.
readListFile
(
customDic
.
getInputStream
());
stopWords
=
Tools
.
readListFile
(
stopDic
.
getInputStream
());
negativeWords
=
Tools
.
readListFile
(
negativeDic
.
getInputStream
());
positiveWords
=
Tools
.
readListFile
(
positiveDic
.
getInputStream
());
log
.
info
(
"ansj自定义词典加载:{}条,停用词加载:{}条,负面词典加载:{}条,正面词典加载:{}条"
,
customWords
.
size
(),
stopWords
.
size
(),
negativeWords
.
size
(),
positiveWords
.
size
());
}
catch
(
Exception
e
)
{
log
.
info
(
"MyDic-init 异常"
,
e
);
}
...
...
@@ -78,8 +76,8 @@ public class MyDic {
* @Description: TODO(获取正面词)
* 设定文件
*/
public
static
List
<
String
>
getPosi
v
tiveWords
()
{
return
nega
tiveWords
;
public
static
List
<
String
>
getPositiveWords
()
{
return
posi
tiveWords
;
}
/**
...
...
src/main/java/com/zhiwei/brandkbs2/service/impl/CustomEventServiceImpl.java
View file @
d300091b
...
...
@@ -33,6 +33,7 @@ import org.apache.logging.log4j.LogManager;
import
org.apache.logging.log4j.Logger
;
import
org.joda.time.Period
;
import
org.joda.time.PeriodType
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
org.springframework.data.mongodb.core.query.Criteria
;
import
org.springframework.data.mongodb.core.query.Query
;
import
org.springframework.data.mongodb.core.query.Update
;
...
...
@@ -81,6 +82,9 @@ public class CustomEventServiceImpl implements CustomEventService {
@Resource
MongoUtil
mongoUtil
;
@Autowired
TextUtil
textUtil
;
@Override
public
List
<
JSONObject
>
getCustomEventRankList
(
Long
startTime
,
Long
endTime
)
{
List
<
JSONObject
>
resultList
=
null
;
...
...
@@ -375,7 +379,7 @@ public class CustomEventServiceImpl implements CustomEventService {
private
List
<
JSONObject
>
getHotKeyword
(
List
<
BaseMap
>
articleList
)
{
List
<
String
>
texts
=
articleList
.
stream
().
map
(
article
->
article
.
getTitle
()
+
article
.
getContent
()).
collect
(
Collectors
.
toList
());
//分析热评词
return
T
extUtil
.
getHighWordsJson
(
texts
,
30
);
return
t
extUtil
.
getHighWordsJson
(
texts
,
30
);
}
/**
...
...
src/main/java/com/zhiwei/brandkbs2/service/impl/MarkDataServiceImpl.java
View file @
d300091b
...
...
@@ -137,6 +137,9 @@ public class MarkDataServiceImpl implements MarkDataService {
@Resource
(
name
=
"mongoUtil"
)
MongoUtil
mongoUtil
;
@Autowired
TextUtil
textUtil
;
@Override
public
PageVO
<
MarkFlowEntity
>
getYuqingMarkList
(
MarkSearchDTO
markSearchDTO
)
{
try
{
...
...
@@ -504,12 +507,13 @@ public class MarkDataServiceImpl implements MarkDataService {
log
.
info
(
"es查询size:{},耗时:{}"
,
texts
.
size
(),
System
.
currentTimeMillis
()
-
s
);
long
s1
=
System
.
currentTimeMillis
();
// 分析高频词
List
<
JSONObject
>
highWords
=
T
extUtil
.
getHighWordsJson
(
texts
,
30
);
List
<
JSONObject
>
highWords
=
t
extUtil
.
getHighWordsJson
(
texts
,
30
);
log
.
info
(
"分析高频词耗时:{}"
,
(
System
.
currentTimeMillis
()
-
s1
));
redisUtil
.
setExpire
(
redisKey
,
JSON
.
toJSONString
(
highWords
));
return
highWords
;
}
catch
(
IOException
e
)
{
ExceptionCast
.
cast
(
CommonCodeEnum
.
FAIL
,
"es查询异常"
,
e
);
log
.
error
(
"getMarkHighWord-"
,
e
);
ExceptionCast
.
cast
(
CommonCodeEnum
.
FAIL
);
}
return
null
;
}
...
...
src/main/java/com/zhiwei/brandkbs2/util/TextUtil.java
View file @
d300091b
...
...
@@ -15,6 +15,7 @@ import com.zhiwei.nlp.vo.KResult;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.elasticsearch.search.SearchHit
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
org.springframework.beans.factory.annotation.Value
;
import
org.springframework.core.io.Resource
;
import
org.springframework.stereotype.Component
;
...
...
@@ -39,7 +40,8 @@ public class TextUtil {
private
Resource
customDic
;
@Value
(
"classpath:wordDic/stopWordDictionary.txt"
)
private
Resource
stopDic
;
private
static
AnsjSeg
ansjSeg
;
@Autowired
private
AnsjSeg
ansjSeg
;
@PostConstruct
public
void
init
()
{
...
...
@@ -105,7 +107,7 @@ public class TextUtil {
return
Tools
.
sortMap
(
result
,
maxSize
);
}
public
static
List
<
JSONObject
>
getHighWordsJson
(
List
<
String
>
texts
,
Integer
maxSize
)
{
public
List
<
JSONObject
>
getHighWordsJson
(
List
<
String
>
texts
,
Integer
maxSize
)
{
// Map<String, Integer> highWords = getHighWords(texts, maxSize);
List
<
Map
.
Entry
<
String
,
Integer
>>
wordRate
=
ansjSeg
.
getFenCi
(
texts
,
maxSize
);
List
<
JSONObject
>
result
=
new
ArrayList
<>(
wordRate
.
size
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment