Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
titleAggregation
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
xuyimeng
titleAggregation
Commits
ce5b9fa0
Commit
ce5b9fa0
authored
Jan 02, 2018
by
win7
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
标题聚合工具类
parent
78033265
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
1088 additions
and
103 deletions
+1088
-103
src/main/java/com/zhiweidata/titleAggregation/algorithm/Algorithm.java
+19
-0
src/main/java/com/zhiweidata/titleAggregation/algorithm/AllAlgorithm.java
+48
-0
src/main/java/com/zhiweidata/titleAggregation/algorithm/impl/CosineSimilarity.java
+157
-0
src/main/java/com/zhiweidata/titleAggregation/algorithm/impl/CutPage.java
+106
-0
src/main/java/com/zhiweidata/titleAggregation/algorithm/impl/MySimHash.java
+272
-0
src/main/java/com/zhiweidata/titleAggregation/main/ClusterResult.java
+1
-2
src/main/java/com/zhiweidata/titleAggregation/main/ClusterUtil.java
+52
-10
src/main/java/com/zhiweidata/titleAggregation/main/HCluster.java
+27
-61
src/main/java/com/zhiweidata/titleAggregation/main/Means.java
+69
-0
src/main/java/com/zhiweidata/titleAggregation/method/CosineSimilarity.java
+4
-3
src/main/java/com/zhiweidata/titleAggregation/method/CutPage.java
+1
-1
src/main/java/com/zhiweidata/titleAggregation/method/MySimHash.java
+16
-11
src/main/java/com/zhiweidata/titleAggregation/util/BasicUtil.java
+109
-2
src/main/java/com/zhiweidata/titleAggregation/util/ComputeWordsVector.java
+111
-0
src/main/java/com/zhiweidata/titleAggregation/util/ThreadPool.java
+83
-0
src/test/java/StartTest/ExcelTest.java
+4
-4
src/test/java/StartTest/MongoStart.java
+8
-9
src/test/java/StartTest/ResultTest.java
+1
-0
No files found.
src/main/java/com/zhiweidata/titleAggregation/algorithm/Algorithm.java
0 → 100644
View file @
ce5b9fa0
package
com
.
zhiweidata
.
titleAggregation
.
algorithm
;
import
java.util.List
;
/**
* @ClassName: Algorithm
* @Description: TODO(各个算法接口)
* @author xuyimeng
* @date 2017年12月29日 上午11:59:54
*/
public
interface
Algorithm
{
/**相似度*/
double
getSimilarity
(
String
text1
,
String
text2
);
double
[]
getSimilarity
(
List
<
String
>
text1
,
String
text2
);
double
[][]
getSimilarity
(
List
<
String
>
text
);
}
src/main/java/com/zhiweidata/titleAggregation/algorithm/AllAlgorithm.java
0 → 100644
View file @
ce5b9fa0
/**
* @Title: AllAlgorithm.java
* @Package com.zhiweidata.titleAggregation.algorithm
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月29日 下午2:23:02
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
titleAggregation
.
algorithm
;
import
com.zhiweidata.titleAggregation.algorithm.impl.CosineSimilarity
;
import
com.zhiweidata.titleAggregation.algorithm.impl.CutPage
;
import
com.zhiweidata.titleAggregation.algorithm.impl.MySimHash
;
/**
* @ClassName: AllAlgorithm
* @Description: TODO(获取所有算法的对象)
* @author xuyimeng
* @date 2017年12月29日 下午2:23:02
*/
public
class
AllAlgorithm
{
public
enum
goal
{
hash
,
cut
,
cos
}
private
static
final
Algorithm
simHash
=
new
MySimHash
();
private
static
final
Algorithm
cosineSimilarity
=
new
CosineSimilarity
();
private
static
final
Algorithm
cutpage
=
new
CutPage
();
public
static
Algorithm
getInstance
(
goal
state
)
{
if
(
state
.
equals
(
goal
.
cos
))
{
return
cosineSimilarity
;
}
else
if
(
state
.
equals
(
goal
.
cut
))
{
return
cutpage
;
}
else
{
return
simHash
;
}
}
private
AllAlgorithm
()
{}
}
src/main/java/com/zhiweidata/titleAggregation/algorithm/impl/CosineSimilarity.java
0 → 100644
View file @
ce5b9fa0
package
com
.
zhiweidata
.
titleAggregation
.
algorithm
.
impl
;
import
java.io.UnsupportedEncodingException
;
import
java.util.HashMap
;
import
java.util.Iterator
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiweidata.titleAggregation.algorithm.Algorithm
;
/**
* 余弦算法,根据相似程序(长度、单个字)来判断
* @ClassName: CosineSimilarity
* @Description: TODO(余弦相似度算法的)
* @author xuyimeng
* @date 2017年12月26日 上午10:01:07
*/
public
class
CosineSimilarity
implements
Algorithm
{
/**
* 输入两段文本利用字频率的余弦定理判断二者间的相似度
* @param doc1,文本1
* @param doc2,文本2
* @return 相似度值
*/
public
static
double
CalculateTextSim
(
String
doc1
,
String
doc2
)
{
if
(
doc1
!=
null
&&
doc1
.
trim
().
length
()
>
0
&&
doc2
!=
null
&&
doc2
.
trim
().
length
()
>
0
)
{
Map
<
Integer
,
int
[]>
AlgorithmMap
=
new
HashMap
<
Integer
,
int
[]>();
// 将两个字符串中的中文字符以及出现的总数封装到,AlgorithmMap中
for
(
int
i
=
0
;
i
<
doc1
.
length
();
i
++)
{
char
d1
=
doc1
.
charAt
(
i
);
if
(
isHanZi
(
d1
))
{
int
charIndex
=
getGB2312Id
(
d1
);
if
(
charIndex
!=
-
1
)
{
int
[]
fq
=
AlgorithmMap
.
get
(
charIndex
);
if
(
fq
!=
null
&&
fq
.
length
==
2
)
{
fq
[
0
]++;
}
else
{
fq
=
new
int
[
2
];
fq
[
0
]
=
1
;
fq
[
1
]
=
0
;
AlgorithmMap
.
put
(
charIndex
,
fq
);
}
}
}
}
for
(
int
i
=
0
;
i
<
doc2
.
length
();
i
++)
{
char
d2
=
doc2
.
charAt
(
i
);
if
(
isHanZi
(
d2
))
{
int
charIndex
=
getGB2312Id
(
d2
);
if
(
charIndex
!=
-
1
)
{
int
[]
fq
=
AlgorithmMap
.
get
(
charIndex
);
if
(
fq
!=
null
&&
fq
.
length
==
2
)
{
fq
[
1
]++;
}
else
{
fq
=
new
int
[
2
];
fq
[
0
]
=
0
;
fq
[
1
]
=
1
;
AlgorithmMap
.
put
(
charIndex
,
fq
);
}
}
}
}
Iterator
<
Integer
>
iterator
=
AlgorithmMap
.
keySet
().
iterator
();
double
sqdoc1
=
0
;
double
sqdoc2
=
0
;
double
denominator
=
0
;
while
(
iterator
.
hasNext
())
{
int
[]
c
=
AlgorithmMap
.
get
(
iterator
.
next
());
denominator
+=
c
[
0
]
*
c
[
1
];
sqdoc1
+=
c
[
0
]
*
c
[
0
];
sqdoc2
+=
c
[
1
]
*
c
[
1
];
}
return
denominator
/
Math
.
sqrt
(
sqdoc1
*
sqdoc2
);
}
else
{
throw
new
NullPointerException
(
"the Document is null or have not cahrs!!"
);
}
}
/**
* 输入一个字符判断是否为中文汉字
*
* @param ch,字符
* @return true为中文汉字,否则为false
*/
public
static
boolean
isHanZi
(
char
ch
)
{
return
(
ch
>=
0x4E00
&&
ch
<=
0x9FA5
);
}
/**
* 根据输入的Unicode字符,获取它的GB2312编码或者ascii编码,
*
* @param ch,输入的GB2312中文字符或者ASCII字符(128个)
* @return ch在GB2312中的位置,-1表示该字符不认识
*/
public
static
short
getGB2312Id
(
char
ch
)
{
try
{
byte
[]
buffer
=
Character
.
toString
(
ch
).
getBytes
(
"GB2312"
);
if
(
buffer
.
length
!=
2
)
{
// 正常情况下buffer应该是两个字节,否则说明ch不属于GB2312编码,故返回'?',此时说明不认识该字符
return
-
1
;
}
int
b0
=
(
int
)
(
buffer
[
0
]
&
0x0FF
)
-
161
;
// 编码从A1开始,因此减去0xA1=161
int
b1
=
(
int
)
(
buffer
[
1
]
&
0x0FF
)
-
161
;
// 第一个字符和最后一个字符没有汉字,因此每个区只收16*6-2=94个汉字
return
(
short
)
(
b0
*
94
+
b1
);
}
catch
(
UnsupportedEncodingException
e
)
{
e
.
printStackTrace
();
}
return
-
1
;
}
/**
* 用余弦算法 遍历计算相似度
* 越接近1, 越相近
*/
public
double
[][]
getSimilarity
(
List
<
String
>
titles
)
{
int
size
=
titles
.
size
();
double
[][]
distance
=
new
double
[
size
][
size
];
String
doc1
=
""
;
String
doc2
=
""
;
for
(
int
i
=
0
;
i
<
size
-
1
;
i
++)
{
doc1
=
titles
.
get
(
i
);
for
(
int
j
=
i
+
1
;
j
<
size
;
j
++)
{
doc2
=
titles
.
get
(
j
);
distance
[
i
][
j
]
=
CalculateTextSim
(
doc1
,
doc2
);
}
}
return
distance
;
}
@Override
public
double
getSimilarity
(
String
text1
,
String
text2
)
{
return
CalculateTextSim
(
text1
,
text2
);
}
@Override
public
double
[]
getSimilarity
(
List
<
String
>
list
,
String
doc1
)
{
double
[]
distance
=
new
double
[
list
.
size
()];
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
String
doc2
=
list
.
get
(
i
);
distance
[
i
]
=
CalculateTextSim
(
doc1
,
doc2
);
}
return
distance
;
}
}
\ No newline at end of file
src/main/java/com/zhiweidata/titleAggregation/algorithm/impl/CutPage.java
0 → 100644
View file @
ce5b9fa0
/**
* @Title: CutPage.java
* @Package util
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月27日 下午2:24:06
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
titleAggregation
.
algorithm
.
impl
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.List
;
import
com.zhiweidata.titleAggregation.algorithm.Algorithm
;
import
com.zhiweidata.titleAggregation.algorithm.AllAlgorithm
;
import
com.zhiweidata.titleAggregation.algorithm.AllAlgorithm.goal
;
/**
* @ClassName: CutPage
* @Description: TODO(应用于长文本的切割算法)
* @author xuyimeng
* @date 2017年12月27日 下午2:24:06
*/
public
class
CutPage
implements
Algorithm
{
public
static
double
getSemblance
(
String
goalText
,
String
testText
)
{
List
<
String
>
goalTexts
=
splitString
(
goalText
);
List
<
String
>
textTexts
=
splitString
(
testText
);
Algorithm
cos
=
AllAlgorithm
.
getInstance
(
goal
.
cos
);
double
result
=
0
;
for
(
String
goal
:
goalTexts
)
{
double
[]
sim
=
cos
.
getSimilarity
(
textTexts
,
goal
);
double
maxSim
=
0
;
for
(
int
i
=
0
;
i
<
sim
.
length
;
i
++)
{
if
(
sim
[
i
]
>
maxSim
)
{
maxSim
=
sim
[
i
];
}
}
result
+=
maxSim
;
}
return
result
/
goalTexts
.
size
();
}
/**
* @Title: splitString
* @Description: TODO(切割字符串,按。和;)
* @param text
* @return
* List<String> 返回类型
*/
public
static
List
<
String
>
splitString
(
String
text
){
List
<
String
>
result
=
new
ArrayList
<>();
List
<
String
>
list
=
Arrays
.
asList
(
text
.
split
(
"。"
));
for
(
String
str
:
list
)
{
if
(
str
.
contains
(
";"
))
{
result
.
addAll
(
Arrays
.
asList
(
str
.
split
(
";"
)));
}
result
.
add
(
str
);
}
return
result
;
}
@Override
public
double
[]
getSimilarity
(
List
<
String
>
text1
,
String
text2
)
{
int
size
=
text1
.
size
();
double
[]
result
=
new
double
[
size
];
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
result
[
i
]
=
getSemblance
(
text1
.
get
(
i
),
text2
);
}
return
result
;
}
@Override
public
double
[][]
getSimilarity
(
List
<
String
>
text
)
{
int
size
=
text
.
size
();
double
[][]
result
=
new
double
[
size
][
size
];
for
(
int
i
=
0
;
i
<
size
-
1
;
i
++)
{
for
(
int
j
=
i
+
1
;
j
<
size
;
j
++)
{
result
[
i
][
j
]
=
getSemblance
(
text
.
get
(
i
),
text
.
get
(
j
));
}
}
return
result
;
}
@Override
public
double
getSimilarity
(
String
text1
,
String
text2
)
{
return
getSemblance
(
text1
,
text2
);
}
}
src/main/java/com/zhiweidata/titleAggregation/algorithm/impl/MySimHash.java
0 → 100644
View file @
ce5b9fa0
package
com
.
zhiweidata
.
titleAggregation
.
algorithm
.
impl
;
import
java.math.BigInteger
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.ansj.domain.Term
;
import
com.zhiweidata.titleAggregation.algorithm.Algorithm
;
import
com.zhiweidata.titleAggregation.util.AnsjSeg
;
/**
* simhash 是根据词义(词性)来判断文本相似度
* @ClassName: MySimHash
* @Description: TODO(simHash算法)
* @author xuyimeng
* @date 2017年12月26日 上午9:16:58
*/
public
class
MySimHash
implements
Algorithm
{
private
String
tokens
;
//字符串
private
BigInteger
strSimHash
;
//字符产的hash值
private
int
hashbits
=
64
;
// 分词后的hash数;
public
void
setTokens
(
String
tokens
)
{
this
.
tokens
=
tokens
;
strSimHash
=
simHash
();
}
/**
* 这个是对整个字符串进行hash计算
* @return
*/
private
BigInteger
simHash
()
{
int
[]
v
=
new
int
[
this
.
hashbits
];
AnsjSeg
ansj
=
AnsjSeg
.
getInstance
();
Map
<
String
,
Integer
>
weightOfNature
=
new
HashMap
<
String
,
Integer
>();
// 词性的权重
//给名词的权重是1;
weightOfNature
.
put
(
"n"
,
1
);
weightOfNature
.
put
(
"m"
,
1
);
List
<
Term
>
termList
=
ansj
.
getTerms
(
tokens
);
for
(
Term
term
:
termList
)
{
String
word
=
term
.
getName
();
//分词字符串
String
nature
=
term
.
getNatureStr
();
// 分词属性;
//将每一个分词hash为一组固定长度的数列
BigInteger
t
=
hash
(
word
);
for
(
int
i
=
0
;
i
<
this
.
hashbits
;
i
++)
{
BigInteger
bitmask
=
new
BigInteger
(
"1"
).
shiftLeft
(
i
);
// 建立一个长度为64的整数数组,进行权重计算,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕.
int
weight
=
1
;
if
(
weightOfNature
.
containsKey
(
nature
))
{
weight
=
weightOfNature
.
get
(
nature
);
}
if
(
t
.
and
(
bitmask
).
signum
()
!=
0
)
{
// 这里是计算整个文档的所有特征的向量和
v
[
i
]
+=
weight
;
}
else
{
v
[
i
]
-=
weight
;
}
}
}
BigInteger
fingerprint
=
new
BigInteger
(
"0"
);
for
(
int
i
=
0
;
i
<
this
.
hashbits
;
i
++)
{
if
(
v
[
i
]
>=
0
)
{
fingerprint
=
fingerprint
.
add
(
new
BigInteger
(
"1"
).
shiftLeft
(
i
));
}
}
return
fingerprint
;
}
/**
* 对单个的分词进行hash计算;
* @param source
* @return
*/
private
BigInteger
hash
(
String
source
)
{
if
(
source
==
null
||
source
.
length
()
==
0
)
{
return
new
BigInteger
(
"0"
);
}
else
{
/**
* 当sourece 的长度过短,会导致hash算法失效,因此需要对过短的词补偿
*/
while
(
source
.
length
()
<
3
)
{
source
=
source
+
source
.
charAt
(
0
);
}
char
[]
sourceArray
=
source
.
toCharArray
();
BigInteger
x
=
BigInteger
.
valueOf
(((
long
)
sourceArray
[
0
])
<<
7
);
BigInteger
m
=
new
BigInteger
(
"1000003"
);
BigInteger
mask
=
new
BigInteger
(
"2"
).
pow
(
this
.
hashbits
).
subtract
(
new
BigInteger
(
"1"
));
for
(
char
item
:
sourceArray
)
{
BigInteger
temp
=
BigInteger
.
valueOf
((
long
)
item
);
x
=
x
.
multiply
(
m
).
xor
(
temp
).
and
(
mask
);
}
x
=
x
.
xor
(
new
BigInteger
(
String
.
valueOf
(
source
.
length
())));
if
(
x
.
equals
(
new
BigInteger
(
"-1"
)))
{
x
=
new
BigInteger
(
"-2"
);
}
return
x
;
}
}
/**
* 计算海明距离,海明距离越小说明越相似;
* @param other
* @return
*/
public
int
hammingDistance
(
MySimHash
other
)
{
BigInteger
m
=
new
BigInteger
(
"1"
).
shiftLeft
(
this
.
hashbits
).
subtract
(
new
BigInteger
(
"1"
));
BigInteger
x
=
this
.
strSimHash
.
xor
(
other
.
strSimHash
).
and
(
m
);
int
tot
=
0
;
while
(
x
.
signum
()
!=
0
)
{
tot
+=
1
;
x
=
x
.
and
(
x
.
subtract
(
new
BigInteger
(
"1"
)));
}
return
tot
;
}
/**
*
* @Title: getSemblance
* @Description: TODO(计算simhash的相似度)
* @param s2
* @return
* double 返回类型
*/
public
double
getSemblance
(
MySimHash
s2
){
double
i
=
(
double
)
this
.
hammingDistance
(
s2
);
return
1
-
i
/
this
.
hashbits
;
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一个集合内各个元素的)
* @param titles
* @return
* int[][] 返回类型
*/
public
int
[][]
getDistance
(
List
<
String
>
titles
)
{
int
size
=
titles
.
size
();
List
<
MySimHash
>
listHash
=
new
ArrayList
<>(
size
);
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
titles
.
get
(
i
));
listHash
.
add
(
mySimHash
);
}
int
[][]
distance
=
new
int
[
size
][
size
];
int
temp
;
for
(
int
i
=
0
;
i
<
size
-
1
;
i
++)
{
for
(
int
j
=
i
+
1
;
j
<
size
;
j
++)
{
temp
=
listHash
.
get
(
i
).
hammingDistance
(
listHash
.
get
(
j
));
distance
[
i
][
j
]
=
temp
;
}
}
return
distance
;
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一对多)
* @param titles
* @param text
* @return
* int[] 返回类型
*/
public
int
[]
getDistance
(
List
<
String
>
titles
,
String
text
)
{
int
size
=
titles
.
size
();
List
<
MySimHash
>
listHash
=
new
ArrayList
<>(
size
);
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
titles
.
get
(
i
));
listHash
.
add
(
mySimHash
);
}
int
[]
distance
=
new
int
[
titles
.
size
()];
int
temp
;
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
text
);
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
temp
=
mySimHash
.
hammingDistance
(
listHash
.
get
(
i
));
distance
[
i
]
=
temp
;
}
return
distance
;
}
public
int
getDistance
(
String
text1
,
String
text2
)
{
MySimHash
hash1
=
new
MySimHash
();
MySimHash
hash2
=
new
MySimHash
();
hash1
.
setTokens
(
text1
);
hash2
.
setTokens
(
text2
);
return
hash1
.
hammingDistance
(
hash2
);
}
/**
* @Title: getSimilarity
* @Description: TODO(判断simhash相似度)
* @param titles
* @param text
* @return
* double[] 返回类型
*/
@Override
public
double
[]
getSimilarity
(
List
<
String
>
titles
,
String
text
)
{
int
size
=
titles
.
size
();
List
<
MySimHash
>
listHash
=
new
ArrayList
<>(
size
);
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
titles
.
get
(
i
));
listHash
.
add
(
mySimHash
);
}
double
[]
distance
=
new
double
[
titles
.
size
()];
double
temp
;
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
text
);
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
temp
=
mySimHash
.
getSemblance
(
listHash
.
get
(
i
));
distance
[
i
]
=
temp
;
}
return
distance
;
}
@Override
public
double
getSimilarity
(
String
text1
,
String
text2
)
{
MySimHash
hash1
=
new
MySimHash
();
MySimHash
hash2
=
new
MySimHash
();
hash1
.
setTokens
(
text1
);
hash2
.
setTokens
(
text2
);
return
hash1
.
getSemblance
(
hash2
);
}
@Override
public
double
[][]
getSimilarity
(
List
<
String
>
text
)
{
int
size
=
text
.
size
();
List
<
MySimHash
>
listHash
=
new
ArrayList
<>(
size
);
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
text
.
get
(
i
));
listHash
.
add
(
mySimHash
);
}
double
[][]
distance
=
new
double
[
size
][
size
];
double
temp
;
for
(
int
i
=
0
;
i
<
size
-
1
;
i
++)
{
for
(
int
j
=
i
+
1
;
j
<
size
;
j
++)
{
temp
=
listHash
.
get
(
i
).
getSemblance
(
listHash
.
get
(
j
));
distance
[
i
][
j
]
=
temp
;
}
}
return
distance
;
}
}
src/main/java/com/zhiweidata/titleAggregation/main/ClusterResult.java
View file @
ce5b9fa0
...
...
@@ -67,8 +67,7 @@ public class ClusterResult {
{
cosFreq
=
0.93
;
}
HCluster
hCluster
=
new
HCluster
();
Map
<
Integer
,
String
>
map
=
BasicUtil
.
toMap
(
list
);
return
hCluster
.
DataToResult
(
map
,
freq
,
cosFreq
);
return
Means
.
changeMeans
(
map
,
freq
,
cosFreq
);
}
}
src/main/java/com/zhiweidata/titleAggregation/main/ClusterUtil.java
View file @
ce5b9fa0
...
...
@@ -6,9 +6,11 @@ import java.util.Iterator;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiweidata.titleAggregation.algorithm.Algorithm
;
import
com.zhiweidata.titleAggregation.algorithm.AllAlgorithm
;
import
com.zhiweidata.titleAggregation.algorithm.AllAlgorithm.goal
;
import
com.zhiweidata.titleAggregation.bean.Cluster
;
import
com.zhiweidata.titleAggregation.bean.DataPoint
;
import
com.zhiweidata.titleAggregation.method.CosineSimilarity
;
/**
...
...
@@ -18,22 +20,31 @@ import com.zhiweidata.titleAggregation.method.CosineSimilarity;
* @date 2017年12月26日 上午9:19:41
*/
public
class
ClusterUtil
{
static
Algorithm
cos
=
AllAlgorithm
.
getInstance
(
goal
.
cos
);
/**
* @Title: mergeLikeCluster
* @Description: TODO(合并相似类簇)
* @param clusters
* @param freq
* void 返回类型
*/
public
void
mergeLikeCluster
(
List
<
Cluster
>
clusters
,
double
freq
)
{
List
<
String
>
texts
=
new
ArrayList
<>();
for
(
Cluster
cluster
:
clusters
)
{
texts
.
add
(
cluster
.
getClusterName
());
}
CosineSimilarity
cos
=
new
CosineSimilarity
();
double
[][]
distance
=
cos
.
getDistance
(
texts
);
for
(
int
i
=
0
;
i
<
clusters
.
size
()-
1
;
i
++)
double
[][]
distance
=
cos
.
getSimilarity
(
texts
);
int
size
=
clusters
.
size
();
for
(
int
i
=
0
;
i
<
size
-
1
;
i
++)
{
if
(
clusters
.
get
(
i
).
getDataPoints
().
size
()
==
0
)
{
continue
;
}
for
(
int
j
=
i
+
1
;
j
<
clusters
.
size
()
;
j
++)
for
(
int
j
=
i
+
1
;
j
<
size
;
j
++)
{
if
(
clusters
.
get
(
j
).
getDataPoints
().
size
()
==
0
)
{
...
...
@@ -55,7 +66,6 @@ public class ClusterUtil {
*/
public
void
alertLikeData
(
List
<
Cluster
>
clusters
,
double
freq
)
{
CosineSimilarity
cos
=
new
CosineSimilarity
();
//存储从类簇中被删除dataPoint的集合
List
<
DataPoint
>
list
=
new
ArrayList
<>();
//存储类簇名的集合
...
...
@@ -78,7 +88,7 @@ public class ClusterUtil {
dataPointsNames
.
add
(
text
);
}
double
[]
distances
=
cos
.
get
Distance
(
dataPointsNames
,
cluster
.
getClusterName
());
double
[]
distances
=
cos
.
get
Similarity
(
dataPointsNames
,
cluster
.
getClusterName
());
Iterator
<
DataPoint
>
it
=
dataPoints
.
iterator
();
int
i
=
0
;
...
...
@@ -100,11 +110,12 @@ public class ClusterUtil {
List
<
DataPoint
>
listNew
=
new
ArrayList
<>();
//遍历判断被删除的节点是否与其它类簇相似,相似就添加到这个类簇中
int
size
=
clusterNames
.
size
();
for
(
DataPoint
dataPoint
:
list
)
{
double
[]
distances
=
cos
.
get
Distance
(
clusterNames
,
dataPoint
.
getDataPointName
());
double
[]
distances
=
cos
.
get
Similarity
(
clusterNames
,
dataPoint
.
getDataPointName
());
for
(
int
i
=
0
;
i
<
clusterNames
.
size
()
;
i
++)
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
//相似度大于0.93就将节点加入类簇
if
(
distances
[
i
]
>
freq
)
...
...
@@ -117,7 +128,7 @@ public class ClusterUtil {
break
;
}
if
(
i
==
clusterNames
.
size
()
-
1
)
if
(
i
==
size
-
1
)
{
listNew
.
add
(
dataPoint
);
}
...
...
@@ -191,6 +202,26 @@ public class ClusterUtil {
}
return
finalClusters
;
}
public
void
mergeCluster
(
List
<
String
>
names
,
List
<
Cluster
>
clusterList
,
int
index
,
String
name
,
double
cosFreq
)
{
int
size
=
clusterList
.
size
();
double
[]
distance
=
cos
.
getSimilarity
(
names
,
name
);
for
(
int
i
=
index
+
1
;
i
<
size
;
i
++)
{
if
(
clusterList
.
get
(
i
).
getDataPoints
().
size
()
==
0
)
{
continue
;
}
if
(
distance
[
i
]
>
cosFreq
)
{
mergeCluster
(
clusterList
,
index
,
i
);
return
;
}
}
}
/**
* 选择次数最多的作为类簇名,若次数一样,选择title最短的为类簇名
* @Title: changeMaxDataPoint
...
...
@@ -274,6 +305,16 @@ public class ClusterUtil {
* void 返回类型
*/
public
void
completedData
(
List
<
DataPoint
>
texts
,
List
<
Cluster
>
clusters
)
{
Iterator
<
Cluster
>
it
=
clusters
.
iterator
();
while
(
it
.
hasNext
())
{
Cluster
cluster
=
it
.
next
();
if
(
cluster
.
getDataPoints
().
size
()
==
0
)
{
it
.
remove
();
}
}
for
(
DataPoint
dataPoint
:
texts
)
{
List
<
DataPoint
>
list
=
new
ArrayList
<>();
...
...
@@ -286,6 +327,7 @@ public class ClusterUtil {
clusters
.
add
(
cluster
);
}
}
}
...
...
src/main/java/com/zhiweidata/titleAggregation/main/HCluster.java
View file @
ce5b9fa0
package
com
.
zhiweidata
.
titleAggregation
.
main
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Comparator
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiweidata.titleAggregation.algorithm.AllAlgorithm
;
import
com.zhiweidata.titleAggregation.algorithm.impl.MySimHash
;
import
com.zhiweidata.titleAggregation.bean.Cluster
;
import
com.zhiweidata.titleAggregation.bean.DataPoint
;
import
com.zhiweidata.titleAggregation.bean.Result
;
import
com.zhiweidata.titleAggregation.method.ComputeWordsVector
;
import
com.zhiweidata.titleAggregation.method.CosineSimilarity
;
import
com.zhiweidata.titleAggregation.method.MySimHash
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate
;
import
com.zhiweidata.titleAggregation.util.ComputeWordsVector
;
import
com.zhiweidata.titleAggregation.util.BasicUtil
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate.goal
;
...
...
@@ -25,59 +23,29 @@ import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal;
* @date 2017年12月26日 上午9:47:58
*/
public
class
HCluster
{
static
MySimHash
hash
=
(
MySimHash
)
AllAlgorithm
.
getInstance
(
com
.
zhiweidata
.
titleAggregation
.
algorithm
.
AllAlgorithm
.
goal
.
hash
);
//简繁体翻译
static
ChineseTranslate
simplifiedTrans
=
ChineseTranslate
.
getInstance
(
goal
.
简体
);
public
List
<
Result
>
DataToResult
(
Map
<
Integer
,
String
>
texts
,
int
freq
,
double
cosFreq
){
List
<
Cluster
>
clusters
=
changeData
(
texts
,
freq
,
cosFreq
);
static
ClusterUtil
util
=
new
ClusterUtil
();
MySimHash
hash
=
new
MySimHash
();
CosineSimilarity
cos
=
new
CosineSimilarity
();
List
<
Result
>
list
=
new
ArrayList
<>();
for
(
Cluster
cluster
:
clusters
)
{
if
(
cluster
.
getDataPoints
().
size
()
==
0
)
{
continue
;
}
String
clusterName
=
cluster
.
getClusterName
();
for
(
DataPoint
dataPoint
:
cluster
.
getDataPoints
())
{
String
dataPointName
=
dataPoint
.
getDataPointName
();
Result
result
=
new
Result
();
result
.
setClusterName
(
clusterName
);
result
.
setDatapointName
(
dataPointName
);
result
.
setI
(
dataPoint
.
getI
());
result
.
setSimhash
(
hash
.
getDistance
(
clusterName
,
dataPointName
));
result
.
setCosSimilarity
(
cos
.
CalculateTextSim
(
clusterName
,
dataPointName
));
list
.
add
(
result
);
}
}
Collections
.
sort
(
list
,
new
Comparator
<
Result
>()
{
@Override
public
int
compare
(
Result
o1
,
Result
o2
)
{
return
o1
.
getI
()
-
o2
.
getI
();
public
void
sumCluster
(
List
<
Cluster
>
clusters
,
double
cosFreq
){
util
.
mergeLikeCluster
(
clusters
,
cosFreq
);
util
.
alertLikeData
(
clusters
,
cosFreq
);
}
});
return
list
;
}
/**
* 将标题以长度分组选择不同的相似度
* @Title: changeData
* @Description: TODO(将标题以长度分组选择不同的相似度)
* @param texts
* @param freq simhash距离,默认(推荐)为9
* @param cosFreq 余弦的相似度 默认(推荐)为0.93
* @return
* Map<Integer,Map<Integer,String>> 返回类型
*/
public
List
<
Cluster
>
changeData
(
Map
<
Integer
,
String
>
texts
,
int
freq
,
double
cosFreq
)
{
//简繁体翻译
ChineseTranslate
simplifiedTrans
=
ChineseTranslate
.
getInstance
(
goal
.
简体
);
public
List
<
Cluster
>
changeData
(
Map
<
Integer
,
String
>
texts
,
int
freq
)
{
//按标题长度分组
Map
<
Integer
,
String
>
shortText
=
new
HashMap
<>();
Map
<
Integer
,
String
>
middleText
=
new
HashMap
<>();
...
...
@@ -109,19 +77,11 @@ public class HCluster {
freq
+=
2
;
clusters
.
addAll
(
startCluster
(
longText
,
freq
));
ClusterUtil
util
=
new
ClusterUtil
();
util
.
mergeLikeCluster
(
clusters
,
cosFreq
);
util
.
alertLikeData
(
clusters
,
cosFreq
);
return
clusters
;
}
/** 聚类的主方法*/
private
List
<
Cluster
>
startCluster
(
Map
<
Integer
,
String
>
titles
,
int
freq
)
{
ClusterUtil
util
=
new
ClusterUtil
();
MySimHash
hash
=
new
MySimHash
();
private
static
List
<
Cluster
>
startCluster
(
Map
<
Integer
,
String
>
titles
,
int
freq
)
{
List
<
DataPoint
>
dp
=
readData
(
titles
);
// 声明cluster类,存放类名和类簇中含有的样本
...
...
@@ -132,7 +92,9 @@ public class HCluster {
// flag为判断标志
boolean
flag
=
true
;
int
it
=
0
;
//hash距离
int
[][]
distances
=
hash
.
getDistance
(
BasicUtil
.
toList
(
titles
));
while
(
flag
)
{
// mergeIndexA和mergeIndexB表示每一次迭代聚类最小的两个类簇,也就是每一次迭代要合并的两个类簇
int
mergeIndexA
=
0
;
...
...
@@ -140,7 +102,8 @@ public class HCluster {
/*
* 迭代开始,分别去计算每个类簇之间的距离,将距离小的类簇合并
*/
for
(
int
i
=
0
;
i
<
finalClusters
.
size
()
-
1
;
i
++)
int
size
=
finalClusters
.
size
();
for
(
int
i
=
0
;
i
<
size
-
1
;
i
++)
{
if
(
finalClusters
.
get
(
i
).
getDataPoints
().
size
()
==
0
)
{
...
...
@@ -149,7 +112,7 @@ public class HCluster {
int
min
=
freq
;
for
(
int
j
=
i
+
1
;
j
<
finalClusters
.
size
()
;
j
++)
for
(
int
j
=
i
+
1
;
j
<
size
;
j
++)
{
if
(
finalClusters
.
get
(
j
).
getDataPoints
().
size
()
==
0
)
{
...
...
@@ -178,22 +141,23 @@ public class HCluster {
{
it
++;
}
//持续5次,都为0,判断算法结束
if
(
it
>
5
)
{
flag
=
false
;
}
}
return
finalClusters
;
}
/**初始化类簇*/
private
List
<
Cluster
>
initialCluster
(
List
<
DataPoint
>
dpoints
)
{
private
static
List
<
Cluster
>
initialCluster
(
List
<
DataPoint
>
dpoints
)
{
// 声明存放初始化类簇的链表
List
<
Cluster
>
originalClusters
=
new
ArrayList
<>();
// 声明一个临时的用于存放样本点的链表
List
<
DataPoint
>
tempDataPoints
;
for
(
int
i
=
0
;
i
<
dpoints
.
size
();
i
++)
{
int
size
=
dpoints
.
size
();
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
tempDataPoints
=
new
ArrayList
<>();
// 得到每一个样本点
DataPoint
tempDataPoint
=
dpoints
.
get
(
i
);
...
...
@@ -216,7 +180,8 @@ public class HCluster {
* @param titles
* @return
*/
private
List
<
DataPoint
>
readData
(
Map
<
Integer
,
String
>
titles
)
{
return
new
ComputeWordsVector
()
.
computeTFMultiIDF
(
titles
);
private
static
List
<
DataPoint
>
readData
(
Map
<
Integer
,
String
>
titles
)
{
return
ComputeWordsVector
.
computeTFMultiIDF
(
titles
);
}
}
\ No newline at end of file
src/main/java/com/zhiweidata/titleAggregation/main/Means.java
0 → 100644
View file @
ce5b9fa0
/**
* @Title: Means.java
* @Package com.zhiweidata.titleAggregation.main
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月29日 上午10:22:18
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
titleAggregation
.
main
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiweidata.titleAggregation.bean.Cluster
;
import
com.zhiweidata.titleAggregation.bean.Result
;
import
com.zhiweidata.titleAggregation.util.BasicUtil
;
import
com.zhiweidata.titleAggregation.util.ThreadPool
;
/**
* @ClassName: Means
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2017年12月29日 上午10:22:18
*/
public
class
Means
{
public
static
List
<
Result
>
changeMeans
(
Map
<
Integer
,
String
>
texts
,
int
freq
,
double
cosFreq
)
{
return
cutTexts
(
texts
,
freq
,
cosFreq
);
}
private
static
List
<
Result
>
cutTexts
(
Map
<
Integer
,
String
>
texts
,
int
freq
,
double
cosFreq
)
{
long
t
=
System
.
currentTimeMillis
();
List
<
Map
<
Integer
,
String
>>
list
=
new
ArrayList
<>();
Map
<
Integer
,
String
>
map
=
new
HashMap
<>(
350
);
int
i
=
0
;
for
(
Integer
key
:
texts
.
keySet
())
{
if
(
i
<
350
)
{
map
.
put
(
key
,
texts
.
get
(
key
));
}
else
{
map
.
put
(
key
,
texts
.
get
(
key
));
list
.
add
(
map
);
map
=
new
HashMap
<>(
350
);
i
=
0
;
}
i
++;
}
if
(
i
<
350
)
{
list
.
add
(
map
);
}
System
.
out
.
println
(
"切分list,共切出:"
+
list
.
size
()+
"个,用时:"
+(
System
.
currentTimeMillis
()-
t
));
t
=
System
.
currentTimeMillis
();
List
<
Cluster
>
clusters
=
ThreadPool
.
ClusterRun
(
list
,
freq
,
cosFreq
);
System
.
out
.
println
(
"运算成功,用时"
+(
System
.
currentTimeMillis
()
-
t
));
t
=
System
.
currentTimeMillis
();
List
<
Result
>
results
=
BasicUtil
.
toResult
(
clusters
);
System
.
out
.
println
(
"转化结果成功,用时"
+(
System
.
currentTimeMillis
()-
t
));
return
results
;
}
}
src/main/java/com/zhiweidata/titleAggregation/method/CosineSimilarity.java
View file @
ce5b9fa0
...
...
@@ -117,14 +117,15 @@ public class CosineSimilarity {
*/
public
double
[][]
getDistance
(
List
<
String
>
titles
)
{
CosineSimilarity
cosineSimilarity
=
new
CosineSimilarity
();
double
[][]
distance
=
new
double
[
titles
.
size
()][
titles
.
size
()];
int
size
=
titles
.
size
();
double
[][]
distance
=
new
double
[
size
][
size
];
String
doc1
=
""
;
String
doc2
=
""
;
for
(
int
i
=
0
;
i
<
titles
.
size
()
-
1
;
i
++)
for
(
int
i
=
0
;
i
<
size
-
1
;
i
++)
{
doc1
=
titles
.
get
(
i
);
for
(
int
j
=
i
+
1
;
j
<
titles
.
size
()
;
j
++)
for
(
int
j
=
i
+
1
;
j
<
size
;
j
++)
{
doc2
=
titles
.
get
(
j
);
distance
[
i
][
j
]
=
cosineSimilarity
.
CalculateTextSim
(
doc1
,
doc2
);
...
...
src/main/java/com/zhiweidata/titleAggregation/method/CutPage.java
View file @
ce5b9fa0
...
...
@@ -47,7 +47,7 @@ public class CutPage {
return
result
/
goalTexts
.
size
();
}
public
List
<
String
>
splitString
(
String
text
){
public
static
List
<
String
>
splitString
(
String
text
){
List
<
String
>
result
=
new
ArrayList
<>();
List
<
String
>
list
=
Arrays
.
asList
(
text
.
split
(
"。"
));
...
...
src/main/java/com/zhiweidata/titleAggregation/method/MySimHash.java
View file @
ce5b9fa0
...
...
@@ -6,6 +6,8 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
javax.rmi.CORBA.Tie
;
import
org.ansj.domain.Term
;
import
com.zhiweidata.titleAggregation.util.AnsjSeg
;
...
...
@@ -144,19 +146,20 @@ public class MySimHash {
* int[][] 返回类型
*/
public
int
[][]
getDistance
(
List
<
String
>
titles
)
{
List
<
MySimHash
>
listHash
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
titles
.
size
();
i
++)
int
size
=
titles
.
size
();
List
<
MySimHash
>
listHash
=
new
ArrayList
<>(
size
);
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
titles
.
get
(
i
));
listHash
.
add
(
mySimHash
);
}
int
[][]
distance
=
new
int
[
titles
.
size
()][
titles
.
size
()
];
int
[][]
distance
=
new
int
[
size
][
size
];
int
temp
;
for
(
int
i
=
0
;
i
<
titles
.
size
()
-
1
;
i
++)
for
(
int
i
=
0
;
i
<
size
-
1
;
i
++)
{
for
(
int
j
=
i
+
1
;
j
<
titles
.
size
()
;
j
++)
for
(
int
j
=
i
+
1
;
j
<
size
;
j
++)
{
temp
=
listHash
.
get
(
i
).
hammingDistance
(
listHash
.
get
(
j
));
distance
[
i
][
j
]
=
temp
;
...
...
@@ -174,8 +177,9 @@ public class MySimHash {
* int[] 返回类型
*/
public
int
[]
getDistance
(
List
<
String
>
titles
,
String
text
)
{
List
<
MySimHash
>
listHash
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
titles
.
size
();
i
++)
int
size
=
titles
.
size
();
List
<
MySimHash
>
listHash
=
new
ArrayList
<>(
size
);
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
titles
.
get
(
i
));
...
...
@@ -186,7 +190,7 @@ public class MySimHash {
int
temp
;
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
text
);
for
(
int
i
=
0
;
i
<
titles
.
size
()
;
i
++)
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
temp
=
mySimHash
.
hammingDistance
(
listHash
.
get
(
i
));
distance
[
i
]
=
temp
;
...
...
@@ -211,8 +215,9 @@ public class MySimHash {
* double[] 返回类型
*/
public
double
[]
getSimilarity
(
List
<
String
>
titles
,
String
text
)
{
List
<
MySimHash
>
listHash
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
titles
.
size
();
i
++)
int
size
=
titles
.
size
();
List
<
MySimHash
>
listHash
=
new
ArrayList
<>(
size
);
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
titles
.
get
(
i
));
...
...
@@ -223,7 +228,7 @@ public class MySimHash {
double
temp
;
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
text
);
for
(
int
i
=
0
;
i
<
titles
.
size
()
;
i
++)
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
temp
=
mySimHash
.
getSemblance
(
listHash
.
get
(
i
));
distance
[
i
]
=
temp
;
...
...
src/main/java/com/zhiweidata/titleAggregation/util/BasicUtil.java
View file @
ce5b9fa0
...
...
@@ -2,9 +2,18 @@ package com.zhiweidata.titleAggregation.util;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.
HashMap
;
import
java.util.
Comparator
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.TreeMap
;
import
com.zhiweidata.titleAggregation.algorithm.Algorithm
;
import
com.zhiweidata.titleAggregation.algorithm.AllAlgorithm
;
import
com.zhiweidata.titleAggregation.algorithm.AllAlgorithm.goal
;
import
com.zhiweidata.titleAggregation.algorithm.impl.MySimHash
;
import
com.zhiweidata.titleAggregation.bean.Cluster
;
import
com.zhiweidata.titleAggregation.bean.DataPoint
;
import
com.zhiweidata.titleAggregation.bean.Result
;
/**
* @ClassName: Util
...
...
@@ -13,9 +22,81 @@ import java.util.Map;
* @date 2017年12月26日 上午9:15:29
*/
public
class
BasicUtil
{
/**
* @Title: toResult
* @Description: TODO(类簇转化为结果集)
* @param clusters
* @return
* List<Result> 返回类型
*/
public
static
List
<
Result
>
toResult
(
List
<
Cluster
>
clusters
)
{
MySimHash
hash
=
(
MySimHash
)
AllAlgorithm
.
getInstance
(
goal
.
hash
);
Algorithm
cos
=
AllAlgorithm
.
getInstance
(
goal
.
cos
);
List
<
Result
>
list
=
new
ArrayList
<>();
for
(
Cluster
cluster
:
clusters
)
{
if
(
cluster
.
getDataPoints
().
size
()
==
0
)
{
continue
;
}
String
clusterName
=
cluster
.
getClusterName
();
for
(
DataPoint
dataPoint
:
cluster
.
getDataPoints
())
{
String
dataPointName
=
dataPoint
.
getDataPointName
();
Result
result
=
new
Result
();
result
.
setClusterName
(
clusterName
);
result
.
setDatapointName
(
dataPointName
);
result
.
setI
(
dataPoint
.
getI
());
result
.
setSimhash
(
hash
.
getDistance
(
clusterName
,
dataPointName
));
result
.
setCosSimilarity
(
cos
.
getSimilarity
(
clusterName
,
dataPointName
));
list
.
add
(
result
);
}
}
//排序
Collections
.
sort
(
list
,
new
Comparator
<
Result
>()
{
@Override
public
int
compare
(
Result
o1
,
Result
o2
)
{
return
o1
.
getI
()
-
o2
.
getI
();
}
});
return
list
;
}
/**
* @Title: toString
* @Description: TODO(转化为name集合)
* @param list
* @return
* List<String> 返回类型
*/
public
static
List
<
String
>
toString
(
List
<
Cluster
>
list
){
List
<
String
>
result
=
new
ArrayList
<>();
for
(
Cluster
cluster
:
list
)
{
if
(
cluster
.
getDataPoints
().
size
()
==
0
)
{
continue
;
}
String
name
=
cluster
.
getClusterName
();
result
.
add
(
name
);
}
return
result
;
}
/**
*
* @Title: toMap
* @Description: TODO(list转为map)
* @param list
* @return
* Map<Integer,String> 返回类型
*/
public
static
Map
<
Integer
,
String
>
toMap
(
List
<
String
>
list
)
{
Map
<
Integer
,
String
>
texts
=
new
Hash
Map
<>();
Map
<
Integer
,
String
>
texts
=
new
Tree
Map
<>();
int
i
=
0
;
for
(
String
text
:
list
)
{
...
...
@@ -42,7 +123,33 @@ public class BasicUtil {
}
return
list
;
}
/**
* @Title: splitList
* @Description: TODO(将List集合按规定个数拆成n个集合)
* @param list
* @param len
* @return
* List<List<?>> 返回类型
*/
public
static
List
<
List
<?>>
splitList
(
List
<?>
list
,
int
len
)
{
if
(
list
==
null
||
list
.
size
()
==
0
||
len
<
1
)
{
return
null
;
}
List
<
List
<?>>
result
=
new
ArrayList
<
List
<?>>();
int
size
=
list
.
size
();
int
count
=
(
size
+
len
-
1
)
/
len
;
for
(
int
i
=
0
;
i
<
count
;
i
++)
{
List
<?>
subList
=
list
.
subList
(
i
*
len
,
((
i
+
1
)
*
len
>
size
?
size
:
len
*
(
i
+
1
)));
result
.
add
(
subList
);
}
return
result
;
}
/**
* 去除集合空的元素
*/
...
...
src/main/java/com/zhiweidata/titleAggregation/util/ComputeWordsVector.java
0 → 100644
View file @
ce5b9fa0
package
com
.
zhiweidata
.
titleAggregation
.
util
;
import
java.util.*
;
import
org.ansj.domain.Term
;
import
com.zhiweidata.titleAggregation.bean.DataPoint
;
/**
* @ClassName: ComputeWordsVector
* @Description: TODO(计算文档的向量属性,将所有文档向量化)
* @author xuyimeng
* @date 2017年12月26日 上午9:22:06
*/
public
class
ComputeWordsVector
{
/**
* 计算文档的TF-IDF属性向量,返回Map<标题,<特征词,TF-IDF值>>
* @param testSampleDir 处理好的聚类样本测试样例集
* @return 所有测试样例的属性向量构成的map
*/
public
static
List
<
DataPoint
>
computeTFMultiIDF
(
Map
<
Integer
,
String
>
testSampleDir
){
List
<
DataPoint
>
dataPoints
=
new
ArrayList
<>();
Map
<
String
,
Double
>
idfPerWordMap
=
computeIDF
(
BasicUtil
.
toList
(
testSampleDir
));
Map
<
String
,
Double
>
tfPerDocMap
=
new
TreeMap
<
String
,
Double
>();
AnsjSeg
ansj
=
AnsjSeg
.
getInstance
();
String
word
;
for
(
Integer
key
:
testSampleDir
.
keySet
()){
tfPerDocMap
.
clear
();
DataPoint
dataPoint
=
new
DataPoint
();
word
=
testSampleDir
.
get
(
key
);
ansj
.
getString
(
tfPerDocMap
,
testSampleDir
.
get
(
key
));
Double
wordSumPerDoc
=
(
double
)
tfPerDocMap
.
size
();
//计算每篇文档的总词数
Double
maxCount
=
0.0
,
wordWeight
;
//记录出现次数最多的词的次数,用作归一化 ???
Set
<
Map
.
Entry
<
String
,
Double
>>
tempTF
=
tfPerDocMap
.
entrySet
();
for
(
Iterator
<
Map
.
Entry
<
String
,
Double
>>
mt
=
tempTF
.
iterator
();
mt
.
hasNext
();){
Map
.
Entry
<
String
,
Double
>
me
=
mt
.
next
();
if
(
me
.
getValue
()
>
maxCount
)
{
maxCount
=
me
.
getValue
();
}
}
for
(
Iterator
<
Map
.
Entry
<
String
,
Double
>>
mt
=
tempTF
.
iterator
();
mt
.
hasNext
();){
Map
.
Entry
<
String
,
Double
>
me
=
mt
.
next
();
if
(
idfPerWordMap
.
containsKey
(
me
.
getKey
()))
{
Double
IDF
=
Math
.
log
(
testSampleDir
.
size
()
/
idfPerWordMap
.
get
(
me
.
getKey
()));
wordWeight
=
(
me
.
getValue
()
/
wordSumPerDoc
)
*
IDF
;
tfPerDocMap
.
put
(
me
.
getKey
(),
wordWeight
);
}
}
dataPoint
.
setDataPointName
(
word
);
dataPoint
.
setDimensioin
(
tfPerDocMap
);
dataPoint
.
setI
(
key
);
dataPoints
.
add
(
dataPoint
);
}
return
dataPoints
;
}
// /**
// * 输出测试样例map内容,用于测试
// * @param allTestSampleMap
// */
// private void printTestSampleMap(Map<String, Map<String, Double>> allTestSampleMap){
//
// Set<Map.Entry<String, Map<String,Double>>> allWords = allTestSampleMap.entrySet();
//
// for(Iterator<Entry<String, Map<String, Double>>> it = allWords.iterator();it.hasNext();){
//
// Map.Entry<String, Map<String,Double>> me = it.next();
// System.out.print(me.getKey()+" ");
//
// Set<Map.Entry<String, Double>> vectorSet = me.getValue().entrySet();
// for(Iterator<Map.Entry<String, Double>> vt = vectorSet.iterator();vt.hasNext();){
// Map.Entry<String, Double> vme = vt.next();
// System.out.print(vme.getKey()+" "+vme.getValue()+" ");
// }
// System.out.println();
// }
// }
/**
* 计算IDF,即词频
* @param testSample
* @return 单词IDFmap <单词,词频>
*/
public
static
Map
<
String
,
Double
>
computeIDF
(
List
<
String
>
testSample
){
Map
<
String
,
Double
>
IDFPerWordMap
=
new
TreeMap
<
String
,
Double
>();
String
word
;
AnsjSeg
ansj
=
AnsjSeg
.
getInstance
();
for
(
Term
term
:
ansj
.
getTerms
(
testSample
.
toString
()))
{
word
=
term
.
getName
();
if
(
IDFPerWordMap
.
containsKey
(
word
))
IDFPerWordMap
.
put
(
word
,
IDFPerWordMap
.
get
(
word
)+
1.0
);
else
IDFPerWordMap
.
put
(
word
,
1.0
);
}
return
IDFPerWordMap
;
}
}
\ No newline at end of file
src/main/java/com/zhiweidata/titleAggregation/util/ThreadPool.java
0 → 100644
View file @
ce5b9fa0
/**
* @Title: ThreadPool.java
* @Package com.zhiweidata.titleAggregation.util
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月29日 下午3:26:49
* @version V1.0
*/
package
com
.
zhiweidata
.
titleAggregation
.
util
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.concurrent.ExecutorService
;
import
java.util.concurrent.Executors
;
import
com.zhiweidata.titleAggregation.bean.Cluster
;
import
com.zhiweidata.titleAggregation.main.ClusterUtil
;
import
com.zhiweidata.titleAggregation.main.HCluster
;
/**
* @ClassName: ThreadPool
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2017年12月29日 下午3:26:49
*/
public
class
ThreadPool
{
private
static
HCluster
hCluster
=
new
HCluster
();
public
static
List
<
Cluster
>
ClusterRun
(
List
<
Map
<
Integer
,
String
>>
list
,
int
freq
,
double
cosFreq
)
{
ExecutorService
fixedThreadPool
=
Executors
.
newFixedThreadPool
(
8
);
List
<
Cluster
>
clusterList
=
new
ArrayList
<>();
int
size
=
list
.
size
();
for
(
int
i
=
0
;
i
<
size
;
i
++)
{
Map
<
Integer
,
String
>
texts
=
list
.
get
(
i
);
fixedThreadPool
.
execute
(
new
Runnable
()
{
@Override
public
void
run
()
{
List
<
Cluster
>
clusters
=
hCluster
.
changeData
(
texts
,
freq
);
hCluster
.
sumCluster
(
clusters
,
cosFreq
);
clusterList
.
addAll
(
clusters
);
}
});
}
fixedThreadPool
.
shutdown
();
while
(
true
)
{
if
(
fixedThreadPool
.
isTerminated
())
{
return
mergeCluster
(
clusterList
,
cosFreq
);
}
}
}
public
static
List
<
Cluster
>
mergeCluster
(
List
<
Cluster
>
clusterList
,
double
cosFreq
){
ExecutorService
fixedThreadPool
=
Executors
.
newFixedThreadPool
(
8
);
ClusterUtil
util
=
new
ClusterUtil
();
List
<
String
>
names
=
BasicUtil
.
toString
(
clusterList
);
int
size
=
clusterList
.
size
();
for
(
int
i
=
0
;
i
<
size
-
1
;
i
++)
{
if
(
clusterList
.
get
(
i
).
getDataPoints
().
size
()
==
0
)
{
continue
;
}
String
name
=
clusterList
.
get
(
i
).
getClusterName
();
int
index
=
i
;
fixedThreadPool
.
execute
(
new
Runnable
()
{
@Override
public
void
run
()
{
util
.
mergeCluster
(
names
,
clusterList
,
index
,
name
,
cosFreq
);
}
});
}
fixedThreadPool
.
shutdown
();
while
(
true
)
{
if
(
fixedThreadPool
.
isTerminated
())
{
return
clusterList
;
}
}
}
}
src/test/java/StartTest/ExcelTest.java
View file @
ce5b9fa0
...
...
@@ -59,10 +59,10 @@ public class ExcelTest {
@SuppressWarnings
(
"unchecked"
)
List
<
Map
<
String
,
Object
>>
body
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
String
>
titles
=
exportTitleData
(
body
);
long
time
=
System
.
currentTimeMillis
();
//调用算法
ClusterResult
cr
=
new
ClusterResult
();
List
<
Result
>
list
=
cr
.
getResult
(
titles
);
List
<
Result
>
list
=
ClusterResult
.
getResult
(
titles
);
System
.
out
.
println
(
"时间:"
+(
System
.
currentTimeMillis
()
-
time
)+
",量级:"
+
list
.
size
());
DBOExp
dbo
=
new
DBOExp
();
dbo
.
putRun
(
GroupSheet
(
list
,
body
),
goalPath
,
"聚合"
);
dbo
.
putRun
(
AllSheet
(
body
),
goalPath
,
"全部"
);
...
...
@@ -81,7 +81,7 @@ public class ExcelTest {
{
for
(
String
key
:
map
.
keySet
())
{
if
(
key
.
equals
(
"标题"
))
if
(
key
.
equals
(
"标题"
)
&&
key
!=
null
)
{
titles
.
add
(
map
.
get
(
key
).
toString
());
}
...
...
src/test/java/StartTest/MongoStart.java
View file @
ce5b9fa0
...
...
@@ -12,6 +12,7 @@ import org.junit.Test;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiweidata.titleAggregation.bean.Result
;
import
com.zhiweidata.titleAggregation.main.ClusterResult
;
import
com.zhiweidata.titleAggregation.main.HCluster
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate.goal
;
...
...
@@ -87,27 +88,25 @@ public class MongoStart {
List
<
MediaAndWechatEvent
>
listEvent
=
util
.
getListData
(
pt
,
eventId
);
Map
<
Integer
,
String
>
texts
=
new
HashMap
<>();
int
i
=
0
;
List
<
String
>
texts
=
new
ArrayList
<>();
for
(
MediaAndWechatEvent
event
:
listEvent
)
{
String
text
=
event
.
getTitle
().
replaceAll
(
"\\."
,
"-"
);
texts
.
put
(
i
,
text
);
i
++;
texts
.
add
(
text
);
}
System
.
out
.
println
(
"start"
);
//调用算法
HCluster
hc
=
new
HCluster
();
long
time
=
System
.
currentTimeMillis
();
List
<
Result
>
list
=
hc
.
DataTo
Result
(
texts
,
9
,
0.93
);
List
<
Result
>
list
=
ClusterResult
.
get
Result
(
texts
,
9
,
0.93
);
long
t
=
System
.
currentTimeMillis
()-
time
;
System
.
out
.
println
(
"事件名:"
+
name
+
"——"
+
pt
+
"数据,数据量:"
+
texts
.
size
()+
" ,输出时间:"
+
t
);
DBOExp
dbo
=
new
DBOExp
();
dbo
.
putRun
(
GroupSheet
(
list
,
listEvent
),
path
,
"聚合"
);
dbo
.
putRun
(
AllSheet
(
listEvent
),
path
,
"全部"
);
//
DBOExp dbo = new DBOExp();
//
dbo.putRun(GroupSheet(list,listEvent),path,"聚合");
//
dbo.putRun(AllSheet(listEvent), path, "全部");
}
/**
...
...
src/test/java/StartTest/ResultTest.java
View file @
ce5b9fa0
...
...
@@ -14,6 +14,7 @@ import com.zhiweidata.titleAggregation.method.MySimHash;
import
com.zhiweidata.titleAggregation.util.AnsjSeg
;
public
class
ResultTest
{
@Test
public
void
test5
()
{
List
<
String
>
texts
=
new
ArrayList
<>();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment