Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
titleAggregation
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
xuyimeng
titleAggregation
Commits
6bf229eb
Commit
6bf229eb
authored
Dec 28, 2017
by
win7
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
短文本聚合,长文本相似度
parent
98076b77
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
581 additions
and
304 deletions
+581
-304
pom.xml
+59
-0
src/main/java/com/zhiweidata/titleAggregation/bean/DataPoint.java
+2
-2
src/main/java/com/zhiweidata/titleAggregation/bean/Result.java
+99
-0
src/main/java/com/zhiweidata/titleAggregation/main/ClusterResult.java
+74
-0
src/main/java/com/zhiweidata/titleAggregation/main/ClusterUtil.java
+34
-43
src/main/java/com/zhiweidata/titleAggregation/main/HCluster.java
+44
-87
src/main/java/com/zhiweidata/titleAggregation/method/ComputeWordsVector.java
+3
-3
src/main/java/com/zhiweidata/titleAggregation/method/CosineSimilarity.java
+44
-1
src/main/java/com/zhiweidata/titleAggregation/method/CutPage.java
+65
-0
src/main/java/com/zhiweidata/titleAggregation/method/MySimHash.java
+96
-1
src/main/java/com/zhiweidata/titleAggregation/start/Start.java
+0
-26
src/main/java/com/zhiweidata/titleAggregation/util/BasicUtil.java
+15
-1
src/main/java/com/zhiweidata/titleAggregation/util/ChineseTranslate.java
+1
-2
src/test/java/StartTest/ExcelTest.java
+16
-64
src/test/java/StartTest/MongoStart.java
+9
-61
src/test/java/StartTest/ResultTest.java
+5
-4
使用说明.txt
+15
-9
No files found.
pom.xml
View file @
6bf229eb
...
@@ -3,6 +3,9 @@
...
@@ -3,6 +3,9 @@
<groupId>
com.zhiweidata
</groupId>
<groupId>
com.zhiweidata
</groupId>
<artifactId>
titleAggregation
</artifactId>
<artifactId>
titleAggregation
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
<version>
0.0.1-SNAPSHOT
</version>
<name>
titleAggregation
</name>
<description>
标题聚合工具类
</description>
<dependencies>
<dependencies>
<!-- 测试用jar包 -->
<!-- 测试用jar包 -->
...
@@ -46,4 +49,59 @@
...
@@ -46,4 +49,59 @@
<version>
5.0.2
</version>
<version>
5.0.2
</version>
</dependency>
</dependency>
</dependencies>
</dependencies>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>
maven-source-plugin
</artifactId>
<version>
2.4
</version>
<configuration>
<attach>
true
</attach>
</configuration>
<executions>
<execution>
<phase>
compile
</phase>
<goals>
<goal>
jar
</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-javadoc-plugin
</artifactId>
<version>
2.10.4
</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-surefire-plugin
</artifactId>
<version>
2.19.1
</version>
<configuration>
<forkMode>
once
</forkMode>
<argLine>
-Dfile.encoding=UTF-8
</argLine>
<skipTests>
true
</skipTests>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>
nexus-releases
</id>
<name>
User Porject Snapshot
</name>
<url>
http://192.168.0.30:8081/nexus/content/repositories/snapshots/
</url>
<uniqueVersion>
true
</uniqueVersion>
</snapshotRepository>
<repository>
<id>
nexus-releases
</id>
<name>
User Porject Release
</name>
<url>
http://192.168.0.30:8081/nexus/content/repositories/releases/
</url>
</repository>
</distributionManagement>
</project>
</project>
\ No newline at end of file
src/main/java/com/zhiweidata/titleAggregation/bean/DataPoint.java
View file @
6bf229eb
...
@@ -16,10 +16,11 @@ public class DataPoint {
...
@@ -16,10 +16,11 @@ public class DataPoint {
}
}
public
DataPoint
(
String
dataPointName
,
Map
<
String
,
Double
>
dimensioin
)
{
public
DataPoint
(
String
dataPointName
,
Map
<
String
,
Double
>
dimensioin
,
Integer
i
)
{
super
();
super
();
this
.
dataPointName
=
dataPointName
;
this
.
dataPointName
=
dataPointName
;
this
.
dimensioin
=
dimensioin
;
this
.
dimensioin
=
dimensioin
;
this
.
i
=
i
;
}
}
public
String
getDataPointName
()
{
public
String
getDataPointName
()
{
...
@@ -51,5 +52,4 @@ public class DataPoint {
...
@@ -51,5 +52,4 @@ public class DataPoint {
return
"DataPoint [dataPointName="
+
dataPointName
+
", dimensioin="
+
dimensioin
+
", i="
+
i
+
"]"
;
return
"DataPoint [dataPointName="
+
dataPointName
+
", dimensioin="
+
dimensioin
+
", i="
+
i
+
"]"
;
}
}
}
}
src/main/java/com/zhiweidata/titleAggregation/bean/Result.java
0 → 100644
View file @
6bf229eb
/**
* @Title: Result.java
* @Package com.zhiweidata.titleAggregation.bean
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月28日 上午11:52:42
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
titleAggregation
.
bean
;
/**
* @ClassName: Result
* @Description: TODO(存储结果的对象)
* @author xuyimeng
* @date 2017年12月28日 上午11:52:42
*/
public
class
Result
{
/**类簇名*/
private
String
clusterName
;
/**节点名*/
private
String
datapointName
;
/**索引位置*/
private
Integer
i
;
/**simhash距离*/
private
int
simhash
;
/**余弦算法相似度*/
private
double
cosSimilarity
;
/**切割算法相似度*/
private
double
cut
;
public
Result
()
{}
public
Result
(
String
clusterName
,
String
datapointName
,
int
simhash
,
double
cosSimilarity
,
double
cut
,
Integer
i
)
{
super
();
this
.
clusterName
=
clusterName
;
this
.
datapointName
=
datapointName
;
this
.
i
=
i
;
this
.
simhash
=
simhash
;
this
.
cosSimilarity
=
cosSimilarity
;
this
.
cut
=
cut
;
}
public
Integer
getI
()
{
return
i
;
}
public
void
setI
(
Integer
i
)
{
this
.
i
=
i
;
}
public
String
getClusterName
()
{
return
clusterName
;
}
public
void
setClusterName
(
String
clusterName
)
{
this
.
clusterName
=
clusterName
;
}
public
String
getDatapointName
()
{
return
datapointName
;
}
public
void
setDatapointName
(
String
datapointName
)
{
this
.
datapointName
=
datapointName
;
}
public
int
getSimhash
()
{
return
simhash
;
}
public
void
setSimhash
(
int
simhash
)
{
this
.
simhash
=
simhash
;
}
public
double
getCosSimilarity
()
{
return
cosSimilarity
;
}
public
void
setCosSimilarity
(
double
cosSimilarity
)
{
this
.
cosSimilarity
=
cosSimilarity
;
}
public
double
getCut
()
{
return
cut
;
}
public
void
setCut
(
double
cut
)
{
this
.
cut
=
cut
;
}
@Override
public
String
toString
()
{
return
"Result [clusterName="
+
clusterName
+
", datapointName="
+
datapointName
+
", i="
+
i
+
", simhash="
+
simhash
+
", cosSimilarity="
+
cosSimilarity
+
", cut="
+
cut
+
"]"
;
}
}
src/main/java/com/zhiweidata/titleAggregation/main/ClusterResult.java
0 → 100644
View file @
6bf229eb
/**
* @Title: ClusterResult.java
* @Package com.zhiweidata.titleAggregation.main
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月28日 下午12:14:54
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
titleAggregation
.
main
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiweidata.titleAggregation.bean.Result
;
import
com.zhiweidata.titleAggregation.util.BasicUtil
;
/**
* @ClassName: ClusterResult
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2017年12月28日 下午12:14:54
*/
public
class
ClusterResult
{
/**
* @Title: getResult
* @Description: TODO()
* @param list 标题的集合
* @return
* List<Result> 返回类型
*/
public
static
List
<
Result
>
getResult
(
List
<
String
>
list
){
return
getResult
(
list
,
9
,
0.93
);
}
/**
* @Title: getResult
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param list 文本的集合
* @param freq simhash的距离阈值,推荐为9
* @return
* List<Result> 返回类型
*/
public
static
List
<
Result
>
getResult
(
List
<
String
>
list
,
int
freq
){
if
(
freq
<
0
)
{
freq
=
9
;
}
return
getResult
(
list
,
freq
,
0.93
);
}
/**
* @Title: getResult
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param list
* @param freq simhash的距离阈值,推荐为9
* @param cosFreq 余弦的相似度阈值,推荐0.93
* @return
* List<Result> 返回类型
*/
public
static
List
<
Result
>
getResult
(
List
<
String
>
list
,
int
freq
,
double
cosFreq
){
if
(
freq
<
0
)
{
freq
=
9
;
}
if
(
cosFreq
<
0
)
{
cosFreq
=
0.93
;
}
HCluster
hCluster
=
new
HCluster
();
Map
<
Integer
,
String
>
map
=
BasicUtil
.
toMap
(
list
);
return
hCluster
.
DataToResult
(
map
,
freq
,
cosFreq
);
}
}
src/main/java/com/zhiweidata/titleAggregation/main/ClusterUtil.java
View file @
6bf229eb
...
@@ -8,6 +8,7 @@ import java.util.Map;
...
@@ -8,6 +8,7 @@ import java.util.Map;
import
com.zhiweidata.titleAggregation.bean.Cluster
;
import
com.zhiweidata.titleAggregation.bean.Cluster
;
import
com.zhiweidata.titleAggregation.bean.DataPoint
;
import
com.zhiweidata.titleAggregation.bean.DataPoint
;
import
com.zhiweidata.titleAggregation.method.CosineSimilarity
;
/**
/**
...
@@ -17,6 +18,35 @@ import com.zhiweidata.titleAggregation.bean.DataPoint;
...
@@ -17,6 +18,35 @@ import com.zhiweidata.titleAggregation.bean.DataPoint;
* @date 2017年12月26日 上午9:19:41
* @date 2017年12月26日 上午9:19:41
*/
*/
public
class
ClusterUtil
{
public
class
ClusterUtil
{
public
void
mergeLikeCluster
(
List
<
Cluster
>
clusters
,
double
freq
)
{
List
<
String
>
texts
=
new
ArrayList
<>();
for
(
Cluster
cluster
:
clusters
)
{
texts
.
add
(
cluster
.
getClusterName
());
}
CosineSimilarity
cos
=
new
CosineSimilarity
();
double
[][]
distance
=
cos
.
getDistance
(
texts
);
for
(
int
i
=
0
;
i
<
clusters
.
size
()-
1
;
i
++)
{
if
(
clusters
.
get
(
i
).
getDataPoints
().
size
()
==
0
)
{
continue
;
}
for
(
int
j
=
i
+
1
;
j
<
clusters
.
size
();
j
++)
{
if
(
clusters
.
get
(
j
).
getDataPoints
().
size
()
==
0
)
{
continue
;
}
if
(
distance
[
i
][
j
]
>
0.93
)
{
mergeCluster
(
clusters
,
i
,
j
);
}
}
}
}
/**
/**
* @Title: alertLikeData
* @Title: alertLikeData
* @Description: TODO(检测类簇与各个个体之间的相似度)
* @Description: TODO(检测类簇与各个个体之间的相似度)
...
@@ -25,6 +55,7 @@ public class ClusterUtil {
...
@@ -25,6 +55,7 @@ public class ClusterUtil {
*/
*/
public
void
alertLikeData
(
List
<
Cluster
>
clusters
,
double
freq
)
public
void
alertLikeData
(
List
<
Cluster
>
clusters
,
double
freq
)
{
{
CosineSimilarity
cos
=
new
CosineSimilarity
();
//存储从类簇中被删除dataPoint的集合
//存储从类簇中被删除dataPoint的集合
List
<
DataPoint
>
list
=
new
ArrayList
<>();
List
<
DataPoint
>
list
=
new
ArrayList
<>();
//存储类簇名的集合
//存储类簇名的集合
...
@@ -47,7 +78,7 @@ public class ClusterUtil {
...
@@ -47,7 +78,7 @@ public class ClusterUtil {
dataPointsNames
.
add
(
text
);
dataPointsNames
.
add
(
text
);
}
}
double
[]
distances
=
getDistance
(
dataPointsNames
,
cluster
.
getClusterName
());
double
[]
distances
=
cos
.
getDistance
(
dataPointsNames
,
cluster
.
getClusterName
());
Iterator
<
DataPoint
>
it
=
dataPoints
.
iterator
();
Iterator
<
DataPoint
>
it
=
dataPoints
.
iterator
();
int
i
=
0
;
int
i
=
0
;
...
@@ -64,14 +95,14 @@ public class ClusterUtil {
...
@@ -64,14 +95,14 @@ public class ClusterUtil {
cluster
.
setDataPoints
(
dataPoints
);
cluster
.
setDataPoints
(
dataPoints
);
clusters
.
set
(
clusters
.
indexOf
(
cluster
),
cluster
);
clusters
.
set
(
clusters
.
indexOf
(
cluster
),
cluster
);
}
}
List
<
DataPoint
>
listNew
=
new
ArrayList
<>();
List
<
DataPoint
>
listNew
=
new
ArrayList
<>();
//遍历判断被删除的节点是否与其它类簇相似,相似就添加到这个类簇中
//遍历判断被删除的节点是否与其它类簇相似,相似就添加到这个类簇中
for
(
DataPoint
dataPoint
:
list
)
for
(
DataPoint
dataPoint
:
list
)
{
{
double
[]
distances
=
getDistance
(
clusterNames
,
dataPoint
.
getDataPointName
());
double
[]
distances
=
cos
.
getDistance
(
clusterNames
,
dataPoint
.
getDataPointName
());
for
(
int
i
=
0
;
i
<
clusterNames
.
size
();
i
++)
for
(
int
i
=
0
;
i
<
clusterNames
.
size
();
i
++)
{
{
...
@@ -96,46 +127,6 @@ public class ClusterUtil {
...
@@ -96,46 +127,6 @@ public class ClusterUtil {
completedData
(
listNew
,
clusters
);
completedData
(
listNew
,
clusters
);
}
}
/**
* 用余弦算法 遍历计算相似度
* 越接近1, 越相近
*/
public
double
[][]
getDistance
(
List
<
String
>
titles
)
{
CosineSimilarity
cosineSimilarity
=
new
CosineSimilarity
();
double
[][]
distance
=
new
double
[
titles
.
size
()][
titles
.
size
()];
String
doc1
=
""
;
String
doc2
=
""
;
for
(
int
i
=
0
;
i
<
titles
.
size
()-
1
;
i
++)
{
doc1
=
titles
.
get
(
i
);
for
(
int
j
=
i
+
1
;
j
<
titles
.
size
();
j
++)
{
doc2
=
titles
.
get
(
j
);
distance
[
i
][
j
]
=
cosineSimilarity
.
CalculateTextSim
(
doc1
,
doc2
);
}
}
return
distance
;
}
/**
* 用余弦算法 计算类簇内各个个体与类簇名的相似度
* 越接近1, 越相近
* @param list
* @param str
* @return
*/
public
double
[]
getDistance
(
List
<
String
>
list
,
String
doc1
){
CosineSimilarity
cosineSimilarity
=
new
CosineSimilarity
();
double
[]
distance
=
new
double
[
list
.
size
()];
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
String
doc2
=
list
.
get
(
i
);
distance
[
i
]
=
cosineSimilarity
.
CalculateTextSim
(
doc1
,
doc2
);
}
return
distance
;
}
// /**计算两个文本的相似度
// /**计算两个文本的相似度
// * 注释部分是 向量夹角余弦计算,目前采用的是向量内积计算
// * 注释部分是 向量夹角余弦计算,目前采用的是向量内积计算
...
...
src/main/java/com/zhiweidata/titleAggregation/main/HCluster.java
View file @
6bf229eb
...
@@ -7,37 +7,64 @@ import java.util.Map;
...
@@ -7,37 +7,64 @@ import java.util.Map;
import
com.zhiweidata.titleAggregation.bean.Cluster
;
import
com.zhiweidata.titleAggregation.bean.Cluster
;
import
com.zhiweidata.titleAggregation.bean.DataPoint
;
import
com.zhiweidata.titleAggregation.bean.DataPoint
;
import
com.zhiweidata.titleAggregation.bean.Result
;
import
com.zhiweidata.titleAggregation.method.ComputeWordsVector
;
import
com.zhiweidata.titleAggregation.method.CosineSimilarity
;
import
com.zhiweidata.titleAggregation.method.MySimHash
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate
;
import
com.zhiweidata.titleAggregation.util.Util
;
import
com.zhiweidata.titleAggregation.util.
Basic
Util
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate.goal
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate.goal
;
/**
/**
* @ClassName: HCluster
* @ClassName: HCluster
* @Description: TODO(算法主程序)
* @Description: TODO(
标题
算法主程序)
* @author xuyimeng
* @author xuyimeng
* @date 2017年12月26日 上午9:47:58
* @date 2017年12月26日 上午9:47:58
*/
*/
public
class
HCluster
{
public
class
HCluster
{
public
List
<
Result
>
DataToResult
(
Map
<
Integer
,
String
>
texts
,
int
freq
,
double
cosFreq
){
List
<
Cluster
>
clusters
=
changeData
(
texts
,
freq
,
cosFreq
);
MySimHash
hash
=
new
MySimHash
();
CosineSimilarity
cos
=
new
CosineSimilarity
();
List
<
Result
>
list
=
new
ArrayList
<>();
for
(
Cluster
cluster
:
clusters
)
{
if
(
cluster
.
getDataPoints
().
size
()
==
0
)
{
continue
;
}
String
clusterName
=
cluster
.
getClusterName
();
for
(
DataPoint
dataPoint
:
cluster
.
getDataPoints
())
{
String
dataPointName
=
dataPoint
.
getDataPointName
();
Result
result
=
new
Result
();
result
.
setClusterName
(
clusterName
);
result
.
setDatapointName
(
dataPointName
);
result
.
setI
(
dataPoint
.
getI
());
result
.
setSimhash
(
hash
.
getDistance
(
clusterName
,
dataPointName
));
result
.
setCosSimilarity
(
cos
.
CalculateTextSim
(
clusterName
,
dataPointName
));
list
.
add
(
result
);
}
}
return
list
;
}
/**
/**
* 将标题以长度分组选择不同的相似度
* @Title: changeData
* @Title: changeData
* @Description: TODO()
* @Description: TODO(
将标题以长度分组选择不同的相似度
)
* @param texts
* @param texts
* @param freq simhash距离,默认(推荐)为9
* @param freq simhash距离,默认(推荐)为9
* @param cosFreq 余弦的相似度 默认(推荐)为0.93
* @param cosFreq 余弦的相似度 默认(推荐)为0.93
* @return
* @return
* Map<Integer,Map<Integer,String>> 返回类型
* Map<Integer,Map<Integer,String>> 返回类型
*/
*/
public
Map
<
Integer
,
Map
<
Integer
,
String
>>
changeData
(
Map
<
Integer
,
String
>
texts
,
int
freq
,
double
cosFreq
)
{
public
List
<
Cluster
>
changeData
(
Map
<
Integer
,
String
>
texts
,
int
freq
,
double
cosFreq
)
{
if
(
freq
<=
0
)
{
freq
=
9
;
}
if
(
cosFreq
<=
0.0
)
{
cosFreq
=
0.93
;
}
//简繁体翻译
//简繁体翻译
ChineseTranslate
simplifiedTrans
=
ChineseTranslate
.
getInstance
(
goal
.
简体
);
ChineseTranslate
simplifiedTrans
=
ChineseTranslate
.
getInstance
(
goal
.
简体
);
...
@@ -73,23 +100,10 @@ public class HCluster {
...
@@ -73,23 +100,10 @@ public class HCluster {
clusters
.
addAll
(
startCluster
(
longText
,
freq
));
clusters
.
addAll
(
startCluster
(
longText
,
freq
));
ClusterUtil
util
=
new
ClusterUtil
();
ClusterUtil
util
=
new
ClusterUtil
();
util
.
mergeLikeCluster
(
clusters
,
cosFreq
);
util
.
alertLikeData
(
clusters
,
cosFreq
);
util
.
alertLikeData
(
clusters
,
cosFreq
);
Map
<
Integer
,
Map
<
Integer
,
String
>>
map
=
new
HashMap
<>();
return
clusters
;
for
(
Cluster
cl
:
clusters
)
{
List
<
DataPoint
>
tempDps
=
cl
.
getDataPoints
();
if
(
tempDps
.
size
()
>=
1
)
{
Integer
key
=
cl
.
getI
();
Map
<
Integer
,
String
>
mapDataPoint
=
new
HashMap
<>();
for
(
DataPoint
tempdp
:
tempDps
)
{
mapDataPoint
.
put
(
tempdp
.
getI
(),
tempdp
.
getDataPointName
());
}
map
.
put
(
key
,
mapDataPoint
);
}
}
return
map
;
}
}
...
@@ -97,6 +111,8 @@ public class HCluster {
...
@@ -97,6 +111,8 @@ public class HCluster {
/** 聚类的主方法*/
/** 聚类的主方法*/
private
List
<
Cluster
>
startCluster
(
Map
<
Integer
,
String
>
titles
,
int
freq
)
{
private
List
<
Cluster
>
startCluster
(
Map
<
Integer
,
String
>
titles
,
int
freq
)
{
ClusterUtil
util
=
new
ClusterUtil
();
ClusterUtil
util
=
new
ClusterUtil
();
MySimHash
hash
=
new
MySimHash
();
List
<
DataPoint
>
dp
=
readData
(
titles
);
List
<
DataPoint
>
dp
=
readData
(
titles
);
// 声明cluster类,存放类名和类簇中含有的样本
// 声明cluster类,存放类名和类簇中含有的样本
List
<
Cluster
>
finalClusters
=
new
ArrayList
<>();
List
<
Cluster
>
finalClusters
=
new
ArrayList
<>();
...
@@ -106,7 +122,7 @@ public class HCluster {
...
@@ -106,7 +122,7 @@ public class HCluster {
// flag为判断标志
// flag为判断标志
boolean
flag
=
true
;
boolean
flag
=
true
;
int
it
=
0
;
int
it
=
0
;
int
[][]
distances
=
getDistance
(
Util
.
toList
(
titles
));
int
[][]
distances
=
hash
.
getDistance
(
Basic
Util
.
toList
(
titles
));
while
(
flag
)
{
while
(
flag
)
{
// mergeIndexA和mergeIndexB表示每一次迭代聚类最小的两个类簇,也就是每一次迭代要合并的两个类簇
// mergeIndexA和mergeIndexB表示每一次迭代聚类最小的两个类簇,也就是每一次迭代要合并的两个类簇
int
mergeIndexA
=
0
;
int
mergeIndexA
=
0
;
...
@@ -160,64 +176,6 @@ public class HCluster {
...
@@ -160,64 +176,6 @@ public class HCluster {
return
finalClusters
;
return
finalClusters
;
}
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一个集合内各个元素的)
* @param titles
* @return
* int[][] 返回类型
*/
public
int
[][]
getDistance
(
List
<
String
>
titles
)
{
List
<
MySimHash
>
listHash
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
titles
.
size
();
i
++)
{
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
titles
.
get
(
i
));
listHash
.
add
(
mySimHash
);
}
int
[][]
distance
=
new
int
[
titles
.
size
()][
titles
.
size
()];
int
temp
;
for
(
int
i
=
0
;
i
<
titles
.
size
()-
1
;
i
++)
{
for
(
int
j
=
i
+
1
;
j
<
titles
.
size
();
j
++)
{
temp
=
listHash
.
get
(
i
).
hammingDistance
(
listHash
.
get
(
j
));
distance
[
i
][
j
]
=
temp
;
}
}
return
distance
;
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一对多)
* @param titles
* @param text
* @return
* int[] 返回类型
*/
public
int
[]
getDistance
(
List
<
String
>
titles
,
String
text
)
{
List
<
MySimHash
>
listHash
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
titles
.
size
();
i
++)
{
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
titles
.
get
(
i
));
listHash
.
add
(
mySimHash
);
}
int
[]
distance
=
new
int
[
titles
.
size
()];
int
temp
;
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
text
);
for
(
int
i
=
0
;
i
<
titles
.
size
();
i
++)
{
temp
=
mySimHash
.
hammingDistance
(
listHash
.
get
(
i
));
distance
[
i
]
=
temp
;
}
return
distance
;
}
/**初始化类簇*/
/**初始化类簇*/
private
List
<
Cluster
>
initialCluster
(
List
<
DataPoint
>
dpoints
)
{
private
List
<
Cluster
>
initialCluster
(
List
<
DataPoint
>
dpoints
)
{
// 声明存放初始化类簇的链表
// 声明存放初始化类簇的链表
...
@@ -238,7 +196,6 @@ public class HCluster {
...
@@ -238,7 +196,6 @@ public class HCluster {
tempCluster
.
setI
(
tempDataPoint
.
getI
());
tempCluster
.
setI
(
tempDataPoint
.
getI
());
// 将新的类簇加入到初始化类簇链表中
// 将新的类簇加入到初始化类簇链表中
originalClusters
.
add
(
tempCluster
);
originalClusters
.
add
(
tempCluster
);
}
}
return
originalClusters
;
return
originalClusters
;
...
...
src/main/java/com/zhiweidata/titleAggregation/m
ain
/ComputeWordsVector.java
→
src/main/java/com/zhiweidata/titleAggregation/m
ethod
/ComputeWordsVector.java
View file @
6bf229eb
package
com
.
zhiweidata
.
titleAggregation
.
m
ain
;
package
com
.
zhiweidata
.
titleAggregation
.
m
ethod
;
import
java.util.*
;
import
java.util.*
;
...
@@ -6,7 +6,7 @@ import org.ansj.domain.Term;
...
@@ -6,7 +6,7 @@ import org.ansj.domain.Term;
import
com.zhiweidata.titleAggregation.bean.DataPoint
;
import
com.zhiweidata.titleAggregation.bean.DataPoint
;
import
com.zhiweidata.titleAggregation.util.AnsjSeg
;
import
com.zhiweidata.titleAggregation.util.AnsjSeg
;
import
com.zhiweidata.titleAggregation.util.Util
;
import
com.zhiweidata.titleAggregation.util.
Basic
Util
;
/**
/**
* @ClassName: ComputeWordsVector
* @ClassName: ComputeWordsVector
...
@@ -24,7 +24,7 @@ public class ComputeWordsVector {
...
@@ -24,7 +24,7 @@ public class ComputeWordsVector {
public
List
<
DataPoint
>
computeTFMultiIDF
(
Map
<
Integer
,
String
>
testSampleDir
){
public
List
<
DataPoint
>
computeTFMultiIDF
(
Map
<
Integer
,
String
>
testSampleDir
){
List
<
DataPoint
>
dataPoints
=
new
ArrayList
<>();
List
<
DataPoint
>
dataPoints
=
new
ArrayList
<>();
Map
<
String
,
Double
>
idfPerWordMap
=
computeIDF
(
Util
.
toList
(
testSampleDir
));
Map
<
String
,
Double
>
idfPerWordMap
=
computeIDF
(
Basic
Util
.
toList
(
testSampleDir
));
Map
<
String
,
Double
>
tfPerDocMap
=
new
TreeMap
<
String
,
Double
>();
Map
<
String
,
Double
>
tfPerDocMap
=
new
TreeMap
<
String
,
Double
>();
AnsjSeg
ansj
=
AnsjSeg
.
getInstance
();
AnsjSeg
ansj
=
AnsjSeg
.
getInstance
();
...
...
src/main/java/com/zhiweidata/titleAggregation/m
ain
/CosineSimilarity.java
→
src/main/java/com/zhiweidata/titleAggregation/m
ethod
/CosineSimilarity.java
View file @
6bf229eb
package
com
.
zhiweidata
.
titleAggregation
.
m
ain
;
package
com
.
zhiweidata
.
titleAggregation
.
m
ethod
;
import
java.io.UnsupportedEncodingException
;
import
java.io.UnsupportedEncodingException
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.Iterator
;
import
java.util.Iterator
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
/**
/**
...
@@ -109,4 +110,45 @@ public class CosineSimilarity {
...
@@ -109,4 +110,45 @@ public class CosineSimilarity {
}
}
return
-
1
;
return
-
1
;
}
}
/**
* 用余弦算法 遍历计算相似度
* 越接近1, 越相近
*/
public
double
[][]
getDistance
(
List
<
String
>
titles
)
{
CosineSimilarity
cosineSimilarity
=
new
CosineSimilarity
();
double
[][]
distance
=
new
double
[
titles
.
size
()][
titles
.
size
()];
String
doc1
=
""
;
String
doc2
=
""
;
for
(
int
i
=
0
;
i
<
titles
.
size
()-
1
;
i
++)
{
doc1
=
titles
.
get
(
i
);
for
(
int
j
=
i
+
1
;
j
<
titles
.
size
();
j
++)
{
doc2
=
titles
.
get
(
j
);
distance
[
i
][
j
]
=
cosineSimilarity
.
CalculateTextSim
(
doc1
,
doc2
);
}
}
return
distance
;
}
/**
* 用余弦算法 计算类簇内各个个体与类簇名的相似度
* 越接近1, 越相近
* @param list
* @param str
* @return
*/
public
double
[]
getDistance
(
List
<
String
>
list
,
String
doc1
){
CosineSimilarity
cosineSimilarity
=
new
CosineSimilarity
();
double
[]
distance
=
new
double
[
list
.
size
()];
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++)
{
String
doc2
=
list
.
get
(
i
);
distance
[
i
]
=
cosineSimilarity
.
CalculateTextSim
(
doc1
,
doc2
);
}
return
distance
;
}
}
}
\ No newline at end of file
src/main/java/com/zhiweidata/titleAggregation/method/CutPage.java
0 → 100644
View file @
6bf229eb
/**
* @Title: CutPage.java
* @Package util
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月27日 下午2:24:06
* @version V1.0
*/
/**
*
*/
package
com
.
zhiweidata
.
titleAggregation
.
method
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.List
;
import
com.zhiweidata.titleAggregation.method.MySimHash
;
/**
* @ClassName: CutPage
* @Description: TODO(应用于长文本的切割算法)
* @author xuyimeng
* @date 2017年12月27日 下午2:24:06
*/
public
class
CutPage
{
public
double
getDistance
(
String
goalText
,
String
testText
)
{
List
<
String
>
goalTexts
=
splitString
(
goalText
);
List
<
String
>
textTexts
=
splitString
(
testText
);
MySimHash
hash
=
new
MySimHash
();
double
result
=
0
;
for
(
String
goal
:
goalTexts
)
{
double
[]
sim
=
hash
.
getSimilarity
(
textTexts
,
goal
);
double
maxSim
=
0
;
for
(
int
i
=
0
;
i
<
sim
.
length
;
i
++)
{
if
(
sim
[
i
]
>
maxSim
)
{
maxSim
=
sim
[
i
];
}
}
result
+=
maxSim
;
}
return
result
/
goalTexts
.
size
();
}
public
List
<
String
>
splitString
(
String
text
){
List
<
String
>
result
=
new
ArrayList
<>();
List
<
String
>
list
=
Arrays
.
asList
(
text
.
split
(
"。"
));
for
(
String
str
:
list
)
{
if
(
str
.
contains
(
";"
))
{
result
.
addAll
(
Arrays
.
asList
(
str
.
split
(
";"
)));
}
result
.
add
(
str
);
}
return
result
;
}
}
src/main/java/com/zhiweidata/titleAggregation/m
ain
/MySimHash.java
→
src/main/java/com/zhiweidata/titleAggregation/m
ethod
/MySimHash.java
View file @
6bf229eb
package
com
.
zhiweidata
.
titleAggregation
.
m
ain
;
package
com
.
zhiweidata
.
titleAggregation
.
m
ethod
;
import
java.math.BigInteger
;
import
java.math.BigInteger
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
...
@@ -135,4 +136,98 @@ public class MySimHash {
...
@@ -135,4 +136,98 @@ public class MySimHash {
return
1
-
i
/
this
.
hashbits
;
return
1
-
i
/
this
.
hashbits
;
}
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一个集合内各个元素的)
* @param titles
* @return
* int[][] 返回类型
*/
public
int
[][]
getDistance
(
List
<
String
>
titles
)
{
List
<
MySimHash
>
listHash
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
titles
.
size
();
i
++)
{
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
titles
.
get
(
i
));
listHash
.
add
(
mySimHash
);
}
int
[][]
distance
=
new
int
[
titles
.
size
()][
titles
.
size
()];
int
temp
;
for
(
int
i
=
0
;
i
<
titles
.
size
()-
1
;
i
++)
{
for
(
int
j
=
i
+
1
;
j
<
titles
.
size
();
j
++)
{
temp
=
listHash
.
get
(
i
).
hammingDistance
(
listHash
.
get
(
j
));
distance
[
i
][
j
]
=
temp
;
}
}
return
distance
;
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一对多)
* @param titles
* @param text
* @return
* int[] 返回类型
*/
public
int
[]
getDistance
(
List
<
String
>
titles
,
String
text
)
{
List
<
MySimHash
>
listHash
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
titles
.
size
();
i
++)
{
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
titles
.
get
(
i
));
listHash
.
add
(
mySimHash
);
}
int
[]
distance
=
new
int
[
titles
.
size
()];
int
temp
;
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
text
);
for
(
int
i
=
0
;
i
<
titles
.
size
();
i
++)
{
temp
=
mySimHash
.
hammingDistance
(
listHash
.
get
(
i
));
distance
[
i
]
=
temp
;
}
return
distance
;
}
public
int
getDistance
(
String
text1
,
String
text2
)
{
MySimHash
hash1
=
new
MySimHash
();
MySimHash
hash2
=
new
MySimHash
();
hash1
.
setTokens
(
text1
);
hash2
.
setTokens
(
text2
);
return
hash1
.
hammingDistance
(
hash2
);
}
/**
* @Title: getSimilarity
* @Description: TODO(判断simhash相似度)
* @param titles
* @param text
* @return
* double[] 返回类型
*/
public
double
[]
getSimilarity
(
List
<
String
>
titles
,
String
text
)
{
List
<
MySimHash
>
listHash
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
titles
.
size
();
i
++)
{
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
titles
.
get
(
i
));
listHash
.
add
(
mySimHash
);
}
double
[]
distance
=
new
double
[
titles
.
size
()];
double
temp
;
MySimHash
mySimHash
=
new
MySimHash
();
mySimHash
.
setTokens
(
text
);
for
(
int
i
=
0
;
i
<
titles
.
size
();
i
++)
{
temp
=
mySimHash
.
getSemblance
(
listHash
.
get
(
i
));
distance
[
i
]
=
temp
;
}
return
distance
;
}
}
}
src/main/java/com/zhiweidata/titleAggregation/start/Start.java
deleted
100644 → 0
View file @
98076b77
package
com
.
zhiweidata
.
titleAggregation
.
start
;
public
class
Start
{
public
static
void
main
(
String
[]
args
)
{
// 使用说明
//
// 标题相似度算法,是根据simhash、余弦判断辅以空间向量乘积计算相似度,用层次聚类算法选择聚类
// 中心。
//
// 在src/test/java/startTest中
// MongoStart 是对mongo数据库进行使用测试,将结果导出成excel表格
// ExcelTest 是对excel表格数据进行使用测试,将结果导出成excel表格
// ResultTest 有对各个算法的效果进行模拟测试
// ChineseTest 是对繁简体结果转换的测试
//
// 在com.zhiweidata.titleAggregation.main中
// HCluster 聚类算法的实现类
// ClusterUtil 是对算法结果的优化,一般在此对结果进行迭代
// ComputeWordsVector 计算文本向量的工具类
// CosineSimilarity 余弦算法的工具类
// MySimHash simhash算法的工具类
//
// 使用方法
// 在startTest中,给出了事例,自行参考
}
}
src/main/java/com/zhiweidata/titleAggregation/util/Util.java
→
src/main/java/com/zhiweidata/titleAggregation/util/
Basic
Util.java
View file @
6bf229eb
...
@@ -2,15 +2,29 @@ package com.zhiweidata.titleAggregation.util;
...
@@ -2,15 +2,29 @@ package com.zhiweidata.titleAggregation.util;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
/**
/**
* @ClassName: Util
* @ClassName: Util
* @Description: TODO(封装对集合处理的工具类)
* @Description: TODO(封装对集合处理的工具类)
* @author xuyimeng
* @author xuyimeng
* @date 2017年12月26日 上午9:15:29
* @date 2017年12月26日 上午9:15:29
*/
*/
public
class
Util
{
public
class
BasicUtil
{
public
static
Map
<
Integer
,
String
>
toMap
(
List
<
String
>
list
)
{
Map
<
Integer
,
String
>
texts
=
new
HashMap
<>();
int
i
=
0
;
for
(
String
text
:
list
)
{
texts
.
put
(
i
,
text
);
i
++;
}
return
texts
;
}
/**
/**
* @Title: toList
* @Title: toList
* @Description: TODO(将map转为list)
* @Description: TODO(将map转为list)
...
...
src/main/java/com/zhiweidata/titleAggregation/util/ChineseTranslate.java
View file @
6bf229eb
...
@@ -34,11 +34,10 @@ public class ChineseTranslate {
...
@@ -34,11 +34,10 @@ public class ChineseTranslate {
private
ChineseTranslate
()
{
}
private
ChineseTranslate
()
{
}
/**
/**
* 不需自行创建转换器即可转换. 内部调用
{@link #转换(String) 转换}
方法.
* 不需自行创建转换器即可转换. 内部调用
转换
方法.
* @param 文本 任意长度
* @param 文本 任意长度
* @param 简繁 goal格式
* @param 简繁 goal格式
* @return 转换为goal格式的文本
* @return 转换为goal格式的文本
* @throws IllegalArgumentException 文本为null时
*/
*/
public
static
String
trans
(
String
text
,
goal
简繁
)
{
public
static
String
trans
(
String
text
,
goal
简繁
)
{
return
getInstance
(
简繁
).
trans
(
text
);
return
getInstance
(
简繁
).
trans
(
text
);
...
...
src/test/java/StartTest/ExcelTest.java
View file @
6bf229eb
...
@@ -18,6 +18,9 @@ import org.junit.Test;
...
@@ -18,6 +18,9 @@ import org.junit.Test;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.mongodb.DBObject
;
import
com.zhiweidata.titleAggregation.bean.Cluster
;
import
com.zhiweidata.titleAggregation.bean.Result
;
import
com.zhiweidata.titleAggregation.main.ClusterResult
;
import
com.zhiweidata.titleAggregation.main.HCluster
;
import
com.zhiweidata.titleAggregation.main.HCluster
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate.goal
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate.goal
;
...
@@ -55,14 +58,13 @@ public class ExcelTest {
...
@@ -55,14 +58,13 @@ public class ExcelTest {
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
List
<
Map
<
String
,
Object
>>
body
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
body
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
Map
<
Integer
,
String
>
titles
=
exportTitleData
(
body
);
List
<
String
>
titles
=
exportTitleData
(
body
);
//调用算法
//调用算法
HCluster
hc
=
new
HCluster
();
ClusterResult
cr
=
new
ClusterResult
();
Map
<
Integer
,
Map
<
Integer
,
String
>>
mapResult
=
hc
.
changeData
(
titles
,
9
,
0.93
);
List
<
Result
>
list
=
cr
.
getResult
(
titles
);
DBOExp
dbo
=
new
DBOExp
();
DBOExp
dbo
=
new
DBOExp
();
dbo
.
putRun
(
noGroupSheet
(
mapResult
,
body
),
goalPath
,
"未聚合"
);
dbo
.
putRun
(
GroupSheet
(
list
,
body
),
goalPath
,
"聚合"
);
dbo
.
putRun
(
GroupSheet
(
mapResult
,
body
),
goalPath
,
"聚合"
);
dbo
.
putRun
(
AllSheet
(
body
),
goalPath
,
"全部"
);
dbo
.
putRun
(
AllSheet
(
body
),
goalPath
,
"全部"
);
}
}
/**
/**
...
@@ -73,17 +75,15 @@ public class ExcelTest {
...
@@ -73,17 +75,15 @@ public class ExcelTest {
* @return
* @return
* Map<Integer,String> 返回类型
* Map<Integer,String> 返回类型
*/
*/
public
Map
<
Integer
,
String
>
exportTitleData
(
List
<
Map
<
String
,
Object
>>
body
){
public
List
<
String
>
exportTitleData
(
List
<
Map
<
String
,
Object
>>
body
){
Map
<
Integer
,
String
>
titles
=
new
HashMap
<>();
List
<
String
>
titles
=
new
ArrayList
<>();
int
i
=
0
;
for
(
Map
<
String
,
Object
>
map
:
body
)
for
(
Map
<
String
,
Object
>
map
:
body
)
{
{
for
(
String
key
:
map
.
keySet
())
for
(
String
key
:
map
.
keySet
())
{
{
if
(
key
.
equals
(
"标题"
))
if
(
key
.
equals
(
"标题"
))
{
{
titles
.
put
(
i
,
map
.
get
(
key
).
toString
());
titles
.
add
(
map
.
get
(
key
).
toString
());
i
++;
}
}
}
}
}
}
...
@@ -92,47 +92,6 @@ public class ExcelTest {
...
@@ -92,47 +92,6 @@ public class ExcelTest {
/**
/**
*
*
* @Title: noGroupSheet
* @Description: TODO(未聚合的个体的sheet)
* @param map
* @param body
* @return
* List<DBObject> 返回类型
*/
public
static
List
<
DBObject
>
noGroupSheet
(
Map
<
Integer
,
Map
<
Integer
,
String
>>
map
,
List
<
Map
<
String
,
Object
>>
body
)
{
List
<
DBObject
>
listDB
=
new
ArrayList
<>();
for
(
Integer
str
:
map
.
keySet
())
{
Map
<
Integer
,
String
>
titles
=
map
.
get
(
str
);
if
(
titles
.
size
()
>
1
)
{
continue
;
}
for
(
Integer
key
:
titles
.
keySet
())
{
DBObject
obj
=
new
BasicDBObject
();
Map
<
String
,
Object
>
bean
=
body
.
get
(
key
);
for
(
String
keyObj
:
bean
.
keySet
())
{
if
(
keyObj
==
null
||
""
.
equals
(
keyObj
))
{
continue
;
}
obj
.
put
(
keyObj
,
bean
.
get
(
keyObj
));
}
listDB
.
add
(
obj
);
}
}
return
listDB
;
}
/**
*
* @Title: GroupSheet
* @Title: GroupSheet
* @Description: TODO(聚合的个体的sheet)
* @Description: TODO(聚合的个体的sheet)
* @param map
* @param map
...
@@ -140,27 +99,20 @@ public class ExcelTest {
...
@@ -140,27 +99,20 @@ public class ExcelTest {
* @return 设定文件
* @return 设定文件
* @return List<DBObject> 返回类型
* @return List<DBObject> 返回类型
*/
*/
public
static
List
<
DBObject
>
GroupSheet
(
Map
<
Integer
,
Map
<
Integer
,
String
>>
map
,
public
static
List
<
DBObject
>
GroupSheet
(
List
<
Result
>
list
,
List
<
Map
<
String
,
Object
>>
body
)
List
<
Map
<
String
,
Object
>>
body
)
{
{
//简繁体翻译
//简繁体翻译
ChineseTranslate
simplifiedTrans
=
ChineseTranslate
.
getInstance
(
goal
.
简体
);
ChineseTranslate
simplifiedTrans
=
ChineseTranslate
.
getInstance
(
goal
.
简体
);
List
<
DBObject
>
listDB
=
new
ArrayList
<>();
List
<
DBObject
>
listDB
=
new
ArrayList
<>();
for
(
Integer
str
:
map
.
keySet
())
int
i
=
0
;
{
for
(
Result
result
:
list
)
Map
<
Integer
,
String
>
titles
=
map
.
get
(
str
);
if
(
titles
.
size
()
<=
1
)
{
continue
;
}
for
(
Integer
key
:
titles
.
keySet
())
{
{
DBObject
obj
=
new
BasicDBObject
();
DBObject
obj
=
new
BasicDBObject
();
Map
<
String
,
Object
>
bean
=
body
.
get
(
key
);
Map
<
String
,
Object
>
bean
=
body
.
get
(
i
);
String
title
=
simplifiedTrans
.
trans
(
body
.
get
(
str
).
get
(
"标题"
).
toString
());
String
title
=
simplifiedTrans
.
trans
(
result
.
getClusterName
());
obj
.
put
(
"聚合标签"
,
title
);
obj
.
put
(
"聚合标签"
,
title
);
for
(
String
keyObj
:
bean
.
keySet
())
for
(
String
keyObj
:
bean
.
keySet
())
{
{
...
@@ -171,7 +123,7 @@ public class ExcelTest {
...
@@ -171,7 +123,7 @@ public class ExcelTest {
obj
.
put
(
keyObj
,
bean
.
get
(
keyObj
));
obj
.
put
(
keyObj
,
bean
.
get
(
keyObj
));
}
}
listDB
.
add
(
obj
);
listDB
.
add
(
obj
);
}
i
++;
}
}
return
listDB
;
return
listDB
;
}
}
...
...
src/test/java/StartTest/MongoStart.java
View file @
6bf229eb
...
@@ -11,6 +11,7 @@ import org.junit.Test;
...
@@ -11,6 +11,7 @@ import org.junit.Test;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.mongodb.DBObject
;
import
com.zhiweidata.titleAggregation.bean.Result
;
import
com.zhiweidata.titleAggregation.main.HCluster
;
import
com.zhiweidata.titleAggregation.main.HCluster
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate.goal
;
import
com.zhiweidata.titleAggregation.util.ChineseTranslate.goal
;
...
@@ -99,60 +100,15 @@ public class MongoStart {
...
@@ -99,60 +100,15 @@ public class MongoStart {
//调用算法
//调用算法
HCluster
hc
=
new
HCluster
();
HCluster
hc
=
new
HCluster
();
long
time
=
System
.
currentTimeMillis
();
long
time
=
System
.
currentTimeMillis
();
Map
<
Integer
,
Map
<
Integer
,
String
>>
mapResult
=
hc
.
changeData
(
texts
,
9
,
0.93
);
List
<
Result
>
list
=
hc
.
DataToResult
(
texts
,
9
,
0.93
);
long
t
=
System
.
currentTimeMillis
()-
time
;
long
t
=
System
.
currentTimeMillis
()-
time
;
System
.
out
.
println
(
"事件名:"
+
name
+
"——"
+
pt
+
"数据,数据量:"
+
texts
.
size
()+
" ,输出时间:"
+
t
);
System
.
out
.
println
(
"事件名:"
+
name
+
"——"
+
pt
+
"数据,数据量:"
+
texts
.
size
()+
" ,输出时间:"
+
t
);
DBOExp
dbo
=
new
DBOExp
();
DBOExp
dbo
=
new
DBOExp
();
dbo
.
putRun
(
noGroupSheet
(
mapResult
,
listEvent
),
path
,
"未聚合"
);
dbo
.
putRun
(
GroupSheet
(
list
,
listEvent
),
path
,
"聚合"
);
dbo
.
putRun
(
GroupSheet
(
mapResult
,
listEvent
),
path
,
"聚合"
);
dbo
.
putRun
(
AllSheet
(
listEvent
),
path
,
"全部"
);
dbo
.
putRun
(
AllSheet
(
listEvent
),
path
,
"全部"
);
}
}
/**
*
* @Title: noGroupSheet
* @Description: TODO(未聚合的个体的sheet)
* @param map
* @param listEvent
* @return
* List<DBObject> 返回类型
*/
public
static
List
<
DBObject
>
noGroupSheet
(
Map
<
Integer
,
Map
<
Integer
,
String
>>
map
,
List
<
MediaAndWechatEvent
>
listEvent
)
{
SimpleDateFormat
sdf
=
new
SimpleDateFormat
(
"yyyy-MM-dd HH:mm:ss"
);
List
<
DBObject
>
listDB
=
new
ArrayList
<>();
for
(
Integer
str
:
map
.
keySet
())
{
Map
<
Integer
,
String
>
titles
=
map
.
get
(
str
);
if
(
titles
.
size
()
>
1
)
{
continue
;
}
for
(
Integer
key
:
titles
.
keySet
())
{
MediaAndWechatEvent
event
=
listEvent
.
get
(
key
);
DBObject
obj
=
new
BasicDBObject
();
String
saveTime
=
sdf
.
format
((
new
Date
(
event
.
getSavetime
())));
obj
.
put
(
"标题"
,
event
.
getTitle
());
obj
.
put
(
"url"
,
event
.
getUrl
());
obj
.
put
(
"发布时间"
,
sdf
.
format
(
event
.
getTime
()));
obj
.
put
(
"来源"
,
event
.
getSource
());
obj
.
put
(
"类型"
,
event
.
getType
());
obj
.
put
(
"保存时间"
,
saveTime
);
obj
.
put
(
"平台"
,
event
.
getPt
());
obj
.
put
(
"事件id"
,
event
.
getEventId
());
obj
.
put
(
"H因子"
,
event
.
getH
());
listDB
.
add
(
obj
);
}
}
return
listDB
;
}
/**
/**
*
*
...
@@ -163,7 +119,7 @@ public class MongoStart {
...
@@ -163,7 +119,7 @@ public class MongoStart {
* @ @return 设定文件
* @ @return 设定文件
* @return List<DBObject> 返回类型
* @return List<DBObject> 返回类型
*/
*/
public
static
List
<
DBObject
>
GroupSheet
(
Map
<
Integer
,
Map
<
Integer
,
String
>>
map
,
public
static
List
<
DBObject
>
GroupSheet
(
List
<
Result
>
list
,
List
<
MediaAndWechatEvent
>
listEvent
)
List
<
MediaAndWechatEvent
>
listEvent
)
{
{
//简繁体翻译
//简繁体翻译
...
@@ -171,21 +127,14 @@ public class MongoStart {
...
@@ -171,21 +127,14 @@ public class MongoStart {
SimpleDateFormat
sdf
=
new
SimpleDateFormat
(
"yyyy-MM-dd HH:mm:ss"
);
SimpleDateFormat
sdf
=
new
SimpleDateFormat
(
"yyyy-MM-dd HH:mm:ss"
);
List
<
DBObject
>
listDB
=
new
ArrayList
<>();
List
<
DBObject
>
listDB
=
new
ArrayList
<>();
for
(
Integer
str
:
map
.
keySet
())
for
(
Result
result
:
list
)
{
Map
<
Integer
,
String
>
titles
=
map
.
get
(
str
);
if
(
titles
.
size
()
<=
1
)
{
continue
;
}
for
(
Integer
key
:
titles
.
keySet
())
{
{
MediaAndWechatEvent
event
=
listEvent
.
get
(
key
);
DBObject
obj
=
new
BasicDBObject
();
DBObject
obj
=
new
BasicDBObject
();
MediaAndWechatEvent
event
=
listEvent
.
get
(
result
.
getI
());
String
saveTime
=
sdf
.
format
((
new
Date
(
event
.
getSavetime
())));
String
saveTime
=
sdf
.
format
((
new
Date
(
event
.
getSavetime
())));
String
titleGroup
=
simplifiedTrans
.
trans
(
listEvent
.
get
(
str
).
getTitle
());
String
text
=
simplifiedTrans
.
trans
(
result
.
getClusterName
());
obj
.
put
(
"聚合标题"
,
titleGroup
);
obj
.
put
(
"聚合标题"
,
text
);
obj
.
put
(
"标题"
,
event
.
getTitle
());
obj
.
put
(
"标题"
,
event
.
getTitle
());
obj
.
put
(
"url"
,
event
.
getUrl
());
obj
.
put
(
"url"
,
event
.
getUrl
());
obj
.
put
(
"发布时间"
,
sdf
.
format
(
event
.
getTime
()));
obj
.
put
(
"发布时间"
,
sdf
.
format
(
event
.
getTime
()));
...
@@ -198,7 +147,6 @@ public class MongoStart {
...
@@ -198,7 +147,6 @@ public class MongoStart {
listDB
.
add
(
obj
);
listDB
.
add
(
obj
);
}
}
}
return
listDB
;
return
listDB
;
}
}
/**
/**
...
...
src/test/java/StartTest/ResultTest.java
View file @
6bf229eb
...
@@ -9,8 +9,8 @@ import org.junit.Test;
...
@@ -9,8 +9,8 @@ import org.junit.Test;
import
org.nlpcn.commons.lang.util.AnsjArrays
;
import
org.nlpcn.commons.lang.util.AnsjArrays
;
import
com.zhiweidata.titleAggregation.main.ClusterUtil
;
import
com.zhiweidata.titleAggregation.main.ClusterUtil
;
import
com.zhiweidata.titleAggregation.m
ain
.CosineSimilarity
;
import
com.zhiweidata.titleAggregation.m
ethod
.CosineSimilarity
;
import
com.zhiweidata.titleAggregation.m
ain
.MySimHash
;
import
com.zhiweidata.titleAggregation.m
ethod
.MySimHash
;
import
com.zhiweidata.titleAggregation.util.AnsjSeg
;
import
com.zhiweidata.titleAggregation.util.AnsjSeg
;
public
class
ResultTest
{
public
class
ResultTest
{
...
@@ -28,11 +28,12 @@ public class ResultTest {
...
@@ -28,11 +28,12 @@ public class ResultTest {
@Test
@Test
public
void
test3
()
{
public
void
test3
()
{
String
s1
=
"大学生娶同学妈妈?传了几年的假新闻"
;
String
s1
=
"大学生娶同学妈妈?传了几年的假新闻"
;
//文本的集合
List
<
String
>
s2
=
new
ArrayList
<>();
List
<
String
>
s2
=
new
ArrayList
<>();
s2
.
add
(
s1
);
s2
.
add
(
s1
);
C
lusterUtil
clusterUtil
=
new
ClusterUtil
();
C
osineSimilarity
cos
=
new
CosineSimilarity
();
System
.
out
.
println
(
c
lusterUtil
.
getDistance
(
s2
,
s1
)[
0
]);
System
.
out
.
println
(
c
os
.
getDistance
(
s2
,
s1
)[
0
]);
}
}
...
...
使用说明.txt
View file @
6bf229eb
使用说明
使用说明
...
@@ -15,13 +15,19 @@ ClusterUtil 是对算法结果的优化,一般在此对结果进行迭代
...
@@ -15,13 +15,19 @@ ClusterUtil 是对算法结果的优化,一般在此对结果进行迭代
ComputeWordsVector 计算文本向量的工具类
ComputeWordsVector 计算文本向量的工具类
CosineSimilarity 余弦算法的工具类
CosineSimilarity 余弦算法的工具类
MySimHash simhash算法的工具类
MySimHash simhash算法的工具类
cutPage 切割算法的工具类
使用:
直接调用 ClusterResult.getResult(List<String> texts);
返回的是List<Result>
Result是结果集的对象,里面有:
clusterName:类簇名
dataPointName:节点名
i:List中的索引
simhash:simhash距离(越小越好)
cosSimilarity:余弦算法相似度(越接近1越相似)
cut:切割算法相似度(越接近1越相似)
在各个算法的工具类中,封装了各个算法的相似度计算
使用方法
在startTest中,给出了事例,自行参考
注意
算法支持繁简体的聚类,但不支持对聚类标题的转换 如果对聚类标题有繁简体的要求,
在调用繁简体转换类的方法,自行转换
例:
ChineseTranslate simplifiedTrans = ChineseTranslate.getInstance(goal.简体);
String title = simplifiedTrans.trans(text);
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment