Commit 6bf229eb by win7

短文本聚合,长文本相似度

parent 98076b77
......@@ -3,7 +3,10 @@
<groupId>com.zhiweidata</groupId>
<artifactId>titleAggregation</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>titleAggregation</name>
<description>标题聚合工具类</description>
<dependencies>
<!-- 测试用jar包 -->
<dependency>
......@@ -46,4 +49,59 @@
<version>5.0.2</version>
</dependency>
</dependencies>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
<skipTests>true</skipTests>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
</project>
\ No newline at end of file
......@@ -16,10 +16,11 @@ public class DataPoint {
}
public DataPoint(String dataPointName, Map<String, Double> dimensioin) {
public DataPoint(String dataPointName, Map<String, Double> dimensioin, Integer i) {
super();
this.dataPointName = dataPointName;
this.dimensioin = dimensioin;
this.i = i;
}
public String getDataPointName() {
......@@ -51,5 +52,4 @@ public class DataPoint {
return "DataPoint [dataPointName=" + dataPointName + ", dimensioin=" + dimensioin + ", i=" + i + "]";
}
}
/**
* @Title: Result.java
* @Package com.zhiweidata.titleAggregation.bean
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月28日 上午11:52:42
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.titleAggregation.bean;
/**
* @ClassName: Result
* @Description: TODO(存储结果的对象)
* @author xuyimeng
* @date 2017年12月28日 上午11:52:42
*/
public class Result {
/**类簇名*/
private String clusterName;
/**节点名*/
private String datapointName;
/**索引位置*/
private Integer i;
/**simhash距离*/
private int simhash;
/**余弦算法相似度*/
private double cosSimilarity;
/**切割算法相似度*/
private double cut;
public Result() {}
public Result(String clusterName, String datapointName, int simhash, double cosSimilarity, double cut,Integer i) {
super();
this.clusterName = clusterName;
this.datapointName = datapointName;
this.i = i;
this.simhash = simhash;
this.cosSimilarity = cosSimilarity;
this.cut = cut;
}
public Integer getI() {
return i;
}
public void setI(Integer i) {
this.i = i;
}
public String getClusterName() {
return clusterName;
}
public void setClusterName(String clusterName) {
this.clusterName = clusterName;
}
public String getDatapointName() {
return datapointName;
}
public void setDatapointName(String datapointName) {
this.datapointName = datapointName;
}
public int getSimhash() {
return simhash;
}
public void setSimhash(int simhash) {
this.simhash = simhash;
}
public double getCosSimilarity() {
return cosSimilarity;
}
public void setCosSimilarity(double cosSimilarity) {
this.cosSimilarity = cosSimilarity;
}
public double getCut() {
return cut;
}
public void setCut(double cut) {
this.cut = cut;
}
@Override
public String toString() {
return "Result [clusterName=" + clusterName + ", datapointName=" + datapointName + ", i=" + i + ", simhash="
+ simhash + ", cosSimilarity=" + cosSimilarity + ", cut=" + cut + "]";
}
}
/**
* @Title: ClusterResult.java
* @Package com.zhiweidata.titleAggregation.main
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月28日 下午12:14:54
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.titleAggregation.main;
import java.util.List;
import java.util.Map;
import com.zhiweidata.titleAggregation.bean.Result;
import com.zhiweidata.titleAggregation.util.BasicUtil;
/**
* @ClassName: ClusterResult
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2017年12月28日 下午12:14:54
*/
public class ClusterResult {
/**
* @Title: getResult
* @Description: TODO()
* @param list 标题的集合
* @return
* List<Result> 返回类型
*/
public static List<Result> getResult(List<String> list){
return getResult(list,9,0.93);
}
/**
* @Title: getResult
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param list 文本的集合
* @param freq simhash的距离阈值,推荐为9
* @return
* List<Result> 返回类型
*/
public static List<Result> getResult(List<String> list,int freq){
if (freq < 0)
{
freq = 9;
}
return getResult(list,freq,0.93);
}
/**
* @Title: getResult
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param list
* @param freq simhash的距离阈值,推荐为9
* @param cosFreq 余弦的相似度阈值,推荐0.93
* @return
* List<Result> 返回类型
*/
public static List<Result> getResult(List<String> list,int freq,double cosFreq){
if (freq < 0)
{
freq = 9;
}
if (cosFreq < 0)
{
cosFreq = 0.93;
}
HCluster hCluster = new HCluster();
Map<Integer,String> map = BasicUtil.toMap(list);
return hCluster.DataToResult(map, freq, cosFreq);
}
}
......@@ -8,6 +8,7 @@ import java.util.Map;
import com.zhiweidata.titleAggregation.bean.Cluster;
import com.zhiweidata.titleAggregation.bean.DataPoint;
import com.zhiweidata.titleAggregation.method.CosineSimilarity;
/**
......@@ -17,6 +18,35 @@ import com.zhiweidata.titleAggregation.bean.DataPoint;
* @date 2017年12月26日 上午9:19:41
*/
public class ClusterUtil {
public void mergeLikeCluster(List<Cluster> clusters,double freq) {
List<String> texts = new ArrayList<>();
for (Cluster cluster : clusters)
{
texts.add(cluster.getClusterName());
}
CosineSimilarity cos = new CosineSimilarity();
double[][] distance = cos.getDistance(texts);
for (int i=0;i<clusters.size()-1;i++)
{
if (clusters.get(i).getDataPoints().size() == 0)
{
continue;
}
for (int j=i+1;j<clusters.size();j++)
{
if (clusters.get(j).getDataPoints().size() == 0)
{
continue;
}
if (distance[i][j] > 0.93)
{
mergeCluster(clusters, i, j);
}
}
}
}
/**
* @Title: alertLikeData
* @Description: TODO(检测类簇与各个个体之间的相似度)
......@@ -25,6 +55,7 @@ public class ClusterUtil {
*/
public void alertLikeData(List<Cluster> clusters,double freq)
{
CosineSimilarity cos = new CosineSimilarity();
//存储从类簇中被删除dataPoint的集合
List<DataPoint> list = new ArrayList<>();
//存储类簇名的集合
......@@ -47,7 +78,7 @@ public class ClusterUtil {
dataPointsNames.add(text);
}
double[] distances = getDistance(dataPointsNames,cluster.getClusterName());
double[] distances = cos.getDistance(dataPointsNames,cluster.getClusterName());
Iterator<DataPoint> it = dataPoints.iterator();
int i = 0;
......@@ -64,14 +95,14 @@ public class ClusterUtil {
cluster.setDataPoints(dataPoints);
clusters.set(clusters.indexOf(cluster), cluster);
}
List<DataPoint> listNew = new ArrayList<>();
//遍历判断被删除的节点是否与其它类簇相似,相似就添加到这个类簇中
for (DataPoint dataPoint : list)
{
double[] distances = getDistance(clusterNames,dataPoint.getDataPointName());
double[] distances = cos.getDistance(clusterNames,dataPoint.getDataPointName());
for (int i=0;i<clusterNames.size();i++)
{
......@@ -96,46 +127,6 @@ public class ClusterUtil {
completedData(listNew, clusters);
}
/**
* 用余弦算法 遍历计算相似度
* 越接近1, 越相近
*/
public double[][] getDistance(List<String> titles) {
CosineSimilarity cosineSimilarity = new CosineSimilarity();
double[][] distance = new double[titles.size()][titles.size()];
String doc1 = "";
String doc2 = "";
for (int i = 0; i < titles.size()-1; i++)
{
doc1 = titles.get(i);
for (int j = i+1; j < titles.size(); j++)
{
doc2 = titles.get(j);
distance[i][j] = cosineSimilarity.CalculateTextSim(doc1, doc2);
}
}
return distance;
}
/**
* 用余弦算法 计算类簇内各个个体与类簇名的相似度
* 越接近1, 越相近
* @param list
* @param str
* @return
*/
public double[] getDistance(List<String> list,String doc1){
CosineSimilarity cosineSimilarity = new CosineSimilarity();
double[] distance = new double[list.size()];
for (int i = 0; i < list.size(); i++)
{
String doc2 = list.get(i);
distance[i] = cosineSimilarity.CalculateTextSim(doc1, doc2);
}
return distance;
}
// /**计算两个文本的相似度
// * 注释部分是 向量夹角余弦计算,目前采用的是向量内积计算
......
......@@ -7,37 +7,64 @@ import java.util.Map;
import com.zhiweidata.titleAggregation.bean.Cluster;
import com.zhiweidata.titleAggregation.bean.DataPoint;
import com.zhiweidata.titleAggregation.bean.Result;
import com.zhiweidata.titleAggregation.method.ComputeWordsVector;
import com.zhiweidata.titleAggregation.method.CosineSimilarity;
import com.zhiweidata.titleAggregation.method.MySimHash;
import com.zhiweidata.titleAggregation.util.ChineseTranslate;
import com.zhiweidata.titleAggregation.util.Util;
import com.zhiweidata.titleAggregation.util.BasicUtil;
import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal;
/**
* @ClassName: HCluster
* @Description: TODO(算法主程序)
* @Description: TODO(标题算法主程序)
* @author xuyimeng
* @date 2017年12月26日 上午9:47:58
*/
public class HCluster {
public List<Result> DataToResult(Map<Integer,String> texts,int freq,double cosFreq){
List<Cluster> clusters = changeData(texts, freq, cosFreq);
MySimHash hash = new MySimHash();
CosineSimilarity cos = new CosineSimilarity();
List<Result> list = new ArrayList<>();
for (Cluster cluster : clusters)
{
if (cluster.getDataPoints().size() == 0)
{
continue;
}
String clusterName = cluster.getClusterName();
for (DataPoint dataPoint : cluster.getDataPoints())
{
String dataPointName = dataPoint.getDataPointName();
Result result = new Result();
result.setClusterName(clusterName);
result.setDatapointName(dataPointName);
result.setI(dataPoint.getI());
result.setSimhash(hash.getDistance(clusterName, dataPointName));
result.setCosSimilarity(cos.CalculateTextSim(clusterName, dataPointName));
list.add(result);
}
}
return list;
}
/**
* 将标题以长度分组选择不同的相似度
* @Title: changeData
* @Description: TODO()
* @Description: TODO(将标题以长度分组选择不同的相似度)
* @param texts
* @param freq simhash距离,默认(推荐)为9
* @param cosFreq 余弦的相似度 默认(推荐)为0.93
* @return
* Map<Integer,Map<Integer,String>> 返回类型
*/
public Map<Integer,Map<Integer,String>> changeData(Map<Integer,String> texts,int freq,double cosFreq) {
if (freq <= 0)
{
freq = 9;
}
public List<Cluster> changeData(Map<Integer,String> texts,int freq,double cosFreq) {
if (cosFreq <= 0.0)
{
cosFreq = 0.93;
}
//简繁体翻译
ChineseTranslate simplifiedTrans = ChineseTranslate.getInstance(goal.简体);
......@@ -73,23 +100,10 @@ public class HCluster {
clusters.addAll(startCluster(longText, freq));
ClusterUtil util = new ClusterUtil();
util.mergeLikeCluster(clusters, cosFreq);
util.alertLikeData(clusters,cosFreq);
Map<Integer,Map<Integer,String>> map = new HashMap<>();
for (Cluster cl : clusters) {
List<DataPoint> tempDps = cl.getDataPoints();
if (tempDps.size() >= 1)
{
Integer key = cl.getI();
Map<Integer,String> mapDataPoint = new HashMap<>();
for (DataPoint tempdp : tempDps) {
mapDataPoint.put(tempdp.getI(), tempdp.getDataPointName());
}
map.put(key, mapDataPoint);
}
}
return map;
return clusters;
}
......@@ -97,6 +111,8 @@ public class HCluster {
/** 聚类的主方法*/
private List<Cluster> startCluster(Map<Integer, String> titles, int freq) {
ClusterUtil util = new ClusterUtil();
MySimHash hash = new MySimHash();
List<DataPoint> dp = readData(titles);
// 声明cluster类,存放类名和类簇中含有的样本
List<Cluster> finalClusters = new ArrayList<>();
......@@ -106,7 +122,7 @@ public class HCluster {
// flag为判断标志
boolean flag = true;
int it = 0;
int[][] distances = getDistance(Util.toList(titles));
int[][] distances = hash.getDistance(BasicUtil.toList(titles));
while (flag) {
// mergeIndexA和mergeIndexB表示每一次迭代聚类最小的两个类簇,也就是每一次迭代要合并的两个类簇
int mergeIndexA = 0;
......@@ -160,64 +176,6 @@ public class HCluster {
return finalClusters;
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一个集合内各个元素的)
* @param titles
* @return
* int[][] 返回类型
*/
public int[][] getDistance(List<String> titles) {
List<MySimHash> listHash = new ArrayList<>();
for (int i = 0; i < titles.size(); i++)
{
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(titles.get(i));
listHash.add(mySimHash);
}
int[][] distance = new int[titles.size()][titles.size()];
int temp;
for (int i = 0; i < titles.size()-1; i++)
{
for (int j = i+1; j < titles.size(); j++)
{
temp = listHash.get(i).hammingDistance(listHash.get(j));
distance[i][j] = temp;
}
}
return distance;
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一对多)
* @param titles
* @param text
* @return
* int[] 返回类型
*/
public int[] getDistance(List<String> titles,String text) {
List<MySimHash> listHash = new ArrayList<>();
for (int i = 0; i < titles.size(); i++)
{
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(titles.get(i));
listHash.add(mySimHash);
}
int[] distance = new int[titles.size()];
int temp;
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(text);
for (int i = 0; i < titles.size(); i++)
{
temp = mySimHash.hammingDistance(listHash.get(i));
distance[i] = temp;
}
return distance;
}
/**初始化类簇*/
private List<Cluster> initialCluster(List<DataPoint> dpoints) {
// 声明存放初始化类簇的链表
......@@ -238,7 +196,6 @@ public class HCluster {
tempCluster.setI(tempDataPoint.getI());
// 将新的类簇加入到初始化类簇链表中
originalClusters.add(tempCluster);
}
return originalClusters;
......
package com.zhiweidata.titleAggregation.main;
package com.zhiweidata.titleAggregation.method;
import java.util.*;
......@@ -6,7 +6,7 @@ import org.ansj.domain.Term;
import com.zhiweidata.titleAggregation.bean.DataPoint;
import com.zhiweidata.titleAggregation.util.AnsjSeg;
import com.zhiweidata.titleAggregation.util.Util;
import com.zhiweidata.titleAggregation.util.BasicUtil;
/**
* @ClassName: ComputeWordsVector
......@@ -24,7 +24,7 @@ public class ComputeWordsVector {
public List<DataPoint> computeTFMultiIDF(Map<Integer, String> testSampleDir){
List<DataPoint> dataPoints = new ArrayList<>();
Map<String,Double> idfPerWordMap = computeIDF(Util.toList(testSampleDir));
Map<String,Double> idfPerWordMap = computeIDF(BasicUtil.toList(testSampleDir));
Map<String,Double> tfPerDocMap = new TreeMap<String, Double>();
AnsjSeg ansj = AnsjSeg.getInstance();
......
package com.zhiweidata.titleAggregation.main;
package com.zhiweidata.titleAggregation.method;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
/**
......@@ -109,4 +110,45 @@ public class CosineSimilarity {
}
return -1;
}
/**
* 用余弦算法 遍历计算相似度
* 越接近1, 越相近
*/
public double[][] getDistance(List<String> titles) {
CosineSimilarity cosineSimilarity = new CosineSimilarity();
double[][] distance = new double[titles.size()][titles.size()];
String doc1 = "";
String doc2 = "";
for (int i = 0; i < titles.size()-1; i++)
{
doc1 = titles.get(i);
for (int j = i+1; j < titles.size(); j++)
{
doc2 = titles.get(j);
distance[i][j] = cosineSimilarity.CalculateTextSim(doc1, doc2);
}
}
return distance;
}
/**
* 用余弦算法 计算类簇内各个个体与类簇名的相似度
* 越接近1, 越相近
* @param list
* @param str
* @return
*/
public double[] getDistance(List<String> list,String doc1){
CosineSimilarity cosineSimilarity = new CosineSimilarity();
double[] distance = new double[list.size()];
for (int i = 0; i < list.size(); i++)
{
String doc2 = list.get(i);
distance[i] = cosineSimilarity.CalculateTextSim(doc1, doc2);
}
return distance;
}
}
\ No newline at end of file
/**
* @Title: CutPage.java
* @Package util
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月27日 下午2:24:06
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.titleAggregation.method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.zhiweidata.titleAggregation.method.MySimHash;
/**
* @ClassName: CutPage
* @Description: TODO(应用于长文本的切割算法)
* @author xuyimeng
* @date 2017年12月27日 下午2:24:06
*/
public class CutPage {
public double getDistance(String goalText,String testText) {
List<String> goalTexts = splitString(goalText);
List<String> textTexts = splitString(testText);
MySimHash hash = new MySimHash();
double result = 0;
for (String goal : goalTexts)
{
double[] sim = hash.getSimilarity(textTexts, goal);
double maxSim = 0;
for (int i=0;i<sim.length;i++)
{
if (sim[i] > maxSim)
{
maxSim = sim[i];
}
}
result += maxSim;
}
return result /goalTexts.size();
}
public List<String> splitString(String text){
List<String> result = new ArrayList<>();
List<String> list = Arrays.asList(text.split("。"));
for (String str : list)
{
if (str.contains(";"))
{
result.addAll(Arrays.asList(str.split(";")));
}
result.add(str);
}
return result;
}
}
package com.zhiweidata.titleAggregation.main;
package com.zhiweidata.titleAggregation.method;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
......@@ -121,7 +122,7 @@ public class MySimHash {
}
return tot;
}
/**
*
* @Title: getSemblance
......@@ -134,5 +135,99 @@ public class MySimHash {
double i = (double) this.hammingDistance(s2);
return 1 - i/this.hashbits ;
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一个集合内各个元素的)
* @param titles
* @return
* int[][] 返回类型
*/
public int[][] getDistance(List<String> titles) {
List<MySimHash> listHash = new ArrayList<>();
for (int i = 0; i < titles.size(); i++)
{
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(titles.get(i));
listHash.add(mySimHash);
}
int[][] distance = new int[titles.size()][titles.size()];
int temp;
for (int i = 0; i < titles.size()-1; i++)
{
for (int j = i+1; j < titles.size(); j++)
{
temp = listHash.get(i).hammingDistance(listHash.get(j));
distance[i][j] = temp;
}
}
return distance;
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一对多)
* @param titles
* @param text
* @return
* int[] 返回类型
*/
public int[] getDistance(List<String> titles,String text) {
List<MySimHash> listHash = new ArrayList<>();
for (int i = 0; i < titles.size(); i++)
{
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(titles.get(i));
listHash.add(mySimHash);
}
int[] distance = new int[titles.size()];
int temp;
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(text);
for (int i = 0; i < titles.size(); i++)
{
temp = mySimHash.hammingDistance(listHash.get(i));
distance[i] = temp;
}
return distance;
}
public int getDistance(String text1,String text2) {
MySimHash hash1 = new MySimHash();
MySimHash hash2 = new MySimHash();
hash1.setTokens(text1);
hash2.setTokens(text2);
return hash1.hammingDistance(hash2);
}
/**
* @Title: getSimilarity
* @Description: TODO(判断simhash相似度)
* @param titles
* @param text
* @return
* double[] 返回类型
*/
public double[] getSimilarity(List<String> titles,String text) {
List<MySimHash> listHash = new ArrayList<>();
for (int i = 0; i < titles.size(); i++)
{
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(titles.get(i));
listHash.add(mySimHash);
}
double[] distance = new double[titles.size()];
double temp;
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(text);
for (int i = 0; i < titles.size(); i++)
{
temp = mySimHash.getSemblance(listHash.get(i));
distance[i] = temp;
}
return distance;
}
}
package com.zhiweidata.titleAggregation.start;
public class Start {
public static void main(String[] args) {
// 使用说明
//
// 标题相似度算法,是根据simhash、余弦判断辅以空间向量乘积计算相似度,用层次聚类算法选择聚类
// 中心。
//
// 在src/test/java/startTest中
// MongoStart 是对mongo数据库进行使用测试,将结果导出成excel表格
// ExcelTest 是对excel表格数据进行使用测试,将结果导出成excel表格
// ResultTest 有对各个算法的效果进行模拟测试
// ChineseTest 是对繁简体结果转换的测试
//
// 在com.zhiweidata.titleAggregation.main中
// HCluster 聚类算法的实现类
// ClusterUtil 是对算法结果的优化,一般在此对结果进行迭代
// ComputeWordsVector 计算文本向量的工具类
// CosineSimilarity 余弦算法的工具类
// MySimHash simhash算法的工具类
//
// 使用方法
// 在startTest中,给出了事例,自行参考
}
}
......@@ -2,15 +2,29 @@ package com.zhiweidata.titleAggregation.util;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @ClassName: Util
* @Description: TODO(封装对集合处理的工具类)
* @author xuyimeng
* @date 2017年12月26日 上午9:15:29
*/
public class Util {
public class BasicUtil {
public static Map<Integer,String> toMap(List<String> list)
{
Map<Integer,String> texts = new HashMap<>();
int i = 0;
for (String text : list)
{
texts.put(i, text);
i++;
}
return texts;
}
/**
* @Title: toList
* @Description: TODO(将map转为list)
......
......@@ -34,11 +34,10 @@ public class ChineseTranslate {
private ChineseTranslate() { }
/**
* 不需自行创建转换器即可转换. 内部调用{@link #转换(String) 转换}方法.
* 不需自行创建转换器即可转换. 内部调用转换方法.
* @param 文本 任意长度
* @param 简繁 goal格式
* @return 转换为goal格式的文本
* @throws IllegalArgumentException 文本为null时
*/
public static String trans(String text, goal 简繁) {
return getInstance(简繁).trans(text);
......
......@@ -18,6 +18,9 @@ import org.junit.Test;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiweidata.titleAggregation.bean.Cluster;
import com.zhiweidata.titleAggregation.bean.Result;
import com.zhiweidata.titleAggregation.main.ClusterResult;
import com.zhiweidata.titleAggregation.main.HCluster;
import com.zhiweidata.titleAggregation.util.ChineseTranslate;
import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal;
......@@ -55,14 +58,13 @@ public class ExcelTest {
@SuppressWarnings("unchecked")
List<Map<String, Object>> body = (List<Map<String, Object>>)map.get("body");
Map<Integer,String> titles = exportTitleData(body);
List<String> titles = exportTitleData(body);
//调用算法
HCluster hc = new HCluster();
Map<Integer,Map<Integer,String>> mapResult = hc.changeData(titles,9,0.93);
ClusterResult cr = new ClusterResult();
List<Result> list = cr.getResult(titles);
DBOExp dbo = new DBOExp();
dbo.putRun(noGroupSheet(mapResult,body),goalPath,"未聚合");
dbo.putRun(GroupSheet(mapResult,body),goalPath,"聚合");
dbo.putRun(GroupSheet(list,body),goalPath,"聚合");
dbo.putRun(AllSheet(body), goalPath, "全部");
}
/**
......@@ -73,17 +75,15 @@ public class ExcelTest {
* @return
* Map<Integer,String> 返回类型
*/
public Map<Integer,String> exportTitleData(List<Map<String, Object>> body){
Map<Integer,String> titles = new HashMap<>();
int i = 0;
public List<String> exportTitleData(List<Map<String, Object>> body){
List<String> titles = new ArrayList<>();
for (Map<String, Object> map : body)
{
for (String key : map.keySet())
{
if (key.equals("标题"))
{
titles.put(i, map.get(key).toString());
i++;
titles.add(map.get(key).toString());
}
}
}
......@@ -92,47 +92,6 @@ public class ExcelTest {
/**
*
* @Title: noGroupSheet
* @Description: TODO(未聚合的个体的sheet)
* @param map
* @param body
* @return
* List<DBObject> 返回类型
*/
public static List<DBObject> noGroupSheet(Map<Integer, Map<Integer, String>> map,
List<Map<String, Object>> body)
{
List<DBObject> listDB = new ArrayList<>();
for (Integer str : map.keySet())
{
Map<Integer, String> titles = map.get(str);
if (titles.size() > 1)
{
continue;
}
for (Integer key : titles.keySet())
{
DBObject obj = new BasicDBObject();
Map<String,Object> bean = body.get(key);
for (String keyObj : bean.keySet())
{
if (keyObj == null || "".equals(keyObj))
{
continue;
}
obj.put(keyObj, bean.get(keyObj));
}
listDB.add(obj);
}
}
return listDB;
}
/**
*
* @Title: GroupSheet
* @Description: TODO(聚合的个体的sheet)
* @param map
......@@ -140,38 +99,31 @@ public class ExcelTest {
* @return 设定文件
* @return List<DBObject> 返回类型
*/
public static List<DBObject> GroupSheet(Map<Integer, Map<Integer, String>> map,
public static List<DBObject> GroupSheet(List<Result> list,
List<Map<String, Object>> body)
{
//简繁体翻译
ChineseTranslate simplifiedTrans = ChineseTranslate.getInstance(goal.简体);
List<DBObject> listDB = new ArrayList<>();
for (Integer str : map.keySet())
int i=0;
for (Result result : list)
{
Map<Integer, String> titles = map.get(str);
if (titles.size() <= 1)
{
continue;
}
DBObject obj = new BasicDBObject();
for (Integer key : titles.keySet())
Map<String,Object> bean = body.get(i);
String title = simplifiedTrans.trans(result.getClusterName());
obj.put("聚合标签", title);
for (String keyObj : bean.keySet())
{
DBObject obj = new BasicDBObject();
Map<String,Object> bean = body.get(key);
String title = simplifiedTrans.trans(body.get(str).get("标题").toString());
obj.put("聚合标签", title);
for (String keyObj : bean.keySet())
if (keyObj == null || "".equals(keyObj))
{
if (keyObj == null || "".equals(keyObj))
{
continue;
}
obj.put(keyObj, bean.get(keyObj));
continue;
}
listDB.add(obj);
obj.put(keyObj, bean.get(keyObj));
}
listDB.add(obj);
i++;
}
return listDB;
}
......
......@@ -11,6 +11,7 @@ import org.junit.Test;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiweidata.titleAggregation.bean.Result;
import com.zhiweidata.titleAggregation.main.HCluster;
import com.zhiweidata.titleAggregation.util.ChineseTranslate;
import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal;
......@@ -70,7 +71,7 @@ public class MongoStart {
// path = "D:\\excel\\"+name+"——"+pt+"标题.xls";
// MediaData(pt, eventId, path,name);
}
/**
*
* @Title: MediaData
......@@ -92,68 +93,23 @@ public class MongoStart {
{
String text = event.getTitle().replaceAll("\\.", "-");
texts.put(i, text);
i++;
i++;
}
System.out.println("start");
//调用算法
HCluster hc = new HCluster();
long time = System.currentTimeMillis();
Map<Integer,Map<Integer,String>> mapResult = hc.changeData(texts,9, 0.93);
List<Result> list = hc.DataToResult(texts, 9, 0.93);
long t = System.currentTimeMillis()-time;
System.out.println("事件名:"+name+"——"+pt+"数据,数据量:"+texts.size()+" ,输出时间:"+t);
DBOExp dbo = new DBOExp();
dbo.putRun(noGroupSheet(mapResult,listEvent),path,"未聚合");
dbo.putRun(GroupSheet(mapResult,listEvent),path,"聚合");
dbo.putRun(GroupSheet(list,listEvent),path,"聚合");
dbo.putRun(AllSheet(listEvent), path, "全部");
}
/**
*
* @Title: noGroupSheet
* @Description: TODO(未聚合的个体的sheet)
* @param map
* @param listEvent
* @return
* List<DBObject> 返回类型
*/
public static List<DBObject> noGroupSheet(Map<Integer, Map<Integer, String>> map,
List<MediaAndWechatEvent> listEvent)
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
List<DBObject> listDB = new ArrayList<>();
for (Integer str : map.keySet())
{
Map<Integer, String> titles = map.get(str);
if (titles.size() > 1)
{
continue;
}
for (Integer key : titles.keySet())
{
MediaAndWechatEvent event = listEvent.get(key);
DBObject obj = new BasicDBObject();
String saveTime = sdf.format((new Date(event.getSavetime())));
obj.put("标题", event.getTitle());
obj.put("url", event.getUrl());
obj.put("发布时间", sdf.format(event.getTime()));
obj.put("来源", event.getSource());
obj.put("类型", event.getType());
obj.put("保存时间", saveTime);
obj.put("平台", event.getPt());
obj.put("事件id", event.getEventId());
obj.put("H因子", event.getH());
listDB.add(obj);
}
}
return listDB;
}
/**
*
* @Title: GroupSheet
......@@ -163,7 +119,7 @@ public class MongoStart {
* @ @return 设定文件
* @return List<DBObject> 返回类型
*/
public static List<DBObject> GroupSheet(Map<Integer, Map<Integer, String>> map,
public static List<DBObject> GroupSheet(List<Result> list,
List<MediaAndWechatEvent> listEvent)
{
//简繁体翻译
......@@ -171,33 +127,25 @@ public class MongoStart {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
List<DBObject> listDB = new ArrayList<>();
for (Integer str : map.keySet())
for (Result result : list)
{
Map<Integer, String> titles = map.get(str);
if (titles.size() <= 1)
{
continue;
}
DBObject obj = new BasicDBObject();
MediaAndWechatEvent event = listEvent.get(result.getI());
String saveTime = sdf.format((new Date(event.getSavetime())));
String text = simplifiedTrans.trans(result.getClusterName());
for (Integer key : titles.keySet())
{
MediaAndWechatEvent event = listEvent.get(key);
DBObject obj = new BasicDBObject();
String saveTime = sdf.format((new Date(event.getSavetime())));
String titleGroup = simplifiedTrans.trans(listEvent.get(str).getTitle());
obj.put("聚合标题", titleGroup);
obj.put("标题", event.getTitle());
obj.put("url", event.getUrl());
obj.put("发布时间", sdf.format(event.getTime()));
obj.put("来源", event.getSource());
obj.put("类型", event.getType());
obj.put("保存时间", saveTime);
obj.put("平台", event.getPt());
obj.put("事件id", event.getEventId());
obj.put("H因子", event.getH());
listDB.add(obj);
}
obj.put("聚合标题", text);
obj.put("标题", event.getTitle());
obj.put("url", event.getUrl());
obj.put("发布时间", sdf.format(event.getTime()));
obj.put("来源", event.getSource());
obj.put("类型", event.getType());
obj.put("保存时间", saveTime);
obj.put("平台", event.getPt());
obj.put("事件id", event.getEventId());
obj.put("H因子", event.getH());
listDB.add(obj);
}
return listDB;
}
......
......@@ -9,8 +9,8 @@ import org.junit.Test;
import org.nlpcn.commons.lang.util.AnsjArrays;
import com.zhiweidata.titleAggregation.main.ClusterUtil;
import com.zhiweidata.titleAggregation.main.CosineSimilarity;
import com.zhiweidata.titleAggregation.main.MySimHash;
import com.zhiweidata.titleAggregation.method.CosineSimilarity;
import com.zhiweidata.titleAggregation.method.MySimHash;
import com.zhiweidata.titleAggregation.util.AnsjSeg;
public class ResultTest {
......@@ -28,11 +28,12 @@ public class ResultTest {
@Test
public void test3() {
String s1 = "大学生娶同学妈妈?传了几年的假新闻";
//文本的集合
List<String> s2 = new ArrayList<>();
s2.add(s1);
ClusterUtil clusterUtil = new ClusterUtil();
System.out.println(clusterUtil.getDistance(s2, s1)[0]);
CosineSimilarity cos = new CosineSimilarity();
System.out.println(cos.getDistance(s2, s1)[0]);
}
......
使用说明
使用说明
......@@ -15,13 +15,19 @@ ClusterUtil 是对算法结果的优化,一般在此对结果进行迭代
ComputeWordsVector 计算文本向量的工具类
CosineSimilarity 余弦算法的工具类
MySimHash simhash算法的工具类
cutPage 切割算法的工具类
使用:
直接调用 ClusterResult.getResult(List<String> texts);
返回的是List<Result>
Result是结果集的对象,里面有:
clusterName:类簇名
dataPointName:节点名
i:List中的索引
simhash:simhash距离(越小越好)
cosSimilarity:余弦算法相似度(越接近1越相似)
cut:切割算法相似度(越接近1越相似)
在各个算法的工具类中,封装了各个算法的相似度计算
使用方法
在startTest中,给出了事例,自行参考
注意
算法支持繁简体的聚类,但不支持对聚类标题的转换 如果对聚类标题有繁简体的要求,
在调用繁简体转换类的方法,自行转换
例:
ChineseTranslate simplifiedTrans = ChineseTranslate.getInstance(goal.简体);
String title = simplifiedTrans.trans(text);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment