Commit 78033265 by win7

短文本聚合,长文本相似度

parent 6bf229eb
...@@ -5,9 +5,10 @@ ...@@ -5,9 +5,10 @@
* @author xuyimeng * @author xuyimeng
* @date 2017年12月28日 上午11:52:42 * @date 2017年12月28日 上午11:52:42
* @version V1.0 * @version V1.0
*/ /**
*
*/ */
/**
*
*/
package com.zhiweidata.titleAggregation.bean; package com.zhiweidata.titleAggregation.bean;
/** /**
...@@ -16,23 +17,24 @@ package com.zhiweidata.titleAggregation.bean; ...@@ -16,23 +17,24 @@ package com.zhiweidata.titleAggregation.bean;
* @author xuyimeng * @author xuyimeng
* @date 2017年12月28日 上午11:52:42 * @date 2017年12月28日 上午11:52:42
*/ */
public class Result { public class Result{
/**类簇名*/ /** 类簇名 */
private String clusterName; private String clusterName;
/**节点名*/ /** 节点名 */
private String datapointName; private String datapointName;
/**索引位置*/ /** 索引位置 */
private Integer i; private Integer i;
/**simhash距离*/ /** simhash距离 */
private int simhash; private int simhash;
/**余弦算法相似度*/ /** 余弦算法相似度 */
private double cosSimilarity; private double cosSimilarity;
/**切割算法相似度*/ /** 切割算法相似度 */
private double cut; private double cut;
public Result() {} public Result() {
}
public Result(String clusterName, String datapointName, int simhash, double cosSimilarity, double cut,Integer i) { public Result(String clusterName, String datapointName, int simhash, double cosSimilarity, double cut, Integer i) {
super(); super();
this.clusterName = clusterName; this.clusterName = clusterName;
this.datapointName = datapointName; this.datapointName = datapointName;
...@@ -96,4 +98,5 @@ public class Result { ...@@ -96,4 +98,5 @@ public class Result {
+ simhash + ", cosSimilarity=" + cosSimilarity + ", cut=" + cut + "]"; + simhash + ", cosSimilarity=" + cosSimilarity + ", cut=" + cut + "]";
} }
} }
package com.zhiweidata.titleAggregation.main; package com.zhiweidata.titleAggregation.main;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -51,6 +53,14 @@ public class HCluster { ...@@ -51,6 +53,14 @@ public class HCluster {
list.add(result); list.add(result);
} }
} }
Collections.sort(list, new Comparator<Result>() {
@Override
public int compare(Result o1, Result o2) {
return o1.getI() - o2.getI();
}
});
return list; return list;
} }
/** /**
......
...@@ -6,15 +6,33 @@ import java.util.List; ...@@ -6,15 +6,33 @@ import java.util.List;
import org.ansj.domain.Term; import org.ansj.domain.Term;
import org.junit.Test; import org.junit.Test;
import org.nlpcn.commons.lang.util.AnsjArrays;
import com.zhiweidata.titleAggregation.main.ClusterUtil; import com.zhiweidata.titleAggregation.bean.Result;
import com.zhiweidata.titleAggregation.main.ClusterResult;
import com.zhiweidata.titleAggregation.method.CosineSimilarity; import com.zhiweidata.titleAggregation.method.CosineSimilarity;
import com.zhiweidata.titleAggregation.method.MySimHash; import com.zhiweidata.titleAggregation.method.MySimHash;
import com.zhiweidata.titleAggregation.util.AnsjSeg; import com.zhiweidata.titleAggregation.util.AnsjSeg;
public class ResultTest { public class ResultTest {
@Test @Test
public void test5() {
List<String> texts = new ArrayList<>();
texts.add("慰安妇纪录片遭截图制成表情包被网友质疑 腾讯致歉");
texts.add("慰安妇纪录片遭截图制成QQ表情包 配文:我真委屈");
texts.add("慰安妇纪录片遭截图制成QQ表情包 配文:我真委屈");
texts.add("慰安妇纪录片被做成表情包 QQ空间道歉");
texts.add("慰安妇纪录片遭截图制成表情包 配文:我真委屈");
texts.add("慰安妇纪录片遭截图制成表情包 网友质疑");
List<Result> list = ClusterResult.getResult(texts);
for (Result result : list)
{
System.out.println("聚合标题:"+result.getClusterName()+
" 标题:"+result.getDatapointName());
}
}
@Test
public void test4() { public void test4() {
AnsjSeg ansj = AnsjSeg.getInstance(); AnsjSeg ansj = AnsjSeg.getInstance();
String text = "愤怒!慰安妇纪录片《二十二》被截图制作表情包"; String text = "愤怒!慰安妇纪录片《二十二》被截图制作表情包";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment