Commit 78033265 by win7

短文本聚合,长文本相似度

parent 6bf229eb
......@@ -5,9 +5,10 @@
* @author xuyimeng
* @date 2017年12月28日 上午11:52:42
* @version V1.0
*/ /**
*
*/
/**
*
*/
package com.zhiweidata.titleAggregation.bean;
/**
......@@ -16,23 +17,24 @@ package com.zhiweidata.titleAggregation.bean;
* @author xuyimeng
* @date 2017年12月28日 上午11:52:42
*/
public class Result {
/**类簇名*/
public class Result{
/** 类簇名 */
private String clusterName;
/**节点名*/
/** 节点名 */
private String datapointName;
/**索引位置*/
/** 索引位置 */
private Integer i;
/**simhash距离*/
/** simhash距离 */
private int simhash;
/**余弦算法相似度*/
/** 余弦算法相似度 */
private double cosSimilarity;
/**切割算法相似度*/
/** 切割算法相似度 */
private double cut;
public Result() {}
public Result() {
}
public Result(String clusterName, String datapointName, int simhash, double cosSimilarity, double cut,Integer i) {
public Result(String clusterName, String datapointName, int simhash, double cosSimilarity, double cut, Integer i) {
super();
this.clusterName = clusterName;
this.datapointName = datapointName;
......@@ -96,4 +98,5 @@ public class Result {
+ simhash + ", cosSimilarity=" + cosSimilarity + ", cut=" + cut + "]";
}
}
package com.zhiweidata.titleAggregation.main;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
......@@ -51,6 +53,14 @@ public class HCluster {
list.add(result);
}
}
Collections.sort(list, new Comparator<Result>() {
@Override
public int compare(Result o1, Result o2) {
return o1.getI() - o2.getI();
}
});
return list;
}
/**
......
......@@ -6,15 +6,33 @@ import java.util.List;
import org.ansj.domain.Term;
import org.junit.Test;
import org.nlpcn.commons.lang.util.AnsjArrays;
import com.zhiweidata.titleAggregation.main.ClusterUtil;
import com.zhiweidata.titleAggregation.bean.Result;
import com.zhiweidata.titleAggregation.main.ClusterResult;
import com.zhiweidata.titleAggregation.method.CosineSimilarity;
import com.zhiweidata.titleAggregation.method.MySimHash;
import com.zhiweidata.titleAggregation.util.AnsjSeg;
public class ResultTest {
@Test
public void test5() {
List<String> texts = new ArrayList<>();
texts.add("慰安妇纪录片遭截图制成表情包被网友质疑 腾讯致歉");
texts.add("慰安妇纪录片遭截图制成QQ表情包 配文:我真委屈");
texts.add("慰安妇纪录片遭截图制成QQ表情包 配文:我真委屈");
texts.add("慰安妇纪录片被做成表情包 QQ空间道歉");
texts.add("慰安妇纪录片遭截图制成表情包 配文:我真委屈");
texts.add("慰安妇纪录片遭截图制成表情包 网友质疑");
List<Result> list = ClusterResult.getResult(texts);
for (Result result : list)
{
System.out.println("聚合标题:"+result.getClusterName()+
" 标题:"+result.getDatapointName());
}
}
@Test
public void test4() {
AnsjSeg ansj = AnsjSeg.getInstance();
String text = "愤怒!慰安妇纪录片《二十二》被截图制作表情包";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment