Commit ce5b9fa0 by win7

标题聚合工具类

parent 78033265
package com.zhiweidata.titleAggregation.algorithm;
import java.util.List;
/**
* @ClassName: Algorithm
* @Description: TODO(各个算法接口)
* @author xuyimeng
* @date 2017年12月29日 上午11:59:54
*/
public interface Algorithm {
/**相似度*/
double getSimilarity(String text1,String text2);
double[] getSimilarity(List<String> text1,String text2);
double[][] getSimilarity(List<String> text);
}
/**
* @Title: AllAlgorithm.java
* @Package com.zhiweidata.titleAggregation.algorithm
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月29日 下午2:23:02
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.titleAggregation.algorithm;
import com.zhiweidata.titleAggregation.algorithm.impl.CosineSimilarity;
import com.zhiweidata.titleAggregation.algorithm.impl.CutPage;
import com.zhiweidata.titleAggregation.algorithm.impl.MySimHash;
/**
* @ClassName: AllAlgorithm
* @Description: TODO(获取所有算法的对象)
* @author xuyimeng
* @date 2017年12月29日 下午2:23:02
*/
public class AllAlgorithm {
public enum goal{
hash,cut,cos
}
private static final Algorithm simHash = new MySimHash();
private static final Algorithm cosineSimilarity = new CosineSimilarity();
private static final Algorithm cutpage = new CutPage();
public static Algorithm getInstance(goal state) {
if (state.equals(goal.cos))
{
return cosineSimilarity;
}
else if (state.equals(goal.cut))
{
return cutpage;
}
else
{
return simHash;
}
}
private AllAlgorithm() {}
}
package com.zhiweidata.titleAggregation.algorithm.impl;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import com.zhiweidata.titleAggregation.algorithm.Algorithm;
/**
* 余弦算法,根据相似程序(长度、单个字)来判断
* @ClassName: CosineSimilarity
* @Description: TODO(余弦相似度算法的)
* @author xuyimeng
* @date 2017年12月26日 上午10:01:07
*/
public class CosineSimilarity implements Algorithm{
/**
* 输入两段文本利用字频率的余弦定理判断二者间的相似度
* @param doc1,文本1
* @param doc2,文本2
* @return 相似度值
*/
public static double CalculateTextSim(String doc1, String doc2) {
if (doc1 != null && doc1.trim().length() > 0 && doc2 != null && doc2.trim().length() > 0) {
Map<Integer, int[]> AlgorithmMap = new HashMap<Integer, int[]>();
// 将两个字符串中的中文字符以及出现的总数封装到,AlgorithmMap中
for (int i = 0; i < doc1.length(); i++) {
char d1 = doc1.charAt(i);
if (isHanZi(d1)) {
int charIndex = getGB2312Id(d1);
if (charIndex != -1) {
int[] fq = AlgorithmMap.get(charIndex);
if (fq != null && fq.length == 2) {
fq[0]++;
} else {
fq = new int[2];
fq[0] = 1;
fq[1] = 0;
AlgorithmMap.put(charIndex, fq);
}
}
}
}
for (int i = 0; i < doc2.length(); i++) {
char d2 = doc2.charAt(i);
if (isHanZi(d2)) {
int charIndex = getGB2312Id(d2);
if (charIndex != -1) {
int[] fq = AlgorithmMap.get(charIndex);
if (fq != null && fq.length == 2) {
fq[1]++;
} else {
fq = new int[2];
fq[0] = 0;
fq[1] = 1;
AlgorithmMap.put(charIndex, fq);
}
}
}
}
Iterator<Integer> iterator = AlgorithmMap.keySet().iterator();
double sqdoc1 = 0;
double sqdoc2 = 0;
double denominator = 0;
while (iterator.hasNext()) {
int[] c = AlgorithmMap.get(iterator.next());
denominator += c[0] * c[1];
sqdoc1 += c[0] * c[0];
sqdoc2 += c[1] * c[1];
}
return denominator / Math.sqrt(sqdoc1 * sqdoc2);
} else {
throw new NullPointerException("the Document is null or have not cahrs!!");
}
}
/**
* 输入一个字符判断是否为中文汉字
*
* @param ch,字符
* @return true为中文汉字,否则为false
*/
public static boolean isHanZi(char ch) {
return (ch >= 0x4E00 && ch <= 0x9FA5);
}
/**
* 根据输入的Unicode字符,获取它的GB2312编码或者ascii编码,
*
* @param ch,输入的GB2312中文字符或者ASCII字符(128个)
* @return ch在GB2312中的位置,-1表示该字符不认识
*/
public static short getGB2312Id(char ch) {
try {
byte[] buffer = Character.toString(ch).getBytes("GB2312");
if (buffer.length != 2) {
// 正常情况下buffer应该是两个字节,否则说明ch不属于GB2312编码,故返回'?',此时说明不认识该字符
return -1;
}
int b0 = (int) (buffer[0] & 0x0FF) - 161; // 编码从A1开始,因此减去0xA1=161
int b1 = (int) (buffer[1] & 0x0FF) - 161; // 第一个字符和最后一个字符没有汉字,因此每个区只收16*6-2=94个汉字
return (short) (b0 * 94 + b1);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return -1;
}
/**
* 用余弦算法 遍历计算相似度
* 越接近1, 越相近
*/
public double[][] getSimilarity(List<String> titles) {
int size = titles.size();
double[][] distance = new double[size][size];
String doc1 = "";
String doc2 = "";
for (int i = 0; i < size-1; i++)
{
doc1 = titles.get(i);
for (int j = i+1; j < size; j++)
{
doc2 = titles.get(j);
distance[i][j] = CalculateTextSim(doc1, doc2);
}
}
return distance;
}
@Override
public double getSimilarity(String text1, String text2) {
return CalculateTextSim(text1, text2);
}
@Override
public double[] getSimilarity(List<String> list,String doc1) {
double[] distance = new double[list.size()];
for (int i = 0; i < list.size(); i++)
{
String doc2 = list.get(i);
distance[i] = CalculateTextSim(doc1, doc2);
}
return distance;
}
}
\ No newline at end of file
/**
* @Title: CutPage.java
* @Package util
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月27日 下午2:24:06
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.titleAggregation.algorithm.impl;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.zhiweidata.titleAggregation.algorithm.Algorithm;
import com.zhiweidata.titleAggregation.algorithm.AllAlgorithm;
import com.zhiweidata.titleAggregation.algorithm.AllAlgorithm.goal;
/**
* @ClassName: CutPage
* @Description: TODO(应用于长文本的切割算法)
* @author xuyimeng
* @date 2017年12月27日 下午2:24:06
*/
public class CutPage implements Algorithm{
public static double getSemblance(String goalText,String testText) {
List<String> goalTexts = splitString(goalText);
List<String> textTexts = splitString(testText);
Algorithm cos = AllAlgorithm.getInstance(goal.cos);
double result = 0;
for (String goal : goalTexts)
{
double[] sim = cos.getSimilarity(textTexts, goal);
double maxSim = 0;
for (int i=0;i<sim.length;i++)
{
if (sim[i] > maxSim)
{
maxSim = sim[i];
}
}
result += maxSim;
}
return result /goalTexts.size();
}
/**
* @Title: splitString
* @Description: TODO(切割字符串,按。和;)
* @param text
* @return
* List<String> 返回类型
*/
public static List<String> splitString(String text){
List<String> result = new ArrayList<>();
List<String> list = Arrays.asList(text.split("。"));
for (String str : list)
{
if (str.contains(";"))
{
result.addAll(Arrays.asList(str.split(";")));
}
result.add(str);
}
return result;
}
@Override
public double[] getSimilarity(List<String> text1, String text2) {
int size = text1.size();
double[] result = new double[size];
for (int i=0;i<size;i++)
{
result[i] = getSemblance(text1.get(i), text2);
}
return result;
}
@Override
public double[][] getSimilarity(List<String> text) {
int size = text.size();
double[][] result = new double[size][size];
for (int i=0;i<size-1;i++)
{
for (int j=i+1;j<size;j++)
{
result[i][j] = getSemblance(text.get(i), text.get(j));
}
}
return result;
}
@Override
public double getSimilarity(String text1, String text2) {
return getSemblance(text1, text2);
}
}
package com.zhiweidata.titleAggregation.algorithm.impl;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.ansj.domain.Term;
import com.zhiweidata.titleAggregation.algorithm.Algorithm;
import com.zhiweidata.titleAggregation.util.AnsjSeg;
/**
* simhash 是根据词义(词性)来判断文本相似度
* @ClassName: MySimHash
* @Description: TODO(simHash算法)
* @author xuyimeng
* @date 2017年12月26日 上午9:16:58
*/
public class MySimHash implements Algorithm{
private String tokens; //字符串
private BigInteger strSimHash;//字符产的hash值
private int hashbits = 64; // 分词后的hash数;
public void setTokens(String tokens) {
this.tokens = tokens;
strSimHash = simHash();
}
/**
* 这个是对整个字符串进行hash计算
* @return
*/
private BigInteger simHash() {
int[] v = new int[this.hashbits];
AnsjSeg ansj = AnsjSeg.getInstance();
Map<String, Integer> weightOfNature = new HashMap<String, Integer>(); // 词性的权重
//给名词的权重是1;
weightOfNature.put("n", 1);
weightOfNature.put("m", 1);
List<Term> termList = ansj.getTerms(tokens);
for (Term term : termList) {
String word = term.getName(); //分词字符串
String nature = term.getNatureStr(); // 分词属性;
//将每一个分词hash为一组固定长度的数列
BigInteger t = hash(word);
for (int i = 0; i < this.hashbits; i++) {
BigInteger bitmask = new BigInteger("1").shiftLeft(i);
// 建立一个长度为64的整数数组,进行权重计算,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕.
int weight = 1;
if (weightOfNature.containsKey(nature)) {
weight = weightOfNature.get(nature);
}
if (t.and(bitmask).signum() != 0) {
// 这里是计算整个文档的所有特征的向量和
v[i] += weight;
} else {
v[i] -= weight;
}
}
}
BigInteger fingerprint = new BigInteger("0");
for (int i = 0; i < this.hashbits; i++) {
if (v[i] >= 0) {
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
}
}
return fingerprint;
}
/**
* 对单个的分词进行hash计算;
* @param source
* @return
*/
private BigInteger hash(String source) {
if (source == null || source.length() == 0) {
return new BigInteger("0");
} else {
/**
* 当sourece 的长度过短,会导致hash算法失效,因此需要对过短的词补偿
*/
while (source.length() < 3) {
source = source + source.charAt(0);
}
char[] sourceArray = source.toCharArray();
BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
BigInteger m = new BigInteger("1000003");
BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1"));
for (char item : sourceArray) {
BigInteger temp = BigInteger.valueOf((long) item);
x = x.multiply(m).xor(temp).and(mask);
}
x = x.xor(new BigInteger(String.valueOf(source.length())));
if (x.equals(new BigInteger("-1"))) {
x = new BigInteger("-2");
}
return x;
}
}
/**
* 计算海明距离,海明距离越小说明越相似;
* @param other
* @return
*/
public int hammingDistance(MySimHash other) {
BigInteger m = new BigInteger("1").shiftLeft(this.hashbits).subtract(
new BigInteger("1"));
BigInteger x = this.strSimHash.xor(other.strSimHash).and(m);
int tot = 0;
while (x.signum() != 0) {
tot += 1;
x = x.and(x.subtract(new BigInteger("1")));
}
return tot;
}
/**
*
* @Title: getSemblance
* @Description: TODO(计算simhash的相似度)
* @param s2
* @return
* double 返回类型
*/
public double getSemblance(MySimHash s2 ){
double i = (double) this.hammingDistance(s2);
return 1 - i/this.hashbits ;
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一个集合内各个元素的)
* @param titles
* @return
* int[][] 返回类型
*/
public int[][] getDistance(List<String> titles) {
int size = titles.size();
List<MySimHash> listHash = new ArrayList<>(size);
for (int i = 0; i < size; i++)
{
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(titles.get(i));
listHash.add(mySimHash);
}
int[][] distance = new int[size][size];
int temp;
for (int i = 0; i < size-1; i++)
{
for (int j = i+1; j < size; j++)
{
temp = listHash.get(i).hammingDistance(listHash.get(j));
distance[i][j] = temp;
}
}
return distance;
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一对多)
* @param titles
* @param text
* @return
* int[] 返回类型
*/
public int[] getDistance(List<String> titles,String text) {
int size = titles.size();
List<MySimHash> listHash = new ArrayList<>(size);
for (int i = 0; i < size; i++)
{
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(titles.get(i));
listHash.add(mySimHash);
}
int[] distance = new int[titles.size()];
int temp;
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(text);
for (int i = 0; i < size; i++)
{
temp = mySimHash.hammingDistance(listHash.get(i));
distance[i] = temp;
}
return distance;
}
public int getDistance(String text1,String text2) {
MySimHash hash1 = new MySimHash();
MySimHash hash2 = new MySimHash();
hash1.setTokens(text1);
hash2.setTokens(text2);
return hash1.hammingDistance(hash2);
}
/**
* @Title: getSimilarity
* @Description: TODO(判断simhash相似度)
* @param titles
* @param text
* @return
* double[] 返回类型
*/
@Override
public double[] getSimilarity(List<String> titles,String text) {
int size = titles.size();
List<MySimHash> listHash = new ArrayList<>(size);
for (int i = 0; i < size; i++)
{
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(titles.get(i));
listHash.add(mySimHash);
}
double[] distance = new double[titles.size()];
double temp;
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(text);
for (int i = 0; i < size; i++)
{
temp = mySimHash.getSemblance(listHash.get(i));
distance[i] = temp;
}
return distance;
}
@Override
public double getSimilarity(String text1, String text2) {
MySimHash hash1 = new MySimHash();
MySimHash hash2 = new MySimHash();
hash1.setTokens(text1);
hash2.setTokens(text2);
return hash1.getSemblance(hash2);
}
@Override
public double[][] getSimilarity(List<String> text) {
int size = text.size();
List<MySimHash> listHash = new ArrayList<>(size);
for (int i = 0; i < size; i++)
{
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(text.get(i));
listHash.add(mySimHash);
}
double[][] distance = new double[size][size];
double temp;
for (int i = 0; i < size-1; i++)
{
for (int j = i+1; j < size; j++)
{
temp = listHash.get(i).getSemblance(listHash.get(j));
distance[i][j] = temp;
}
}
return distance;
}
}
...@@ -67,8 +67,7 @@ public class ClusterResult { ...@@ -67,8 +67,7 @@ public class ClusterResult {
{ {
cosFreq = 0.93; cosFreq = 0.93;
} }
HCluster hCluster = new HCluster();
Map<Integer,String> map = BasicUtil.toMap(list); Map<Integer,String> map = BasicUtil.toMap(list);
return hCluster.DataToResult(map, freq, cosFreq); return Means.changeMeans(map,freq,cosFreq);
} }
} }
...@@ -6,9 +6,11 @@ import java.util.Iterator; ...@@ -6,9 +6,11 @@ import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.zhiweidata.titleAggregation.algorithm.Algorithm;
import com.zhiweidata.titleAggregation.algorithm.AllAlgorithm;
import com.zhiweidata.titleAggregation.algorithm.AllAlgorithm.goal;
import com.zhiweidata.titleAggregation.bean.Cluster; import com.zhiweidata.titleAggregation.bean.Cluster;
import com.zhiweidata.titleAggregation.bean.DataPoint; import com.zhiweidata.titleAggregation.bean.DataPoint;
import com.zhiweidata.titleAggregation.method.CosineSimilarity;
/** /**
...@@ -18,22 +20,31 @@ import com.zhiweidata.titleAggregation.method.CosineSimilarity; ...@@ -18,22 +20,31 @@ import com.zhiweidata.titleAggregation.method.CosineSimilarity;
* @date 2017年12月26日 上午9:19:41 * @date 2017年12月26日 上午9:19:41
*/ */
public class ClusterUtil { public class ClusterUtil {
static Algorithm cos = AllAlgorithm.getInstance(goal.cos);
/**
* @Title: mergeLikeCluster
* @Description: TODO(合并相似类簇)
* @param clusters
* @param freq
* void 返回类型
*/
public void mergeLikeCluster(List<Cluster> clusters,double freq) { public void mergeLikeCluster(List<Cluster> clusters,double freq) {
List<String> texts = new ArrayList<>(); List<String> texts = new ArrayList<>();
for (Cluster cluster : clusters) for (Cluster cluster : clusters)
{ {
texts.add(cluster.getClusterName()); texts.add(cluster.getClusterName());
} }
CosineSimilarity cos = new CosineSimilarity();
double[][] distance = cos.getDistance(texts);
for (int i=0;i<clusters.size()-1;i++) double[][] distance = cos.getSimilarity(texts);
int size = clusters.size();
for (int i=0;i<size-1;i++)
{ {
if (clusters.get(i).getDataPoints().size() == 0) if (clusters.get(i).getDataPoints().size() == 0)
{ {
continue; continue;
} }
for (int j=i+1;j<clusters.size();j++) for (int j=i+1;j<size;j++)
{ {
if (clusters.get(j).getDataPoints().size() == 0) if (clusters.get(j).getDataPoints().size() == 0)
{ {
...@@ -55,7 +66,6 @@ public class ClusterUtil { ...@@ -55,7 +66,6 @@ public class ClusterUtil {
*/ */
public void alertLikeData(List<Cluster> clusters,double freq) public void alertLikeData(List<Cluster> clusters,double freq)
{ {
CosineSimilarity cos = new CosineSimilarity();
//存储从类簇中被删除dataPoint的集合 //存储从类簇中被删除dataPoint的集合
List<DataPoint> list = new ArrayList<>(); List<DataPoint> list = new ArrayList<>();
//存储类簇名的集合 //存储类簇名的集合
...@@ -78,7 +88,7 @@ public class ClusterUtil { ...@@ -78,7 +88,7 @@ public class ClusterUtil {
dataPointsNames.add(text); dataPointsNames.add(text);
} }
double[] distances = cos.getDistance(dataPointsNames,cluster.getClusterName()); double[] distances = cos.getSimilarity(dataPointsNames,cluster.getClusterName());
Iterator<DataPoint> it = dataPoints.iterator(); Iterator<DataPoint> it = dataPoints.iterator();
int i = 0; int i = 0;
...@@ -100,11 +110,12 @@ public class ClusterUtil { ...@@ -100,11 +110,12 @@ public class ClusterUtil {
List<DataPoint> listNew = new ArrayList<>(); List<DataPoint> listNew = new ArrayList<>();
//遍历判断被删除的节点是否与其它类簇相似,相似就添加到这个类簇中 //遍历判断被删除的节点是否与其它类簇相似,相似就添加到这个类簇中
int size = clusterNames.size();
for (DataPoint dataPoint : list) for (DataPoint dataPoint : list)
{ {
double[] distances = cos.getDistance(clusterNames,dataPoint.getDataPointName()); double[] distances = cos.getSimilarity(clusterNames,dataPoint.getDataPointName());
for (int i=0;i<clusterNames.size();i++) for (int i=0;i<size;i++)
{ {
//相似度大于0.93就将节点加入类簇 //相似度大于0.93就将节点加入类簇
if (distances[i] > freq) if (distances[i] > freq)
...@@ -117,7 +128,7 @@ public class ClusterUtil { ...@@ -117,7 +128,7 @@ public class ClusterUtil {
break; break;
} }
if (i == clusterNames.size()-1) if (i == size-1)
{ {
listNew.add(dataPoint); listNew.add(dataPoint);
} }
...@@ -191,6 +202,26 @@ public class ClusterUtil { ...@@ -191,6 +202,26 @@ public class ClusterUtil {
} }
return finalClusters; return finalClusters;
} }
public void mergeCluster(List<String> names,List<Cluster> clusterList,int index,String name,double cosFreq) {
int size = clusterList.size();
double[] distance = cos.getSimilarity(names, name);
for (int i=index+1;i<size;i++)
{
if (clusterList.get(i).getDataPoints().size() == 0)
{
continue;
}
if (distance[i] > cosFreq)
{
mergeCluster(clusterList, index, i);
return;
}
}
}
/** /**
* 选择次数最多的作为类簇名,若次数一样,选择title最短的为类簇名 * 选择次数最多的作为类簇名,若次数一样,选择title最短的为类簇名
* @Title: changeMaxDataPoint * @Title: changeMaxDataPoint
...@@ -274,6 +305,16 @@ public class ClusterUtil { ...@@ -274,6 +305,16 @@ public class ClusterUtil {
* void 返回类型 * void 返回类型
*/ */
public void completedData(List<DataPoint> texts,List<Cluster> clusters) { public void completedData(List<DataPoint> texts,List<Cluster> clusters) {
Iterator<Cluster> it = clusters.iterator();
while(it.hasNext())
{
Cluster cluster = it.next();
if (cluster.getDataPoints().size() == 0)
{
it.remove();
}
}
for (DataPoint dataPoint : texts) for (DataPoint dataPoint : texts)
{ {
List<DataPoint> list = new ArrayList<>(); List<DataPoint> list = new ArrayList<>();
...@@ -286,6 +327,7 @@ public class ClusterUtil { ...@@ -286,6 +327,7 @@ public class ClusterUtil {
clusters.add(cluster); clusters.add(cluster);
} }
} }
} }
......
package com.zhiweidata.titleAggregation.main; package com.zhiweidata.titleAggregation.main;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.zhiweidata.titleAggregation.algorithm.AllAlgorithm;
import com.zhiweidata.titleAggregation.algorithm.impl.MySimHash;
import com.zhiweidata.titleAggregation.bean.Cluster; import com.zhiweidata.titleAggregation.bean.Cluster;
import com.zhiweidata.titleAggregation.bean.DataPoint; import com.zhiweidata.titleAggregation.bean.DataPoint;
import com.zhiweidata.titleAggregation.bean.Result;
import com.zhiweidata.titleAggregation.method.ComputeWordsVector;
import com.zhiweidata.titleAggregation.method.CosineSimilarity;
import com.zhiweidata.titleAggregation.method.MySimHash;
import com.zhiweidata.titleAggregation.util.ChineseTranslate; import com.zhiweidata.titleAggregation.util.ChineseTranslate;
import com.zhiweidata.titleAggregation.util.ComputeWordsVector;
import com.zhiweidata.titleAggregation.util.BasicUtil; import com.zhiweidata.titleAggregation.util.BasicUtil;
import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal; import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal;
...@@ -25,59 +23,29 @@ import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal; ...@@ -25,59 +23,29 @@ import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal;
* @date 2017年12月26日 上午9:47:58 * @date 2017年12月26日 上午9:47:58
*/ */
public class HCluster { public class HCluster {
static MySimHash hash = (MySimHash)AllAlgorithm.getInstance(
com.zhiweidata.titleAggregation.algorithm.AllAlgorithm.goal.hash);
//简繁体翻译
static ChineseTranslate simplifiedTrans = ChineseTranslate.getInstance(goal.简体);
public List<Result> DataToResult(Map<Integer,String> texts,int freq,double cosFreq){ static ClusterUtil util = new ClusterUtil();
List<Cluster> clusters = changeData(texts, freq, cosFreq);
MySimHash hash = new MySimHash();
CosineSimilarity cos = new CosineSimilarity();
List<Result> list = new ArrayList<>(); public void sumCluster(List<Cluster> clusters,double cosFreq){
for (Cluster cluster : clusters) util.mergeLikeCluster(clusters, cosFreq);
{ util.alertLikeData(clusters,cosFreq);
if (cluster.getDataPoints().size() == 0)
{
continue;
}
String clusterName = cluster.getClusterName();
for (DataPoint dataPoint : cluster.getDataPoints())
{
String dataPointName = dataPoint.getDataPointName();
Result result = new Result();
result.setClusterName(clusterName);
result.setDatapointName(dataPointName);
result.setI(dataPoint.getI());
result.setSimhash(hash.getDistance(clusterName, dataPointName));
result.setCosSimilarity(cos.CalculateTextSim(clusterName, dataPointName));
list.add(result);
}
}
Collections.sort(list, new Comparator<Result>() {
@Override
public int compare(Result o1, Result o2) {
return o1.getI() - o2.getI();
} }
});
return list;
}
/** /**
* 将标题以长度分组选择不同的相似度 * 将标题以长度分组选择不同的相似度
* @Title: changeData * @Title: changeData
* @Description: TODO(将标题以长度分组选择不同的相似度) * @Description: TODO(将标题以长度分组选择不同的相似度)
* @param texts * @param texts
* @param freq simhash距离,默认(推荐)为9 * @param freq simhash距离,默认(推荐)为9
* @param cosFreq 余弦的相似度 默认(推荐)为0.93
* @return * @return
* Map<Integer,Map<Integer,String>> 返回类型 * Map<Integer,Map<Integer,String>> 返回类型
*/ */
public List<Cluster> changeData(Map<Integer,String> texts,int freq,double cosFreq) { public List<Cluster> changeData(Map<Integer,String> texts,int freq) {
//简繁体翻译
ChineseTranslate simplifiedTrans = ChineseTranslate.getInstance(goal.简体);
//按标题长度分组 //按标题长度分组
Map<Integer, String> shortText = new HashMap<>(); Map<Integer, String> shortText = new HashMap<>();
Map<Integer, String> middleText = new HashMap<>(); Map<Integer, String> middleText = new HashMap<>();
...@@ -109,19 +77,11 @@ public class HCluster { ...@@ -109,19 +77,11 @@ public class HCluster {
freq += 2; freq += 2;
clusters.addAll(startCluster(longText, freq)); clusters.addAll(startCluster(longText, freq));
ClusterUtil util = new ClusterUtil();
util.mergeLikeCluster(clusters, cosFreq);
util.alertLikeData(clusters,cosFreq);
return clusters; return clusters;
} }
/** 聚类的主方法*/ /** 聚类的主方法*/
private List<Cluster> startCluster(Map<Integer, String> titles, int freq) { private static List<Cluster> startCluster(Map<Integer, String> titles, int freq) {
ClusterUtil util = new ClusterUtil();
MySimHash hash = new MySimHash();
List<DataPoint> dp = readData(titles); List<DataPoint> dp = readData(titles);
// 声明cluster类,存放类名和类簇中含有的样本 // 声明cluster类,存放类名和类簇中含有的样本
...@@ -132,7 +92,9 @@ public class HCluster { ...@@ -132,7 +92,9 @@ public class HCluster {
// flag为判断标志 // flag为判断标志
boolean flag = true; boolean flag = true;
int it = 0; int it = 0;
//hash距离
int[][] distances = hash.getDistance(BasicUtil.toList(titles)); int[][] distances = hash.getDistance(BasicUtil.toList(titles));
while (flag) { while (flag) {
// mergeIndexA和mergeIndexB表示每一次迭代聚类最小的两个类簇,也就是每一次迭代要合并的两个类簇 // mergeIndexA和mergeIndexB表示每一次迭代聚类最小的两个类簇,也就是每一次迭代要合并的两个类簇
int mergeIndexA = 0; int mergeIndexA = 0;
...@@ -140,7 +102,8 @@ public class HCluster { ...@@ -140,7 +102,8 @@ public class HCluster {
/* /*
* 迭代开始,分别去计算每个类簇之间的距离,将距离小的类簇合并 * 迭代开始,分别去计算每个类簇之间的距离,将距离小的类簇合并
*/ */
for (int i = 0; i < finalClusters.size() - 1; i++) int size = finalClusters.size();
for (int i = 0; i < size - 1; i++)
{ {
if (finalClusters.get(i).getDataPoints().size() == 0) if (finalClusters.get(i).getDataPoints().size() == 0)
{ {
...@@ -149,7 +112,7 @@ public class HCluster { ...@@ -149,7 +112,7 @@ public class HCluster {
int min = freq; int min = freq;
for (int j = i + 1; j < finalClusters.size(); j++) for (int j = i + 1; j < size; j++)
{ {
if (finalClusters.get(j).getDataPoints().size() == 0) if (finalClusters.get(j).getDataPoints().size() == 0)
{ {
...@@ -178,22 +141,23 @@ public class HCluster { ...@@ -178,22 +141,23 @@ public class HCluster {
{ {
it++; it++;
} }
//持续5次,都为0,判断算法结束
if (it > 5) { if (it > 5) {
flag = false; flag = false;
} }
} }
return finalClusters; return finalClusters;
} }
/**初始化类簇*/ /**初始化类簇*/
private List<Cluster> initialCluster(List<DataPoint> dpoints) { private static List<Cluster> initialCluster(List<DataPoint> dpoints) {
// 声明存放初始化类簇的链表 // 声明存放初始化类簇的链表
List<Cluster> originalClusters = new ArrayList<>(); List<Cluster> originalClusters = new ArrayList<>();
// 声明一个临时的用于存放样本点的链表 // 声明一个临时的用于存放样本点的链表
List<DataPoint> tempDataPoints; List<DataPoint> tempDataPoints;
for (int i = 0; i < dpoints.size(); i++) { int size = dpoints.size();
for (int i = 0; i < size; i++) {
tempDataPoints = new ArrayList<>(); tempDataPoints = new ArrayList<>();
// 得到每一个样本点 // 得到每一个样本点
DataPoint tempDataPoint = dpoints.get(i); DataPoint tempDataPoint = dpoints.get(i);
...@@ -216,7 +180,8 @@ public class HCluster { ...@@ -216,7 +180,8 @@ public class HCluster {
* @param titles * @param titles
* @return * @return
*/ */
private List<DataPoint> readData(Map<Integer, String> titles) { private static List<DataPoint> readData(Map<Integer, String> titles) {
return new ComputeWordsVector().computeTFMultiIDF(titles); return ComputeWordsVector.computeTFMultiIDF(titles);
} }
} }
\ No newline at end of file
/**
* @Title: Means.java
* @Package com.zhiweidata.titleAggregation.main
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月29日 上午10:22:18
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.titleAggregation.main;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.zhiweidata.titleAggregation.bean.Cluster;
import com.zhiweidata.titleAggregation.bean.Result;
import com.zhiweidata.titleAggregation.util.BasicUtil;
import com.zhiweidata.titleAggregation.util.ThreadPool;
/**
* @ClassName: Means
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2017年12月29日 上午10:22:18
*/
public class Means {
public static List<Result>changeMeans(Map<Integer,String> texts,int freq,double cosFreq) {
return cutTexts(texts, freq, cosFreq);
}
private static List<Result> cutTexts(Map<Integer,String> texts,int freq,double cosFreq) {
long t = System.currentTimeMillis();
List<Map<Integer,String>> list = new ArrayList<>();
Map<Integer, String> map = new HashMap<>(350);
int i=0;
for (Integer key : texts.keySet())
{
if (i < 350)
{
map.put(key, texts.get(key));
}
else
{
map.put(key, texts.get(key));
list.add(map);
map = new HashMap<>(350);
i = 0;
}
i++;
}
if (i < 350)
{
list.add(map);
}
System.out.println("切分list,共切出:"+list.size()+"个,用时:"+(System.currentTimeMillis()-t));
t = System.currentTimeMillis();
List<Cluster> clusters = ThreadPool.ClusterRun(list, freq, cosFreq);
System.out.println("运算成功,用时"+(System.currentTimeMillis() - t));
t = System.currentTimeMillis();
List<Result> results = BasicUtil.toResult(clusters);
System.out.println("转化结果成功,用时"+(System.currentTimeMillis()-t));
return results;
}
}
...@@ -117,14 +117,15 @@ public class CosineSimilarity { ...@@ -117,14 +117,15 @@ public class CosineSimilarity {
*/ */
public double[][] getDistance(List<String> titles) { public double[][] getDistance(List<String> titles) {
CosineSimilarity cosineSimilarity = new CosineSimilarity(); CosineSimilarity cosineSimilarity = new CosineSimilarity();
double[][] distance = new double[titles.size()][titles.size()]; int size = titles.size();
double[][] distance = new double[size][size];
String doc1 = ""; String doc1 = "";
String doc2 = ""; String doc2 = "";
for (int i = 0; i < titles.size()-1; i++) for (int i = 0; i < size-1; i++)
{ {
doc1 = titles.get(i); doc1 = titles.get(i);
for (int j = i+1; j < titles.size(); j++) for (int j = i+1; j < size; j++)
{ {
doc2 = titles.get(j); doc2 = titles.get(j);
distance[i][j] = cosineSimilarity.CalculateTextSim(doc1, doc2); distance[i][j] = cosineSimilarity.CalculateTextSim(doc1, doc2);
......
...@@ -47,7 +47,7 @@ public class CutPage { ...@@ -47,7 +47,7 @@ public class CutPage {
return result /goalTexts.size(); return result /goalTexts.size();
} }
public List<String> splitString(String text){ public static List<String> splitString(String text){
List<String> result = new ArrayList<>(); List<String> result = new ArrayList<>();
List<String> list = Arrays.asList(text.split("。")); List<String> list = Arrays.asList(text.split("。"));
......
...@@ -6,6 +6,8 @@ import java.util.HashMap; ...@@ -6,6 +6,8 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import javax.rmi.CORBA.Tie;
import org.ansj.domain.Term; import org.ansj.domain.Term;
import com.zhiweidata.titleAggregation.util.AnsjSeg; import com.zhiweidata.titleAggregation.util.AnsjSeg;
...@@ -144,19 +146,20 @@ public class MySimHash { ...@@ -144,19 +146,20 @@ public class MySimHash {
* int[][] 返回类型 * int[][] 返回类型
*/ */
public int[][] getDistance(List<String> titles) { public int[][] getDistance(List<String> titles) {
List<MySimHash> listHash = new ArrayList<>(); int size = titles.size();
for (int i = 0; i < titles.size(); i++) List<MySimHash> listHash = new ArrayList<>(size);
for (int i = 0; i < size; i++)
{ {
MySimHash mySimHash = new MySimHash(); MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(titles.get(i)); mySimHash.setTokens(titles.get(i));
listHash.add(mySimHash); listHash.add(mySimHash);
} }
int[][] distance = new int[titles.size()][titles.size()]; int[][] distance = new int[size][size];
int temp; int temp;
for (int i = 0; i < titles.size()-1; i++) for (int i = 0; i < size-1; i++)
{ {
for (int j = i+1; j < titles.size(); j++) for (int j = i+1; j < size; j++)
{ {
temp = listHash.get(i).hammingDistance(listHash.get(j)); temp = listHash.get(i).hammingDistance(listHash.get(j));
distance[i][j] = temp; distance[i][j] = temp;
...@@ -174,8 +177,9 @@ public class MySimHash { ...@@ -174,8 +177,9 @@ public class MySimHash {
* int[] 返回类型 * int[] 返回类型
*/ */
public int[] getDistance(List<String> titles,String text) { public int[] getDistance(List<String> titles,String text) {
List<MySimHash> listHash = new ArrayList<>(); int size = titles.size();
for (int i = 0; i < titles.size(); i++) List<MySimHash> listHash = new ArrayList<>(size);
for (int i = 0; i < size; i++)
{ {
MySimHash mySimHash = new MySimHash(); MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(titles.get(i)); mySimHash.setTokens(titles.get(i));
...@@ -186,7 +190,7 @@ public class MySimHash { ...@@ -186,7 +190,7 @@ public class MySimHash {
int temp; int temp;
MySimHash mySimHash = new MySimHash(); MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(text); mySimHash.setTokens(text);
for (int i = 0; i < titles.size(); i++) for (int i = 0; i < size; i++)
{ {
temp = mySimHash.hammingDistance(listHash.get(i)); temp = mySimHash.hammingDistance(listHash.get(i));
distance[i] = temp; distance[i] = temp;
...@@ -211,8 +215,9 @@ public class MySimHash { ...@@ -211,8 +215,9 @@ public class MySimHash {
* double[] 返回类型 * double[] 返回类型
*/ */
public double[] getSimilarity(List<String> titles,String text) { public double[] getSimilarity(List<String> titles,String text) {
List<MySimHash> listHash = new ArrayList<>(); int size = titles.size();
for (int i = 0; i < titles.size(); i++) List<MySimHash> listHash = new ArrayList<>(size);
for (int i = 0; i < size; i++)
{ {
MySimHash mySimHash = new MySimHash(); MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(titles.get(i)); mySimHash.setTokens(titles.get(i));
...@@ -223,7 +228,7 @@ public class MySimHash { ...@@ -223,7 +228,7 @@ public class MySimHash {
double temp; double temp;
MySimHash mySimHash = new MySimHash(); MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(text); mySimHash.setTokens(text);
for (int i = 0; i < titles.size(); i++) for (int i = 0; i < size; i++)
{ {
temp = mySimHash.getSemblance(listHash.get(i)); temp = mySimHash.getSemblance(listHash.get(i));
distance[i] = temp; distance[i] = temp;
......
...@@ -2,9 +2,18 @@ package com.zhiweidata.titleAggregation.util; ...@@ -2,9 +2,18 @@ package com.zhiweidata.titleAggregation.util;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.TreeMap;
import com.zhiweidata.titleAggregation.algorithm.Algorithm;
import com.zhiweidata.titleAggregation.algorithm.AllAlgorithm;
import com.zhiweidata.titleAggregation.algorithm.AllAlgorithm.goal;
import com.zhiweidata.titleAggregation.algorithm.impl.MySimHash;
import com.zhiweidata.titleAggregation.bean.Cluster;
import com.zhiweidata.titleAggregation.bean.DataPoint;
import com.zhiweidata.titleAggregation.bean.Result;
/** /**
* @ClassName: Util * @ClassName: Util
...@@ -13,9 +22,81 @@ import java.util.Map; ...@@ -13,9 +22,81 @@ import java.util.Map;
* @date 2017年12月26日 上午9:15:29 * @date 2017年12月26日 上午9:15:29
*/ */
public class BasicUtil { public class BasicUtil {
/**
* @Title: toResult
* @Description: TODO(类簇转化为结果集)
* @param clusters
* @return
* List<Result> 返回类型
*/
public static List<Result> toResult(List<Cluster> clusters)
{
MySimHash hash = (MySimHash)AllAlgorithm.getInstance(goal.hash);
Algorithm cos = AllAlgorithm.getInstance(goal.cos);
List<Result> list = new ArrayList<>();
for (Cluster cluster : clusters)
{
if (cluster.getDataPoints().size() == 0)
{
continue;
}
String clusterName = cluster.getClusterName();
for (DataPoint dataPoint : cluster.getDataPoints())
{
String dataPointName = dataPoint.getDataPointName();
Result result = new Result();
result.setClusterName(clusterName);
result.setDatapointName(dataPointName);
result.setI(dataPoint.getI());
result.setSimhash(hash.getDistance(clusterName, dataPointName));
result.setCosSimilarity(cos.getSimilarity(clusterName, dataPointName));
list.add(result);
}
}
//排序
Collections.sort(list, new Comparator<Result>() {
@Override
public int compare(Result o1, Result o2) {
return o1.getI() - o2.getI();
}
});
return list;
}
/**
* @Title: toString
* @Description: TODO(转化为name集合)
* @param list
* @return
* List<String> 返回类型
*/
public static List<String> toString(List<Cluster> list){
List<String> result = new ArrayList<>();
for (Cluster cluster : list)
{
if (cluster.getDataPoints().size() == 0)
{
continue;
}
String name = cluster.getClusterName();
result.add(name);
}
return result;
}
/**
*
* @Title: toMap
* @Description: TODO(list转为map)
* @param list
* @return
* Map<Integer,String> 返回类型
*/
public static Map<Integer,String> toMap(List<String> list) public static Map<Integer,String> toMap(List<String> list)
{ {
Map<Integer,String> texts = new HashMap<>(); Map<Integer,String> texts = new TreeMap<>();
int i = 0; int i = 0;
for (String text : list) for (String text : list)
{ {
...@@ -42,7 +123,33 @@ public class BasicUtil { ...@@ -42,7 +123,33 @@ public class BasicUtil {
} }
return list; return list;
} }
/**
* @Title: splitList
* @Description: TODO(将List集合按规定个数拆成n个集合)
* @param list
* @param len
* @return
* List<List<?>> 返回类型
*/
public static List<List<?>> splitList(List<?> list, int len) {
if (list == null || list.size() == 0 || len < 1)
{
return null;
}
List<List<?>> result = new ArrayList<List<?>>();
int size = list.size();
int count = (size + len - 1) / len;
for (int i = 0; i < count; i++)
{
List<?> subList = list.subList(i * len,
((i + 1) * len > size ? size : len * (i + 1)));
result.add(subList);
}
return result;
}
/** /**
* 去除集合空的元素 * 去除集合空的元素
*/ */
......
package com.zhiweidata.titleAggregation.util;
import java.util.*;
import org.ansj.domain.Term;
import com.zhiweidata.titleAggregation.bean.DataPoint;
/**
* @ClassName: ComputeWordsVector
* @Description: TODO(计算文档的向量属性,将所有文档向量化)
* @author xuyimeng
* @date 2017年12月26日 上午9:22:06
*/
public class ComputeWordsVector {
/**
* 计算文档的TF-IDF属性向量,返回Map<标题,<特征词,TF-IDF值>>
* @param testSampleDir 处理好的聚类样本测试样例集
* @return 所有测试样例的属性向量构成的map
*/
public static List<DataPoint> computeTFMultiIDF(Map<Integer, String> testSampleDir){
List<DataPoint> dataPoints = new ArrayList<>();
Map<String,Double> idfPerWordMap = computeIDF(BasicUtil.toList(testSampleDir));
Map<String,Double> tfPerDocMap = new TreeMap<String, Double>();
AnsjSeg ansj = AnsjSeg.getInstance();
String word ;
for(Integer key : testSampleDir.keySet()){
tfPerDocMap.clear();
DataPoint dataPoint = new DataPoint();
word = testSampleDir.get(key);
ansj.getString(tfPerDocMap,testSampleDir.get(key));
Double wordSumPerDoc = (double) tfPerDocMap.size(); //计算每篇文档的总词数
Double maxCount = 0.0,wordWeight; //记录出现次数最多的词的次数,用作归一化 ???
Set<Map.Entry<String, Double>> tempTF = tfPerDocMap.entrySet();
for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator();mt.hasNext();){
Map.Entry<String, Double> me = mt.next();
if(me.getValue() > maxCount)
{
maxCount = me.getValue();
}
}
for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator();mt.hasNext();){
Map.Entry<String, Double> me = mt.next();
if (idfPerWordMap.containsKey(me.getKey()))
{
Double IDF = Math.log(testSampleDir.size() / idfPerWordMap.get(me.getKey()));
wordWeight = (me.getValue() / wordSumPerDoc) * IDF;
tfPerDocMap.put(me.getKey(), wordWeight);
}
}
dataPoint.setDataPointName(word);
dataPoint.setDimensioin(tfPerDocMap);
dataPoint.setI(key);
dataPoints.add(dataPoint);
}
return dataPoints;
}
// /**
// * 输出测试样例map内容,用于测试
// * @param allTestSampleMap
// */
// private void printTestSampleMap(Map<String, Map<String, Double>> allTestSampleMap){
//
// Set<Map.Entry<String, Map<String,Double>>> allWords = allTestSampleMap.entrySet();
//
// for(Iterator<Entry<String, Map<String, Double>>> it = allWords.iterator();it.hasNext();){
//
// Map.Entry<String, Map<String,Double>> me = it.next();
// System.out.print(me.getKey()+" ");
//
// Set<Map.Entry<String, Double>> vectorSet = me.getValue().entrySet();
// for(Iterator<Map.Entry<String, Double>> vt = vectorSet.iterator();vt.hasNext();){
// Map.Entry<String, Double> vme = vt.next();
// System.out.print(vme.getKey()+" "+vme.getValue()+" ");
// }
// System.out.println();
// }
// }
/**
* 计算IDF,即词频
* @param testSample
* @return 单词IDFmap <单词,词频>
*/
public static Map<String,Double> computeIDF(List<String> testSample){
Map<String,Double> IDFPerWordMap = new TreeMap<String, Double>();
String word;
AnsjSeg ansj = AnsjSeg.getInstance();
for (Term term : ansj.getTerms(testSample.toString()))
{
word = term.getName();
if(IDFPerWordMap.containsKey(word))
IDFPerWordMap.put(word, IDFPerWordMap.get(word)+1.0);
else
IDFPerWordMap.put(word, 1.0);
}
return IDFPerWordMap;
}
}
\ No newline at end of file
/**
* @Title: ThreadPool.java
* @Package com.zhiweidata.titleAggregation.util
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月29日 下午3:26:49
* @version V1.0
*/
package com.zhiweidata.titleAggregation.util;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import com.zhiweidata.titleAggregation.bean.Cluster;
import com.zhiweidata.titleAggregation.main.ClusterUtil;
import com.zhiweidata.titleAggregation.main.HCluster;
/**
* @ClassName: ThreadPool
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2017年12月29日 下午3:26:49
*/
public class ThreadPool {
private static HCluster hCluster = new HCluster();
public static List<Cluster> ClusterRun(List<Map<Integer, String>> list, int freq,double cosFreq) {
ExecutorService fixedThreadPool = Executors.newFixedThreadPool(8);
List<Cluster> clusterList = new ArrayList<>();
int size = list.size();
for (int i = 0; i < size; i++)
{
Map<Integer, String> texts = list.get(i);
fixedThreadPool.execute(new Runnable() {
@Override
public void run() {
List<Cluster> clusters = hCluster.changeData(texts, freq);
hCluster.sumCluster(clusters, cosFreq);
clusterList.addAll(clusters);
}
});
}
fixedThreadPool.shutdown();
while (true) {
if (fixedThreadPool.isTerminated()) {
return mergeCluster(clusterList,cosFreq);
}
}
}
public static List<Cluster> mergeCluster(List<Cluster> clusterList,double cosFreq){
ExecutorService fixedThreadPool = Executors.newFixedThreadPool(8);
ClusterUtil util = new ClusterUtil();
List<String> names = BasicUtil.toString(clusterList);
int size = clusterList.size();
for (int i = 0; i < size-1; i++)
{
if (clusterList.get(i).getDataPoints().size() == 0)
{
continue;
}
String name = clusterList.get(i).getClusterName();
int index = i;
fixedThreadPool.execute(new Runnable() {
@Override
public void run() {
util.mergeCluster(names, clusterList, index, name, cosFreq);
}
});
}
fixedThreadPool.shutdown();
while (true) {
if (fixedThreadPool.isTerminated()) {
return clusterList;
}
}
}
}
...@@ -59,10 +59,10 @@ public class ExcelTest { ...@@ -59,10 +59,10 @@ public class ExcelTest {
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
List<Map<String, Object>> body = (List<Map<String, Object>>)map.get("body"); List<Map<String, Object>> body = (List<Map<String, Object>>)map.get("body");
List<String> titles = exportTitleData(body); List<String> titles = exportTitleData(body);
long time = System.currentTimeMillis();
//调用算法 //调用算法
ClusterResult cr = new ClusterResult(); List<Result> list = ClusterResult.getResult(titles);
List<Result> list = cr.getResult(titles); System.out.println("时间:"+(System.currentTimeMillis() - time)+",量级:"+list.size());
DBOExp dbo = new DBOExp(); DBOExp dbo = new DBOExp();
dbo.putRun(GroupSheet(list,body),goalPath,"聚合"); dbo.putRun(GroupSheet(list,body),goalPath,"聚合");
dbo.putRun(AllSheet(body), goalPath, "全部"); dbo.putRun(AllSheet(body), goalPath, "全部");
...@@ -81,7 +81,7 @@ public class ExcelTest { ...@@ -81,7 +81,7 @@ public class ExcelTest {
{ {
for (String key : map.keySet()) for (String key : map.keySet())
{ {
if (key.equals("标题")) if (key.equals("标题") && key!=null)
{ {
titles.add(map.get(key).toString()); titles.add(map.get(key).toString());
} }
......
...@@ -12,6 +12,7 @@ import org.junit.Test; ...@@ -12,6 +12,7 @@ import org.junit.Test;
import com.mongodb.BasicDBObject; import com.mongodb.BasicDBObject;
import com.mongodb.DBObject; import com.mongodb.DBObject;
import com.zhiweidata.titleAggregation.bean.Result; import com.zhiweidata.titleAggregation.bean.Result;
import com.zhiweidata.titleAggregation.main.ClusterResult;
import com.zhiweidata.titleAggregation.main.HCluster; import com.zhiweidata.titleAggregation.main.HCluster;
import com.zhiweidata.titleAggregation.util.ChineseTranslate; import com.zhiweidata.titleAggregation.util.ChineseTranslate;
import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal; import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal;
...@@ -87,27 +88,25 @@ public class MongoStart { ...@@ -87,27 +88,25 @@ public class MongoStart {
List<MediaAndWechatEvent> listEvent = util.getListData(pt,eventId); List<MediaAndWechatEvent> listEvent = util.getListData(pt,eventId);
Map<Integer,String> texts = new HashMap<>(); List<String> texts = new ArrayList<>();
int i = 0;
for (MediaAndWechatEvent event : listEvent) for (MediaAndWechatEvent event : listEvent)
{ {
String text = event.getTitle().replaceAll("\\.", "-"); String text = event.getTitle().replaceAll("\\.", "-");
texts.put(i, text); texts.add(text);
i++;
} }
System.out.println("start"); System.out.println("start");
//调用算法 //调用算法
HCluster hc = new HCluster();
long time = System.currentTimeMillis(); long time = System.currentTimeMillis();
List<Result> list = hc.DataToResult(texts, 9, 0.93); List<Result> list = ClusterResult.getResult(texts, 9, 0.93);
long t = System.currentTimeMillis()-time; long t = System.currentTimeMillis()-time;
System.out.println("事件名:"+name+"——"+pt+"数据,数据量:"+texts.size()+" ,输出时间:"+t); System.out.println("事件名:"+name+"——"+pt+"数据,数据量:"+texts.size()+" ,输出时间:"+t);
DBOExp dbo = new DBOExp(); // DBOExp dbo = new DBOExp();
dbo.putRun(GroupSheet(list,listEvent),path,"聚合"); // dbo.putRun(GroupSheet(list,listEvent),path,"聚合");
dbo.putRun(AllSheet(listEvent), path, "全部"); // dbo.putRun(AllSheet(listEvent), path, "全部");
} }
/** /**
......
...@@ -14,6 +14,7 @@ import com.zhiweidata.titleAggregation.method.MySimHash; ...@@ -14,6 +14,7 @@ import com.zhiweidata.titleAggregation.method.MySimHash;
import com.zhiweidata.titleAggregation.util.AnsjSeg; import com.zhiweidata.titleAggregation.util.AnsjSeg;
public class ResultTest { public class ResultTest {
@Test @Test
public void test5() { public void test5() {
List<String> texts = new ArrayList<>(); List<String> texts = new ArrayList<>();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment