Commit 98076b77 by win7

标题聚合工具类

parents
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiweidata</groupId>
<artifactId>titleAggregation</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<!-- 测试用jar包 -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<!-- 连接mongodb -->
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.2.2</version>
</dependency>
<!-- 日志文件 -->
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.14</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.21</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.21</version>
</dependency>
<!-- excel -->
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>jxlzw</artifactId>
<version>0.0.2-SNAPSHOT</version>
</dependency>
<!-- 分词 -->
<dependency>
<groupId>org.ansj</groupId>
<artifactId>ansj_seg</artifactId>
<version>5.0.2</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package com.zhiweidata.titleAggregation.bean;
import java.util.List;
/**
* @ClassName: Cluster
* @Description: TODO(类簇bean)
* @author xuyimeng
* @date 2017年12月26日 上午9:14:18
*/
public class Cluster {
private List<DataPoint> dataPoints; // 类簇中的样本点
private String clusterName;
private Integer i;
public Integer getI() {
return i;
}
public void setI(Integer i) {
this.i = i;
}
public List<DataPoint> getDataPoints() {
return dataPoints;
}
public void setDataPoints(List<DataPoint> dataPoints) {
this.dataPoints = dataPoints;
}
public String getClusterName() {
return clusterName;
}
public void setClusterName(String clusterName) {
this.clusterName = clusterName;
}
}
package com.zhiweidata.titleAggregation.bean;
import java.util.Map;
/**
* @ClassName: DataPoint
* @Description: TODO(个体的bean)
* @author xuyimeng
* @date 2017年12月26日 上午9:14:41
*/
public class DataPoint {
String dataPointName; // 样本点名
private Map<String,Double> dimensioin; // 样本点的词频
private Integer i;
public DataPoint(){
}
public DataPoint(String dataPointName, Map<String, Double> dimensioin) {
super();
this.dataPointName = dataPointName;
this.dimensioin = dimensioin;
}
public String getDataPointName() {
return dataPointName;
}
public void setDataPointName(String dataPointName) {
this.dataPointName = dataPointName;
}
public Map<String, Double> getDimensioin() {
return dimensioin;
}
public void setDimensioin(Map<String, Double> dimensioin) {
this.dimensioin = dimensioin;
}
public Integer getI() {
return i;
}
public void setI(Integer i) {
this.i = i;
}
@Override
public String toString() {
return "DataPoint [dataPointName=" + dataPointName + ", dimensioin=" + dimensioin + ", i=" + i + "]";
}
}
package com.zhiweidata.titleAggregation.main;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import com.zhiweidata.titleAggregation.bean.Cluster;
import com.zhiweidata.titleAggregation.bean.DataPoint;
/**
* @ClassName: ClusterUtil
* @Description: TODO(类簇的工具类,封装对类簇的各个优化操作)
* @author xuyimeng
* @date 2017年12月26日 上午9:19:41
*/
public class ClusterUtil {
/**
* @Title: alertLikeData
* @Description: TODO(检测类簇与各个个体之间的相似度)
* @param clusters
* void 返回类型
*/
public void alertLikeData(List<Cluster> clusters,double freq)
{
//存储从类簇中被删除dataPoint的集合
List<DataPoint> list = new ArrayList<>();
//存储类簇名的集合
List<String> clusterNames = new ArrayList<>();
//遍历判断每一个类簇中的每一个节点是否与这个类簇相似,不相似的话,就从这个类簇中删除
for (Cluster cluster : clusters)
{
//存储节点名的集合
List<String> dataPointsNames = new ArrayList<>();
clusterNames.add(cluster.getClusterName());
if (cluster.getDataPoints().size() == 0)
{
continue;
}
List<DataPoint> dataPoints = cluster.getDataPoints();
for (DataPoint dp : dataPoints) {
String text = dp.getDataPointName();
dataPointsNames.add(text);
}
double[] distances = getDistance(dataPointsNames,cluster.getClusterName());
Iterator<DataPoint> it = dataPoints.iterator();
int i = 0;
while(it.hasNext())
{
DataPoint dp = it.next();
if (distances[i] < freq)
{
list.add(dp);
it.remove();
}
i++;
}
cluster.setDataPoints(dataPoints);
clusters.set(clusters.indexOf(cluster), cluster);
}
List<DataPoint> listNew = new ArrayList<>();
//遍历判断被删除的节点是否与其它类簇相似,相似就添加到这个类簇中
for (DataPoint dataPoint : list)
{
double[] distances = getDistance(clusterNames,dataPoint.getDataPointName());
for (int i=0;i<clusterNames.size();i++)
{
//相似度大于0.93就将节点加入类簇
if (distances[i] > freq)
{
Cluster cluster = clusters.get(i);
List<DataPoint> list2 = cluster.getDataPoints();
list2.add(dataPoint);
cluster.setDataPoints(list2);
clusters.set(i, cluster);
break;
}
if (i == clusterNames.size()-1)
{
listNew.add(dataPoint);
}
}
}
completedData(listNew, clusters);
}
/**
* 用余弦算法 遍历计算相似度
* 越接近1, 越相近
*/
public double[][] getDistance(List<String> titles) {
CosineSimilarity cosineSimilarity = new CosineSimilarity();
double[][] distance = new double[titles.size()][titles.size()];
String doc1 = "";
String doc2 = "";
for (int i = 0; i < titles.size()-1; i++)
{
doc1 = titles.get(i);
for (int j = i+1; j < titles.size(); j++)
{
doc2 = titles.get(j);
distance[i][j] = cosineSimilarity.CalculateTextSim(doc1, doc2);
}
}
return distance;
}
/**
* 用余弦算法 计算类簇内各个个体与类簇名的相似度
* 越接近1, 越相近
* @param list
* @param str
* @return
*/
public double[] getDistance(List<String> list,String doc1){
CosineSimilarity cosineSimilarity = new CosineSimilarity();
double[] distance = new double[list.size()];
for (int i = 0; i < list.size(); i++)
{
String doc2 = list.get(i);
distance[i] = cosineSimilarity.CalculateTextSim(doc1, doc2);
}
return distance;
}
// /**计算两个文本的相似度
// * 注释部分是 向量夹角余弦计算,目前采用的是向量内积计算
// * @param dimA 文本1的<单词,词频>向量
// * @param dimB 文本2<单词,词频>向量
// * @return Double 向量之间的相似度
// */
// public double computeSim(Map<String, Double> dimA, Map<String, Double> dimB) {
//// double mul = 0;
//// Set<Map.Entry<String, Double>> testWordTFMapSet = dimA.entrySet();
//// for(Iterator<Map.Entry<String, Double>> it = testWordTFMapSet.iterator(); it.hasNext();){
//// Map.Entry<String, Double> me = it.next();
//// if(dimB.containsKey(me.getKey())){
//// mul += me.getValue()*dimB.get(me.getKey());
//// }
//// }
// return mul ;
//
// double trainAbs = 0;
// Set<Map.Entry<String, Double>> dimBSet = dimB.entrySet();
// for(Iterator<Map.Entry<String, Double>> it = dimBSet.iterator(); it.hasNext();){
// Map.Entry<String, Double> me = it.next();
// trainAbs += me.getValue()*me.getValue();
// }
// trainAbs = Math.sqrt(trainAbs);
// return trainAbs;
// }
/**
*
* @Title: mergeCluster
* @Description: TODO(将cluster[mergeIndexB]中的DataPoint加入到 cluster[mergeIndexA])
* @param finalClusters
* @param mergeIndexA
* @param mergeIndexB
* @return
* List<Cluster> 返回类型
*/
public List<Cluster> mergeCluster(List<Cluster> finalClusters, int mergeIndexA, int mergeIndexB) {
if (mergeIndexA != mergeIndexB) {
Cluster clusterA = finalClusters.get(mergeIndexA);
Cluster clusterB = finalClusters.get(mergeIndexB);
List<DataPoint> dpA = clusterA.getDataPoints();
List<DataPoint> dpB = clusterB.getDataPoints();
Iterator<DataPoint> it = dpB.iterator();
while (it.hasNext())
{
DataPoint dp = it.next();
dpA.add(dp);
}
DataPoint dp = changeMaxDataPoint(dpA);
clusterA.setClusterName(dp.getDataPointName());
clusterA.setI(dp.getI());
clusterA.setDataPoints(dpA);
clusterB.setDataPoints(new ArrayList<DataPoint>());
finalClusters.set(mergeIndexA, clusterA);
finalClusters.set(mergeIndexB, clusterB);
}
return finalClusters;
}
/**
* 选择次数最多的作为类簇名,若次数一样,选择title最短的为类簇名
* @Title: changeMaxDataPoint
* @Description: TODO(选择类簇名)
* @param dataPoints
* @return
* DataPoint 返回类型
*/
public DataPoint changeMaxDataPoint(List<DataPoint> dataPoints) {
Map<String,Integer> countMap = new HashMap<>();
String title = "";
//通过map求存入次数
for (DataPoint dataPoint: dataPoints)
{
title = dataPoint.getDataPointName();
if (countMap.containsKey(title))
{
Integer num = countMap.get(title) + 1;
countMap.put(title, num);
}
else
{
countMap.put(title, 1);
}
}
//将存入次数最多的title,设为类簇名
String maxKey = "";
//长度最短的title;
String minKye = "";
int maxNo= 0;
int count = 0;
for (String countKey : countMap.keySet())
{
int valueNo = countMap.get(countKey);
if(valueNo > maxNo)
{
maxNo = valueNo;
maxKey = countKey;
}
if (maxNo == valueNo) {
count++;
}
if (minKye.length() > countKey.length())
{
minKye = countKey;
}
}
//如果所有的标题出现次数都一样,就将标题最短的,设为类簇名
if (count == countMap.size())
{
for (DataPoint dataPoint: dataPoints)
{
if (minKye.equals(dataPoint.getDataPointName()))
{
return dataPoint;
}
}
}
//将标题出现次数最多的设为类簇名
if (!maxKey.equals(""))
{
for (DataPoint dataPoint: dataPoints)
{
if (maxKey.equals(dataPoint.getDataPointName()))
{
return dataPoint;
}
}
}
return null;
}
/**
* @Title: completedData
* @Description: TODO(补全缺少的类簇)
* @param texts
* @param clusters
* void 返回类型
*/
public void completedData(List<DataPoint> texts,List<Cluster> clusters) {
for (DataPoint dataPoint : texts)
{
List<DataPoint> list = new ArrayList<>();
list.add(dataPoint);
Cluster cluster = new Cluster();
cluster.setClusterName(dataPoint.getDataPointName());
cluster.setI(dataPoint.getI());
cluster.setDataPoints(list);
clusters.add(cluster);
}
}
}
package com.zhiweidata.titleAggregation.main;
import java.util.*;
import org.ansj.domain.Term;
import com.zhiweidata.titleAggregation.bean.DataPoint;
import com.zhiweidata.titleAggregation.util.AnsjSeg;
import com.zhiweidata.titleAggregation.util.Util;
/**
* @ClassName: ComputeWordsVector
* @Description: TODO(计算文档的向量属性,将所有文档向量化)
* @author xuyimeng
* @date 2017年12月26日 上午9:22:06
*/
public class ComputeWordsVector {
/**
* 计算文档的TF-IDF属性向量,返回Map<标题,<特征词,TF-IDF值>>
* @param testSampleDir 处理好的聚类样本测试样例集
* @return 所有测试样例的属性向量构成的map
*/
public List<DataPoint> computeTFMultiIDF(Map<Integer, String> testSampleDir){
List<DataPoint> dataPoints = new ArrayList<>();
Map<String,Double> idfPerWordMap = computeIDF(Util.toList(testSampleDir));
Map<String,Double> tfPerDocMap = new TreeMap<String, Double>();
AnsjSeg ansj = AnsjSeg.getInstance();
String word ;
for(Integer key : testSampleDir.keySet()){
tfPerDocMap.clear();
DataPoint dataPoint = new DataPoint();
word = testSampleDir.get(key);
ansj.getString(tfPerDocMap,testSampleDir.get(key));
Double wordSumPerDoc = (double) tfPerDocMap.size(); //计算每篇文档的总词数
Double maxCount = 0.0,wordWeight; //记录出现次数最多的词的次数,用作归一化 ???
Set<Map.Entry<String, Double>> tempTF = tfPerDocMap.entrySet();
for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator();mt.hasNext();){
Map.Entry<String, Double> me = mt.next();
if(me.getValue() > maxCount)
{
maxCount = me.getValue();
}
}
for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator();mt.hasNext();){
Map.Entry<String, Double> me = mt.next();
if (idfPerWordMap.containsKey(me.getKey()))
{
Double IDF = Math.log(testSampleDir.size() / idfPerWordMap.get(me.getKey()));
wordWeight = (me.getValue() / wordSumPerDoc) * IDF;
tfPerDocMap.put(me.getKey(), wordWeight);
}
}
dataPoint.setDataPointName(word);
dataPoint.setDimensioin(tfPerDocMap);
dataPoint.setI(key);
dataPoints.add(dataPoint);
}
return dataPoints;
}
// /**
// * 输出测试样例map内容,用于测试
// * @param allTestSampleMap
// */
// private void printTestSampleMap(Map<String, Map<String, Double>> allTestSampleMap){
//
// Set<Map.Entry<String, Map<String,Double>>> allWords = allTestSampleMap.entrySet();
//
// for(Iterator<Entry<String, Map<String, Double>>> it = allWords.iterator();it.hasNext();){
//
// Map.Entry<String, Map<String,Double>> me = it.next();
// System.out.print(me.getKey()+" ");
//
// Set<Map.Entry<String, Double>> vectorSet = me.getValue().entrySet();
// for(Iterator<Map.Entry<String, Double>> vt = vectorSet.iterator();vt.hasNext();){
// Map.Entry<String, Double> vme = vt.next();
// System.out.print(vme.getKey()+" "+vme.getValue()+" ");
// }
// System.out.println();
// }
// }
/**
* 计算IDF,即词频
* @param testSample
* @return 单词IDFmap <单词,词频>
*/
public Map<String,Double> computeIDF(List<String> testSample){
Map<String,Double> IDFPerWordMap = new TreeMap<String, Double>();
String word;
AnsjSeg ansj = AnsjSeg.getInstance();
for (Term term : ansj.getTerms(testSample.toString()))
{
word = term.getName();
if(IDFPerWordMap.containsKey(word))
IDFPerWordMap.put(word, IDFPerWordMap.get(word)+1.0);
else
IDFPerWordMap.put(word, 1.0);
}
return IDFPerWordMap;
}
}
\ No newline at end of file
package com.zhiweidata.titleAggregation.main;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
/**
* 余弦算法,根据相似程序(长度、单个字)来判断
* @ClassName: CosineSimilarity
* @Description: TODO(余弦相似度算法的)
* @author xuyimeng
* @date 2017年12月26日 上午10:01:07
*/
public class CosineSimilarity {
/**
* 输入两段文本利用字频率的余弦定理判断二者间的相似度
* @param doc1,文本1
* @param doc2,文本2
* @return 相似度值
*/
public double CalculateTextSim(String doc1, String doc2) {
if (doc1 != null && doc1.trim().length() > 0 && doc2 != null && doc2.trim().length() > 0) {
Map<Integer, int[]> AlgorithmMap = new HashMap<Integer, int[]>();
// 将两个字符串中的中文字符以及出现的总数封装到,AlgorithmMap中
for (int i = 0; i < doc1.length(); i++) {
char d1 = doc1.charAt(i);
if (isHanZi(d1)) {
int charIndex = getGB2312Id(d1);
if (charIndex != -1) {
int[] fq = AlgorithmMap.get(charIndex);
if (fq != null && fq.length == 2) {
fq[0]++;
} else {
fq = new int[2];
fq[0] = 1;
fq[1] = 0;
AlgorithmMap.put(charIndex, fq);
}
}
}
}
for (int i = 0; i < doc2.length(); i++) {
char d2 = doc2.charAt(i);
if (isHanZi(d2)) {
int charIndex = getGB2312Id(d2);
if (charIndex != -1) {
int[] fq = AlgorithmMap.get(charIndex);
if (fq != null && fq.length == 2) {
fq[1]++;
} else {
fq = new int[2];
fq[0] = 0;
fq[1] = 1;
AlgorithmMap.put(charIndex, fq);
}
}
}
}
Iterator<Integer> iterator = AlgorithmMap.keySet().iterator();
double sqdoc1 = 0;
double sqdoc2 = 0;
double denominator = 0;
while (iterator.hasNext()) {
int[] c = AlgorithmMap.get(iterator.next());
denominator += c[0] * c[1];
sqdoc1 += c[0] * c[0];
sqdoc2 += c[1] * c[1];
}
return denominator / Math.sqrt(sqdoc1 * sqdoc2);
} else {
throw new NullPointerException("the Document is null or have not cahrs!!");
}
}
/**
* 输入一个字符判断是否为中文汉字
*
* @param ch,字符
* @return true为中文汉字,否则为false
*/
public boolean isHanZi(char ch) {
return (ch >= 0x4E00 && ch <= 0x9FA5);
}
/**
* 根据输入的Unicode字符,获取它的GB2312编码或者ascii编码,
*
* @param ch,输入的GB2312中文字符或者ASCII字符(128个)
* @return ch在GB2312中的位置,-1表示该字符不认识
*/
public static short getGB2312Id(char ch) {
try {
byte[] buffer = Character.toString(ch).getBytes("GB2312");
if (buffer.length != 2) {
// 正常情况下buffer应该是两个字节,否则说明ch不属于GB2312编码,故返回'?',此时说明不认识该字符
return -1;
}
int b0 = (int) (buffer[0] & 0x0FF) - 161; // 编码从A1开始,因此减去0xA1=161
int b1 = (int) (buffer[1] & 0x0FF) - 161; // 第一个字符和最后一个字符没有汉字,因此每个区只收16*6-2=94个汉字
return (short) (b0 * 94 + b1);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return -1;
}
}
\ No newline at end of file
package com.zhiweidata.titleAggregation.main;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.zhiweidata.titleAggregation.bean.Cluster;
import com.zhiweidata.titleAggregation.bean.DataPoint;
import com.zhiweidata.titleAggregation.util.ChineseTranslate;
import com.zhiweidata.titleAggregation.util.Util;
import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal;
/**
* @ClassName: HCluster
* @Description: TODO(算法主程序)
* @author xuyimeng
* @date 2017年12月26日 上午9:47:58
*/
public class HCluster {
/**
* @Title: changeData
* @Description: TODO()
* @param texts
* @param freq simhash距离,默认(推荐)为9
* @param cosFreq 余弦的相似度 默认(推荐)为0.93
* @return
* Map<Integer,Map<Integer,String>> 返回类型
*/
public Map<Integer,Map<Integer,String>> changeData(Map<Integer,String> texts,int freq,double cosFreq) {
if (freq <= 0)
{
freq = 9;
}
if (cosFreq <= 0.0)
{
cosFreq = 0.93;
}
//简繁体翻译
ChineseTranslate simplifiedTrans = ChineseTranslate.getInstance(goal.简体);
//按标题长度分组
Map<Integer, String> shortText = new HashMap<>();
Map<Integer, String> middleText = new HashMap<>();
Map<Integer, String> longText = new HashMap<>();
for (Integer key : texts.keySet())
{
String title = simplifiedTrans.trans(texts.get(key));
if (title.length() < 10)
{
shortText.put(key, title);
}
else if (title.length() < 30)
{
middleText.put(key, title);
}
else
{
longText.put(key, title);
}
}
/*
* freq代表了聚类的终止条件,判断还有没有距离小于freq的两个类簇,若有则合并后继续迭代,否则终止迭代
* 不同长度选用不同的相似度
*/
List<Cluster> clusters = new ArrayList<>();
clusters.addAll(startCluster(shortText, freq));
freq += 2;
clusters.addAll(startCluster(middleText, freq));
freq += 2;
clusters.addAll(startCluster(longText, freq));
ClusterUtil util = new ClusterUtil();
util.alertLikeData(clusters,cosFreq);
Map<Integer,Map<Integer,String>> map = new HashMap<>();
for (Cluster cl : clusters) {
List<DataPoint> tempDps = cl.getDataPoints();
if (tempDps.size() >= 1)
{
Integer key = cl.getI();
Map<Integer,String> mapDataPoint = new HashMap<>();
for (DataPoint tempdp : tempDps) {
mapDataPoint.put(tempdp.getI(), tempdp.getDataPointName());
}
map.put(key, mapDataPoint);
}
}
return map;
}
/** 聚类的主方法*/
private List<Cluster> startCluster(Map<Integer, String> titles, int freq) {
ClusterUtil util = new ClusterUtil();
List<DataPoint> dp = readData(titles);
// 声明cluster类,存放类名和类簇中含有的样本
List<Cluster> finalClusters = new ArrayList<>();
// 初始化类簇,开始时认为每一个样本都是一个类簇并将初始化类簇赋值给最终类簇
List<Cluster> originalClusters = initialCluster(dp);
finalClusters = originalClusters;
// flag为判断标志
boolean flag = true;
int it = 0;
int[][] distances = getDistance(Util.toList(titles));
while (flag) {
// mergeIndexA和mergeIndexB表示每一次迭代聚类最小的两个类簇,也就是每一次迭代要合并的两个类簇
int mergeIndexA = 0;
int mergeIndexB = 0;
/*
* 迭代开始,分别去计算每个类簇之间的距离,将距离小的类簇合并
*/
for (int i = 0; i < finalClusters.size() - 1; i++)
{
if (finalClusters.get(i).getDataPoints().size() == 0)
{
continue;
}
int min = freq;
for (int j = i + 1; j < finalClusters.size(); j++)
{
if (finalClusters.get(j).getDataPoints().size() == 0)
{
continue;
}
int distance = distances[i][j];
if (distance < min)
{
min = distance;
mergeIndexA = i;
mergeIndexB = j;
}
}
if (min < freq)
{
finalClusters = util.mergeCluster(finalClusters,mergeIndexA, mergeIndexB);
}
}
/*
* mergeIndexA,mergeIndexB都为0时,表示 没有类簇可以合并
*/
if (mergeIndexA == 0 && mergeIndexB == 0)
{
it++;
}
if (it > 5) {
flag = false;
}
}
return finalClusters;
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一个集合内各个元素的)
* @param titles
* @return
* int[][] 返回类型
*/
public int[][] getDistance(List<String> titles) {
List<MySimHash> listHash = new ArrayList<>();
for (int i = 0; i < titles.size(); i++)
{
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(titles.get(i));
listHash.add(mySimHash);
}
int[][] distance = new int[titles.size()][titles.size()];
int temp;
for (int i = 0; i < titles.size()-1; i++)
{
for (int j = i+1; j < titles.size(); j++)
{
temp = listHash.get(i).hammingDistance(listHash.get(j));
distance[i][j] = temp;
}
}
return distance;
}
/**
* @Title: getDistance
* @Description: TODO(simhash计算相似度,一对多)
* @param titles
* @param text
* @return
* int[] 返回类型
*/
public int[] getDistance(List<String> titles,String text) {
List<MySimHash> listHash = new ArrayList<>();
for (int i = 0; i < titles.size(); i++)
{
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(titles.get(i));
listHash.add(mySimHash);
}
int[] distance = new int[titles.size()];
int temp;
MySimHash mySimHash = new MySimHash();
mySimHash.setTokens(text);
for (int i = 0; i < titles.size(); i++)
{
temp = mySimHash.hammingDistance(listHash.get(i));
distance[i] = temp;
}
return distance;
}
/**初始化类簇*/
private List<Cluster> initialCluster(List<DataPoint> dpoints) {
// 声明存放初始化类簇的链表
List<Cluster> originalClusters = new ArrayList<>();
// 声明一个临时的用于存放样本点的链表
List<DataPoint> tempDataPoints;
for (int i = 0; i < dpoints.size(); i++) {
tempDataPoints = new ArrayList<>();
// 得到每一个样本点
DataPoint tempDataPoint = dpoints.get(i);
// 链表中加入刚才得到的样本点
tempDataPoints.add(tempDataPoint);
// 声明一个类簇,并且将给类簇设定名字、增加样本点
Cluster tempCluster = new Cluster();
tempCluster.setClusterName(tempDataPoint.getDataPointName());
tempCluster.setDataPoints(tempDataPoints);
tempCluster.setI(tempDataPoint.getI());
// 将新的类簇加入到初始化类簇链表中
originalClusters.add(tempCluster);
}
return originalClusters;
}
/**
* 读取每个节点的维度(词频)
* @param titles
* @return
*/
private List<DataPoint> readData(Map<Integer, String> titles) {
return new ComputeWordsVector().computeTFMultiIDF(titles);
}
}
\ No newline at end of file
package com.zhiweidata.titleAggregation.main;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.ansj.domain.Term;
import com.zhiweidata.titleAggregation.util.AnsjSeg;
/**
* simhash 是根据词义(词性)来判断文本相似度
* @ClassName: MySimHash
* @Description: TODO(simHash算法)
* @author xuyimeng
* @date 2017年12月26日 上午9:16:58
*/
public class MySimHash {
private String tokens; //字符串
private BigInteger strSimHash;//字符产的hash值
private int hashbits = 64; // 分词后的hash数;
public void setTokens(String tokens) {
this.tokens = tokens;
strSimHash = simHash();
}
/**
* 这个是对整个字符串进行hash计算
* @return
*/
private BigInteger simHash() {
int[] v = new int[this.hashbits];
AnsjSeg ansj = AnsjSeg.getInstance();
Map<String, Integer> weightOfNature = new HashMap<String, Integer>(); // 词性的权重
//给名词的权重是1;
weightOfNature.put("n", 1);
weightOfNature.put("m", 1);
List<Term> termList = ansj.getTerms(tokens);
for (Term term : termList) {
String word = term.getName(); //分词字符串
String nature = term.getNatureStr(); // 分词属性;
//将每一个分词hash为一组固定长度的数列
BigInteger t = hash(word);
for (int i = 0; i < this.hashbits; i++) {
BigInteger bitmask = new BigInteger("1").shiftLeft(i);
// 建立一个长度为64的整数数组,进行权重计算,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕.
int weight = 1;
if (weightOfNature.containsKey(nature)) {
weight = weightOfNature.get(nature);
}
if (t.and(bitmask).signum() != 0) {
// 这里是计算整个文档的所有特征的向量和
v[i] += weight;
} else {
v[i] -= weight;
}
}
}
BigInteger fingerprint = new BigInteger("0");
for (int i = 0; i < this.hashbits; i++) {
if (v[i] >= 0) {
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
}
}
return fingerprint;
}
/**
* 对单个的分词进行hash计算;
* @param source
* @return
*/
private BigInteger hash(String source) {
if (source == null || source.length() == 0) {
return new BigInteger("0");
} else {
/**
* 当sourece 的长度过短,会导致hash算法失效,因此需要对过短的词补偿
*/
while (source.length() < 3) {
source = source + source.charAt(0);
}
char[] sourceArray = source.toCharArray();
BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
BigInteger m = new BigInteger("1000003");
BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1"));
for (char item : sourceArray) {
BigInteger temp = BigInteger.valueOf((long) item);
x = x.multiply(m).xor(temp).and(mask);
}
x = x.xor(new BigInteger(String.valueOf(source.length())));
if (x.equals(new BigInteger("-1"))) {
x = new BigInteger("-2");
}
return x;
}
}
/**
* 计算海明距离,海明距离越小说明越相似;
* @param other
* @return
*/
public int hammingDistance(MySimHash other) {
BigInteger m = new BigInteger("1").shiftLeft(this.hashbits).subtract(
new BigInteger("1"));
BigInteger x = this.strSimHash.xor(other.strSimHash).and(m);
int tot = 0;
while (x.signum() != 0) {
tot += 1;
x = x.and(x.subtract(new BigInteger("1")));
}
return tot;
}
/**
*
* @Title: getSemblance
* @Description: TODO(计算simhash的相似度)
* @param s2
* @return
* double 返回类型
*/
public double getSemblance(MySimHash s2 ){
double i = (double) this.hammingDistance(s2);
return 1 - i/this.hashbits ;
}
}
package com.zhiweidata.titleAggregation.start;
public class Start {
public static void main(String[] args) {
// 使用说明
//
// 标题相似度算法,是根据simhash、余弦判断辅以空间向量乘积计算相似度,用层次聚类算法选择聚类
// 中心。
//
// 在src/test/java/startTest中
// MongoStart 是对mongo数据库进行使用测试,将结果导出成excel表格
// ExcelTest 是对excel表格数据进行使用测试,将结果导出成excel表格
// ResultTest 有对各个算法的效果进行模拟测试
// ChineseTest 是对繁简体结果转换的测试
//
// 在com.zhiweidata.titleAggregation.main中
// HCluster 聚类算法的实现类
// ClusterUtil 是对算法结果的优化,一般在此对结果进行迭代
// ComputeWordsVector 计算文本向量的工具类
// CosineSimilarity 余弦算法的工具类
// MySimHash simhash算法的工具类
//
// 使用方法
// 在startTest中,给出了事例,自行参考
}
}
package com.zhiweidata.titleAggregation.util;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.ansj.domain.Term;
import org.ansj.recognition.impl.FilterRecognition;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
/**
* @ClassName: AnsjSeg
* @Description: TODO(分词的工具类)
* @author xuyimeng
* @date 2017年12月26日 上午9:15:07
*/
public class AnsjSeg {
private static final AnsjSeg ansj = new AnsjSeg();
private FilterRecognition fitler = new FilterRecognition();
public static AnsjSeg getInstance() {
return ansj;
}
private AnsjSeg()
{
fitler.insertStopNatures("w"); //过滤标点符号词性
fitler.insertStopNatures("null");//过滤null词性
fitler.insertStopNatures("o");//过滤m词性
}
/**
* 利用ansj进行分词,
* * @param text
*/
public void getString(Map<String,Double> map,String text){
Double i;
String key = "";
for(Term term : ToAnalysis.parse(text).recognition(fitler))
{
key = term.getName();
if (!map.containsKey(key))
{
map.put(key,1.0);
}
else
{
i = map.get(key) + 1.0;
map.put(key, i);
}
}
}
public List<String> getString(String text){
List<String> keys = new ArrayList<>();
String key = "";
for(Term term : IndexAnalysis.parse(text).recognition(fitler))
{
key = term.getName();
keys.add(key);
}
return keys;
}
public List<Term> getTerms(String text){
return IndexAnalysis.parse(text).recognition(fitler).getTerms();
}
}
package com.zhiweidata.titleAggregation.util;
import java.util.ResourceBundle;
/**
* 字库基于原项目https://code.google.com/archive/p/java-zhconverter/, 据项目描述来源于MediaWiki.
* 转换规则很简单, 完全不进行分词.
* 如果输入文本不是单字, 如果在table中有完全匹配, 就返回对应的文本; 不然就逐字按照单字转换.
* @ClassName: ChineseTranslate
* @Description: TODO(繁简体的工具类)
* @author xuyimeng
* @date 2017年12月26日 下午2:29:37
*/
public class ChineseTranslate {
public enum goal {
繁体, 简体
}
private final static ChineseTranslate simplifiedTrans = new ChineseTranslate();
private final static ChineseTranslate TraditionalTrans = new ChineseTranslate();
private ResourceBundle table = null;
public static ChineseTranslate getInstance(goal 简繁) {
if (简繁.equals(goal.繁体)) {
TraditionalTrans.table = ResourceBundle.getBundle("简到繁单字");
return TraditionalTrans;
} else {
simplifiedTrans.table = ResourceBundle.getBundle("繁到简单字");
return simplifiedTrans;
}
}
private ChineseTranslate() { }
/**
* 不需自行创建转换器即可转换. 内部调用{@link #转换(String) 转换}方法.
* @param 文本 任意长度
* @param 简繁 goal格式
* @return 转换为goal格式的文本
* @throws IllegalArgumentException 文本为null时
*/
public static String trans(String text, goal 简繁) {
return getInstance(简繁).trans(text);
}
/**
* 不进行分词. 如果长度大于1, 寻找匹配的短语. 如没有, 按字寻找对应字后组合.
* @param 输入文本 任意长度
* @return 转换后的文本
* @throws IllegalArgumentException 文本为null时
*/
public String trans(String trans) {
if (trans == null) {
throw new IllegalArgumentException("字符串为null");
}
StringBuilder text = new StringBuilder();
if (trans.length() > 1 && table.containsKey(trans)) {
return table.getString(trans);
}
for (char c : trans.toCharArray()){
String str = String.valueOf(c);
// 如有多个对应字符, 暂时用第一个; 如果没有对应字符, 保留原字符
text.append(table.containsKey(str) ? table.getString(str).charAt(0) : str);
}
return text.toString();
}
}
package com.zhiweidata.titleAggregation.util;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
/**
* @ClassName: Util
* @Description: TODO(封装对集合处理的工具类)
* @author xuyimeng
* @date 2017年12月26日 上午9:15:29
*/
public class Util {
/**
* @Title: toList
* @Description: TODO(将map转为list)
* @param map
* @return
* List<String> 返回类型
*/
public static List<String> toList(Map<Integer, String> map){
List<String> list = new ArrayList<>();
for (Integer key : map.keySet())
{
String title = map.get(key);
list.add(title);
}
return list;
}
/**
* 去除集合空的元素
*/
public static void removeEmpty(List<?> list) {
list.removeAll(Collections.singleton(null));
}
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
package StartTest;
import org.junit.Test;
import com.zhiweidata.titleAggregation.util.ChineseTranslate;
import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal;
/**
* @ClassName: ChineseTest
* @Description: TODO(繁简体的转换测试)
* @author xuyimeng
* @date 2017年12月26日 下午2:29:06
*/
public class ChineseTest {
@Test
public void test() {
isChineseTrans("简单", "簡單");
isChineseTrans("曹操", "曹操");
isChineseTrans("赵云", "趙雲");
isChineseTrans("岳飞", "岳飛");
// TODO: issue #1. 不知此字繁体是什么?
isChineseTrans("暰", "暰");
// issue #4 简体转繁体时,“机械”一起的时候不能trans,但只有一个"机"字可以trans。
isChineseTrans("机", "機");
isChineseTrans("机械", "機械");
// issue #5
isChineseTrans("一哄而散", "一鬨而散");
// https://github.com/NLPchina/nlp-lang/issues/23
isChineseTrans("土著");
isChineseTrans("乾坤");
// https://github.com/NLPchina/nlp-lang/issues/24
isChineseTrans("尼日利亚", "尼日利亞");
isChineseTrans("巴基斯坦");
isChineseTrans("厄瓜多尔", "厄瓜多爾");
isChineseTrans("有背光的机械式键盘", "有背光的機械式鍵盤");
}
@Test
public void test2() {
ChineseTranslate simplifiedTrans = ChineseTranslate.getInstance(goal.简体);
String text = "《二十二》截圖被做成表情包 QQ空間發布道歉聲明";
System.out.println("翻译前的结果:"+text);
System.out.println("翻译后的结果:"+simplifiedTrans.trans(text));
}
/**
* @Title: basicTrans
* @Description: TODO(繁简体基础trans) void 返回类型
*/
@Test
public void basicTrans() {
final ChineseTranslate traditionalTrans = ChineseTranslate.getInstance(goal.繁体);
final ChineseTranslate simplifiedTrans = ChineseTranslate.getInstance(goal.简体);
isEquals("簡單", traditionalTrans.trans("简单"));
isEquals("简单", simplifiedTrans.trans("簡單"));
// 如果已是简体, 简体trans后不变; 繁体亦然
isEquals("簡單", traditionalTrans.trans("簡單"));
isEquals("简单", simplifiedTrans.trans("简单"));
}
//边界测试
@Test
public void borderTest() {
isChineseTrans("", "");
isChineseTrans("a", "a");
}
//异常测试,繁体
@Test(expected = IllegalArgumentException.class)
public void exception_Traditional() {
ChineseTranslate.trans(null, goal.繁体);
}
//异常测试,简体
@Test(expected = IllegalArgumentException.class)
public void exception_simplified() {
ChineseTranslate.trans(null, goal.简体);
}
//繁简字翻译
private void isChineseTrans(String text) {
isChineseTrans(text, text);
}
//繁简字翻译
private void isChineseTrans(String simplifiedText, String traditionalText) {
isEquals(traditionalText, ChineseTranslate.trans(simplifiedText, goal.繁体));
isEquals(simplifiedText, ChineseTranslate.trans(traditionalText, goal.简体));
}
/**
* @Title: isEquals
* @Description: TODO(判断两个文本是否相等)
* @param text1
* @param text2
* void 返回类型
*/
private void isEquals(String text1, String text2) {
if (text1.equals(text2)) {
System.out.println("相等");
} else {
System.out.println("不相等");
}
}
}
/**
* @Title: ExcelTest.java
* @Package StartTest
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2017年12月25日 下午5:14:49
* @version V1.0
*/
package StartTest;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiweidata.titleAggregation.main.HCluster;
import com.zhiweidata.titleAggregation.util.ChineseTranslate;
import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal;
import excel.DBOExp;
import excel.SimpeExcelReport;
/**
* @ClassName: ExcelTest
* @Description: TODO(测试从Excel读取数据,进行标题聚合测试)
* @author xuyimeng
* @date 2017年12月25日 下午5:14:49
*/
public class ExcelTest {
@Test
public void test() {
String path = "D:\\标题聚类测试数据.xls";
String sheetName = "二十二";
String goalPath = "D:\\excel\\"+sheetName+".xls";
genergeExcel(path, sheetName, goalPath);
}
/**
* @Title: genergeExcel
* @Description: TODO(读取指定地址的excel生成聚合后的结果)
* @param path 读取的路径
* @param sheetName sheet名,不写读取全部
* @param goalPath 输出聚类结果的路径
* void 返回类型
*/
public void genergeExcel(String path,String sheetName,String goalPath) {
SimpeExcelReport ser = new SimpeExcelReport();
Map<String,Object> map = ser.readExcel(new File(path), sheetName);
@SuppressWarnings("unchecked")
List<Map<String, Object>> body = (List<Map<String, Object>>)map.get("body");
Map<Integer,String> titles = exportTitleData(body);
//调用算法
HCluster hc = new HCluster();
Map<Integer,Map<Integer,String>> mapResult = hc.changeData(titles,9,0.93);
DBOExp dbo = new DBOExp();
dbo.putRun(noGroupSheet(mapResult,body),goalPath,"未聚合");
dbo.putRun(GroupSheet(mapResult,body),goalPath,"聚合");
dbo.putRun(AllSheet(body), goalPath, "全部");
}
/**
*
* @Title: exportExcelData
* @Description: TODO(将拿到的数据封装成List<DBObject>)
* @param body
* @return
* Map<Integer,String> 返回类型
*/
public Map<Integer,String> exportTitleData(List<Map<String, Object>> body){
Map<Integer,String> titles = new HashMap<>();
int i = 0;
for (Map<String, Object> map : body)
{
for (String key : map.keySet())
{
if (key.equals("标题"))
{
titles.put(i, map.get(key).toString());
i++;
}
}
}
return titles;
}
/**
*
* @Title: noGroupSheet
* @Description: TODO(未聚合的个体的sheet)
* @param map
* @param body
* @return
* List<DBObject> 返回类型
*/
public static List<DBObject> noGroupSheet(Map<Integer, Map<Integer, String>> map,
List<Map<String, Object>> body)
{
List<DBObject> listDB = new ArrayList<>();
for (Integer str : map.keySet())
{
Map<Integer, String> titles = map.get(str);
if (titles.size() > 1)
{
continue;
}
for (Integer key : titles.keySet())
{
DBObject obj = new BasicDBObject();
Map<String,Object> bean = body.get(key);
for (String keyObj : bean.keySet())
{
if (keyObj == null || "".equals(keyObj))
{
continue;
}
obj.put(keyObj, bean.get(keyObj));
}
listDB.add(obj);
}
}
return listDB;
}
/**
*
* @Title: GroupSheet
* @Description: TODO(聚合的个体的sheet)
* @param map
* @param body
* @return 设定文件
* @return List<DBObject> 返回类型
*/
public static List<DBObject> GroupSheet(Map<Integer, Map<Integer, String>> map,
List<Map<String, Object>> body)
{
//简繁体翻译
ChineseTranslate simplifiedTrans = ChineseTranslate.getInstance(goal.简体);
List<DBObject> listDB = new ArrayList<>();
for (Integer str : map.keySet())
{
Map<Integer, String> titles = map.get(str);
if (titles.size() <= 1)
{
continue;
}
for (Integer key : titles.keySet())
{
DBObject obj = new BasicDBObject();
Map<String,Object> bean = body.get(key);
String title = simplifiedTrans.trans(body.get(str).get("标题").toString());
obj.put("聚合标签", title);
for (String keyObj : bean.keySet())
{
if (keyObj == null || "".equals(keyObj))
{
continue;
}
obj.put(keyObj, bean.get(keyObj));
}
listDB.add(obj);
}
}
return listDB;
}
/**
*
* @Title: AllSheet
* @Description: TODO(全部数据的sheet)
* @param body
* @return
* List<DBObject> 返回类型
*/
public static List<DBObject> AllSheet(List<Map<String, Object>> body)
{
List<DBObject> listDB = new ArrayList<>();
for (int i=0;i<body.size();i++)
{
DBObject obj = new BasicDBObject();
Map<String,Object> bean = body.get(i);
for (String keyObj : bean.keySet())
{
if (keyObj == null || "".equals(keyObj))
{
continue;
}
obj.put(keyObj, bean.get(keyObj));
}
listDB.add(obj);
}
return listDB;
}
}
package StartTest;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiweidata.titleAggregation.main.HCluster;
import com.zhiweidata.titleAggregation.util.ChineseTranslate;
import com.zhiweidata.titleAggregation.util.ChineseTranslate.goal;
import bean.MediaAndWechatEvent;
import excel.DBOExp;
import util.DBUtil;
/**
* @ClassName: MongoStart
* @Description: TODO(用mongodb中的数据进行标题聚合算法的测试)
* @author xuyimeng
* @date 2017年12月25日 下午4:06:56
*/
public class MongoStart {
@Test
public void test() {
String eventId = "";
String pt = "";
String name = "";
String path = "";
eventId = "6baa33528841e05b10001567";
pt = "微信";
name = "华北地区“煤改气”风波";
path = "D:\\excel\\"+name+"——"+pt+".xls";
MediaData(pt, eventId, path,name);
// eventId = "6baa33528841e05b10001567";
// pt = "网媒";
// name = "华北地区“煤改气”风波";
// path = "D:\\excel\\"+name+"——"+pt+"标题.xls";
// MediaData(pt, eventId, path,name);
//
// eventId = "588e7da34bb6485a10001531";
// pt = "微信";
// name = "第四届世界互联网大会在乌镇举行";
// path = "D:\\excel\\"+name+"——"+pt+"标题.xls";
// MediaData(pt, eventId, path,name);
//
// eventId = "588e7da34bb6485a10001531";
// pt = "网媒";
// name = "第四届世界互联网大会在乌镇举行";
// path = "D:\\excel\\"+name+"——"+pt+"标题.xls";
// MediaData(pt, eventId, path,name);
//数据量超过9w,内存溢出
// eventId = "422e11bee732b76e10000016";
// pt = "微信";
// name = "2017全国两会";
// path = "D:\\excel\\"+name+"——"+pt+"标题.xls";
// MediaData(pt, eventId, path,name);
//
// eventId = "422e11bee732b76e10000016";
// pt = "网媒";
// name = "2017全国两会";
// path = "D:\\excel\\"+name+"——"+pt+"标题.xls";
// MediaData(pt, eventId, path,name);
}
/**
*
* @Title: MediaData
* @Description: TODO(将聚合后的结果输出成excel)
* @param pt
* @param eventId
* @param path
* @param name
* void 返回类型
*/
public static void MediaData(String pt,String eventId,String path,String name) {
DBUtil util = new DBUtil();
List<MediaAndWechatEvent> listEvent = util.getListData(pt,eventId);
Map<Integer,String> texts = new HashMap<>();
int i = 0;
for (MediaAndWechatEvent event : listEvent)
{
String text = event.getTitle().replaceAll("\\.", "-");
texts.put(i, text);
i++;
}
System.out.println("start");
//调用算法
HCluster hc = new HCluster();
long time = System.currentTimeMillis();
Map<Integer,Map<Integer,String>> mapResult = hc.changeData(texts,9, 0.93);
long t = System.currentTimeMillis()-time;
System.out.println("事件名:"+name+"——"+pt+"数据,数据量:"+texts.size()+" ,输出时间:"+t);
DBOExp dbo = new DBOExp();
dbo.putRun(noGroupSheet(mapResult,listEvent),path,"未聚合");
dbo.putRun(GroupSheet(mapResult,listEvent),path,"聚合");
dbo.putRun(AllSheet(listEvent), path, "全部");
}
/**
*
* @Title: noGroupSheet
* @Description: TODO(未聚合的个体的sheet)
* @param map
* @param listEvent
* @return
* List<DBObject> 返回类型
*/
public static List<DBObject> noGroupSheet(Map<Integer, Map<Integer, String>> map,
List<MediaAndWechatEvent> listEvent)
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
List<DBObject> listDB = new ArrayList<>();
for (Integer str : map.keySet())
{
Map<Integer, String> titles = map.get(str);
if (titles.size() > 1)
{
continue;
}
for (Integer key : titles.keySet())
{
MediaAndWechatEvent event = listEvent.get(key);
DBObject obj = new BasicDBObject();
String saveTime = sdf.format((new Date(event.getSavetime())));
obj.put("标题", event.getTitle());
obj.put("url", event.getUrl());
obj.put("发布时间", sdf.format(event.getTime()));
obj.put("来源", event.getSource());
obj.put("类型", event.getType());
obj.put("保存时间", saveTime);
obj.put("平台", event.getPt());
obj.put("事件id", event.getEventId());
obj.put("H因子", event.getH());
listDB.add(obj);
}
}
return listDB;
}
/**
*
* @Title: GroupSheet
* @Description: TODO(聚合的个体的sheet)
* @ @param map
* @ @param listEvent
* @ @return 设定文件
* @return List<DBObject> 返回类型
*/
public static List<DBObject> GroupSheet(Map<Integer, Map<Integer, String>> map,
List<MediaAndWechatEvent> listEvent)
{
//简繁体翻译
ChineseTranslate simplifiedTrans = ChineseTranslate.getInstance(goal.简体);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
List<DBObject> listDB = new ArrayList<>();
for (Integer str : map.keySet())
{
Map<Integer, String> titles = map.get(str);
if (titles.size() <= 1)
{
continue;
}
for (Integer key : titles.keySet())
{
MediaAndWechatEvent event = listEvent.get(key);
DBObject obj = new BasicDBObject();
String saveTime = sdf.format((new Date(event.getSavetime())));
String titleGroup = simplifiedTrans.trans(listEvent.get(str).getTitle());
obj.put("聚合标题", titleGroup);
obj.put("标题", event.getTitle());
obj.put("url", event.getUrl());
obj.put("发布时间", sdf.format(event.getTime()));
obj.put("来源", event.getSource());
obj.put("类型", event.getType());
obj.put("保存时间", saveTime);
obj.put("平台", event.getPt());
obj.put("事件id", event.getEventId());
obj.put("H因子", event.getH());
listDB.add(obj);
}
}
return listDB;
}
/**
*
* @Title: AllSheet
* @Description: TODO(全部数据的sheet)
* @param listEvent
* @return
* List<DBObject> 返回类型
*/
public static List<DBObject> AllSheet(List<MediaAndWechatEvent> listEvent)
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
List<DBObject> listDB = new ArrayList<>();
for (int i=0;i<listEvent.size();i++)
{
MediaAndWechatEvent event = listEvent.get(i);
DBObject obj = new BasicDBObject();
String saveTime = sdf.format((new Date(event.getSavetime())));
obj.put("标题", event.getTitle());
obj.put("url", event.getUrl());
obj.put("发布时间", sdf.format(event.getTime()));
obj.put("来源", event.getSource());
obj.put("类型", event.getType());
obj.put("保存时间", saveTime);
obj.put("平台", event.getPt());
obj.put("事件id", event.getEventId());
obj.put("H因子", event.getH());
listDB.add(obj);
}
return listDB;
}
}
package StartTest;
import java.util.ArrayList;
import java.util.List;
import org.ansj.domain.Term;
import org.junit.Test;
import org.nlpcn.commons.lang.util.AnsjArrays;
import com.zhiweidata.titleAggregation.main.ClusterUtil;
import com.zhiweidata.titleAggregation.main.CosineSimilarity;
import com.zhiweidata.titleAggregation.main.MySimHash;
import com.zhiweidata.titleAggregation.util.AnsjSeg;
public class ResultTest {
@Test
public void test4() {
AnsjSeg ansj = AnsjSeg.getInstance();
String text = "愤怒!慰安妇纪录片《二十二》被截图制作表情包";
for (Term term : ansj.getTerms(text))
{
System.out.println(term.getNatureStr());
System.out.println(term.getName());
}
}
@Test
public void test3() {
String s1 = "大学生娶同学妈妈?传了几年的假新闻";
List<String> s2 = new ArrayList<>();
s2.add(s1);
ClusterUtil clusterUtil = new ClusterUtil();
System.out.println(clusterUtil.getDistance(s2, s1)[0]);
}
@Test
public void test() {
String s1 = "河南警方辟谣“大学生娶同学妈妈”事件:系假新闻_新闻_腾讯网";
String s2 = "河南警方辟谣“大学生娶同学妈妈”事件:系假新闻_新浪新闻";
MySimHash hash1 = new MySimHash();
MySimHash hash2 = new MySimHash();
hash1.setTokens(s1);
hash2.setTokens(s2);
System.out.println(hash1.hammingDistance(hash2));
}
@Test
public void test1() {
CosineSimilarity cosineSimilarity = new CosineSimilarity();
String s1 = "河南警方辟谣“大学生娶同学妈妈”事件:系假新闻";
String s2 = "辟谣 “河南大学生娶同学妈妈”事件";
System.out.println(cosineSimilarity.CalculateTextSim(s1, s2));
}
@Test
public void test2() {
String doc1 = "21岁大学生娶55岁妇女 新娘是新郎同学妈妈";
List<String> list = null;
CosineSimilarity cosineSimilarity = new CosineSimilarity();
for (int i=0;i<list.size();i++)
{
String doc2 = list.get(i);
System.out.println("doc1: " + doc1);
System.out.println("doc2: " + doc2);
System.out.println(cosineSimilarity.CalculateTextSim(doc1, doc2));
}
}
}
package bean;
/**
* @Title: Status.java
* @Package com.zhiweidata.eventfulrun.pojo
* @Description:
* @author hero
* @date 2016年7月18日 下午3:50:20
* @version V1.0
*/
import java.io.Serializable;
import java.util.Date;
/**
* @Description:2015年事件 网媒数据库(用于存储事件原始数据)
* @author hero
* @date 2016年7月18日 下午3:50:20
*/
//@Document(collection = "mediaEvent2016")使用动态选择库
public class MediaAndWechatEvent implements Serializable
{
private static final long serialVersionUID = -6799862774324060068L;
private String _id; // 主键
/** 文章地址 */
private String url;
/** 标题 */
private String title;
/** 发布时间 */
private Date time;
/** 来源 */
private String source;
/** 类型 */
private String type;
/** 内容 */
private String content;
/** 入库时间 */
private Long savetime;
/** 标识码 */
private Long rsid;
/** 平台 */
private String pt;
/** 事件id */
private String eventId;
/** h因子 分析获取用 */
private double H;
/** 头像图片地址,重要渠道传值使用 */
private String imgUrl;
public String getEventId()
{
return eventId;
}
public void setEventId(String eventId)
{
this.eventId = eventId;
}
public String get_id()
{
return _id;
}
public void set_id(String _id)
{
this._id = _id;
}
public String getTitle()
{
return title;
}
public void setTitle(String title)
{
this.title = title;
}
public Date getTime()
{
return time;
}
public void setTime(Date time)
{
this.time = time;
}
public String getSource()
{
return source;
}
public void setSource(String source)
{
this.source = source;
}
public String getType()
{
return type;
}
public void setType(String type)
{
this.type = type;
}
public String getContent()
{
return content;
}
public void setContent(String content)
{
this.content = content;
}
public Long getSavetime()
{
return savetime;
}
public void setSavetime(Long savetime)
{
this.savetime = savetime;
}
public Long getRsid()
{
return rsid;
}
public void setRsid(Long rsid)
{
this.rsid = rsid;
}
public String getPt()
{
return pt;
}
public void setPt(String pt)
{
this.pt = pt;
}
public String getUrl()
{
return url;
}
public void setUrl(String url)
{
this.url = url;
}
public double getH()
{
return H;
}
public void setH(double h)
{
H = h;
}
public String getImgUrl()
{
return imgUrl;
}
public void setImgUrl(String imgUrl)
{
this.imgUrl = imgUrl;
}
public MediaAndWechatEvent()
{
}
public MediaAndWechatEvent(String _id, String title, Date time,
String source, String content, String type, String pt,
String eventId)
{
if ("微信".equals(pt))
{
this._id = (eventId+"_"+title+"_"+source+"_"+time.getTime()).replace(".", "-");
}else {
this._id = _id+ "_" +eventId;
}
this.url = _id;
this.title = title;
this.time = time;
this.source = source;
this.content = content;
this.type = type;
this.savetime = System.currentTimeMillis();
this.pt = pt;
this.eventId = eventId;
}
@Override
public String toString()
{
return "new MediaAndWechatEvent[" + "_id =" + _id + ",url =" + url
+ ",title = " + title + ",time = " + time + ",source = "
+ source + ",content = " + content + ",type = " + type
+ ",savetime = " + savetime + ",rsid = " + rsid + ",pt = " + pt
+ ",eventId = " + eventId + "]";
}
}
package excel;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.mongodb.DBObject;
public class DBOExp
{
// private static SimpeExcelReport simpe = SimpeExcelReport.getInstance();
/**
*
* @TODO (输出DBObject集合)
* @author 陈炜涛
* @param listChai
* @param fliename
* @param sheetName
* @time 2016年8月27日上午10:12:37
* @return void
*/
public void putRun(List<DBObject> listChai, String fliename,
String sheetName)
{
// flie.mkdirs();
SimpeExcelReport simpe = SimpeExcelReport.getInstance();
File excelFile = new File(fliename);
boolean flg = excelFile.exists();
// System.out.println(flg);
OutputStream osOutputStream = null;
try
{
osOutputStream = new FileOutputStream(excelFile, true);
}
catch (FileNotFoundException e1)
{
// TODO Auto-generated catch block
e1.printStackTrace();
}
List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>();
// 将取到的body集合加入总集合
dataList.addAll(bodyList(listChai));
// 创建文件导出
// simpe.createExcelWithStream(headList(), bodyList(lists),
// osOutputStream,
// "微信信息");
// List<String> mergeList = new ArrayList<String>();
// mergeList.add("主题");
// mergeList.add("关键词");
// simpe.setMergeList(mergeList);
// simpe.addSheetInExcelWithFile(headList(),dataList, new
// File(fliename), "微信信息");
if (!flg)
{
simpe.createExcelWithStream(headList(listChai.get(0)), dataList,
osOutputStream, sheetName, excelFile);
}
else
{
simpe.addSheetInExcelWithFile(headList(listChai.get(0)), dataList,
new File(fliename), sheetName);
}
try
{
osOutputStream.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
/**
* @Description 设置文件的列名
*
* @return headList excel中所有列名的list
*/
public static List<String> headList(DBObject dbo)
{
List<String> headList = new ArrayList<String>();
headList.addAll(dbo.keySet());
return headList;
}
/**
* @Description 装载数据
*
* @return dataList 列名和值组成的map的list
*/
public List<Map<String, Object>> bodyList(List<DBObject> lists)
{
List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>();
// 循环存数据的list组装成制表时候能用的map的list
// List<String> days = InfoSource27.getDayPoint();
List<String> keys = new ArrayList<String>();
keys.addAll(lists.get(0).keySet());
Map<String, Object> beanMap;
for (DBObject dbo : lists)
{
// 因为这个导出文件类不能导出空对象,所以每个值都做了判断空的
beanMap = new HashMap<String, Object>();
for (String key : keys)
{
beanMap.put(key, dbo.get(key));
}
dataList.add(beanMap);
}
return dataList;
}
}
package excel;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.jxlzw.report.model.HLink;
import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import jxl.format.Border;
import jxl.format.BorderLineStyle;
import jxl.format.Colour;
import jxl.format.UnderlineStyle;
import jxl.read.biff.BiffException;
import jxl.write.Label;
import jxl.write.WritableCellFormat;
import jxl.write.WritableFont;
import jxl.write.WritableHyperlink;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
/**
* 简单的 Excel报表
*
* @ClassName: SimpeExcelReport
* @Description: TODO(这里用一句话描述这个类的作用)
* @author Administrator
* @date 2015年11月20日 下午4:52:02
*/
public class SimpeExcelReport
{
private static Logger log = LoggerFactory.getLogger(SimpeExcelReport.class);
private List<Map<String, Object>> bodyList;
private List<String> headList;
private WritableCellFormat format;
private WritableCellFormat formatColor;
private WritableCellFormat headformat;
private WritableWorkbook writeWorkBook;
private OutputStream os;
private Workbook readWordBook;
private WritableSheet sheet = null;
private List<String> mergeList;
public static SimpeExcelReport getInstance()
{
return new SimpeExcelReport();
}
/**
* 读取一个Excel,返回格式Map<sheetName,DATA> DATA: 是一个Map,存放两个key head,和body head是表头的List,存放了表头的字段 body是Map
* <key,value> 存放了表头字段对应的数据
*
* @Title: readExcel
* @param excelFiel
* @param sheetName 不输入Sheet名字那么返回所有sheet数据
* 设定文件
* @return Map<String,Object> 返回类型
*/
public Map<String, Object> readExcel(File excelFiel, String sheetName)
{
readWordBook = getExcelFile(excelFiel);
Map<String, Object> map;
if (null != sheetName && !"".equals(sheetName))
{
map = getExcelBySheet(sheetName);
}
else
{
map = getExcelAllData();
}
closeAllObject();
log.info("文件读取成功");
return map;
}
public WritableCellFormat getTitleFormat(WritableFont headFont)
{
WritableCellFormat wcfFormat = new WritableCellFormat(headFont);
// 设置居中
try
{
wcfFormat.setAlignment(jxl.format.Alignment.CENTRE); // 左右居中
wcfFormat.setVerticalAlignment(jxl.format.VerticalAlignment.CENTRE);// 上下居中
wcfFormat.setBackground(Colour.LIGHT_BLUE);
}
catch (WriteException e)
{
e.printStackTrace();
}
return wcfFormat;
}
/**
* 获取Excel所有的数据
*
* @Title: getExcelAllData
* @Description: TODO(这里用一句话描述这个方法的作用)
* 设定文件
* @return Map<String,Object> 返回类型
*/
private Map<String, Object> getExcelAllData()
{
Sheet[] sheets = readWordBook.getSheets();
Sheet sheet;
Map<String, Object> excelMap = new HashMap<String, Object>();
for (int s = 0; s < sheets.length; s++)
{
sheet = sheets[s];
Map<String, Object> sheetMap = getSheetData(sheet);
excelMap.put(sheet.getName(), sheetMap);
}
return excelMap;
}
/**
* 获取指定sheet的Excel数据
*
* @Title: getExcelBySheet
* @Description: TODO(这里用一句话描述这个方法的作用)
* 设定文件
* @return Map<String,Object> 返回类型
*/
private Map<String, Object> getExcelBySheet(String sheetName)
{
Sheet sheet = readWordBook.getSheet(sheetName);
Map<String, Object> sheetMap = getSheetData(sheet);
return sheetMap;
}
private Map<String, Object> getSheetData(Sheet sheet)
{
List<String> headList = new ArrayList<String>();
List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
Map<String, Object> sheetMap = new HashMap<String, Object>();
// 获取表头
Cell[] cell = sheet.getRow(0);
for (int i = 0; i < cell.length; i++)
{
headList.add(cell[i].getContents());
}
// 获取数据
Map<String, Object> bodyData;
for (int i = 1; i < sheet.getRows(); i++)
{
cell = sheet.getRow(i);
bodyData = new HashMap<String, Object>();
for (int c = 0; c < headList.size(); c++)
{
try {
bodyData.put(headList.get(c), cell[c].getContents());
} catch (Exception e) {
bodyData.put(headList.get(c), null);
}
}
bodyList.add(bodyData);
}
sheetMap.put("head", headList);
sheetMap.put("body", bodyList);
return sheetMap;
}
/**
* 根据已存在的Excel创建新的Sheet
*
* @Title: createExcelWithStream
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param headList 表头
* @param bodyList 数据对象Map<String,Object>;
* @param isClose 创建完成后是否关闭流;
* @param os
* @param sheetName 设定文件
* @return void 返回类型
*/
public synchronized void createExcelWithStream(List<String> headList, List<Map<String, Object>> bodyList,
OutputStream os, String sheetName,File excelFiel)
{
this.os = os;
this.headList = headList;
this.bodyList = bodyList;
int sheetIndex = 0;
getWriteWorkBookWithStream();
@SuppressWarnings("unused")
int size = buildSheet(sheetIndex, null == sheetName || "".equals(sheetName) ? "Sheet" : sheetName);
try
{
os.flush();
}
catch (IOException e)
{
e.printStackTrace();
}
writeWorkBookWriter();
closeAllObject();
this.headList = null;
this.bodyList = null;
log.info("文件创建成功");
}
/**
* @return the mergeList
*/
public List<String> getMergeList()
{
return mergeList;
}
/**
* @param mergeList the mergeList to set
*/
public void setMergeList(List<String> mergeList)
{
this.mergeList = mergeList;
}
/**
* 根据输出流创建Excel文件
*
* @Title: createExcelWithStream
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param headList 表头
* @param bodyList 数据对象Map<String,Object>;
* @param os
* @param sheetName 设定文件
* @return void 返回类型
*/
public synchronized void addSheetInExcelWithFile(List<String> headList, List<Map<String, Object>> bodyList,
File excelFile, String sheetName)
{
this.headList = headList;
this.bodyList = bodyList;
getWorkBookWithFile(excelFile);
int sheetIndex = getSheetIndex();
buildSheet(sheetIndex, null == sheetName || "".equals(sheetName) ? "Sheet" : existsName(sheetName));
writeWorkBookWriter();
closeAllObject();
log.info("文件创建成功");
this.headList = null;
this.bodyList = null;
}
/**
* 构建Sheet,在这个文件判断了数据是发大于50000条,大于50000那么创建新的Sheet
*
* @Title: buildSheet
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param index
* @param sheetName 设定文件
* @return void 返回类型
*/
public int buildSheet(int index, String sheetName)
{
// 大于五万条数据就进行分表
int size = bodyList.size();
int limit = 50000;
if (size < limit)
{
sheet = writeWorkBook.createSheet(sheetName, index);// 创建工作表,第一个参数是名称,第二个参数是该工作表在工作薄中的位置
fileInToSheet(sheet, bodyList, headList);
}
else
{
int count = size % limit == 0 ? size / limit : size / limit + 1;
for (int i = 0; i < count; i++)
{
sheet = writeWorkBook.createSheet(sheetName + "(" + (i + index) + ")", i + index);// 创建工作表,第一个参数是名称,第二个参数是该工作表在工作薄中的位置
int toIndex = limit * (i + 1) > size ? size - 1 : limit * (i + 1);
fileInToSheet(sheet, bodyList.subList(i * limit, toIndex), headList);
}
}
for (Integer mergeColNum : getmergeColNumS())
{
mergeCell(mergeColNum);
}
return writeWorkBook.getSheetNames().length;
}
private List<Integer> getmergeColNumS()
{
if (null != mergeList && mergeList.size() > 0)
{
List<Integer> mergerList = new ArrayList<Integer>();
for (String merge : mergeList)
{
mergerList.add(getHeadListMap().get(merge));
}
return mergerList;
}
return new ArrayList<Integer>();
}
/**
* 选择列号,进行相同的值合并操作
*
* @Title: mergeCell
* @param colNum 列的下标第一列为0
*/
private void mergeCell(int colNum)
{
try
{
Map<Integer, Integer> map = getMergeCellsList(colNum);
for (Integer startRowNum : map.keySet())
sheet.mergeCells(colNum, startRowNum, colNum, map.get(startRowNum));
}
catch (RowsExceededException e)
{
e.printStackTrace();
}
catch (WriteException e)
{
e.printStackTrace();
}
}
private Map<Integer, Integer> getMergeCellsList(Integer colNum)
{
Map<Integer, Integer> map = new HashMap<Integer, Integer>();
int rows = sheet.getRows();
for (int rowNum = 0; rowNum < rows; rowNum++)
{
Cell cell = sheet.getCell(colNum, rowNum);
while (rowNum < rows)
{
rowNum++;
Cell cellNext = sheet.getCell(colNum, rowNum);
if (cell.getContents().equals(cellNext.getContents()))
{
map.put(cell.getRow(), rowNum);
}
else
{
rowNum -= 1;
break;
}
}
}
return map;
}
private void getWriteWorkBookWithStream()
{
try
{
writeWorkBook = Workbook.createWorkbook(os);// 创建xls文件
if (writeWorkBook != null)
{
formatting();
}
}
catch (FileNotFoundException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
}
// 创建新的Sheet
private void getWorkBookWithFile(File excelFile)
{
try
{
readWordBook = getExcelFile(excelFile);
writeWorkBook = Workbook.createWorkbook(excelFile, readWordBook);
if (writeWorkBook != null)
{
formatting();
}
}
catch (FileNotFoundException e)
{
excelFile.deleteOnExit();
e.printStackTrace();
}
catch (IOException e)
{
excelFile.deleteOnExit();
e.printStackTrace();
}
}
// 创建新的Sheet
private Workbook getExcelFile(File excelFile)
{
try
{
return Workbook.getWorkbook(excelFile);
}
catch (BiffException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
return null;
}
/**
* @Title: getSheetIndex
* @Description: TODO(获取新增加的sheet在表中的位置)
* 设定文件
* @return int 返回类型
*/
private int getSheetIndex()
{
int index = readWordBook.getSheets().length + 1;
return index;
}
private String existsName(String sheetName)
{
if (readWordBook != null)
{
String[] names = readWordBook.getSheetNames();
for (int i = 0; i < names.length; i++)
{
if (sheetName.equals(names[i]))
{
sheetName += "副本";
}
}
}
return sheetName;
}
/**
* @Title: fileInSheet
* @Description: TODO(为sheet页填充数据)
* @param sheet
* @param list 设定文件
* @return void 返回类型
*/
private void fileInToSheet(WritableSheet sheet, List<Map<String, Object>> list, List<String> headList)
{
builderHeader(sheet, headList);
parserBean(sheet, list);
}
/**
* @Title: builderHeader
* @Description: TODO(生成表头)
* @param sheet
* @param list
* @param headformat 设定文件
* @return void 返回类型
*/
private void builderHeader(WritableSheet sheet, List<String> list)
{
for (int i = 0; i < list.size(); i++)
{
try
{
sheet.addCell(new Label(i, 0, list.get(i), headformat));
}
catch (RowsExceededException e)
{
e.printStackTrace();
}
catch (WriteException e)
{
e.printStackTrace();
}
}
}
/** 解析行 */
private void parserBean(WritableSheet sheet, List<Map<String, Object>> list)
{
for (int i = 0; i < list.size(); i++)
{
builderCell(sheet, list.get(i), i + 1);
}
}
/** 根据表头来解析列数据 */
private void builderCell(WritableSheet sheet, Map<String, Object> obj, int row)
{
try
{
List<String> head = headList;
for (int i = 0; i < head.size(); i++)
{
Object o = obj.get(head.get(i)) == null ? "" : obj.get(head.get(i));
try
{
if (o instanceof Integer)
{
sheet.addCell(new jxl.write.Number(i, row, Integer.valueOf(o.toString()), format));
}
else if (o instanceof Double)
{
sheet.addCell(new jxl.write.Number(i, row, Double.valueOf(o.toString()), format));
}
else if (o instanceof Long)
{
sheet.addCell(new jxl.write.Number(i, row, Long.valueOf(o.toString()), format));
}
else if (o instanceof HLink)
{
HLink l = (HLink)o;
if (null == l.getUrl())
{
sheet.addCell(new Label(i, row, "超链接出错:_" + l.getDescription(), format));
}
else
{
WritableHyperlink link = null;
link = new WritableHyperlink(i, row, l.getUrl());
link.setDescription(l.getDescription());
sheet.addHyperlink(link);
}
}
else
{
if (row % 2 == 0)
{
sheet.addCell(new Label(i, row, o.toString(), formatColor));
}
else
{
sheet.addCell(new Label(i, row, o.toString(), format));
}
}
}
catch (NumberFormatException e)
{
log.error("第几列:{}\t列名:{}\t数据:" + o, i, headList.get(i));
}
}
}
catch (RowsExceededException e)
{
e.printStackTrace();
}
catch (WriteException e)
{
e.printStackTrace();
}
}
/**
* @Title: formatting
* @Description: TODO(设置Excel单元格格式)
* @param 设定文件
* @return void 返回类型
*/
private void formatting()
{
format = getCellFormat(writeWorkBook);
formatColor = getCellSimpleFormat(writeWorkBook);
headformat = getHeaderFormat(writeWorkBook);
}
public void closeAllObject()
{
try
{
if (writeWorkBook != null)
{
writeWorkBook.close();
}
if (os != null)
{
os.close();
}
if (readWordBook != null)
{
readWordBook.close();
}
}
catch (WriteException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
}
/** 表头各式 */
public WritableCellFormat getHeaderFormat(WritableWorkbook wb)
{
// 创建表头样式
//WritableFont headFont = new WritableFont(WritableFont.TIMES, 10, WritableFont.BOLD, false);
WritableFont headFont = new WritableFont(WritableFont.TIMES, 10, WritableFont.BOLD, false,UnderlineStyle.NO_UNDERLINE,Colour.WHITE);
WritableCellFormat wcfFormat = new WritableCellFormat(headFont);
// 设置居中
try
{
wb.setColourRGB(Colour.GRAY_50, 166, 166, 166);
wcfFormat.setAlignment(jxl.format.Alignment.CENTRE);// 左右居中
wcfFormat.setVerticalAlignment(jxl.format.VerticalAlignment.CENTRE);// 上下居中
wcfFormat.setBorder(Border.ALL, BorderLineStyle.THIN, Colour.BLUE2);// 黑色边框
wcfFormat.setBackground(Colour.GRAY_50);
}
catch (Exception e)
{
e.printStackTrace();
}
return wcfFormat;
}
/**
* 单元格各式
* 添加了背景色
**/
public WritableCellFormat getCellSimpleFormat(WritableWorkbook wb)
{
// 创建表头样式
WritableFont headFonts = new WritableFont(WritableFont.createFont("微软雅黑"), 9, WritableFont.NO_BOLD, false);
WritableCellFormat wcfSimpleFormat = new WritableCellFormat(headFonts);
// 设置居中
try
{
wb.setColourRGB(Colour.GRAY_80, 242, 242, 242); // 工作簿颜色设置
wcfSimpleFormat.setAlignment(jxl.format.Alignment.CENTRE);// 左右居中
wcfSimpleFormat.setVerticalAlignment(jxl.format.VerticalAlignment.CENTRE);// 上下居中
wcfSimpleFormat.setBorder(Border.ALL, BorderLineStyle.THIN, Colour.BLUE2);// 蓝色边框
wcfSimpleFormat.setBackground(Colour.GRAY_80);
}
catch (Exception e)
{
e.printStackTrace();
}
return wcfSimpleFormat;
}
/** 单元格各式 */
public WritableCellFormat getCellFormat(WritableWorkbook wb)
{
// 创建表头样式
WritableFont headFont = new WritableFont(WritableFont.createFont("微软雅黑"), 9, WritableFont.NO_BOLD, false);
WritableCellFormat wcfFormat = new WritableCellFormat(headFont);
// 设置居中
try
{
wb.setColourRGB(Colour.BLUE2, 0, 176, 240); // 工作簿颜色设置
wcfFormat.setAlignment(jxl.format.Alignment.CENTRE);// 左右居中
wcfFormat.setVerticalAlignment(jxl.format.VerticalAlignment.CENTRE);// 上下居中
wcfFormat.setBorder(Border.ALL, BorderLineStyle.THIN, Colour.BLUE2);// 蓝色边框
}
catch (Exception e)
{
e.printStackTrace();
}
return wcfFormat;
}
/**
* @return the format
*/
public WritableCellFormat getFormat()
{
return format;
}
/**
* @return the headformat
*/
public WritableCellFormat getHeadformat()
{
return headformat;
}
/**
* @param format the format to set
*/
public void setFormat(WritableCellFormat format)
{
this.format = format;
}
/**
* @Description (这里用一句话描述这个方法的作用)
* @param formatColor
*/
public void setFormatColor(WritableCellFormat formatColor)
{
this.formatColor = formatColor;
}
/**
* @param headformat the headformat to set
*/
public void setHeadformat(WritableCellFormat headformat)
{
this.headformat = headformat;
}
private void writeWorkBookWriter()
{
try
{
writeWorkBook.write();
}
catch (IOException e)
{
}
}
/**
* 通过表头List,获取 表头对应的列下标 key-value ,表头-下标
*
* @Title: getHeadListMap
* 设定文件
* @return Map<String,Integer> 返回类型
*/
private Map<String, Integer> getHeadListMap()
{
if (null != headList && headList.size() > 0)
{
Map<String, Integer> map = new HashMap<String, Integer>();
int i = 0;
for (String headNum : headList)
{
map.put(headNum, i);
i++;
}
return map;
}
return new HashMap<String, Integer>();
}
}
package util;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.bson.Document;
import com.mongodb.MongoClient;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import bean.MediaAndWechatEvent;
/**
* @ClassName: DBUtil
* @Description: TODO(提供查询mongo数据的工具类)
* @author xuyimeng
* @date 2017年12月25日 下午5:10:15
*/
public class DBUtil {
private MongoCollection<Document> coll;
/**
* @Title: DBUtil
* @Description: TODO(连接mongo)
* @param 设定文件
* @return
*/
public DBUtil() {
@SuppressWarnings("resource")
MongoClient mongoClient = new MongoClient("115.236.59.91",27017);
MongoDatabase db = mongoClient.getDatabase("EventFulBeta");
coll = db.getCollection("mediaEvent2017");
}
/**
* @Title: getListData
* @Description: TODO(查询指定平台与事件的数据)
* @param pt 平台
* @param eventID 事件ID
* @return
* List<MediaAndWechatEvent> 返回类型
*/
public List<MediaAndWechatEvent> getListData(String pt, String eventID){
List<MediaAndWechatEvent> list = new ArrayList<>();
Document obj = new Document();
if (pt.equals("微信"))
{
obj.append("pt", pt);
}
else
{
obj.append("pt", new Document("$ne","微信"));
}
obj.append("eventId", eventID);
FindIterable<Document> it = coll.find(obj);
Iterator<Document> iterator = it.iterator();
while (iterator.hasNext())
{
MediaAndWechatEvent event = new MediaAndWechatEvent();
Document doc = iterator.next();
event.set_id(doc.getString("_id"));
event.setUrl(doc.getString("url"));
event.setTitle(doc.getString("title"));
event.setTime(doc.getDate("time"));
event.setSource(doc.getString("source"));
event.setType(doc.getString("type"));
event.setSavetime(doc.getLong("savetime"));
event.setRsid(doc.getLong("rsid"));
event.setPt(doc.getString("pt"));
event.setEventId(doc.getString("eventId"));
event.setH(doc.getDouble("H"));
list.add(event);
}
return list;
}
}
使用说明
使用说明
标题相似度算法,是根据simhash、余弦判断辅以空间向量乘积计算相似度,用层次聚类算法选择聚类
中心。
在src/test/java/startTest中
MongoStart 是对mongo数据库进行使用测试,将结果导出成excel表格
ExcelTest 是对excel表格数据进行使用测试,将结果导出成excel表格
ResultTest 有对各个算法的效果进行模拟测试
ChineseTest 是对繁简体结果转换的测试
在com.zhiweidata.titleAggregation.main中
HCluster 聚类算法的实现类
ClusterUtil 是对算法结果的优化,一般在此对结果进行迭代
ComputeWordsVector 计算文本向量的工具类
CosineSimilarity 余弦算法的工具类
MySimHash simhash算法的工具类
使用方法
在startTest中,给出了事例,自行参考
注意
算法支持繁简体的聚类,但不支持对聚类标题的转换 如果对聚类标题有繁简体的要求,
在调用繁简体转换类的方法,自行转换
例:
ChineseTranslate simplifiedTrans = ChineseTranslate.getInstance(goal.简体);
String title = simplifiedTrans.trans(text);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment