############################################# ################################## ##### This file contains the functions: ##### 1.spectral.similarity ##### 2.spectral.similarities ##### 3.kmeans_spectral.sim ##### 4.kmeans_spectral.sims ##### ##### These functions are used to estimate the similarity measure ##### using the following clustering methods: ##### 1.Hierarchical Clustering (Single,Average and Complete Linkage, Ward) ##### 2.DIANA ##### 3.PAM ##### 4.K-Means ##### ####################################################### ################################################################### ################################################################### library(cluster) spectral.similarity <- function(data,p,nspans,cluster.sizes,clust.method) { if(!is.null(nspans)) { if(nspans==0) { nspans=NULL } } # Number of time series G <- dim(data)[1] # Number of observations for each time series N <- dim(data)[2] # The number of values disregarded in the periodogram. This should be zero!!! ntaper <- 0 cep.coefficients <- cepstral.coef(data,p,nspans)$Cepstral_Coefficients DKLFD<-matrix(NA, nrow=G-1, ncol=G-1) DEUCLCEP<-matrix(NA, nrow=G-1, ncol=G-1) DABSCEP<-matrix(NA, nrow=G-1, ncol=G-1) DLNP<-matrix(NA, nrow=G-1, ncol=G-1) DABSLNP<-matrix(NA, nrow=G-1, ncol=G-1) nperiodogram<-matrix(NA, nrow=G, ncol=floor(N/2)-1) periodogramma<-matrix(NA, nrow=G, ncol=floor(N/2)-1) for(i in 1:G) { periodogramma[i,]<-spectrum(data[i,],spans=nspans,taper=ntaper,plot=F, fast=F)$spec[-floor(N/2)] periodogramma[i,]<-correct4zeros(periodogramma[i,]) nperiodogram[i,]<-( periodogramma[i,] )/var(data[i,]) } for(i in 1:(G-1)) { for (j in 1:i) { # Calculate Kullback-Leibler Information Distance DKLFD[i,j]<-(1/N)*sum((nperiodogram[i,]/nperiodogram[j,])+(nperiodogram[j,]/nperiodogram[i,])-2) DKLFD[j,i]<-DKLFD[i,j] # Calculate the absolute value distance of the cepstral coefficients DABSCEP[i,j]<-sum(abs(cep.coefficients[,i]-cep.coefficients[,j])) DABSCEP[j,i]<-DABSCEP[i,j] # Calculate the euclidean distance of the cepstral coefficients DEUCLCEP[i,j]<-sqrt(sum((cep.coefficients[,i]-cep.coefficients[,j])^2)) DEUCLCEP[j,i]<-DEUCLCEP[i,j] # Calculate distance of the logarithm of the Normalized Periodogram DLNP[i,j]<-sqrt(sum((log(nperiodogram[i,])-log(nperiodogram[j,]))^2)) DLNP[j,i]<-DLNP[i,j] # Calculate absolute value of the distance of the logarithm of the Normalized Periodogram DABSLNP[i,j]<-sqrt(sum(abs(log(nperiodogram[i,])-log(nperiodogram[j,])))) DABSLNP[j,i]<-DABSLNP[i,j] } } dimnames(DKLFD) <- list(c(1:(G-1)),c(1:(G-1))) dimnames(DABSCEP) <- list(c(1:(G-1)),c(1:(G-1))) dimnames(DEUCLCEP) <- list(c(1:(G-1)),c(1:(G-1))) dimnames(DLNP) <- list(c(1:(G-1)),c(1:(G-1))) dimnames(DABSLNP) <- list(c(1:(G-1)),c(1:(G-1))) nclusters <- length(cluster.sizes) # Choose clustering method if(clust.method=="hclust.average") { KL.clustering <- cutree(hclust(as.dist(DKLFD),method="average"),k=nclusters ) ABSCEP.clustering <- cutree(hclust(as.dist(DABSCEP),method="average"),k=nclusters) EUCLCEP.clustering <- cutree(hclust(as.dist(DEUCLCEP),method="average"),k=nclusters) LNP.clustering <- cutree(hclust(as.dist(DLNP),method="average"),k=nclusters) ABSLNP.clustering <- cutree(hclust(as.dist(DABSLNP),method="average"),k=nclusters) } else { if(clust.method=="hclust.single") { KL.clustering <- cutree(hclust(as.dist(DKLFD),method="single"),k=nclusters ) ABSCEP.clustering <- cutree(hclust(as.dist(DABSCEP),method="single"),k=nclusters) EUCLCEP.clustering <- cutree(hclust(as.dist(DEUCLCEP),method="single"),k=nclusters) LNP.clustering <- cutree(hclust(as.dist(DLNP),method="single"),k=nclusters) ABSLNP.clustering <- cutree(hclust(as.dist(DABSLNP),method="single"),k=nclusters) } else { if(clust.method=="hclust.complete") { KL.clustering <- cutree(hclust(as.dist(DKLFD),method="complete"),k=nclusters ) ABSCEP.clustering <- cutree(hclust(as.dist(DABSCEP),method="complete"),k=nclusters) EUCLCEP.clustering <- cutree(hclust(as.dist(DEUCLCEP),method="complete"),k=nclusters) LNP.clustering <- cutree(hclust(as.dist(DLNP),method="complete"),k=nclusters) ABSLNP.clustering <- cutree(hclust(as.dist(DABSLNP),method="complete"),k=nclusters) } else { if(clust.method=="hclust.ward") { KL.clustering <- cutree(hclust(as.dist(DKLFD),method="ward"),k=nclusters ) ABSCEP.clustering <- cutree(hclust(as.dist(DABSCEP),method="ward"),k=nclusters) EUCLCEP.clustering <- cutree(hclust(as.dist(DEUCLCEP),method="ward"),k=nclusters) LNP.clustering <- cutree(hclust(as.dist(DLNP),method="ward"),k=nclusters) ABSLNP.clustering <- cutree(hclust(as.dist(DABSLNP),method="ward"),k=nclusters) } else { if(clust.method=="diana") { KL.clustering <- cutree(as.hclust(diana(DKLFD,diss=T)),k=nclusters ) ABSCEP.clustering <- cutree(as.hclust(diana(DABSCEP,diss=T)),k=nclusters ) EUCLCEP.clustering <- cutree(as.hclust(diana(DEUCLCEP,diss=T)),k=nclusters ) LNP.clustering <- cutree(as.hclust(diana(DLNP,diss=T)),k=nclusters ) ABSLNP.clustering <- cutree(as.hclust(diana(DABSLNP,diss=T)),k=nclusters ) } else { if(clust.method=="pam") { KL.clustering <- pam(DKLFD,k=nclusters,diss=T)$clustering ABSCEP.clustering <- pam(DABSCEP,k=nclusters,diss=T)$clustering EUCLCEP.clustering <- pam(DEUCLCEP,k=nclusters,diss=T)$clustering LNP.clustering <- pam(DLNP,k=nclusters,diss=T)$clustering ABSLNP.clustering <- pam(DABSLNP,k=nclusters,diss=T)$clustering } } } } } } # Store the clusters to be compared for similarity with the # real clusters KL.clusters<-matrix(NA, nrow=nclusters, ncol=G-1) ABSCEP.clusters<-matrix(NA, nrow=nclusters, ncol=G-1) EUCLCEP.clusters<-matrix(NA, nrow=nclusters, ncol=G-1) LNP.clusters<-matrix(NA, nrow=nclusters, ncol=G-1) ABSLNP.clusters<-matrix(NA, nrow=nclusters, ncol=G-1) for(i in 1:nclusters) { KL.cl.length_i <- length(rownames(DKLFD)[KL.clustering == i]) KL.clusters[i,1:KL.cl.length_i] <- rownames(DKLFD)[KL.clustering == i] ABSCEP.cl.length_i <- length(rownames(DABSCEP)[ABSCEP.clustering == i]) ABSCEP.clusters[i,1:ABSCEP.cl.length_i]<- rownames(DABSCEP)[ABSCEP.clustering == i] EUCLCEP.cl.length_i <- length(rownames(DEUCLCEP)[EUCLCEP.clustering == i]) EUCLCEP.clusters[i,1:EUCLCEP.cl.length_i]<- rownames(DEUCLCEP)[EUCLCEP.clustering == i] LNP.cl.length_i <- length(rownames(DLNP)[LNP.clustering == i]) LNP.clusters[i,1:LNP.cl.length_i]<- rownames(DLNP)[LNP.clustering == i] ABSLNP.cl.length_i <- length(rownames(DABSLNP)[ABSLNP.clustering == i]) ABSLNP.clusters[i,1:ABSLNP.cl.length_i]<- rownames(DABSLNP)[ABSLNP.clustering == i] } Sim.KL <- matrix(NA,nrow=nclusters,ncol=nclusters) Sim.ABSCEP <- matrix(NA,nrow=nclusters,ncol=nclusters) Sim.EUCLCEP <- matrix(NA,nrow=nclusters,ncol=nclusters) Sim.LNP <- matrix(NA,nrow=nclusters,ncol=nclusters) Sim.ABSLNP <- matrix(NA,nrow=nclusters,ncol=nclusters) for(i in 1:nclusters) { # Create vector corresponding to real cluster i if(i==1) { real.cluster_i <- c(1:cluster.sizes[1]) } else { real.cluster_i <- c((sum(cluster.sizes[1:(i-1)])+1):(sum(cluster.sizes[1:(i-1)])+cluster.sizes[i])) } for(j in 1:nclusters) { Sim.KL[i,j] <- (2*(length(intersect(real.cluster_i,KL.clusters[j,]))))/ (length(real.cluster_i)+length(KL.clusters[j,])-sum(is.na(as.numeric(KL.clusters[j,])))) Sim.ABSCEP[i,j] <- (2*(length(intersect(real.cluster_i,ABSCEP.clusters[j,]))))/ (length(real.cluster_i)+length(ABSCEP.clusters[j,])-sum(is.na(as.numeric(ABSCEP.clusters[j,])))) Sim.EUCLCEP[i,j] <- (2*(length(intersect(real.cluster_i,EUCLCEP.clusters[j,]))))/ (length(real.cluster_i)+length(EUCLCEP.clusters[j,])-sum(is.na(as.numeric(EUCLCEP.clusters[j,])))) Sim.LNP[i,j] <- (2*(length(intersect(real.cluster_i,LNP.clusters[j,]))))/ (length(real.cluster_i)+length(LNP.clusters[j,])-sum(is.na(as.numeric(LNP.clusters[j,])))) Sim.ABSLNP[i,j] <- (2*(length(intersect(real.cluster_i,ABSLNP.clusters[j,]))))/ (length(real.cluster_i)+length(ABSLNP.clusters[j,])-sum(is.na(as.numeric(ABSLNP.clusters[j,])))) } } Similarity.KL <- mean(apply(Sim.KL,1,max)) Similarity.ABSCEP <- mean(apply(Sim.ABSCEP,1,max)) Similarity.EUCLCEP <- mean(apply(Sim.EUCLCEP,1,max)) Similarity.LNP <- mean(apply(Sim.LNP,1,max)) Similarity.ABSLNP <- mean(apply(Sim.ABSLNP,1,max)) return(list(Kullback_Leibler_Distance_Similarity=Similarity.KL,Absolute_Value_Distance_Similarity=Similarity.ABSCEP, Euclidean_Distance_Similarity=Similarity.EUCLCEP,Normalized_Periodogram_Distance_Similarity=Similarity.LNP, Normalized_Periodogram_Absolute_Value_Distance_Similarity=Similarity.ABSLNP)) } spectral.similarities <- function(data,p,nspans,cluster.sizes,clust.method) { Similarities.EUCLCEP <- matrix(NA,nrow=length(p),ncol=length(nspans)) Similarities.ABSCEP <- matrix(NA,nrow=length(p),ncol=length(nspans)) Similarities.KL <- matrix(NA,nrow=length(p),ncol=length(nspans)) Similarities.LNP <- matrix(NA,nrow=length(p),ncol=length(nspans)) Similarities.ABSLNP <- matrix(NA,nrow=length(p),ncol=length(nspans)) for(i in 1:length(p)) { for(j in 1:length(nspans)) { SIMILARITIES <- spectral.similarity(data,p[i],nspans[j],cluster.sizes,clust.method) Similarities.EUCLCEP[i,j] <- SIMILARITIES[[3]] Similarities.ABSCEP[i,j] <- SIMILARITIES[[2]] Similarities.KL[i,j] <- SIMILARITIES[[1]] Similarities.LNP[i,j] <- SIMILARITIES[[4]] Similarities.ABSLNP[i,j] <- SIMILARITIES[[5]] } } dimnames(Similarities.EUCLCEP) <- list(p,nspans) dimnames(Similarities.ABSCEP) <- list(p,nspans) dimnames(Similarities.KL) <- list(p,nspans) dimnames(Similarities.LNP) <- list(p,nspans) dimnames(Similarities.ABSLNP) <- list(p,nspans) return(list(Euclidean_Distance_Similarities=Similarities.EUCLCEP,Absolute_Value_Distance_Similarities=Similarities.ABSCEP, Kullback_Leibler_Distance_Similarities=Similarities.KL,Log_Normalized_Periodogram_Distance_Similarities=Similarities.LNP, Log_Normalized_Periodogram_Absolute_Value_Distance_Similarities=Similarities.ABSLNP)) } kmeans_spectral.sim <- function(data,p,nspans,cluster.sizes) { if(!is.null(nspans)) { if(nspans==0) { nspans=NULL } } # Number of time series G <- dim(data)[1] # Number of observations for each time series N <- dim(data)[2] # The number of values disregarded in the periodogram. This should be zero!!! ntaper <- 0 cep.coefficients <- cepstral.coef(data,p,nspans)$Cepstral_Coefficients nclusters <- length(cluster.sizes) kmeans.clustering <- kmeans(t(cep.coefficients),centers=nclusters,algorithm="MacQueen")$cluster Dummy<-matrix(NA, nrow=G-1, ncol=G-1) dimnames(Dummy) <- list(c(1:(G-1)),c(1:(G-1))) # Store the clusters to be compared for similarity with the # real clusters kmeans.clusters<-matrix(NA, nrow=nclusters, ncol=G-1) for(i in 1:nclusters) { kmeans.cl.length_i <- length(rownames(Dummy)[kmeans.clustering == i]) kmeans.clusters[i,1:kmeans.cl.length_i] <- rownames(Dummy)[kmeans.clustering == i] } Sim.kmeans <- matrix(NA,nrow=nclusters,ncol=nclusters) for(i in 1:nclusters) { # Create vector corresponding to real cluster i if(i==1) { real.cluster_i <- c(1:cluster.sizes[1]) } else { real.cluster_i <- c((sum(cluster.sizes[1:(i-1)])+1):(sum(cluster.sizes[1:(i-1)])+cluster.sizes[i])) } for(j in 1:nclusters) { Sim.kmeans[i,j] <- (2*(length(intersect(real.cluster_i,kmeans.clusters[j,]))))/ (length(real.cluster_i)+length(kmeans.clusters[j,])-sum(is.na(as.numeric(kmeans.clusters[j,])))) } } Similarity.kmeans <- mean(apply(Sim.kmeans,1,max)) return(list(Kmeans_Similarity=Similarity.kmeans)) } kmeans_spectral.sims <- function(data,p,nspans,cluster.sizes) { Similarities.kmeans <- matrix(NA,nrow=length(p),ncol=length(nspans)) for(i in 1:length(p)) { for(j in 1:length(nspans)) { Similarities.kmeans[i,j] <- kmeans_spectral.sim(data,p[i],nspans[j],cluster.sizes)[[1]] } } dimnames(Similarities.kmeans) <- list(p,nspans) return(list(Kmeans_Similarities=Similarities.kmeans)) }