rm(list=ls()) if (!"FactoMineR" %in% installed.packages()) install.packages("FactoMineR") if (!"mclust" %in% installed.packages()) install.packages("mclust") # Digits clustering url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/digits_extrait_images.csv" digits = read.csv(url,header=TRUE,sep=",") digits = digits[,-1] url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/digits_extrait_labels.csv" labels = read.csv(url,header=TRUE,sep=",") labels = labels[,2] n_clusters = length(unique(labels)) # Number of clusters i.centers = sample(1:dim(digits)[1],n_clusters) # random initialisation of clusters centers = digits[i.centers,] km = kmeans(digits,centers,nstart=10,iter.max=50) cl = km$cluster # Images of cluster centers dev.new() par(mfrow=c(1,n_clusters)) for (k in 1:n_clusters){ image(matrix(km$centers[k,],28,28)[,rev(1:28)]) title("Cluster ",k) } # Calculation of the majority label of each class cl_lab = cl for (k in 1:n_clusters){ ii = which(cl==k) # Individus de la classe k counts=table(labels[ii]) # Nb d'occurences de chaque label imax=which.max(counts) # Calcul du majoritaire maj_lab=attributes(counts)$dimnames[[1]][imax] # Son étiquette print(paste("Classe ",k,", label majoritaire = ",maj_lab)) cl_lab[ii] = maj_lab } # Computation of the confusion matrix (conf_mat = table(labels,cl_lab)) # Barplot dev.new() counts <- table(cl_lab,labels) for (ii in 1:dim(counts)[2]){ counts[,ii] = counts[,ii]/sum(counts[,ii]) } barplot(counts, main=" ", xlab="Labels", col=1:5, legend = 0:4) ###############################################@ # Gaussian Mixture Model library(mclust) library(FactoMineR) p = ncol(digits) ## Not run # mod = Mclust(digits,G=5,modelNames = c("EEI", "EVI")) # Too expensive in computational time... ok in Python # Clustering based on PCA components ________________________________________ nc = 25 # You can try to change the number of components set.seed("123") # Initialization of the random number generator (for mclust) pca = PCA(digits,nc=nc,graph=FALSE) mod = Mclust(pca$ind$coord,G=1:10) #fit a Gaussian mixture with various covariance models, max number of cluster = 10 dev.new() plot(mod,"BIC") # best model = "VVV", BIC continues to increase cl = mod$classification n_cl = length(unique(cl)) # Images of cluster centers (gmm classification + empirical means) dev.new() par(mfrow=c(1,n_cl),mar=c(1,1,1,1)) for (k in 1:n_cl){ ii = which(mod$classification==k) center = apply(digits[ii,],2,mean) image(matrix(center,28,28)[,rev(1:28)]) title("Cluster ",k) } ## Results for 5 clusters mod = Mclust(pca$ind$coord,G=5,modelNames="VVV") # Images of cluster centers (gmm parameters) dev.new() par(mfrow=c(1,n_clusters)) for (k in 1:n_clusters){ ii = which(mod$classification==k) center = apply(digits[ii,],2,mean) image(matrix(center,28,28)[,rev(1:28)]) title("Cluster ",k) } # Projection on the PCA first plan dev.new() plot(pca$ind$coord,pch=20,col="white") text(pca$ind$coord,labels=labels, col=mod$classification) # Clustering based on t-sne components ________________________________________ tsne_model_1 = Rtsne(digits, check_duplicates=FALSE, pca=TRUE, perplexity=100, theta=0, dims=2) mod = Mclust(tsne_model_1$Y,G=1:10) dev.new() plot(mod,"BIC") # best model = "VVV", BIC stops to increase after 5 clusters mod = Mclust(tsne_model_1$Y,G=5,mocelNames="VVV") # Images of cluster centers (gmm classification + empirical means) dev.new() par(mfrow=c(1,n_clusters)) for (k in 1:n_clusters){ ii = which(mod$classification==k) center = apply(digits[ii,],2,mean) image(matrix(center,28,28)[,rev(1:28)]) title("Cluster ",k) } # Projection on the t-sne plan dev.new() plot(tsne_model_1$Y,pch=20,col="white") text(tsne_model_1$Y,labels=labels, col=mod$classification)