rm(list=ls())
if (!"FactoMineR" %in% installed.packages()) install.packages("FactoMineR")
if (!"mclust" %in% installed.packages()) install.packages("mclust")


# Digits clustering

url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/digits_extrait_images.csv"
digits = read.csv(url,header=TRUE,sep=",")
digits = digits[,-1]

url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/digits_extrait_labels.csv"
labels = read.csv(url,header=TRUE,sep=",")
labels = labels[,2]


n_clusters = length(unique(labels)) # Number of clusters
i.centers = sample(1:dim(digits)[1],n_clusters) # random initialisation of clusters
centers = digits[i.centers,]
km = kmeans(digits,centers,nstart=10,iter.max=50)
cl = km$cluster

#  Images of cluster centers
dev.new()
par(mfrow=c(1,n_clusters))
for (k in 1:n_clusters){
  image(matrix(km$centers[k,],28,28)[,rev(1:28)])
  title("Cluster ",k)
}

# Calculation of the majority label of each class
cl_lab = cl
for (k in 1:n_clusters){
  ii = which(cl==k) # Individus de la classe k
  counts=table(labels[ii]) # Nb d'occurences de chaque label
  imax=which.max(counts) # Calcul du majoritaire
  maj_lab=attributes(counts)$dimnames[[1]][imax] # Son étiquette
  print(paste("Classe ",k,", label majoritaire = ",maj_lab))
  cl_lab[ii] = maj_lab
}

# Computation of the confusion matrix
(conf_mat = table(labels,cl_lab))

# Barplot 
dev.new()
counts <- table(cl_lab,labels)
for (ii in 1:dim(counts)[2]){
  counts[,ii] = counts[,ii]/sum(counts[,ii])
}
barplot(counts, main=" ",
        xlab="Labels", col=1:5,
        legend = 0:4)


###############################################@
# Gaussian Mixture Model
library(mclust)
library(FactoMineR)

p = ncol(digits)
## Not run
# mod = Mclust(digits,G=5,modelNames = c("EEI", "EVI")) # Too expensive in computational time... ok in Python

# Clustering based on PCA components ________________________________________
nc = 25 # You can try to change the number of components
set.seed("123") # Initialization of the random number generator (for mclust)
pca = PCA(digits,nc=nc,graph=FALSE) 
mod = Mclust(pca$ind$coord,G=1:10)  #fit a Gaussian mixture with various covariance models, max number of cluster = 10

dev.new()
plot(mod,"BIC") # best model = "VVV", BIC continues to increase


cl = mod$classification
n_cl = length(unique(cl))
#  Images of cluster centers (gmm classification + empirical means)
dev.new()
par(mfrow=c(1,n_cl),mar=c(1,1,1,1))
for (k in 1:n_cl){
  ii = which(mod$classification==k)
  center = apply(digits[ii,],2,mean)
  image(matrix(center,28,28)[,rev(1:28)])
  title("Cluster ",k)
}

## Results for 5 clusters
mod = Mclust(pca$ind$coord,G=5,modelNames="VVV")  

#  Images of cluster centers (gmm parameters)
dev.new()
par(mfrow=c(1,n_clusters))
for (k in 1:n_clusters){
  ii = which(mod$classification==k)
  center = apply(digits[ii,],2,mean)
  image(matrix(center,28,28)[,rev(1:28)])
  title("Cluster ",k)
}

# Projection on the PCA first plan
dev.new()
plot(pca$ind$coord,pch=20,col="white")
text(pca$ind$coord,labels=labels, col=mod$classification)

# Clustering based on t-sne components ________________________________________
tsne_model_1 = Rtsne(digits, check_duplicates=FALSE, pca=TRUE, 
                     perplexity=100, theta=0, dims=2)

mod = Mclust(tsne_model_1$Y,G=1:10) 
dev.new()
plot(mod,"BIC") # best model = "VVV", BIC stops to increase after 5 clusters

mod = Mclust(tsne_model_1$Y,G=5,mocelNames="VVV") 

#  Images of cluster centers (gmm classification + empirical means)
dev.new()
par(mfrow=c(1,n_clusters))
for (k in 1:n_clusters){
  ii = which(mod$classification==k)
  center = apply(digits[ii,],2,mean)
  image(matrix(center,28,28)[,rev(1:28)])
  title("Cluster ",k)
}


# Projection on the t-sne plan
dev.new()
plot(tsne_model_1$Y,pch=20,col="white")
text(tsne_model_1$Y,labels=labels, col=mod$classification)