Data loading

Read Leukemia gene expression dataset

gene = read.csv("https://perso.univ-rennes1.fr/valerie.monbet/DATA/data_set_ALL_AML_train.csv",header=TRUE,sep="\t")
ncol(gene)
## [1] 78
nrow(gene)
## [1] 7129

As usual, this gene expression dataset is stored with the sample (patients) in columns and the gene in rows. For the statistical analysis, a transposition is needed.

X = t(gene[,seq(3,ncol(gene),by=2)])
Y = as.factor(c(rep(1,27),rep(2,11))) # class label (2 = AML, 1 = ALL)

A first view of the data

library(fields)
## Loading required package: spam
## Loading required package: grid
## Spam version 1.4-0 (2016-08-29) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
## 
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve
## Loading required package: maps
image.plot(1:38,1:ncol(X),(scale(X)),xlab="Samples",ylab="Genes Expression",zlim=range(scale(X)))
abline(v=27.5)

Now, let us focus on the genes with a high absolute correlation with the classes

rhoXY = NULL
for (k in 1:ncol(X)){
    rhoXY[k] = abs(cor(X[,k],as.numeric(Y)))
}
hist(rhoXY)

w = which(rhoXY>.6)
image.plot(1:38,1:length(w),scale(X[,w]),xlab="Samples",ylab="Genes",zlim=range(scale(X)))
abline(v=27.5)

## Principal Component Analysis PCA is run using the FactoMineR package. The plot below shows that AML and ALL may be mostly easy to separated.

library(FactoMineR)
pca = PCA(X,graph=FALSE)
plot(pca$ind$coord[,1:2],col=Y,pch=20,
   xlab=paste("PC 1,",round(pca$eig[1,2]*100)/100,"%"),
   ylab=paste("PC 2,",round(pca$eig[2,2]*100)/100,"%"))
legend("topright",col=1:2,legend=c("AML",'ALL'),pch=20)

## Multidimensional scaling

library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.4-3
dis = vegdist(X,"euclidean")
pl <- plot(cmdscale(dis), col=Y,pch=20)

Clustering

Hierarchical Clustering

dis = vegdist(X,"euclidean")
hc = hclust(dis, "ward.D2")
plot(hc)

library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.4.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:permute':
## 
##     shuffle
## The following object is masked from 'package:stats':
## 
##     cutree
dend <- as.dendrogram(hc)
plot(dend)
colored_bars(as.numeric(Y),dend)

### Kmeans Let us start with a random initialization and only 2 classes.

nbcl = 2 # with 2 classes
km = kmeans(X,centers=nbcl) # random initialization

plot(dend)
colored_bars(as.numeric(Y),dend,rowLabels="Y")
colored_bars(as.numeric(km$cluster),rowLabels="kmeans")

One could use the HC results to initialize the kmeans as follows.

cl = cutree(hc,nbcl)
centers = matrix(NA,nbcl,dim(X)[2])
for (k in 1:nbcl){
    wcl = which(cl==k)
    centers[k,] = apply(X[wcl,],2,mean)
}
km = kmeans(X,centers=centers)
plot(dend)
colored_bars(as.numeric(Y),dend,rowLabels="Y")
colored_bars(as.numeric(km$cluster),rowLabels="kmeans")

It can also be a good idea to focus on the genes with a high (absolute) correlation with the classes AML/ALL.

w = which(rhoXY>.7)

dis = vegdist(X[,w],"euclidean")
hc = hclust(dis, "ward.D2")

dend <- as.dendrogram(hc)
plot(dend)
colored_bars(as.numeric(Y),dend)

cl = cutree(hc,nbcl)
centers = matrix(NA,nbcl,dim(X[,w])[2])
for (k in 1:nbcl){
    wcl = which(cl==k)
    centers[k,] = apply(X[wcl,w],2,mean)
}
km = kmeans(X[,w],centers=centers)
plot(dend)
colored_bars(as.numeric(Y),dend,rowLabels="Y")
colored_bars(as.numeric(km$cluster),rowLabels="kmeans")

It seems that classification should be improves using 3 classes.

nbcl = 3
cl = cutree(hc,nbcl)
centers = matrix(NA,nbcl,dim(X[,w])[2])
for (k in 1:nbcl){
    wcl = which(cl==k)
    centers[k,] = apply(X[wcl,w],2,mean)
}
km = kmeans(X[,w],centers=centers)
plot(dend)
colored_bars(as.numeric(Y),dend,rowLabels="Y")
colored_bars(as.numeric(km$cluster),rowLabels="kmeans")