Read Leukemia gene expression dataset
gene = read.csv("https://perso.univ-rennes1.fr/valerie.monbet/DATA/data_set_ALL_AML_train.csv",header=TRUE,sep="\t")
ncol(gene)
## [1] 78
nrow(gene)
## [1] 7129
As usual, this gene expression dataset is stored with the sample (patients) in columns and the gene in rows. For the statistical analysis, a transposition is needed.
X = t(gene[,seq(3,ncol(gene),by=2)])
Y = as.factor(c(rep(1,27),rep(2,11))) # class label (2 = AML, 1 = ALL)
A first view of the data
library(fields)
## Loading required package: spam
## Loading required package: grid
## Spam version 1.4-0 (2016-08-29) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## Loading required package: maps
image.plot(1:38,1:ncol(X),(scale(X)),xlab="Samples",ylab="Genes Expression",zlim=range(scale(X)))
abline(v=27.5)
Now, let us focus on the genes with a high absolute correlation with the classes
rhoXY = NULL
for (k in 1:ncol(X)){
rhoXY[k] = abs(cor(X[,k],as.numeric(Y)))
}
hist(rhoXY)
w = which(rhoXY>.6)
image.plot(1:38,1:length(w),scale(X[,w]),xlab="Samples",ylab="Genes",zlim=range(scale(X)))
abline(v=27.5)
## Principal Component Analysis PCA is run
using the FactoMineR package. The plot below shows that AML
and ALL may be mostly easy to separated.
library(FactoMineR)
pca = PCA(X,graph=FALSE)
plot(pca$ind$coord[,1:2],col=Y,pch=20,
xlab=paste("PC 1,",round(pca$eig[1,2]*100)/100,"%"),
ylab=paste("PC 2,",round(pca$eig[2,2]*100)/100,"%"))
legend("topright",col=1:2,legend=c("AML",'ALL'),pch=20)
## Multidimensional scaling
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.4-3
dis = vegdist(X,"euclidean")
pl <- plot(cmdscale(dis), col=Y,pch=20)
dis = vegdist(X,"euclidean")
hc = hclust(dis, "ward.D2")
plot(hc)
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.4.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:permute':
##
## shuffle
## The following object is masked from 'package:stats':
##
## cutree
dend <- as.dendrogram(hc)
plot(dend)
colored_bars(as.numeric(Y),dend)
### Kmeans Let us start with a random
initialization and only 2 classes.
nbcl = 2 # with 2 classes
km = kmeans(X,centers=nbcl) # random initialization
plot(dend)
colored_bars(as.numeric(Y),dend,rowLabels="Y")
colored_bars(as.numeric(km$cluster),rowLabels="kmeans")
One could use the HC results to initialize
the kmeans as follows.
cl = cutree(hc,nbcl)
centers = matrix(NA,nbcl,dim(X)[2])
for (k in 1:nbcl){
wcl = which(cl==k)
centers[k,] = apply(X[wcl,],2,mean)
}
km = kmeans(X,centers=centers)
plot(dend)
colored_bars(as.numeric(Y),dend,rowLabels="Y")
colored_bars(as.numeric(km$cluster),rowLabels="kmeans")
It can also be a good idea to focus on the
genes with a high (absolute) correlation with the classes
AML/ALL.
w = which(rhoXY>.7)
dis = vegdist(X[,w],"euclidean")
hc = hclust(dis, "ward.D2")
dend <- as.dendrogram(hc)
plot(dend)
colored_bars(as.numeric(Y),dend)
cl = cutree(hc,nbcl)
centers = matrix(NA,nbcl,dim(X[,w])[2])
for (k in 1:nbcl){
wcl = which(cl==k)
centers[k,] = apply(X[wcl,w],2,mean)
}
km = kmeans(X[,w],centers=centers)
plot(dend)
colored_bars(as.numeric(Y),dend,rowLabels="Y")
colored_bars(as.numeric(km$cluster),rowLabels="kmeans")
It seems that classification should be
improves using 3 classes.
nbcl = 3
cl = cutree(hc,nbcl)
centers = matrix(NA,nbcl,dim(X[,w])[2])
for (k in 1:nbcl){
wcl = which(cl==k)
centers[k,] = apply(X[wcl,w],2,mean)
}
km = kmeans(X[,w],centers=centers)
plot(dend)
colored_bars(as.numeric(Y),dend,rowLabels="Y")
colored_bars(as.numeric(km$cluster),rowLabels="kmeans")