Cancer relapsed within 5 years

We have collected gene expression levels for 4654 genes on 184 early-stage breast cancer samples: xtrain.txt (each row is a gene, each column a sample). After surgical removal of the tumour, some unfortunately relapsed within 5 years (label=+1), while other did not (label=-1). The labels of the the 184 samples are available in the file ytrain.txt.

X = read.table("http://members.cbio.mines-paristech.fr/~jvert/svn/tutorials/data/breastcancerwang/xtrain.txt",row.names=1)
Y = read.table("http://members.cbio.mines-paristech.fr/~jvert/svn/tutorials/data/breastcancerwang/ytrain.txt")
X = t(X)
Y = Y$V1
dim(X)
## [1]  184 4654
length(Y)
## [1] 184

Propose and test different techniques to predict the relapse from gene expression data. Check the effect of parameters, estimate the performance.

library(fields)
## Loading required package: spam
## Loading required package: grid
## Spam version 1.4-0 (2016-08-29) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
## 
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve
## Loading required package: maps
image.plot(t(X))

Clustering and visualization

Hierarchical clustering allows to highlight groups in the patients samples.

library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.4.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
d1 = dist(X)
h1 = hclust(d1, "ward.D")
dend <- as.dendrogram(h1)
plot(dend)
colored_bars(as.numeric(Y)+2,dend,rowLabels="Y")

Heatplot may highlight groups of genes and groups of patients.

A zoom can be made on some genes highly correlated with the class Y.

rhoXY = NULL
for (k in 1:ncol(X)){
    rhoXY[k] = abs(cor(X[,k],as.numeric(Y)))
}
hist(rhoXY)

keep = which(abs(rhoXY)>.2)
d1 = dist(X[,keep])
h1 = hclust(d1, "ward.D")
d2 = dist(t(X[,keep]))
h2 = hclust(d2, "ward.D")
gplots::heatmap.2(as.matrix(X[,keep]), 
                  Rowv = h1,
                  Colv = h2,
                  trace = "none"
)

Classification

n = nrow(X) ; n.train = round(n*2/3)
data = data.frame(X)
data$Y = as.factor(Y)

library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:glmnet':
## 
##     auc
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
Seed = "3241"
n = length(Y)
n.train = 64
set.seed(Seed)
test <- pred <-NULL
B = 10
for (b in 1:B){
    i.TRAIN = sample(1:n,n.train) ; 
    i.TEST = setdiff(1:n,i.TRAIN) ; 
    test = c(test,i.TEST)
    rho = NULL
      for (k in 1:ncol(X)){
          rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN])))
      }
    w = which(abs(rho)>.2)
    TRAIN = data.frame(X=scale(data[i.TRAIN,w]),Y=data$Y[i.TRAIN])
    Xtest = scale(data[i.TEST,w],center=apply(data[i.TRAIN,w],2,mean),scale=apply(data[i.TRAIN,w],2,sd))
    TEST = data.frame(X=Xtest,Y=data$Y[i.TEST])
    cv = cv.glmnet(as.matrix(TRAIN[,-dim(TRAIN)[2]]),TRAIN$Y,family="binomial",type.measure="auc",nfolds=3)
    mod.lr = glmnet(as.matrix(TRAIN[,-dim(TRAIN)[2]]),TRAIN$Y,lambda=cv$lambda.1se,family="binomial")
    pred = c(pred,predict(mod.lr,as.matrix(TEST[,-dim(TRAIN)[2]]),type="response"))
}

roc.lr = roc(data[test,dim(data)[2]],pred) 
roc.lr$auc
## Area under the curve: 0.5505
plot(roc.lr)
grid()