We have collected gene expression levels for 4654 genes on 184 early-stage breast cancer samples: xtrain.txt (each row is a gene, each column a sample). After surgical removal of the tumour, some unfortunately relapsed within 5 years (label=+1), while other did not (label=-1). The labels of the the 184 samples are available in the file ytrain.txt.
X = read.table("http://members.cbio.mines-paristech.fr/~jvert/svn/tutorials/data/breastcancerwang/xtrain.txt",row.names=1)
Y = read.table("http://members.cbio.mines-paristech.fr/~jvert/svn/tutorials/data/breastcancerwang/ytrain.txt")
X = t(X)
Y = Y$V1
dim(X)
## [1] 184 4654
length(Y)
## [1] 184
Propose and test different techniques to predict the relapse from gene expression data. Check the effect of parameters, estimate the performance.
library(fields)
## Loading required package: spam
## Loading required package: grid
## Spam version 1.4-0 (2016-08-29) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## Loading required package: maps
image.plot(t(X))
Hierarchical clustering allows to highlight groups in the patients samples.
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.4.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
d1 = dist(X)
h1 = hclust(d1, "ward.D")
dend <- as.dendrogram(h1)
plot(dend)
colored_bars(as.numeric(Y)+2,dend,rowLabels="Y")
Heatplot may highlight groups of genes and groups of patients.
A zoom can be made on some genes highly correlated with the class Y.
rhoXY = NULL
for (k in 1:ncol(X)){
rhoXY[k] = abs(cor(X[,k],as.numeric(Y)))
}
hist(rhoXY)
keep = which(abs(rhoXY)>.2)
d1 = dist(X[,keep])
h1 = hclust(d1, "ward.D")
d2 = dist(t(X[,keep]))
h2 = hclust(d2, "ward.D")
gplots::heatmap.2(as.matrix(X[,keep]),
Rowv = h1,
Colv = h2,
trace = "none"
)
n = nrow(X) ; n.train = round(n*2/3)
data = data.frame(X)
data$Y = as.factor(Y)
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:glmnet':
##
## auc
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
Seed = "3241"
n = length(Y)
n.train = 64
set.seed(Seed)
test <- pred <-NULL
B = 10
for (b in 1:B){
i.TRAIN = sample(1:n,n.train) ;
i.TEST = setdiff(1:n,i.TRAIN) ;
test = c(test,i.TEST)
rho = NULL
for (k in 1:ncol(X)){
rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN])))
}
w = which(abs(rho)>.2)
TRAIN = data.frame(X=scale(data[i.TRAIN,w]),Y=data$Y[i.TRAIN])
Xtest = scale(data[i.TEST,w],center=apply(data[i.TRAIN,w],2,mean),scale=apply(data[i.TRAIN,w],2,sd))
TEST = data.frame(X=Xtest,Y=data$Y[i.TEST])
cv = cv.glmnet(as.matrix(TRAIN[,-dim(TRAIN)[2]]),TRAIN$Y,family="binomial",type.measure="auc",nfolds=3)
mod.lr = glmnet(as.matrix(TRAIN[,-dim(TRAIN)[2]]),TRAIN$Y,lambda=cv$lambda.1se,family="binomial")
pred = c(pred,predict(mod.lr,as.matrix(TEST[,-dim(TRAIN)[2]]),type="response"))
}
roc.lr = roc(data[test,dim(data)[2]],pred)
roc.lr$auc
## Area under the curve: 0.5505
plot(roc.lr)
grid()