Prediction of ER+/ER- in breast cancer

The dataset used in this challenge contains gene expression data. The data and the methods are discribed by Hoshida (2010) (see http://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0015543&type=printable).

Load data

data = read.table("https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_for_other_methods.gct",skip=2,header=TRUE,sep = "\t")
class = read.table("https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_class_labels.cls",skip=2,header=FALSE,sep = " ")

library(fields)
## Loading required package: spam
## Loading required package: grid
## Spam version 1.4-0 (2016-08-29) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
## 
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve
## Loading required package: maps
image.plot(as.matrix(data[,3:ncol(data)]),zlim=range(data[,3:ncol(data)]))

X = t(data[,3:ncol(data)])
Y = t(class)

Clustering and visualization

Hierarchical clustering allows to highlight groups in the patients samples.

library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.4.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
d1 = dist(X)
h1 = hclust(d1, "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
dend <- as.dendrogram(h1)
plot(dend)
colored_bars(as.numeric(Y),dend,rowLabels="Y")

Heatplot allows to highlight groups of genes and groups of patients.

# Tracé des hauteurs de saut
barplot(h1$height) # On peut considérer qu'il y a 2 groupes.

d2 = dist(t(X))
h2 = hclust(d2, "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
gplots::heatmap.2(as.matrix(X), 
                  Rowv = h1,
                  Colv = h2,
                  trace = "none"
)

A zoom can be made on some genes highly correlated with the class ER+/ER-.

 rhoXY = NULL
for (k in 1:ncol(X)){
    rhoXY[k] = abs(cor(X[,k],as.numeric(Y)))
}
hist(rhoXY)

keep = which(abs(rhoXY)>.6)
d1 = dist(X[,keep])
h1 = hclust(d1, "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
d2 = dist(t(X[,keep]))
h2 = hclust(d2, "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
gplots::heatmap.2(as.matrix(X[,keep]), 
                  Rowv = h1,
                  Colv = h2,
                  trace = "none"
)

Classification

n = nrow(X) ; n.train = round(n*2/3)
data = data.frame(X)
data$Y = as.factor(Y)

library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:glmnet':
## 
##     auc
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
Seed = "3241"
n = length(Y)
n.train = 64
set.seed(Seed)
test <- pred <-NULL
B = 10
for (b in 1:B){
    i.TRAIN = sample(1:n,n.train) ; 
    i.TEST = setdiff(1:n,i.TRAIN) ; 
    test = c(test,i.TEST)
    rho = NULL
    for (k in 1:ncol(X)){
        rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN])))
    }
    w = which(abs(rho)>.5)
    TRAIN = data.frame(X=scale(data[i.TRAIN,w]),Y=data$Y[i.TRAIN])
    Xtest = scale(data[i.TEST,w],center=apply(data[i.TRAIN,w],2,mean),scale=apply(data[i.TRAIN,w],2,sd))
    TEST = data.frame(X=Xtest,Y=data$Y[i.TEST])
    cv = cv.glmnet(as.matrix(TRAIN[,-dim(TRAIN)[2]]),TRAIN$Y,family="binomial",type.measure="auc",nfolds=3)
    mod.lr = glmnet(as.matrix(TRAIN[,-dim(TRAIN)[2]]),TRAIN$Y,lambda=cv$lambda.1se,family="binomial")
    pred = c(pred,predict(mod.lr,as.matrix(TEST[,-dim(TRAIN)[2]]),type="response"))
}

roc.lr = roc(data[test,dim(data)[2]],pred) 
roc.lr$auc
## Area under the curve: 0.7367
plot(roc.lr)
grid()

Validation with the test dataset