The dataset used in this challenge contains gene expression data. The data and the methods are discribed by Hoshida (2010) (see http://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0015543&type=printable).
data = read.table("https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_for_other_methods.gct",skip=2,header=TRUE,sep = "\t")
class = read.table("https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_class_labels.cls",skip=2,header=FALSE,sep = " ")
library(fields)
## Loading required package: spam
## Loading required package: grid
## Spam version 1.4-0 (2016-08-29) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## Loading required package: maps
image.plot(as.matrix(data[,3:ncol(data)]),zlim=range(data[,3:ncol(data)]))
X = t(data[,3:ncol(data)])
Y = t(class)
Hierarchical clustering allows to highlight groups in the patients samples.
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.4.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
d1 = dist(X)
h1 = hclust(d1, "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
dend <- as.dendrogram(h1)
plot(dend)
colored_bars(as.numeric(Y),dend,rowLabels="Y")
Heatplot allows to highlight groups of genes and groups of patients.
# Tracé des hauteurs de saut
barplot(h1$height) # On peut considérer qu'il y a 2 groupes.
d2 = dist(t(X))
h2 = hclust(d2, "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
gplots::heatmap.2(as.matrix(X),
Rowv = h1,
Colv = h2,
trace = "none"
)
A zoom can be made on some genes highly correlated with the class ER+/ER-.
rhoXY = NULL
for (k in 1:ncol(X)){
rhoXY[k] = abs(cor(X[,k],as.numeric(Y)))
}
hist(rhoXY)
keep = which(abs(rhoXY)>.6)
d1 = dist(X[,keep])
h1 = hclust(d1, "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
d2 = dist(t(X[,keep]))
h2 = hclust(d2, "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
gplots::heatmap.2(as.matrix(X[,keep]),
Rowv = h1,
Colv = h2,
trace = "none"
)
n = nrow(X) ; n.train = round(n*2/3)
data = data.frame(X)
data$Y = as.factor(Y)
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:glmnet':
##
## auc
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
Seed = "3241"
n = length(Y)
n.train = 64
set.seed(Seed)
test <- pred <-NULL
B = 10
for (b in 1:B){
i.TRAIN = sample(1:n,n.train) ;
i.TEST = setdiff(1:n,i.TRAIN) ;
test = c(test,i.TEST)
rho = NULL
for (k in 1:ncol(X)){
rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN])))
}
w = which(abs(rho)>.5)
TRAIN = data.frame(X=scale(data[i.TRAIN,w]),Y=data$Y[i.TRAIN])
Xtest = scale(data[i.TEST,w],center=apply(data[i.TRAIN,w],2,mean),scale=apply(data[i.TRAIN,w],2,sd))
TEST = data.frame(X=Xtest,Y=data$Y[i.TEST])
cv = cv.glmnet(as.matrix(TRAIN[,-dim(TRAIN)[2]]),TRAIN$Y,family="binomial",type.measure="auc",nfolds=3)
mod.lr = glmnet(as.matrix(TRAIN[,-dim(TRAIN)[2]]),TRAIN$Y,lambda=cv$lambda.1se,family="binomial")
pred = c(pred,predict(mod.lr,as.matrix(TEST[,-dim(TRAIN)[2]]),type="response"))
}
roc.lr = roc(data[test,dim(data)[2]],pred)
roc.lr$auc
## Area under the curve: 0.7367
plot(roc.lr)
grid()