## Leukemia Gene Expression #gene = read.csv("~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/DATASET/Leukemia/data_set_ALL_AML_train.csv",header=TRUE,sep="\t") gene = read.csv("~/Dropbox/ENSEIGNEMENT/MACHINEdata_set_ALL_AML_train.csv",header=TRUE,sep="\t") ncol(gene) nrow(gene) X = t(gene[,seq(3,ncol(gene),by=2)]) Y = as.factor(c(rep(1,27),rep(2,11))) CXY = NULL for (k in 1:ncol(X)){ CXY[k] = abs(cor(X[,k],as.numeric(Y))) } hist(CXY) image.plot(1:38,1:ncol(X),(scale(X)),xlab="Samples",ylab="Genes Expression",zlim=range(scale(X))) abline(v=27.5) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenes.eps",height=8,width=6) w = which(CXY>.6) image.plot(1:38,1:length(w),scale(X[,w]),xlab="Samples",ylab="Genes",zlim=range(scale(X))) abline(v=27.5) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenes_HighCor.eps",height=8,width=6) ## # PCA library(FactoMineR) pca = PCA(X) plot(pca$ind$coord[,1:2],col=Y,pch=20, xlab=paste("PC 1,",round(pca$eig[1,2]*100)/100,"%"), ylab=paste("PC 2,",round(pca$eig[2,2]*100)/100,"%")) legend("topright",col=1:2,legend=c("ALL",'AML'),pch=20) #dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenesPCA.eps",height=6,width=6) pca = PCA(X[,w]) plot(pca$ind$coord[,1:2],col=Y,pch=20, xlab=paste("PC 1,",round(pca$eig[1,2]*100)/100,"%"), ylab=paste("PC 2,",round(pca$eig[2,2]*100)/100,"%")) legend("topright",col=1:2,legend=c("No",'Yes'),pch=20) # MDS Xs = scale(X) # d = dist(Xs,method="euclidean") # fit <- cmdscale(d,eig=TRUE, k=2) # plot(fit$points[,1:2],pch=20, col=Y,xlab="Axis 1",ylab="Axis 2") # legend("topright",col=1:2,legend=c("No",'Yes'),pch=20) # library(kernlab) # kpc <- kpca(~.,data=X,kernel="rbfdot",kpar=list(sigma=0.005),features=4) # plot(kpc@rotated[,1:2],col=Y,pch=20, # xlab=paste("PC 1,", "%"), # ylab=paste("PC 2,", "%")) # legend("bottomleft",col=1:2,legend=c("No",'Yes'),pch=20) # plot(kpc@rotated[,3:4],col=Y,pch=20, # xlab=paste("PC 3,", "%"), # ylab=paste("PC 4,", "%")) # legend("bottomright",col=1:2,legend=c("No",'Yes'),pch=20) #install.packages("vegan") library(vegan) dis = vegdist(X,"cao") dis = vegdist(X,"euclidean") pl <- plot(cmdscale(dis), col=Y,pch=20) legend("bottomright",col=1:2,legend=c("No",'Yes'),pch=20) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenesMDS_euclidean.eps",height=6,width=6) # Isomap iso = isomap(dis,k=10) #??? plot(iso, col=Y,pch=20) # ??? # tSNE # install.packages("tsne") library(tsne) tsne = tsne(Xs,perplexity = 3) plot(tsne, col=Y,pch=20) # install.packages("Rtsne") library(Rtsne) w = which(!duplicated(ILD[,-c(11)])) r_tsne = Rtsne(scale(ILD[w,-c(2,10,11)])) plot(r_tsne$Y,col=Y,pch=20) # ======================================================= # Clustering ILDs = scale(X) #d = dist(ILDs,method="euclidean") dis = vegdist(X[,w],"euclidean") hc = hclust(dis, "ward.D2") plot(hc) library(dendextend) dend <- as.dendrogram(hc) plot(dend) colored_bars(as.numeric(Y),dend) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenesDendogram_euclidean.eps",height=6,width=6) nbcl = 2 # Plus de classes pures avec 3 et 4 qu'avec 2 km = kmeans(Xs[,w],centers=nbcl) for (k in 1:nbcl){ print(paste("Cluster",k)) print(table(Y[which(km$cluster==k)])) } #plot(r_tsne$Y,col=Y,pch=20) #points(r_tsne$Y,col=km$cluster) plot(dend) colored_bars(as.numeric(Y),dend,rowLabels="Y") colored_bars(as.numeric(km$cluster),rowLabels="kmeans") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenesDendogram_euclidean_km.eps",height=8,width=5) cl = cutree(hc,nbcl) centers = matrix(NA,nbcl,dim(X)[2]) for (k in 1:nbcl){ wcl = which(cl==k) centers[k,] = apply(X[wcl,],2,mean) } km = kmeans(X,centers=centers) plot(dend) colored_bars(as.numeric(Y),dend,rowLabels="Y") colored_bars(as.numeric(km$cluster),rowLabels="kmeans") library(mclust) gmm = Mclust(Xs[,w]) plot(gmm,"BIC") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenesDendogram_GMM_BIC.eps",height=5,width=5) plot(dend) colored_bars(as.numeric(Y),dend,rowLabels="Y") colored_bars(as.numeric(gmm$classification),rowLabels="GMM") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenesDendogram_GMM.eps",height=8,width=5) gmm = Mclust(X[,w]) plot(gmm,"BIC") plot(dend) colored_bars(as.numeric(Y),dend,rowLabels="Y") colored_bars(as.numeric(gmm$classification),rowLabels="GMM") nbcl = 2 km = kmeans(Xs,centers=nbcl) plot(dend) colored_bars(as.numeric(Y),dend,rowLabels="Y") colored_bars(as.numeric(km$cluster),rowLabels="kmeans") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenesDendogram_all_km.eps",height=8,width=5) gmm = Mclust(Xs) plot(gmm,"BIC") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenesDendogram_all_GMM_BIC.eps",height=5,width=5) plot(dend) colored_bars(as.numeric(Y),dend,rowLabels="Y") colored_bars(3-as.numeric(gmm$classification),rowLabels="GMM") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenesDendogram_all_GMM.eps",height=8,width=5) nbcl = 2 km = kmeans(X,centers=nbcl) plot(dend) colored_bars(as.numeric(Y),dend,rowLabels="Y") colored_bars(as.numeric(km$cluster),rowLabels="kmeans") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenesDendogram_all_km.eps",height=8,width=5) gmm = Mclust(X) plot(gmm,"BIC") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenesDendogram_all_GMM_BIC.eps",height=5,width=5) plot(dend) colored_bars(as.numeric(Y),dend,rowLabels="Y") colored_bars(3-as.numeric(gmm$classification),rowLabels="GMM") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenesDendogram_all_GMM.eps",height=8,width=5) # ======================================================= # CLASSIFICATION # ======================================================= # K nearest neighbors =================================== library(class) library(pROC) Seed = "1223" set.seed(Seed) #ILDtmp = ILD[which(!is.na(ILD$AGR)),] #ILDtmp$LD = as.factor(ILDtmp$LD) n = nrow(X) ; n.train = round(n*2/3) data = data.frame(cbind(X,Y)) data$Y = as.factor(data$Y) set.seed(Seed) test <- prob1 <- prob3 <- prob5 <- prob7<- prob9 <-NULL B = 100 for (b in 1:B){ i.TRAIN = sample(1:n,n.train) ; i.TEST = setdiff(1:n,i.TRAIN) ; test = c(test,i.TEST) rho = NULL for (k in 1:ncol(X)){ rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN]))) } w = which(abs(rho)>.5) TRAIN = data[i.TRAIN,c(w,dim(data)[2])] TEST = data[i.TEST,c(w,dim(data)[2])] knn1 = knn(TRAIN[,-(length(w)+1)],TEST[,-(length(w)+1)],TRAIN$Y,k=1,prob=TRUE) prob1 = c(prob1,attributes(knn1)$prob) knn3 = knn(TRAIN[,-(length(w)+1)],TEST[,-(length(w)+1)],TRAIN$Y,k=3,prob=TRUE) prob3 = c(prob3,attributes(knn3)$prob) knn5 = knn(TRAIN[,-(length(w)+1)],TEST[,-(length(w)+1)],TRAIN$Y,k=5,prob=TRUE) prob5 = c(prob5,attributes(knn5)$prob) knn7 = knn(TRAIN[,-(length(w)+1)],TEST[,-(length(w)+1)],TRAIN$Y,k=7,prob=TRUE) prob7 = c(prob7,attributes(knn7)$prob) knn9 = knn(TRAIN[,-(length(w)+1)],TEST[,-(length(w)+1)],TRAIN$Y,k=9,prob=TRUE) prob9 = c(prob9,attributes(knn7)$prob) } Ytest = data[test,dim(data)[2]] Ychar=NULL Ychar[Ytest==levels(Ytest)[1]] = "AML" Ychar[Ytest=="2"] = "ALL" boxplot(prob7~Ychar,ylab="Predicted probabilities") #dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenes_knn_boxplot.eps",height=6,width=6) roc5 = roc(data[test,dim(data)[2]],prob5) plot(roc5) roc9 = roc(data[test,dim(data)[2]],prob9) plot(roc9,add=TRUE,col="gray") roc7 = roc(data[test,dim(data)[2]],prob7) plot(roc7,add=TRUE,col="blue",lty=2) legend("bottomright",legend=c("k=5","k=7","k=9"),lty=c(1,2,1),col=c("black","blue","gray")) #dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenes_knn_ROC.eps",height=6,width=6) # knn and PCA set.seed(Seed) test <- prob1 <- prob3 <- prob5 <- prob7<- prob9 <-NULL B = 100 for (b in 1:B){ i.TRAIN = sample(1:n,n.train) ; i.TEST = setdiff(1:n,i.TRAIN) ; test = c(test,i.TEST) rho = NULL for (k in 1:ncol(X)){ rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN]))) } w = which(abs(rho)>.5) TRAIN = data[i.TRAIN,c(w,dim(data)[2])] TEST = data[i.TEST,c(w,dim(data)[2])] pca = PCA(data[,-dim(data)[2]],ind.sup = i.TEST,graph = FALSE,ncp=20) knn1 = knn(pca$ind$coord,pca$ind.sup$coord,TRAIN$Y,k=1,prob=TRUE) prob1 = c(prob1,attributes(knn1)$prob) knn3 = knn(pca$ind$coord,pca$ind.sup$coord,TRAIN$Y,k=3,prob=TRUE) prob3 = c(prob3,attributes(knn3)$prob) knn5 = knn(pca$ind$coord,pca$ind.sup$coord,TRAIN$Y,k=5,prob=TRUE) prob5 = c(prob5,attributes(knn5)$prob) knn7 = knn(pca$ind$coord,pca$ind.sup$coord,TRAIN$Y,k=7,prob=TRUE) prob7 = c(prob7,attributes(knn7)$prob) knn9 = knn(pca$ind$coord,pca$ind.sup$coord,TRAIN$Y,k=9,prob=TRUE) prob9 = c(prob9,attributes(knn7)$prob) } roc5 = roc(data[test,dim(data)[2]],prob5) plot(roc5) roc9 = roc(data[test,dim(data)[2]],prob9) plot(roc9,add=TRUE,col="gray") roc7 = roc(data[test,dim(data)[2]],prob7) plot(roc7,add=TRUE,col="blue",lty=2) legend("bottomright",legend=c("k=5","k=7","k=9"),lty=c(1,2,1),col=c("black","blue","gray")) # ======================================================= # Decision trees ======================================== n.train = 34 set.seed(Seed) test <- pred <-NULL B = 100 for (b in 1:B){ i.TRAIN = sample(1:n,n.train) ; i.TEST = setdiff(1:n,i.TRAIN) ; test = c(test,i.TEST) rho = NULL for (k in 1:ncol(X)){ rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN]))) } w = which(abs(rho)>.7) TRAIN = data.frame(X=scale(data[i.TRAIN,w]),Y=data$Y[i.TRAIN]) Xtest = scale(data[i.TEST,w],center=apply(data[i.TRAIN,w],2,mean),scale=apply(data[i.TRAIN,w],2,sd)) TEST = data.frame(X=Xtest,Y=data$Y[i.TEST]) rp = rpart(Y~.,data=TRAIN,control=rpart.control(minsplit = 4,cp=0.0001)) pred = c(pred,predict(rp,TEST,type="prob")[,1]) } roc.rp = roc(data[test,dim(data)[2]],pred) plot(roc.rp) grid() dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenes_tree_ROC.eps",height=6,width=6) plot(rp) text(rp) # ======================================================= # LDA =================================================== n.train = 34 set.seed(Seed) test <- pred <-NULL B = 100 for (b in 1:B){ i.TRAIN = sample(1:n,n.train) ; i.TEST = setdiff(1:n,i.TRAIN) ; test = c(test,i.TEST) rho = NULL for (k in 1:ncol(X)){ rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN]))) } w = which(abs(rho)>.5) TRAIN = data.frame(X=scale(data[i.TRAIN,w]),Y=data$Y[i.TRAIN]) Xtest = scale(data[i.TEST,w],center=apply(data[i.TRAIN,w],2,mean),scale=apply(data[i.TRAIN,w],2,sd)) TEST = data.frame(X=Xtest,Y=data$Y[i.TEST]) mod.lda = lda(Y~.,data = TRAIN) pred = c(pred,predict(mod.lda,TEST)$posterior[,1]) } roc.lda = roc(data[test,dim(data)[2]],pred) plot(roc.lda) grid() dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenes_lda_ROC.eps",height=6,width=6) # sparse LDA library(sparseLDA) lambda.seq = c(1e-4,1e-3,1e-2,1e-1,1,10) n.train = 34 set.seed(Seed) test <- pred <-NULL B = 100 for (b in 1:B){ i.TRAIN = sample(1:n,n.train) ; i.TEST = setdiff(1:n,i.TRAIN) ; test = c(test,i.TEST) TRAIN = data.frame(X=scale(data[i.TRAIN,-dim(data)[2]]),Y=data$Y[i.TRAIN]) Xtest = scale(data[i.TEST,],center=apply(data[i.TRAIN,-dim(data)[2]],2,mean),scale=apply(data[i.TRAIN,-dim(data)[2]],2,sd)) TEST = data.frame(X=Xtest,Y=data$Y[i.TEST]) mod.slda = sda(TRAIN[,-dim(data)[2]],TRAIN$Y,lambda=lambda.seq[1]) # choose lambda... pred = c(pred,predict(mod.slda,TEST)$posterior[,1]) } roc.slda = roc(data[test,dim(data)[2]],pred) plot(roc.slda) grid() # ======================================================= # Logistic regression =================================== n.train = 34 set.seed(Seed) test <- pred <-NULL B = 100 for (b in 1:B){ i.TRAIN = sample(1:n,n.train) ; i.TEST = setdiff(1:n,i.TRAIN) ; test = c(test,i.TEST) rho = NULL for (k in 1:ncol(X)){ rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN]))) } w = which(abs(rho)>.5) TRAIN = data.frame(X=scale(data[i.TRAIN,w]),Y=data$Y[i.TRAIN]) Xtest = scale(data[i.TEST,w],center=apply(data[i.TRAIN,w],2,mean),scale=apply(data[i.TRAIN,w],2,sd)) TEST = data.frame(X=Xtest,Y=data$Y[i.TEST]) mod.lr = glm(Y~.,data = TRAIN, family="binomial") pred = c(pred,predict(mod.lr,TEST,type="response")) } roc.lr = roc(data[test,dim(data)[2]],pred) # Area under the curve: 0.5846 plot(roc.lr) grid() dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenes_lr_ROC.eps",height=6,width=6) # Lasso logistic regression library(glmnet) n.train = 34 set.seed(Seed) test <- pred <-NULL B = 100 for (b in 1:B){ i.TRAIN = sample(1:n,n.train) ; i.TEST = setdiff(1:n,i.TRAIN) ; test = c(test,i.TEST) rho = NULL for (k in 1:ncol(X)){ rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN]))) } w = which(abs(rho)>.5) TRAIN = data.frame(X=scale(data[i.TRAIN,w]),Y=data$Y[i.TRAIN]) Xtest = scale(data[i.TEST,w],center=apply(data[i.TRAIN,w],2,mean),scale=apply(data[i.TRAIN,w],2,sd)) TEST = data.frame(X=Xtest,Y=data$Y[i.TEST]) cv = cv.glmnet(as.matrix(TRAIN[,-dim(TRAIN)[2]]),TRAIN$Y,family="binomial",type.measure="auc",nfolds=3) mod.lr = glmnet(as.matrix(TRAIN[,-dim(TRAIN)[2]]),TRAIN$Y,lambda=cv$lambda.1se,family="binomial") pred = c(pred,predict(mod.lr,as.matrix(TEST[,-dim(TRAIN)[2]]),type="response")) } roc.lr = roc(data[test,dim(data)[2]],pred) #Area under the curve: 0.904 plot(roc.lr) grid() dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenes_LASSOlr_ROC.eps",height=6,width=6) # ======================================================= # Artificial neaural network ============================ # http://www.rblog.uni-freiburg.de/2017/02/07/deep-learning-in-r/ library(neuralnet) n.train = 34 set.seed(Seed) test <-NULL pred = matrix(0,100*4,9) name.ann = list() B = 100 for (b in 1:B){ i.TRAIN = sample(1:n,n.train) ; i.TEST = setdiff(1:n,i.TRAIN) ; test = c(test,i.TEST) rho = NULL for (k in 1:ncol(X)){ rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN]))) } w = which(abs(rho)>.5) TRAIN = data.frame(X=scale(data[i.TRAIN,w]),Y=as.numeric(data$Y[i.TRAIN])-1) Xtest = scale(data[i.TEST,w],center=apply(data[i.TRAIN,w],2,mean),scale=apply(data[i.TRAIN,w],2,sd)) TEST = data.frame(X=Xtest,Y=as.numeric(data$Y[i.TEST])-1) N = names(TRAIN)[-ncol(TRAIN)] f <- as.formula(paste("Y ~", paste(N[!N %in% "TRAIN"], collapse = " + "))) cnt = 1 for (i1 in 1:3) { ann = neuralnet(f,data = TRAIN,hidden=c(i1),linear.output=FALSE) pred[((b-1)*nrow(TEST)+1):(b*nrow(TEST)),cnt] = compute(ann,TEST[,-ncol(TEST)])$net.result name.ann[[cnt]] = c(i1,0) cnt = cnt+1 for (i2 in 1:i1) { ann = neuralnet(f,data = TRAIN,hidden=c(i1,i2),linear.output=FALSE) pred[((b-1)*nrow(TEST)+1):(b*nrow(TEST)),cnt] = compute(ann,TEST[,-ncol(TEST)])$net.result name.ann[[cnt]] = c(i1,i2) cnt = cnt+1 } } } auc = list() cnt = 1 for (i1 in 1:3){ for (j1 in 0:i1) { roc.ann = roc(data[test,dim(data)[2]],pred[,cnt]) auc[[cnt]] = roc.ann$auc cnt = cnt+1 } } auc roc.ann = roc(data[test,dim(data)[2]],pred[,9]) #Area under the curve: 0.904 plot(roc.ann) grid() dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenes_ANN_ROC.eps",height=6,width=6) roc.ann$auc # ======================================================= # Bagging =============================================== ### BUGS.... library(adabag) bagging.new <-function (formula, data, mfinal = 100, control, ...) { formula <- as.formula(formula) vardep <- data[, as.character(formula[[2]])] n <- length(data[, 1]) nclases <- nlevels(vardep) pred <- data.frame(rep(0, n)) arboles <- list() replicas <- array(0, c(n, mfinal)) arboles[[1]] <- rpart(formula, data = data, control = rpart.control(minsplit = 2, cp = -1, maxdepth = 30)) nvar <- dim(varImp(arboles[[1]], surrogates = FALSE, competes = FALSE))[1] imp <- array(0, c(mfinal, nvar)) for (m in 1:mfinal) { k2 <- 1 while (k2 == 1) { boostrap <- sample(1:n, replace = TRUE) fit <- rpart(formula, data = data[boostrap, ], control = control) k2 <- length(fit$frame$var) } arboles[[m]] <- fit replicas[, m] <- boostrap k <- varImp(arboles[[m]], surrogates = FALSE, competes = FALSE) imp[m, ] <- k[sort(row.names(k)), ] } pred <- as.data.frame(sapply(arboles, predict, data = data, type = "class")) classfinal <- array(0, c(n, nlevels(vardep))) for (i in 1:nlevels(vardep)) { classfinal[, i] <- matrix(as.numeric(pred == levels(vardep)[i]), nrow = n) %*% rep(1, mfinal) } predclass <- rep("O", n) predclass[] <- apply(classfinal, 1, FUN = select, vardep.summary = summary(vardep)) pond <- rep(1, mfinal) imppond <- as.vector(as.vector(pond) %*% imp) imppond <- imppond/sum(imppond) * 100 names(imppond) <- sort(row.names(k)) votosporc <- classfinal/apply(classfinal, 1, sum) ans <- list(formula = formula, trees = arboles, votes = classfinal, prob = votosporc, class = predclass, samples = replicas, importance = imppond) attr(ans, "vardep.summary") <- summary(vardep, maxsum = 700) mf <- model.frame(formula = formula, data = data) terms <- attr(mf, "terms") ans$terms <- terms ans$call <- match.call() class(ans) <- "bagging" ans } n.train = 34 set.seed(Seed) test <-NULL pred = NULL B = 100 for (b in 1:B){ i.TRAIN = sample(1:n,n.train) ; i.TEST = setdiff(1:n,i.TRAIN) ; test = c(test,i.TEST) rho = NULL for (k in 1:ncol(X)){ rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN]))) } w = which(abs(rho)>.6) TRAIN = data.frame(X=scale(data[i.TRAIN,w]),Y=data$Y[i.TRAIN]) Xtest = scale(data[i.TEST,w],center=apply(data[i.TRAIN,w],2,mean),scale=apply(data[i.TRAIN,w],2,sd)) TEST = data.frame(X=Xtest,Y=data$Y[i.TEST]) N = names(TRAIN)[-ncol(TRAIN)] f <- as.formula(paste("Y ~", paste(N[!N %in% "TRAIN"], collapse = " + "))) bag = bagging.new(f,data=TRAIN,mfinal=2,control=rpart.control(maxdepth=5, minsplit=2,cp=0.0001)) pred = c(pred,predict.bagging(bag,TEST,type="response")) } # ======================================================= # Gradient Boosting====================================== #https://www.analyticsvidhya.com/blog/2015/09/complete-guide-boosting-methods/ n.train = 34 set.seed(Seed) test <-NULL n.trees.tab = c(100,250,500,1000) pred = matrix(NA,400,length(n.trees.tab)) B = 100 for (b in 1:B){ i.TRAIN = sample(1:n,n.train) ; i.TEST = setdiff(1:n,i.TRAIN) ; test = c(test,i.TEST) rho = NULL for (k in 1:ncol(X)){ rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN]))) } w = which(abs(rho)>.5) TRAIN = data.frame(X=scale(data[i.TRAIN,w]),Y=as.numeric(data$Y[i.TRAIN])-1) Xtest = scale(data[i.TEST,w],center=apply(data[i.TRAIN,w],2,mean),scale=apply(data[i.TRAIN,w],2,sd)) TEST = data.frame(X=Xtest,Y=as.numeric(data$Y[i.TEST])-1) N = names(TRAIN)[-ncol(TRAIN)] f <- as.formula(paste("Y ~", paste(N[!N %in% "TRAIN"], collapse = " + "))) gbm1 = gbm(f,data=TRAIN,distribution="bernoulli",n.trees=1000,n.minobsinnode = 4,n.cores=2) cnt = 0 for (n.t in n.trees.tab){ cnt = cnt+1 pred[((b-1)*nrow(TEST)+1):(b*nrow(TEST)),cnt] = predict(gbm1,TEST,n.trees=n.t,type="response") } } auc = NULL roc.gbm = roc(data[test,dim(data)[2]],pred[,1]) #Area under the curve: 0.904 plot(roc.gbm,lwd=1.5) auc[1] = roc.gbm$auc leg = paste("n.trees = ", n.trees.tab[1],", AUC =", round(auc[1]*1000)/1000) for (k in 2:ncol(pred)){ roc.gbm = roc(data[test,dim(data)[2]],pred[,k]) #Area under the curve: 0.904 lines(roc.gbm,col=k,lwd=1.5) auc[k] = roc.gbm$auc leg = c(leg,paste("n.trees = ", n.trees.tab[k],", AUC =", round(auc[k]*1000)/1000)) } grid() roc.gbm$auc #legend("bottomright",legend=leg,text.col=1:length(n.trees.tab)) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenes_GBM_ROC.eps",height=6,width=6) library(caret) fitControl <- trainControl(method = "repeatedcv", number = 4, repeats = 4) set.seed(33) gbmFit1 <- train(as.factor(Y) ~ ., data = TRAIN, method = "gbm", trControl = fitControl,verbose = FALSE) # ======================================================= # SVM =================================================== library(e1071) n.train = 34 set.seed(Seed) test <- pred <-NULL B = 100 for (b in 1:B){ i.TRAIN = sample(1:n,n.train) ; i.TEST = setdiff(1:n,i.TRAIN) ; test = c(test,i.TEST) rho = NULL for (k in 1:ncol(X)){ rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN]))) } w = which(abs(rho)>.5) TRAIN = data.frame(X=scale(data[i.TRAIN,w]),Y=data$Y[i.TRAIN]) Xtest = scale(data[i.TEST,w],center=apply(data[i.TRAIN,w],2,mean),scale=apply(data[i.TRAIN,w],2,sd)) TEST = data.frame(X=Xtest,Y=data$Y[i.TEST]) svm = svm(Y~.,data = TRAIN,probability=TRUE,gamma=1e-5,cost=100) pred = c(pred,predict(svm,TEST,probability=TRUE)) } obj <- tune("svm", Y~.,data = TRAIN, ranges = list(gamma = c(1e-5,1e-3,1e-1), cost = c(0.01,0.1,1,10,100,1000)), tunecontrol = tune.control(sampling = "boot") ) roc.svm = roc(data[test,dim(data)[2]],pred) # Area under the curve: roc.svm$auc plot(roc.svm) # Area under the curve: 0.9296 (gamma = 1e-3, C = 100) grid() dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/Figures/LeukemiaGenes_svm_ROC.eps",height=6,width=6) # ======================================================= # Random Forest ========================================= library(randomForest) n.train = 34 set.seed(Seed) test <- pred <-NULL B = 100 for (b in 1:B){ i.TRAIN = sample(1:n,n.train) ; i.TEST = setdiff(1:n,i.TRAIN) ; test = c(test,i.TEST) rho = NULL for (k in 1:ncol(X)){ rho[k] = abs(cor(X[i.TRAIN,k],as.numeric(Y[i.TRAIN]))) } w = which(abs(rho)>.5) TRAIN = data.frame(X=scale(data[i.TRAIN,w]),Y=data$Y[i.TRAIN]) Xtest = scale(data[i.TEST,w],center=apply(data[i.TRAIN,w],2,mean),scale=apply(data[i.TRAIN,w],2,sd)) TEST = data.frame(X=Xtest,Y=data$Y[i.TEST]) svm = radomForest(Y~.,data = TRAIN,probability=TRUE) pred = c(pred,predict(svm,TEST,probability=TRUE)) } # ======================================================= # ======================================================= # ======================================================= # library(MASS) library(caret) twoClassSum = function (data, lev = .5, model = NULL) { lvls <- levels(data$obs) if (length(lvls) > 2) stop(paste("Your outcome has", length(lvls), "levels. The twoClassSummary() function isn't appropriate.")) #requireNamespaceQuietStop("ModelMetrics") if (!all(levels(data[, "pred"]) == lvls)) stop("levels of observed and predicted data do not match") data$y = as.numeric(data$obs == lvls[2]) lvls = as.numeric(lvls) rocAUC <- ModelMetrics::auc(ifelse(data$obs == lev[2], 0, 1), data[, lvls[1]]) out <- c(rocAUC, sensitivity(data[, "pred"], data[, "obs"], lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2])) names(out) <- c("ROC", "Sens", "Spec") out } ErrsCaret <- function(Model, Name,Pred) { Errs <- data.frame(postResample(obs=TEST$Y,pred=Pred), Resample = "None", model = Name) #rbind(Errs, data.frame(Model$resample, model = Name)) } CaretLearn <- function (Errs, Name, Formula, Method, ...) { set.seed(Seed) Model <- train(as.formula(Formula), data = TRAIN, method = Method, ...) Pred <- predict(Model, newdata = TEST) Errs <- rbind(Errs, ErrsCaret(Model, Name,Pred)) } Seed = "1223" set.seed(Seed) #ILDtmp = ILD[which(!is.na(ILD$AGR)),] #ILDtmp$LD = as.factor(ILDtmp$LD) n = nrow(X) ; n.train = round(n*2/3) data = data.frame(cbind(X[,w],Y)) data$Y = as.factor(data$Y) i.TRAIN = sample(1:n,n.train) ; TRAIN = data[i.TRAIN,] i.TEST = setdiff(1:n,i.TRAIN) ; TEST = data[i.TEST,] Errs = data.frame() Errs <- CaretLearn(Errs, "Linear Discrimant Analysis", "Y ~ .", "lda"); Errs <- CaretLearn(Errs, "Logistic Regression", "Y ~ .", "glm"); Errs <- CaretLearn(Errs, "Support Vector Machine", "Y ~ .", "svmLinear") Errs <- CaretLearn(Errs, "Support Vector Machine with polynomial kernel", "Y ~ .", "svmPoly") Errs <- CaretLearn(Errs, "Support Vector Machine with Gaussian kernel", "Y ~ .", "svmRadial") Errs <- CaretLearn(Errs, "Quadratic Discrimant Analysis", "Y ~ . ", "qda"); Errs <- CaretLearn(Errs, "Naive Bayes with Gaussian model", "Y ~ .", "nb",tuneGrid = data.frame(usekernel = c(FALSE), fL = c(0))) Errs <- CaretLearn(Errs, "k Nearest Neighbor", "Y ~ .", "knn",tuneGrid = data.frame(k = c(1,3,5))) mod_ref = glm(LD~.,data=ILD) boxplot(mod_ref$fitted.values~ILD$LD[which(!is.na(ILD$AGR))]) Model <- train(LD ~ ., data = TRAIN, method = "lda")