# =================================================== # Prediction of a categorical variable # =================================================== library(pROC) # ROC curve library(randomForest) # random forest library(adabag) # bagging # Cancer Relapse dataset X.all = read.table("https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_for_other_methods.gct",skip=2,header=TRUE,sep = "\t") label = read.table("https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_class_labels.cls",skip=2,header=FALSE,sep = " ") #chemin = "~/Documents/ENSEIGNEMENT/MACHINE LEARNING/Master BMC/PROJETS/Ex_datasets/Ex2/" #X.all = read.table(paste(chemin,"Ex2_breast_cancer_ER_train_set_for_other_methods.gct",sep=""),skip=2,header=TRUE,sep = "\t") #label = read.table(paste(chemin,"Ex2_breast_cancer_ER_train_set_class_labels.cls",sep=""),skip=2,header=FALSE,sep = " ") X.desc = X.all[,1] X = t(X.all[,-c(1,2)]) colnames(X) = X.desc y = factor(label,levels=c(0,1)) # ============================================================================== train_test_split <- function (X,y,test_size=.25,random_state=NULL){ # Extraction of a train and a test datasets. # # Inputs : X, y : data # test_size : a fraction (<1) of the total set or a number of samples (integer) ; default 0.25 # random_state : if equal to an interger, it fixes the random seed ; defaut NULL # Outputs : X_train, X_test, y_train, y_test # n = nrow(X) if (test_size>1){test_size=test_size/n} if (!is.null(random_state)){set.seed(random_state)} itest=sample(1:n,round(n*test_size)) itrain=setdiff(1:n,itest) Xtrain=X[itrain,] Xtest=X[itest,] ytrain=y[itrain] ytest=y[itest] return(list(X_train=Xtrain,X_test=Xtest,y_train=ytrain,y_test=ytest)) } # ======================================================= # Function to find the genes with the highest SNR genes_SNR = function(X,y){ # - Inputs # X = dataset # y = labels (a boolean or factor variable) # - Output # SNR labels = unique(y) n = dim(X)[1] p = dim(X)[2] K = length(labels) means = matrix(0,length(labels),dim(X)[2]) std = apply(X,2,sd) for (k in 1:K){means[k,] = apply(X[y==labels[k],],2,mean)} SNR = apply(matrix(abs(apply(means,2,diff)),K-1,p),2,max)/std return(SNR) } # Random Forest ===================================================================== B = 30 # number of samples for the cross validation n_est_list=c(20,50,100,500) err_rf = matrix(0,B,length(n_est_list)) prob_rf = NULL ytest_rf = NULL for (b in 1:B){ print(paste("b =",b)) # sampling for train, test sets samples = train_test_split(X,y,test_size=0.25) X_train=samples$X_train X_test=samples$X_test y_train=samples$y_train y_test=samples$y_test ytest_rf = c(ytest_rf,y_test) # standardization mk=apply(X_train,2,mean) sk=apply(X_train,2,sd) X_train = scale(X_train,center=mk,scale=sk) X_test = scale(X_test,center=mk,scale=sk) data_train = data.frame(y_train,X_train) data_test = data.frame(y_test,X_test) colnames(data_train)[1] = "y" colnames(data_test) = colnames(data_train) # learning knn model with various values of the number of neighbors prob_tmp = NULL for (k in 1:length(n_est_list)){ rf = randomForest(y~.,data=data_train,ntree=n_est_list[k],mtry=20, control=rpart.control(maxdepth=5, minsplit=15),probability=TRUE) y_pred = predict(rf,data_test) err_rf[b,k] = mean((y_test!=y_pred)) # compute MSE y_prob = predict(rf,data_test,type="prob")[,2] prob_tmp = cbind(prob_tmp,y_prob) } prob_rf = rbind(prob_rf,prob_tmp) } err_rf_all = apply(err_rf,2,mean) # global error par(mfrow=c(1,2)) # Two subfigures plot(n_est_list,err_rf_all,pch=20) # error for each value of k grid() k0 = which.min(err_rf_all) # to find for which values of k the error is min res_roc = roc(ytest_rf,prob_rf[,k0]) plot(res_roc) # ROC curve text(.5,.2,paste("AUC =",round(res_roc$auc,2))) # Variable importance (last fitted forest) plot(rf$importance) print("The most importante variables for prediction are") which(rf$importance>1) var_imp = sort(rf$importance,decreasing=TRUE, index.return=TRUE) best_var_imp=var_imp$x[1:20] best_var_names = attributes(rf$importance)$dimnames[[1]][var_imp$ix[1:20]] # ============================================================================== # Bagging algorithm for classification #### Computation time is LONG... #### B = 30 # number of samples for the cross validation n_est_list=c(20,50,100,500) err_bag = matrix(0,B,length(n_est_list)) prob_bag = NULL ytest_bag = NULL for (b in 1:B){ print(paste("b=",b)) # sampling for train, test sets samples = train_test_split(X,y,test_size=0.25) X_train=samples$X_train X_test=samples$X_test y_train=samples$y_train y_test=samples$y_test ytest_bag = c(ytest_bag,y_test) # standardization mk=apply(X_train,2,mean) sk=apply(X_train,2,sd) X_train = scale(X_train,center=mk,scale=sk) X_test = scale(X_test,center=mk,scale=sk) data_train = data.frame(y_train,X_train) data_test = data.frame(y_test,X_test) colnames(data_train)[1] = "y" colnames(data_test) = colnames(data_train) # learning knn model with various values of the number of neighbors prob_tmp = NULL for (k in 1:length(n_est_list)){ bag = bagging(y~.,data=data_train,mfinal=n_est_list[k],control=rpart.control(maxdepth=5, minsplit=15),par=TRUE) y_pred = predict(bag,data_test) err_bag[b,k] = mean((y_test!=y_pred$class)) # compute MSE prob_tmp = cbind(prob_tmp,y_pred$prob[,2]) } prob_bag = rbind(prob_bag,prob_tmp) } err_bag_all = apply(err_bag,2,mean) # global error par(mfrow=c(1,2)) # Two subfigures plot(n_est_list,err_bag_all,pch=20) # error for each value of k grid() k0 = which.min(err_bag_all) # to find for which values of k the error is min res_roc = roc(ytest_bag,prob_bag[,k0]) plot(res_roc) # ROC curve text(.5,.2,paste("AUC =",round(res_roc$auc,2)))