# =================================================== # Prediction of a categorical variable # =================================================== library(neuralnet) library(pROC) # Cancer Relapse dataset X.all = read.table("https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_for_other_methods.gct",skip=2,header=TRUE,sep = "\t") label = read.table("https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_class_labels.cls",skip=2,header=FALSE,sep = " ") X.desc = X.all[,1] X = t(X.all[,-c(1,2)]) y = factor(label,c(0,1)) par(mfrow=c(2,1)) matplot(t(X[y==0,]),col="black",typ="l",lty=1,ylab="expression") title("group 0") matplot(t(X[y==1,]),col="red",typ="l",lty=1,ylab="expression") title("group 1") # ======================================================= train_test_split <- function (X,y,test_size=.25,random_state=NULL){ # Extraction of a train and a test datasets. # # Inputs : X, y : data # test_size : a fraction (<1) of the total set or a number of samples (integer) ; default 0.25 # random_state : if equal to an interger, it fixes the random seed ; defaut NULL # Outputs : X_train, X_test, y_train, y_test # n = nrow(X) if (test_size>1){test_size=test_size/n} if (!is.null(random_state)){set.seed(random_state)} itest=sample(1:n,round(n*test_size)) itrain=setdiff(1:n,itest) Xtrain=X[itrain,] Xtest=X[itest,] ytrain=y[itrain] ytest=y[itest] return(list(X_train=Xtrain,X_test=Xtest,y_train=ytrain,y_test=ytest)) } # ======================================================= # Function to find the genes with the highest SNR genes_SNR = function(X,y){ # - Inputs # X = dataset # y = labels (a boolean or factor variable) # - Output # SNR labels = unique(y) n = dim(X)[1] p = dim(X)[2] K = length(labels) means = matrix(0,length(labels),dim(X)[2]) std = apply(X,2,sd) for (k in 1:K){means[k,] = apply(X[y==labels[k],],2,mean)} SNR = apply(matrix(abs(apply(means,2,diff)),K-1,p),2,max)/std return(SNR) } # ======================================================= # Artificial neaural network ============================ # http://www.rblog.uni-freiburg.de/2017/02/07/deep-learning-in-r/ B = 30 # number of samples for the cross validation err_ann = NULL prob_ann = NULL ytest_ann = NULL for (b in 1:B){ print(paste("b = ",b)) # sampling for train, test sets samples = train_test_split(X,y,test_size=0.25) X_train=samples$X_train X_test=samples$X_test y_train=samples$y_train y_test=samples$y_test ytest_ann = c(ytest_ann,y_test) # find the "best" genes SNR = genes_SNR(X_train,y_train) keep = sort.int(SNR,index.return=TRUE,decreasing=TRUE)$ix[1:50] # standardization mk=apply(X_train,2,mean) sk=apply(X_train,2,sd) X_train = scale(X_train,center=mk,scale=sk) X_test = scale(X_test,center=mk,scale=sk) data_train = data.frame(y_train,X_train[,keep]) data_test = data.frame(y_test,X_test[,keep]) colnames(data_train)[1] = "y" colnames(data_test)=colnames(data_train) ann = neuralnet(y~.,data=data_train,hidden=c(5,3),linear.output=FALSE) prob_ann = c(prob_ann,compute(ann,data_test)$net.result[,2]) ### RESULT ??? } roc(ytest_ann, prob_ann, plot=TRUE, ci=TRUE) grid()