# ===================================================
#      Prediction of a categorical variable
# ===================================================
library(neuralnet)
library(pROC)


# Cancer Relapse dataset
X.all = read.table("https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_for_other_methods.gct",skip=2,header=TRUE,sep = "\t")
label = read.table("https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_class_labels.cls",skip=2,header=FALSE,sep = " ")


X.desc = X.all[,1]
X = t(X.all[,-c(1,2)])
y = factor(label,c(0,1))

par(mfrow=c(2,1))
matplot(t(X[y==0,]),col="black",typ="l",lty=1,ylab="expression")
title("group 0")
matplot(t(X[y==1,]),col="red",typ="l",lty=1,ylab="expression")
title("group 1")

# =======================================================
train_test_split <- function (X,y,test_size=.25,random_state=NULL){
  # Extraction of a train and a test datasets.
  #
  # Inputs : X, y : data
  #          test_size : a fraction (<1) of the total set or a number of samples (integer) ; default 0.25
  #          random_state : if equal to an interger, it fixes the random seed ; defaut NULL
  # Outputs : X_train, X_test, y_train, y_test
  #
  n = nrow(X)
  if (test_size>1){test_size=test_size/n}
  if (!is.null(random_state)){set.seed(random_state)}
  itest=sample(1:n,round(n*test_size))
  itrain=setdiff(1:n,itest)
  Xtrain=X[itrain,]
  Xtest=X[itest,]
  ytrain=y[itrain]
  ytest=y[itest]
  return(list(X_train=Xtrain,X_test=Xtest,y_train=ytrain,y_test=ytest))
}
# =======================================================
# Function to find the genes with the highest SNR
genes_SNR = function(X,y){
  # - Inputs
  # X = dataset
  # y = labels (a boolean or factor variable)
  # - Output
  # SNR
  labels = unique(y)
  n = dim(X)[1]
  p = dim(X)[2]
  K = length(labels)
  
  means = matrix(0,length(labels),dim(X)[2])
  std = apply(X,2,sd)
  
  for (k in 1:K){means[k,] = apply(X[y==labels[k],],2,mean)}
  SNR = apply(matrix(abs(apply(means,2,diff)),K-1,p),2,max)/std
  return(SNR)
}

# =======================================================
# Artificial neaural network ============================

# http://www.rblog.uni-freiburg.de/2017/02/07/deep-learning-in-r/


B = 30 # number of samples for the cross validation

err_ann = NULL
prob_ann = NULL
ytest_ann = NULL
for (b in 1:B){
  print(paste("b = ",b))
  # sampling for train, test sets
  samples = train_test_split(X,y,test_size=0.25)
  X_train=samples$X_train
  X_test=samples$X_test
  y_train=samples$y_train
  y_test=samples$y_test
  ytest_ann = c(ytest_ann,y_test)
  # find the "best" genes
  SNR = genes_SNR(X_train,y_train)
  keep = sort.int(SNR,index.return=TRUE,decreasing=TRUE)$ix[1:50]
  # standardization
  mk=apply(X_train,2,mean)
  sk=apply(X_train,2,sd)
  X_train = scale(X_train,center=mk,scale=sk)
  X_test = scale(X_test,center=mk,scale=sk)
  data_train = data.frame(y_train,X_train[,keep])
  data_test = data.frame(y_test,X_test[,keep])
  colnames(data_train)[1] = "y"
  colnames(data_test)=colnames(data_train)
  
  ann = neuralnet(y~.,data=data_train,hidden=c(5,3),linear.output=FALSE)
  prob_ann = c(prob_ann,compute(ann,data_test)$net.result[,2]) ### RESULT ??? 
}

roc(ytest_ann, prob_ann, plot=TRUE, ci=TRUE)
grid()