url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/Biscuits.csv"
#url = "~/Dropbox/ENSEIGNEMENT/RADO/Biscuits.csv"
biscuits=read.csv(url,sep=";")
# Extraction de la colonne fat
fat=biscuits[,1]
# Extraction des variables explicatives
X=biscuits[,-1]

# Bagging =====================================================================
library(ipred)


n = dim(X)[1]
B = 10 # number of samples for the cross validation
mse_bag = NULL
for (b in 1:B){
  itrain = sample(1:n,round(n*7/8))
  itest = setdiff(1:n,itrain)
  Xs_train = scale(X[itrain,])
  Xs_test = scale(X[itest,],center=apply(X[itrain,],2,mean),scale=apply(X[itrain,],2,sd))
  # train = data.frame(cbind(Xs_train,fat[itrain]))
  # colnames(train)[length(colnames(train))] = "y"
  # test = data.frame(cbind(Xs_test,y[itest]))
  # colnames(test)=colnames(train)
  bag = ipredbagg(fat[itrain],Xs_train,nbagg=20,control=rpart.control(maxdepth=5, minsplit=15))
  # maxdepth = maximal depth of each individual tree
  # minsplit = minimum number of observations in a node to allow a split
  mse_bag[b] = mean((fat[itest]-predict(bag,Xs_test))^2)
}

print(paste("Error of bagging:", round(sqrt(mean(mse_bag))*100)/100))


# Random Forest =====================================================================
library(randomForest)


n = dim(X)[1]
B = 10 # number of samples for the cross validation
mse_rf = NULL
for (b in 1:B){
  itrain = sample(1:n,round(n*7/8))
  itest = setdiff(1:n,itrain)
  Xs_train = scale(X[itrain,])
  Xs_test = scale(X[itest,],center=apply(X[itrain,],2,mean),scale=apply(X[itrain,],2,sd))
  train = data.frame(cbind(fat[itrain],Xs_train))
  colnames(train)[1] = "y"
  test = data.frame(cbind(fat[itest],Xs_test))
  colnames(test) = colnames(train)
  RF = randomForest(y~.,data=train,mtry=20,control=rpart.control(maxdepth=5, minsplit=15))
  # maxdepth = maximal depth of each individual tree
  # minsplit = minimum number of observations in a node to allow a split
  y_pred = predict(RF,test)
  mse_rf = mean((y_pred-fat[itest])^2)
}

print(paste("Error of random forest:", round(sqrt(mean(mse_rf))*100)/100))