library(gbm) library(xgboost) url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/Biscuits.csv" #url = "~/Dropbox/ENSEIGNEMENT/RADO/Biscuits.csv" biscuits=read.csv(url,sep=";") # Extraction de la colonne fat fat=biscuits[,1] # Extraction des variables explicatives X=biscuits[,-1] y = fat train_test_split <- function (X,y,test_size=.25,random_state=NULL){ # Extraction of a train and a test datasets. # # Inputs : X, y : data # test_size : a fraction (<1) of the total set or a number of samples (integer) ; default 0.25 # random_state : if equal to an interger, it fixes the random seed ; defaut NULL # Outputs : X_train, X_test, y_train, y_test # n = nrow(X) if (test_size>1){test_size=test_size/n} if (!is.null(random_state)){set.seed(random_state)} itest=sample(1:n,round(n*test_size)) itrain=setdiff(1:n,itest) Xtrain=X[itrain,] Xtest=X[itest,] ytrain=y[itrain] ytest=y[itest] return(list(X_train=Xtrain,X_test=Xtest,y_train=ytrain,y_test=ytest)) } # Gradient Boosting ===================================================================== B = 30 # number of samples for the cross validation err_gbm = NULL prob_gbm = NULL ytest_gbm = NULL for (b in 1:B){ # sampling for train, test sets samples = train_test_split(X,y,test_size=0.25) X_train=samples$X_train X_test=samples$X_test y_train=samples$y_train y_test=samples$y_test ytest_gbm = c(ytest_gbm,y_test) # standardization # standardization mk=apply(X_train,2,mean) sk=apply(X_train,2,sd) X_train = scale(X_train,center=mk,scale=sk) X_test = scale(X_test,center=mk,scale=sk) data_train = data.frame(y_train,X_train) data_test = data.frame(y_test,X_test) colnames(data_train)[1] = "y" colnames(data_test)=colnames(data_train) # learning decision tree mode gbm = gbm(y~.,data=data_train,distribution="gaussian",n.trees=200,n.minobsinnode=3) y_pred= predict(gbm,data_test,type="response",n.trees=200) err_gbm[b] = mean((y_test-y_pred)^2) } err_gbm_all = sqrt(mean(err_gbm)) # global error print(paste("Mean classification gbm, tree = ",round(err_gbm_all,2))) # eXtreme Gradient Boosting ===================================================================== err_xgb = NULL prob_xgb = NULL ytest_xgb = NULL for (b in 1:B){ # sampling for train, test sets samples = train_test_split(X,y,test_size=0.25) X_train=samples$X_train X_test=samples$X_test y_train=samples$y_train y_test=samples$y_test ytest_xgb = c(ytest_xgb,y_test) # standardization # standardization mk=apply(X_train,2,mean) sk=apply(X_train,2,sd) X_train = scale(X_train,center=mk,scale=sk) X_test = scale(X_test,center=mk,scale=sk) dtrain <- xgb.DMatrix(as.matrix(X_train), label = y_train) dtest <- xgb.DMatrix(as.matrix(X_test), label = y_test) param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,alpha=.5, objective = "reg:linear") bst <- xgb.train(param, dtrain, nrounds = 30) y_pred= predict(bst,dtest) err_xgb[b] = mean((y_test-y_pred)^2) } err_xgb_all = sqrt(mean(err_xgb)) # global error print(paste("Mean classification gbm, tree = ",round(err_xgb_all,2))) # tune the hyper parameter with library mlr library(mlr) samples = train_test_split(X,y,test_size=0.25) X_train=samples$X_train X_test=samples$X_test y_train=samples$y_train y_test=samples$y_test train = data.frame(y_train,X_train) colnames(train)[1] = "y" test = data.frame(y_test,X_test) colnames(test) = colnames(train) #head(train) trainTask <- makeRegrTask(data = train, target = "y") testTask <- makeRegrTask(data = test, target = "y") # Create an xgboost learner xgb_learner <- makeLearner( "regr.xgboost", predict.type = "response", par.vals = list( objective = "reg:linear", nrounds = 200 ) ) # Create a model with default hyper parameters xgb_model <- train(xgb_learner, task = trainTask) result <- predict(xgb_model, testTask) print(paste("Rmse (default hyperparameters):",round(sqrt(mean((result$data$truth - result$data$response)^2)),2))) xgb_params <- makeParamSet( # The number of trees in the model (each one built sequentially) makeIntegerParam("nrounds", lower = 10, upper = 500), # number of splits in each tree makeIntegerParam("max_depth", lower = 1, upper = 10), # "shrinkage" - prevents overfitting makeNumericParam("eta", lower = .001, upper = .5), # L2 regularization - prevents overfitting makeNumericParam("lambda", lower = -5, upper = 0, trafo = function(x) 10^x) ) control <- makeTuneControlRandom(maxit = 1) resample_desc <- makeResampleDesc("CV", iters = 4) tuned_params <- tuneParams( learner = xgb_learner, task = trainTask, resampling = resample_desc, par.set = xgb_params, control = control ) xgb_tuned_learner <- setHyperPars( learner = xgb_learner, par.vals = tuned_params$x ) # Re-train parameters using tuned hyperparameters (and full training set) xgb_model <- train(xgb_tuned_learner, trainTask) # Make a new prediction result <- predict(xgb_model, testTask) print(paste("Rmse (tuned hyperparameters):",round(sqrt(mean((result$data$truth - result$data$response)^2)),2)))