# Data importation # On lit le tableau comme un tableau de chaînes de caractères, # on extrait les colonnes numériques, # on fabrique artisanalement les dummies avec les colonnes catégorielles, # on reconstitue le tout. url="https://perso.univ-rennes1.fr/bernard.delyon/tp/ozone.txt" Ozone = read.table(url,header=TRUE) y=Ozone[,1] X=Ozone[,-1] train_test_split <- function (X,y,test_size=.25,random_state=NULL){ # Extraction of a train and a test datasets. # # Inputs : X, y : data # test_size : a fraction (<1) of the total set or a number of samples (integer) ; default 0.25 # random_state : if equal to an interger, it fixes the random seed ; defaut NULL # Outputs : X_train, X_test, y_train, y_test # n = nrow(X) if (test_size>1){test_size=test_size/n} if (!is.null(random_state)){set.seed(random_state)} itest=sample(1:n,round(n*test_size)) itrain=setdiff(1:n,itest) Xtrain=X[itrain,] Xtest=X[itest,] ytrain=y[itrain] ytest=y[itest] return(list(X_train=Xtrain,X_test=Xtest,y_train=ytrain,y_test=ytest)) } B = 30 # number of samples for the cross validation mse_lr = mse_naive = matrix(0,B,1) for (b in 1:B){ # sampling for train, test sets samples = train_test_split(X,y,test_size=0.25,random_state=1) X_train=samples$X_train X_test=samples$X_test y_train=samples$y_train y_test=samples$y_test # standardization numvar = 1:10 # list of quantitative variables mk=apply(X_train[,numvar],2,mean) sk=apply(X_train[,numvar],2,sd) X_train[,numvar] = scale(X_train[,numvar],center=mk,scale=sk) X_test[,numvar] = scale(X_test[,numvar],center=mk,scale=sk) data_train = data.frame(X_train,y_train) data_test = data.frame(X_test,y_test) # learning linear model lr = lm(y_train~.,data=data_train) y_pred = predict(lr,data_test) # prediction for the test set mse_lr[b] = mean((y_test-y_pred)^2) # compute MSE y_pred_naive = mean(y_train) mse_naive = mean((y_test-y_pred_naive)^2) } print(paste("Error of linear regression: ", round(sqrt(mean(mse_lr))*100)/100)) print(paste("Error of naive prediction: ", round(sqrt(mean(mse_naive))*100)/100))