BW = read.table(file="~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/BirthWeights.dat",header=TRUE) BW$smoke = as.factor(BW$smoke) BW$parity = as.factor(BW$parity) summary(BW) BW$smoke[which(BW$smoke==9)] = NA BW$gestation[which(BW$gestation>360)] = NA BW$age[which(BW$age==99)] = NA BW$height[which(BW$height==99)] = NA BW$weight[which(BW$weight==999)] = NA pairs(BW, panel = panel.smooth, main = "Birth Weights", col = 3+(BW$smoke==1),pch=20) D = read.table("~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/LifeExp.dat",header=TRUE) pairs(D[,-1], panel = panel.smooth, main = "Esperance de vie",pch=20) Ozone = read.table("~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/TECHNICOLOR/Ozone.dat",header=TRUE,row.names=1) pairs(Ozone[,c(1,2,5,8,11)], panel = panel.smooth, main = "Ozone",pch=20) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/Ozone_pairs.eps",width=6,height=6) L = loess(maxO3~T9, data =Ozone) PL = predict(L, data.frame(T9 = seq(8, 30, 1)), se = TRUE) plot(L,pch=20,xlab="T9",ylab="maxO3") lines(seq(8, 30, 1),PL$fit,col="red",lwd = 2) lines(seq(8, 30, 1),PL$fit+2*PL$se.fit,col="red",lwd = 2,lty=2) lines(seq(8, 30, 1),PL$fit-2*PL$se.fit,col="red",lwd = 2,lty=2) #dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/CONF_DS/Figures/Ozone_loess.eps",width=6,height=6) L = loess(maxO3~maxO3v, data =Ozone) PL = predict(L, data.frame(maxO3v = seq(40, 160, 1)), se = TRUE) plot(L,pch=20,xlab="maxO3v",ylab="maxO3") lines(seq(40, 160, 1),PL$fit,col="red",lwd = 2) lines(seq(40, 160, 1),PL$fit+2*PL$se.fit,col="red",lwd = 2,lty=2) lines(seq(40, 160, 1),PL$fit-2*PL$se.fit,col="red",lwd = 2,lty=2) plot(Ozone$T9,Ozone$maxO3,pch=20,xlab = "Temperature t-9h",ylab = "Ozone, concentration max.") mod = lm(maxO3~T9,data=Ozone) lines(Ozone$T9,mod$fitted.values,col="red") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/Ozone_line.eps",width=5,height=5) qqnorm((mod$residuals-mean(mod$residuals))/sd(mod$residuals),xlab="Quantiles théoriques",ylab='Quantiles des résidus',pch=20) mod.comp = lm(maxO3~.,data=Ozone) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/Ozone_line_qq.eps",width=4,height=4) qqnorm((mod.comp$residuals-mean(mod.comp$residuals))/sd(mod.comp$residuals),xlab="Quantiles théoriques",ylab='Quantiles des résidus',pch=20) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/Ozone_qq.eps",width=4,height=4) #mod = lm(maxO3~Vx9,data=Ozone) plot(mod$fitted.values,mod$residuals,pch=20,xlab="Valeurs prédites",ylab="Résidus") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/Ozone_line_res_fit.eps",width=4,height=4) plot(mod.comp$fitted.values,mod.comp$residuals,pch=20,xlab="Valeurs prédites",ylab="Résidus") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/Ozone_res_fit.eps",width=4,height=4) mod = lm(maxO3~Vx9,data=Ozone) plot(mod$fitted.values,mod$residuals,pch=20,xlab="Valeurs prédites",ylab="Résidus") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/Ozone_Vx9_res_fit.eps",width=4,height=4) mod = lm(log(maxO3)~log(Vx9),data=Ozone) plot(mod$fitted.values,mod$residuals,pch=20,xlab="Valeurs prédites",ylab="Résidus") #dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/Ozone_Vx9_res_fit.eps",width=4,height=4) # Cours Conf m=lm(maxO3~T12,data=Ozone) summary(m) plot(Ozone$maxO3,m$fitted.values,pch=20,xlab="Observed y",ylab="Predicted y") grid() # dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/CONF_DS/Figures/Ozone_lmT12.eps",width=6,height=6) m1=lm(maxO3~.,data=Ozone) plot(Ozone$maxO3,m1$fitted.values,pch=20,xlab="Observed y",ylab="Predicted y") grid() # Best subset library(leaps) n = dim(Ozone)[1] B = 50 for (b in 1:B){ train = sample(1:n,round(n*2/3)) test = setdiff(1:n,train) X = Ozone[train,-c(1,2)] y = Ozone[train,2] for (k in 1:8){ res = leaps(X,y,nbest = k,method="Cp") } } # Ridge library(MASS) mod.ridge = lm.ridge(maxO3~.,data=Ozone,lambda = seq(0,1000,1)) w = which(abs(mod.ridge$coef[,100])>2.5) matplot(log(mod.ridge$lambda),t(mod.ridge$coef[w,]),xlab="log(lambda)",ylab="coefficients",typ="l",lty=1,col=1:5) matplot(log(mod.ridge$lambda),t(mod.ridge$coef[-c(w),]),xlab="lambda",ylab="coefficients",typ="l",col="gray",lty=1,add=TRUE) legend("topright",rownames(mod.ridge$coef[w,]),col=1:5,lty=1) abline(v=log(mod.ridge$lambda[which.min(mod.ridge$GCV)])) # dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/CONF_DS/Figures/Ridge_ozone_coef.eps",width=6,height=6) Cp.ridge = NULL mod.tmp = mod.comp x.new = cbind(matrix(1,length(Ozone[,1]),1),Ozone[,-1]) for (k in 1:101){ beta = coef(mod.ridge)[k,] mod.tmp$coefficients = beta Cp.ridge[k] = sum((Ozone$maxO3-predict(mod.tmp,x.new))^2)+2*length(beta)*var(mod.comp$residuals) } x = Ozone[,-c(1)] y = Ozone[,1] n= length(y) p = dim(x)[2] Z = matrix(NA,n,p-2) for (j in 1:(p-2)) {Z[,j] = as.numeric(x[,j])} colnames(Z) = colnames(x)[1:(p-2)] Z = cbind(scale(Z)) y = y-mean(y) lambda=seq(0,50,5) b <- bp <- matrix(NA,p-2,length(lambda)) yhat = matrix(0,n,length(lambda)) # CV B = 50 napp = round(n*2/3) lambda = seq(1,60,5) err = matrix(0,B,length(lambda)) R = solve(t(Z)%*%Z) d = svd(Z) for (b in 1:B){ train = sample(1:n,napp) test = setdiff(1:n,train) Zv = as.matrix(Z[train,]) #Rv = solve(t(Z)%*%Z) #dv = svd(Z) for (k in 1:length(lambda)){ beta = solve(t(Zv)%*%Zv+lambda[k]*diag(1,pZ))%*%(t(Zv)%*%matrix(y[train],length(train),1)) # effective degrees of freedom df[k] = sum(d$d^2/(d$d^2+lambda[k])) yhat = Z[test,]%*%beta err[b,k] = mean((y[test]-yhat)^2) } } plot(df[1:length(lambda)],apply(sqrt(err),2,mean),pch=20,xlab="Nb degrees of freedom",ylab="RMSE") grid() dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/CONF_DS/Figures/Ozone_Ridge_RMSE.eps",width=6,height=6) df <- err <- NULL R = solve(t(Z)%*%Z) d = svd(Z) for (k in 1:length(lambda)){ b[,k] = solve(t(Z)%*%Z+lambda[k]*diag(1,p-2))%*%(t(Z)%*%matrix(y,n,1)) bp[,k] = solve((diag(1,p-2)+lambda[k]*R) )%*%matrix(mod.comp$coefficients[2:(p-1)],p-2,1) # effective degrees of freedom df[k] = sum(d$d^2/(d$d^2+lambda[k])) yhat[,k] = Z%*%b[,k] err[k] = mean((y-yhat[,k])^2) } matplot(lambda,t(b),typ="l") #dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/Figures/Ridge_ozone_coef.eps",width=6,height=6) matplot(lambda,t(bp),typ="l") # Validation croisée Z = Ozone[,2:11] pZ = ncol(Z) n = nrow(Z) n.f = n/8 err = matrix(0,8,length(mod.lars$lambda)) lambda = mod.lars$lambda df = NULL ii = sample(1:n,n) for (v in 1:8) { ii.test = (n.f*(v-1))+(1:n.f) ii.app = ii[setdiff(1:n,ii.test)] Zv = as.matrix(Z[ii.app,]) #Rv = solve(t(Z)%*%Z) #dv = svd(Z) for (k in 1:length(lambda)){ b[,k] = solve(t(Zv)%*%Zv+lambda[k]*diag(1,pZ))%*%(t(Zv)%*%matrix(y[ii.app],length(ii.app),1)) # effective degrees of freedom df[k] = sum(d$d^2/(d$d^2+lambda[k])) yhat[ii.test,k] = Z[ii.test,]%*%b[,k] err[v,k] = mean((y[ii.test]-yhat[ii.test,k])^2) } } plot(df,sqrt(apply(err,2,mean)),typ="l",ylab="CV RMSE (8 folds)",xlab="df") #dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/Figures/Ridge_CV.eps",width=6,height=6) # lasso library(lars) mod.lars = lars(as.matrix(Ozone[,-c(1,12,13)]), Ozone[,1], type = "lasso") matplot(c(mod.lars$lambda,0),mod.lars$beta,typ="l",xlab="lambda",ylab="coefficients",lty=1) text(100,2,col="red",labels = "T15") text(40,-2,col="blue",labels = "Ne9") text(25,1,col="black",labels = "Vx9") # dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/Figures/lasso_ozone_coef.eps",width=6,height=6) plot(c(mod.lars$lambda,0),mod.lars$Cp,pch=20,xlab="lambda",ylab="Cp de Mallow") #dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/TECHNICOLOR/Figures/lasso_ozone_Cp.eps",width=6,height=6) B = 50 napp = round(n*2/3) err.lars = matrix(NA,B,10) R = solve(t(Z)%*%Z) d = svd(Z) for (b in 1:B){ train = sample(1:n,napp) test = setdiff(1:n,train) mod.f.lars = lars(Z[train, ], y[train], type = "lasso",intercept=FALSE) df.lars = mod.f.lars$df coef1 = mod.f.lars$beta j = 1 df.ref=0 cnt = 0 while (j < length(df.lars)){ if (df.lars[j]>df.ref) { cnt = cnt+1 yhat.lars = Z[test,]%*%coef1[j,] err.lars[b,cnt] = mean((y[test]-yhat.lars)^2) df.ref = df.lars[j] } else {j=j+1} } } B = 50 napp = round(n*2/3) Zdf = data.frame(cbind(Z,y)) colnames(Zdf) = c(colnames(x)[1:(p-2)],"y") err.lm = matrix(0,B,1) for (b in 1:B){ train = sample(1:n,napp) test = setdiff(1:n,train) mod = lm(y~.,data=Zdf,subset=train) yhat = predict(mod,Zdf)[test] err.lm = mean((y[test]-yhat)^2) } plot(1:10,apply(sqrt(err.lars),2,mean,na.rm=TRUE),pch=20,xlab="Nb degrees of freedom",ylab="RMSE",ylim=range(apply(sqrt(err),2,mean),apply(sqrt(err.lars),2,mean,na.rm=TRUE)),xlim=c(1,9.7)) lines(1:10,apply(sqrt(err.lars),2,mean,na.rm=TRUE)) points(df[1:12],apply(sqrt(err),2,mean),pch=18,col="red") lines(df[1:12],apply(sqrt(err),2,mean),col="red") abline(h=mean(sqrt(err.lm)),col="gray",lwd=2) grid() legend("topright",c("Lasso","Ridge","LM"),col=c("black","red","gray"),lty=1,pch=c(20,18)) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/CONF_DS/Figures/Ozone_Ridge_Lasso_RMSE.eps",width=6,height=6) boxplot(sqrt(err.lars),at=1:10,boxwex=.2,col="blue",axes=FALSE) boxplot(sqrt(err),at=rev(df[1:12]),col="red",add=TRUE,boxwex=.2,axes=FALSE,xlab="Nb degrees of freedom",ylab="RMSE") axis(2) box() axis(1) abline(h=mean(sqrt(err.lm)),col="gray",lwd=2) grid() legend("topright",c("Lasso","Ridge","LM"),col=c("blue","red","gray"),lty=1,lwd=c(1,1,2)) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/CONF_DS/Figures/Ozone_Ridge_Lasso_RMSE.eps",width=8,height=6) # K fold n = nrow(x) n.f = n/8 yhat.lars = matrix(NA,length(y),11) err.lars = matrix(0,8,11) for (v in 1:8) { ii.test = (n.f*(v-1))+(1:n.f) ii.app = ii[setdiff(1:n,ii.test)] mod.f.lars = lars(Z[ii.app, ], y[ii.app], type = "lasso") for (j in 1:11){ yhat.lars[ii.test,j] = Z[ii.test,]%*%t(mod.f.lars$beta)[,j] err.lars[v,j] = mean((y[ii.test]-yhat.lars[ii.test,j])^2) } } plot(0:10,sqrt(apply(err.lars,2,mean)),typ="l",ylab="CV RMSE (8 folds)",xlab="df") points(0:10,sqrt(apply(err.lars,2,mean)),pch=20) sqrt(sum((y-y.hat)^2)/n) sqrt(apply((matrix(y,n,51)-y.hat.ridge)^2,2,sum)/n) sqrt(apply((matrix(y,n,dim(y.hat.lars)[2])-y.hat.lars)^2,2,sum)/n) plot(sqrt(apply((matrix(y,n,51)-y.hat.ridge)^2,2,sum)/n)[rev(1:51)],pch=20) abline(h=sqrt(sum((y-y.hat)^2)/n)) points(sqrt(apply((matrix(y,n,dim(y.hat.lars)[2])-y.hat.lars)^2,2,sum)/n),pch=20,col="red") # ================================================ # ANN library(neuralnetwork) n = length(Ozone[,1]) appri = sample(1:n,round(n*2/3)) testi = setdiff(1:n,appri) mod.lin = lm(maxO3~.,data=Ozone,subset=appri) st = step(mod.lin) mod.lin.pr = predict(mod.lin,Ozone[testi,]) err.lin = sqrt(mean((mod.lin.pr-Ozone$maxO3[testi])^2)) mod.lin.pr.st = predict(st,Ozone[testi,]) err.lin.st = sqrt(mean((mod.lin.pr.st-Ozone$maxO3[testi])^2)) Oz.st = data.frame(maxO3=Ozone$maxO3,T15=Ozone$T15,Vx9 = Ozone$Vx9,Ne9 = Ozone$Ne9,maxO3v=Ozone$maxO3v) mod.nlin = lm(maxO3 ~ .^2, data = Oz.st, subset = appri) st.nlin = step(mod.nlin) mod.nlin.pr.st = predict(st.nlin,Ozone[testi,]) err.nlin.st = sqrt(mean((mod.nlin.pr.st-Ozone$maxO3[testi])^2)) B = 50 kmax = 10 rmse.ann <- rmse.ann.st <- matrix(NA,B,kmax) for (b in 1:B){ appri = sample(1:n,round(n*2/3)) testi = setdiff(1:n,appri) min.o = apply(Ozone[appri,1:11],2,min) max.o = apply(Ozone[appri,1:11],2,max) data.sc = Ozone for (j in 1:11){ data.sc[,j] = (Ozone[,j]-min.o[j])/(max.o[j]-min.o[j]) } ann = NULL ann.st = NULL data.app = data.sc[appri,] data.test = data.sc[testi,-c(1,12,13)] data.test.st = data.sc[testi,c(4,5,8,11)] y.test = data.sc[testi,1] for (k in 1:kmax){ ann[[k]] = neuralnet(maxO3 ~T9+T12+T15+Ne9+Ne12+Ne15+Vx9+Vx12+Vx15+maxO3v,data=data.app,hidden=k,rep=1)#rep=10 wm = which.min(ann[[k]]$result.matrix[1,]) ann[[k]] = neuralnet(maxO3 ~T9+T12+T15+Ne9+Ne12+Ne15+Vx9+Vx12+Vx15+maxO3v,data=data.app,hidden=k,startweights=ann[[k]]$weights[[wm[1]]]) y.p.sc = compute(ann[[k]],data.test) y.p = y.p.sc$net.result*(max.o[1]-min.o[1])+min.o[1] rmse.ann[b,k] = sqrt(mean((Ozone$maxO3[testi]-y.p)^2)) ann.st[[k]] = neuralnet(maxO3 ~T15+Ne9+Vx9+maxO3v,data=data.app,hidden=k,rep=10) wm = which.min(ann.st[[k]]$result.matrix[1,]) ann.st[[k]] = neuralnet(maxO3 ~T15+Ne9+Vx9+maxO3v,data=data.app,hidden=k,startweights=ann.st[[k]]$weights[[wm[1]]]) y.p.sc = compute(ann.st[[k]],data.test.st) y.p = y.p.sc$net.result*(max.o[1]-min.o[1])+min.o[1] rmse.ann.st[b,k] = sqrt(mean((Ozone$maxO3[testi]-y.p)^2)) } } k.opt = which.min(rmse.ann) k.opt.st = which.min(rmse.ann.st) affich = matrix(c("*** Erreur qudratique moyenne ***", paste("Modèle linéaire :", round(err.lin*100)/100), paste("Modèle linéaire avec sélection de variable :", round(err.lin.st*100)/100), paste("Modèle non linéaire avec sélection de variable :", round(err.nlin.st*100)/100), paste("Réseau de neurone (",k.opt,") :", round(rmse.ann[k.opt]*100)/100), paste("Réseau de neurone avec sélection de variable (",k.opt.st,") :", round(rmse.ann.st[k.opt]*100)/100)),6,1) write(affich,"",sep="\t") ann32 = neuralnet(maxO3 ~T9+T12+T15+Ne9+Ne12+Ne15+Vx9+Vx12+Vx15+maxO3v,data=data.app,hidden=c(3,2),rep=10) y.p.sc = compute(ann32,data.test) y.p = y.p.sc$net.result*(max.o[1]-min.o[1])+min.o[1] rmse.ann32 = sqrt(mean((Ozone$maxO3[testi]-y.p)^2)) plot(ann[[3]]) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/CONF_DS/Figures/Ozone_ANN3.eps",width=6,height=6)) ## # install.packages("FNN") library(FNN) plot(Ozone$Vx9,Ozone$maxO3,pch=20,xlab="Vx9",ylab="max03") n = nrow(Ozone) B = 50 test = matrix(seq(min(Ozone$Vx9),max(Ozone$Vx9),length.out=30),30,1) pred = matrix(0,30,B) for (b in 1:B){ app = sample(1:n,round(2/3*n)) M = knn.reg(Ozone$Vx9[app],y=Ozone$maxO3[app],test=test,k=7) lines(test,M$pred,col="gray") pred[,b] = M$pred } lines(test,apply(pred,1,mean),col="red",lwd=2) B = 50 test = matrix(seq(min(Ozone$Vx9),max(Ozone$Vx9),length.out=30),30,1) pred = matrix(0,30,B) for (b in 1:B){ app = sample(1:n,round(2/3*n)) M = knn.reg(Ozone$Vx9[app],y=Ozone$maxO3[app],test=test,k=7) pred[,b] = M$pred } lines(test,apply(pred,1,mean),col="blue",lwd=2) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/CONF_DS/Figures/Ozone_bagging_knn.eps",width=6,height=6) # Reduction of variance? plot(Ozone$Vx9,Ozone$maxO3,pch=20,xlab="Vx9",ylab="max03") B = 30 B.bag = 50 pred.bag = matrix(0,B,30) test = matrix(seq(min(Ozone$Vx9),max(Ozone$Vx9),length.out=30),30,1) for (b in 1:B) { pred = matrix(0,30,B.bag) for (b.bag in 1:B.bag){ app = sample(1:n,round(2/3*n)) M = knn.reg(Ozone$Vx9[app],y=Ozone$maxO3[app],test=test,k=7) lines(test,M$pred,col="gray") pred[,b.bag] = M$pred } pred.bag[b,] = apply(pred,1,mean) } q1 = apply(pred.bag,2,quantile,1/4) q2 = apply(pred.bag,2,quantile,3/4) polygon(c(test,rev(test)),c(q1,rev(q2)),col="red",border="red") legend("topleft",c("knn (7), all","bagging, interquartile"),col=c("gray","red"),lty=1,lwd=2) points(Ozone$Vx9,Ozone$maxO3,pch=20) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/CONF_DS/Figures/Ozone_bagging_knn.eps",width=6,height=6) library(FNN) n = nrow(Ozone) B=100 B.bag = 40 RMSE = matrix(0,B,B.bag) RMSEall = NULL for (b in 1:B){ pred = matrix(0,n-round(2/3*n),B.bag) app = sample(1:n,round(2/3*n)) ; napp = length(app) test = setdiff(1:n,app) Mall = rpart(Ozone$maxO3~.,subset=app,data=Ozone,control=rpart.control(maxdepth=15,cp=.1)) predall = predict(Mall,Ozone[test,]) RMSEall[b] = sqrt(mean((Ozone$maxO3[test]-predall)^2)) for (b.bag in 1:B.bag){ app.bag = sample(app,napp,replace=TRUE) M = rpart(Ozone$maxO3~.,control=rpart.control(maxdepth=2),subset=app.bag,data=Ozone) pred[,b.bag] = predict(M,Ozone[test,]) if (b.bag>1) {pred.bag = apply(pred[,1:b.bag],1,mean)} else {pred.bag = pred[,1]} RMSE[b,b.bag] = sqrt(mean((Ozone$maxO3[test]-pred.bag)^2)) } } boxplot(RMSE,xlab="Nb of bagging samples",ylab="RMSE") q1 = apply(RMSE,2,quantile,probs=1/4) q2 = apply(RMSE,2,quantile,probs=3/4) q1all = quantile(RMSEall,probs=1/4) q2all = quantile(RMSEall,probs=3/4) plot(1:B.bag,q1,col="white",ylim=range(c(q1,q2,q1all,q2all))) polygon(c(1:B.bag,rev(1:B.bag)),c(q1,rev(q2)),xlab="Nb of bagging samples",ylab="RMSE",col="red",border="red") polygon(c(1:B.bag,rev(1:B.bag)),c(rep(q1all,B.bag),rep(q2all,B.bag)),col="gray",border="gray") legend("topright",c("tree","bagging"),col=c("gray","red"),lty=1,lwd=2) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/CONF_DS/Figures/Ozone_bagging_tree.eps",width=6,height=6) pred.bag = apply(pred,1,mean) sqrt(mean((Ozone$maxO3[test]-pred.bag)^2)) sqrt(mean((Ozone$maxO3[test]-pred[,1])^2)) B = 50 test = matrix(seq(min(Ozone$Vx9),max(Ozone$Vx9),length.out=30),30,1) pred = matrix(0,30,B) for (b in 1:B){ app = sample(1:n,round(2/3*n)) M = knn.reg(Ozone$Vx9[app],y=Ozone$maxO3[app],test=test,k=7) pred[,b] = M$pred } lines(test,apply(pred,1,mean),col="blue",lwd=2) dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/CONF_DS/Figures/Ozone_tree_knn.eps",width=6,height=6) ## Random Forest # data(airquality) library(adabag) library(randomForest) set.seed(131) RF = randomForest(maxO3~.,data=Ozone,mtry=3,importance=TRUE) print(RF) barplot(RF$importance[,1]) dev.copy2eps(file= "~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/CONF_DS/Figures/Ozone_RF_importance.eps",width=12,height=6) n = nrow(Ozone) B=100 RMSE.tree = matrix(0,B,1) RMSE.RF1 = matrix(0,B,1) RMSE.RF2 = matrix(0,B,1) mtry = 10 for (b in 1:B){ app = sample(1:n,round(2/3*n)) ; napp = length(app) test = setdiff(1:n,app) Mall = rpart(Ozone$maxO3~.,subset=app,data=Ozone,control=rpart.control(maxdepth=15,cp=.1)) predall = predict(Mall,Ozone[test,]) RMSE.tree[b] = sqrt(mean((Ozone$maxO3[test]-predall)^2)) RF = randomForest(Ozone$maxO3~.,data=Ozone,mtry=mtry,subset=app,ntree=100) pred = predict(RF,Ozone[test,]) RMSE.RF2[b] = sqrt(mean((Ozone$maxO3[test]-pred)^2)) RF = randomForest(Ozone$maxO3~.,data=Ozone,mtry=mtry,subset=app,ntree=25) pred = predict(RF,Ozone[test,]) RMSE.RF1[b] = sqrt(mean((Ozone$maxO3[test]-pred)^2)) RF = randomForest(Ozone$maxO3~.,data=Ozone,mtry=mtry,subset=app,ntree=5) pred = predict(RF,Ozone[test,]) RMSE.RF[b] = sqrt(mean((Ozone$maxO3[test]-pred)^2)) } D = data.frame(cbind(RMSE.tree,RMSE.RF,RMSE.RF1,RMSE.RF2)) colnames(D) = c("tree","forest (5 trees)","forest (25 trees)","forest (100 trees)") boxplot(D,ylab="RMSE") dev.copy2eps(file="~/Dropbox/ENSEIGNEMENT/MACHINE LEARNING/CONF_DS/Figures/Ozone_RF.eps",width=10,height=6) # Boosting library(gbm) n = nrow(Ozone) boost = gbm.fit(Ozone[,-c(1,2)],Ozone$maxO3,distribution="gaussian",shrinkage=0.01, nTrain=round(2/3*n),n.trees=2000) best.iter <- gbm.perf(boost,method="OOB") pred = predict(boost,Ozone[test,-c(1,2)],n.trees=best.iter) B=100 n.trees = seq(1,500,by=50) RMSE.boost = matrix(0,B,length(n.trees)) for (b in 1:B){ app = sample(1:n,round(2/3*n)) ; napp = length(app) test = setdiff(1:n,app) boost = gbm.fit(Ozone[app,-c(1,2)],Ozone$maxO3[app],distribution="gaussian",shrinkage=0.05,n.trees=max(n.trees),interaction.depth = 2) pred = predict(boost,Ozone[test,-c(1,2)],n.trees=n.trees) RMSE.boost[b,] = sqrt(apply((pred-matrix(Ozone$maxO3[test],length(test),length(n.trees)))^2,2,mean)) } q1 = apply(RMSE.boost,1,quantile,probs=1/4) q2 = apply(RMSE.boost,1,quantile,probs=3/4) plot(1:B,q1,col="white",ylim=range(c(q1,q2))) polygon(c(1:B,rev(1:B)),c(q1,rev(q2)),xlab="Nb of boosting iteration",ylab="RMSE",col="red",border="red") lines(1:B,)