library(MASS) library(ROCR) library(rpart) library(randomForest) library(adabag) library(class) library(FactoMineR) library(knncat) load("~/Dropbox/ENSEIGNEMENT/AnDonnees/DONNEES/panel.Rdata") summary(panel) panel$THISYR = as.factor(panel$THISYR) panel$LASTYR = as.factor(panel$LASTYR) panel$BLACK = as.factor(panel$BLACK) panel$CHILD1 = as.factor(panel$CHILD1) panel$CHILD2 = as.factor(panel$CHILD2) #panel = panel[,-2] m_mod = 8 m_var = 4 acm = MCA(panel,quanti.sup=c(3,4,5),quali.sup=1) #acm = MCA(panel,quanti.sup=c(2,3,4),quali.sup=1) panel_num = matrix(0,nrow(panel),ncol(panel)) panel_num[,1] = panel[,1] for (k in 2:ncol(panel)){ panel_num[,k] = scale(as.numeric(panel[,k])) } colnames(panel_num) = colnames(panel) panel_acm = cbind(panel[,c(1,3,4,5)],acm$ind$coord) #panel_acm = cbind(panel[,c(1,2,3,4)],acm$ind$coord) B = 100 ; kmax = 11 ; err = matrix(0,B,kmax) for (b in 1:B){ appri = sample(1:n,round(4/5*n)) testi = setdiff(1:n,appri) for (k in 1:kmax){ Knn = knn(panel_num[appri,-1], panel_num[testi,-1], panel_num[appri,1], k = k) tab = table(panel[testi,1],Knn) err[b,k] = sum(tab)-sum(diag(tab)) } } boxplot(err/length(testi)) B = 100 ; kmax = 11 ; err = matrix(0,B,kmax) for (b in 1:B){ appri = sample(1:n,round(4/5*n)) testi = setdiff(1:n,appri) for (k in 1:kmax){ Knn = knn(panel_acm[appri,-1], panel_acm[testi,-1], panel_acm[appri,1], k = k) tab = table(panel[testi,1],Knn) err[b,k] = sum(tab)-sum(diag(tab)) } } boxplot(err/length(testi)) B = 30 ; kmax = 5 ; err = matrix(0,B,kmax) for (b in 1:B){ appri = sample(1:n,round(4/5*n)) testi = setdiff(1:n,appri) X = panel[appri,-1] ; y = panel[appri,1] for (k in 1:kmax) { m.rf = randomForest(X,y,mtry=k,ntree=100) tab = table(panel[testi,1],predict(m.rf,panel)[testi]) err[b,k] = sum(tab)-sum(diag(tab)) } } boxplot(err/length(testi)) B = 30 ; kmax = 20 ; err = matrix(0,B,kmax) for (b in 1:B){ appri = sample(1:n,round(4/5*n)) testi = setdiff(1:n,appri) X = panel[appri,-1] ; y = panel[appri,1] for (k in 1:kmax) { m.rf = randomForest(X,y,mtry=2,ntree=k*10) tab = table(panel[testi,1],predict(m.rf,panel)[testi]) err[b,k] = sum(tab)-sum(diag(tab)) } } boxplot(err/length(testi)) colnames(panel_acm) = c("THISYR",'HUBINC',"AGE","EDUC","ACM1","ACM2","ACM3","ACM4") syncat <- knncat (panel[appri,], panel[testi,],classcol=1) table (synpred, panel$THISYR[testi]) B = 10 ; m_max = 50 err = matrix(0,B,m_max) for (b in 1:B){ appri = sample(1:n,round(4/5*n)) testi = setdiff(1:n,appri) panel.app = panel[appri,] panel.test = panel[testi,] m.adaboost <- boosting(THISYR~.,data = panel.app,boos=FALSE, mfinal=m_max) #errorevol(m.adaboost,newdata=panel.app)->evol.train errorevol(m.adaboost,newdata=panel.test)->evol.test err[b,] = evol.test$error } boxplot(err) B = 10 ; m_max = 30 err = matrix(0,B,m_max) for (b in 1:B){ appri = sample(1:n,round(4/5*n)) testi = setdiff(1:n,appri) panel.app = panel[appri,] panel.test = panel[testi,] m.bag <- bagging(THISYR~.,data = panel.app,mfinal=m_max) #errorevol(m.adaboost,newdata=panel.app)->evol.train errorevol(m.bag,newdata=panel.test)->evol.test err[b,] = evol.test$error } boxplot(err) pltroc = function(prob,yobs,add=FALSE,col="black",lty=1){ pred <- prediction(prob, yobs) perf <- performance(pred,"tpr","fpr") plot(perf,lwd=2,add=add,col=col,lty=lty) auc = performance(pred,"auc")@y.values[[1]] auc } comp.err = function(prob,yobs){ pred <- prediction(prob, yobs) perf <- performance(pred,"err") i = which.min(perf@y.values[[1]]) threshold = perf@x.values[[1]][i] return(list(threshold=threshold,err = perf@y.values[[1]][i])) } set.seed(123) n = nrow(panel) B = 10 p.lda = p.lda.num = p.qda = p.glm = p.glm.st = p.glm2 = p.glm.st2 = p.tre = p.rf = p.adaboost = p.knn = p.glm.acm = p.bag = NULL e.lda = e.lda.num = e.qda = e.glm = e.glm.st = e.glm2 = e.glm.st2 = e.tre = e.rf = e.adaboost = e.knn = e.glm.acm = e.bag = NULL y_test = NULL for (b in 1:B){ appri = sample(1:n,round(4/5*n)) testi = setdiff(1:n,appri) ytmp = panel$THISYR[testi] y_test = c(y_test,ytmp) #table(panel$THISYR) #table(panel$THISYR[appri]) m.lda = lda(THISYR~.,data=panel,subset=appri) p.lda = c(p.lda,predict(m.lda,panel)$posterior[testi,2]) tmp = comp.err(predict(m.lda,panel)$posterior[testi,2],ytmp) e.lda = c(e.lda,tmp$err) # m.lda.num = lda(THISYR~.,data=as.data.frame(panel_num),subset=appri) # p.lda.num = c(p.lda.num,predict(m.lda.num,as.data.frame(panel_num))$posterior[testi,2]) # tmp = comp.err(predict(m.lda.num,as.data.frame(panel_num))$posterior[testi,2],ytmp) # e.lda.num = c(e.lda.num,tmp$err) # m.qda = qda(THISYR~.,data=panel,subset=appri) # p.qda = c(p.qda,predict(m.qda,panel)$posterior[testi,2]) m.glm.acm = glm(THISYR~.,data=panel_acm,subset=appri,family="binomial") p.glm.acm = c(p.glm.acm,predict(m.glm.acm,panel_acm,type="respons")[testi]) tmp = comp.err(predict(m.glm.acm,panel_acm,type="respons")[testi],ytmp) e.glm.acm = c(e.glm.acm,tmp$err) m.glm = glm(THISYR~.,data=panel,subset=appri,family="binomial") p.glm = c(p.glm,predict(m.glm,panel,type="respons")[testi]) tmp = comp.err(predict(m.glm,panel,type="respons")[testi],ytmp) e.glm = c(e.glm,tmp$err) # m.glm.st = step(m.glm) # p.glm.st = c(p.glm.st,predict(m.glm.st,panel,type="respons")[testi]) m.glm2 = glm(THISYR~.^2,data=panel,subset=appri,family="binomial") p.glm2 = c(p.glm2,predict(m.glm2,panel,type="respons")[testi]) tmp = comp.err(predict(m.glm2,panel,type="respons")[testi],ytmp) e.glm2 = c(e.glm2,tmp$err) # m.glm.st2 = step(m.glm2) # p.glm.st2 = c(p.glm.st2,predict(m.glm.st2,panel,type="respons")[testi]) m.tre = rpart(THISYR~.,data=panel,subset=appri) p.tre = c(p.tre,predict(m.tre,panel,typ="prob")[testi,2]) tmp = comp.err(predict(m.tre,panel,typ="prob")[testi,2],ytmp) e.tre = c(e.tre,tmp$err) X = panel[appri,-1] ; y = panel[appri,1] m.rf = randomForest(X,y,mtry=2,ntree=160) p.rf = c(p.rf,predict(m.rf,panel,typ="prob")[testi,2]) tmp = comp.err(predict(m.rf,panel,typ="prob")[testi,2],ytmp) e.rf = c(e.rf,tmp$err) panel.app = panel[appri,] panel.test = panel[testi,] m.adaboost <- boosting(THISYR~.,data = panel.app,boos=FALSE, mfinal=4) p.adaboost = c(p.adaboost,predict(m.adaboost,panel.test)$prob[,2]) tmp = comp.err(predict(m.adaboost,panel.test)$prob[,2],ytmp) e.adaboost = c(e.adaboost,tmp$err) Knn = knn(panel_num[appri,-1], panel_num[testi,-1], panel_num[appri,1], k = 7,prob=TRUE) p.knn = c(p.knn,attributes(Knn)$prob) tmp = comp.err(attributes(Knn)$prob,ytmp) e.knn = c(e.knn,tmp$err) m.bag <- bagging(THISYR~.,data = panel.app,mfinal=13) p.bag = c(p.bag,predict(m.bag,panel.test)$prob[,2]) tmp = comp.err(predict(m.bag,panel.test)$prob[,2],ytmp) e.bag = c(e.bag,tmp$err) } auc.lda = pltroc(p.lda, y_test) #auc.lda.num = pltroc(p.lda.num, y_test,col="darkgray") #pltroc(p.qda, y_test,add=TRUE,col="gray") auc.glm = pltroc(p.glm, y_test,add=TRUE,col="red") auc.glm2 = pltroc(p.glm2, y_test,add=TRUE,col="orange") # pltroc(p.glm.st2, y_test,add=TRUE,col="yellow") auc.tree = pltroc(p.tre,y_test,add=TRUE,col="blue") auc.rf = pltroc(p.rf,y_test,add=TRUE,col="cyan") auc.adaboost = pltroc(p.adaboost,y_test,add=TRUE,col="magenta") auc.bag = pltroc(p.bag,y_test,add=TRUE,col="purple") auc.knn = pltroc(p.knn,y_test,add=TRUE,col="grey") legend("bottomright",legend=c(paste("lda ",round(auc.lda*100)/100),paste("glm ",round(auc.glm*100)/100),paste("glm2 ",round(auc.glm2*100)/100),paste("tree ",round(auc.tree*100)/100),paste("RF ",round(auc.rf*100)/100),paste("adaboost",round(auc.adaboost*100)/100),paste("bagging",round(auc.bag*100)/100),paste("knn ",round(auc.knn*100)/100)),text.col=c("black","red","orange","blue","cyan","magenta","purple","grey"),xjust=1) dev.new() E = cbind(e.lda,e.glm,e.glm.acm,e.glm2,e.tre,e.rf,e.adaboost,e.bag,e.knn) colnames(E) = c("lda","glm","glm.acm","glm2","tree","RF","adaboost","bagging","knn") boxplot(E)