#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Jul 27 11:51:27 2018 @author: valerie """ import numpy as np import matplotlib.pyplot as plt from urllib.request import urlopen from sklearn.ensemble import AdaBoostRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import train_test_split import xgboost as xgb url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/Biscuits.csv" biscuits=np.loadtxt(urlopen(url),skiprows=1,delimiter=";") # Extraction de la colonne fat fat=biscuits[:,0] # Extraction des variables explicatives X=biscuits[:,1:] # Trace d'un spectre plt.figure(1) plt.plot(X[1,:]) plt.title("Un exemple de spectre") # In the following plot, color is varying according to the fat percent fatn=(fat-np.min(fat))/(np.max(fat)-np.min(fat)) colors= plt.cm.inferno(fatn) plt.figure(2) for i in range(len(fat)): plt.plot(X[i,:],color=colors[i]) plt.title("Spectres NIR") plt.ylabel("Absorbances") plt.show() #%% # Adaboost B = 100 n_test = 4 mse_AdaBoost = np.zeros(B) for b in range(B): X_train, X_test, y_train, y_test = train_test_split(X,fat,test_size=n_test) mk=np.mean(X_train,axis=0) sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps) X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk) X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk) clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=2),n_estimators=20) clf.fit(X_train,y_train) y_pred = clf.predict(X_test) mse_AdaBoost[b] = np.mean((y_pred-y_test.ravel())**2) print("Error of AdaBoost: ",np.sqrt(np.mean(mse_AdaBoost))) #%% # Gradient Boosting mse_gb = np.zeros(B) for b in range(B): X_train, X_test, y_train, y_test = train_test_split(X,fat,test_size=n_test) mk=np.mean(X_train,axis=0) sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps) X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk) X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk) clf = GradientBoostingRegressor(n_estimators=200,max_features=5) clf.fit(X_train,y_train) y_pred = clf.predict(X_test) mse_gb[b] = np.mean((y_pred-y_test.ravel())**2) print("Error of gradient boosting: ",np.sqrt(np.mean(mse_gb))) #%%# eXtreme Gradient Boosting mse_xgb = np.zeros(B) ncol = X.shape[1] for b in range(B): X_train, X_test, y_train, y_test = train_test_split(X,fat,test_size=n_test) mk=np.mean(X_train,axis=0) sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps) X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk) X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk) dtrain = xgb.DMatrix(data=X_train,label=y_train) dtest = xgb.DMatrix(data=X_test,label=y_test) params = {'n_estimators': 200, 'max_depth': 2, 'min_samples_split': 2, 'eta': 1,'objective':'reg:linear','alpha': 0.2,'silent': 1, 'early_stopping_rounds': 3} watchlist = [(dtest, 'eval',dtrain, 'eval')] num_round=30 bst = xgb.train(params,dtrain,num_round,watchlist) y_pred = bst.predict(dtest) mse_xgb[b] = np.mean((y_pred-y_test.ravel())**2) print("Error of eXtreme gradient boosting: ",np.sqrt(np.mean(mse_xgb))) print("Error of gradient boosting: ",np.sqrt(np.mean(mse_gb))) print("Error of AdaBoost: ",np.sqrt(np.mean(mse_AdaBoost))) #%% # Another way to do the selection of the hyper parameters # - more efficient from a computation time point of vue # - better exploration of the various possibilites params={ 'max_depth': [2,5], 'subsample': [0.4], 'colsample_bytree': [.05,.1,.2], 'n_estimators': [10,100,1000], 'reg_alpha': [0.05,.1,1] } xgb_reg = xgb.XGBRegressor() rs = GridSearchCV(xgb_reg, params, cv=ShuffleSplit(test_size=.25, n_splits=30, random_state=1), scoring = "neg_mean_squared_error", refit=False,n_jobs=-1) rs.fit(X,fat) print("Best parameters :",rs.best_params_) print("Best accuracy :",np.round(np.sqrt(-rs.best_score_),2))