#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Jul 27 11:51:27 2018 @author: valerie """ import numpy as np import matplotlib.pyplot as plt from urllib.request import urlopen url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/Biscuits.csv" biscuits=np.loadtxt(urlopen(url),skiprows=1,delimiter=";") # Extraction de la colonne fat fat=biscuits[:,0] # Extraction des variables explicatives X=biscuits[:,1:] # Trace d'un spectre plt.figure(1) plt.plot(X[1,:]) plt.title("Un exemple de spectre") # TIn the following plot, color is varying according to the fat percent fatn=(fat-min(fat))/(max(fat)-min(fat)) colors= plt.cm.inferno(fatn) plt.figure(2) for i in range(len(fat)): plt.plot(X[i,:],color=colors[i]) plt.title("Spectres NIR") plt.ylabel("Absorbances") plt.show() y = fat # Bagging ============================================================ from sklearn.ensemble import bagging from sklearn.model_selection import train_test_split B = 30 n_test = 4 mse_bag = np.zeros(B) for b in range(B): X_train, X_test, y_train, y_test = train_test_split(X,fat,test_size=n_test) mk=np.mean(X_train,axis=0) sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps) X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk) X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk) clf = bagging.BaggingRegressor(n_estimators=20) clf.fit(X_train,y_train) y_pred = clf.predict(X_test) mse_bag[b] = np.mean((y_pred-y_test)**2) print("Error of bagging regression: ",np.sqrt(np.mean(mse_bag))) # Random Forest ============================================================ from sklearn.ensemble import RandomForestRegressor B = 30 n_test = 4 mse_rf = np.zeros(B) for b in range(B): X_train, X_test, y_train, y_test = train_test_split(X,fat,test_size=n_test) mk=np.mean(X_train,axis=0) sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps) X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk) X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk) clf = RandomForestRegressor(n_estimators=20,max_depth=5) clf.fit(X_train,y_train) y_pred = clf.predict(X_test) mse_rf[b] = np.mean((y_pred-y_test)**2) print("Error of random forest regression: ",np.sqrt(np.mean(mse_rf))) # ============================================================================= # ============================================================================= # Another way to do the selection of the hyper parameters # - more efficient from a computation time point of vue # ============================================================================= from sklearn.model_selection import ShuffleSplit from sklearn.model_selection import GridSearchCV, cross_val_score from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import BaggingRegressor params = {'n_estimators': [10,100,500], 'base_estimator__max_leaf_nodes':[10, 15], 'base_estimator__max_depth':[4, 5, 6]} clf = GridSearchCV(bagging.BaggingRegressor(DecisionTreeRegressor(), max_features = 5), param_grid=params, cv=ShuffleSplit(test_size=.25, n_splits=30, random_state=1), scoring = "neg_mean_squared_error", refit=False,n_jobs=-1) clf.fit(X, y) print("Best parameters :",clf.best_params_) print("Best score :",-np.round(clf.best_score_,2))