#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 27 11:51:27 2018

@author: valerie
"""

import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlopen
url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/Biscuits.csv"
biscuits=np.loadtxt(urlopen(url),skiprows=1,delimiter=";")

# Extraction de la colonne fat
fat=biscuits[:,0]
# Extraction des variables explicatives
X=biscuits[:,1:]
# Trace d'un spectre
plt.figure(1)
plt.plot(X[1,:])
plt.title("Un exemple de spectre")
# TIn the following plot, color is varying according to the fat percent
fatn=(fat-min(fat))/(max(fat)-min(fat))
colors= plt.cm.inferno(fatn)
plt.figure(2)
for i in range(len(fat)):
   plt.plot(X[i,:],color=colors[i])
plt.title("Spectres NIR")
plt.ylabel("Absorbances")
plt.show()


y = fat

# Bagging ============================================================
from sklearn.ensemble import bagging
from sklearn.model_selection import train_test_split

B = 30
n_test = 4
mse_bag = np.zeros(B) 
for b in range(B): 
    X_train, X_test, y_train, y_test = train_test_split(X,fat,test_size=n_test)
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    clf = bagging.BaggingRegressor(n_estimators=20)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    mse_bag[b] = np.mean((y_pred-y_test)**2)
    
print("Error of bagging regression: ",np.sqrt(np.mean(mse_bag)))

             
# Random Forest ============================================================
from sklearn.ensemble import RandomForestRegressor

B = 30
n_test = 4
mse_rf = np.zeros(B) 
for b in range(B): 
    X_train, X_test, y_train, y_test = train_test_split(X,fat,test_size=n_test)
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    clf = RandomForestRegressor(n_estimators=20,max_depth=5)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    mse_rf[b] = np.mean((y_pred-y_test)**2)
    
print("Error of random forest regression: ",np.sqrt(np.mean(mse_rf)))

# =============================================================================
# =============================================================================
# Another way to do the selection of the hyper parameters
# - more efficient from a computation time point of vue
# =============================================================================
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor

params = {'n_estimators': [10,100,500], 
          'base_estimator__max_leaf_nodes':[10, 15], 
          'base_estimator__max_depth':[4, 5, 6]}

clf = GridSearchCV(bagging.BaggingRegressor(DecisionTreeRegressor(),
                        max_features = 5),
                   param_grid=params, 
                   cv=ShuffleSplit(test_size=.25,
                                   n_splits=30, random_state=1), 
                   scoring = "neg_mean_squared_error", 
                   refit=False,n_jobs=-1)
clf.fit(X, y)

print("Best parameters :",clf.best_params_)
print("Best score :",-np.round(clf.best_score_,2))