#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 27 11:51:27 2018

@author: valerie
"""

import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlopen
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb


url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/Biscuits.csv"
biscuits=np.loadtxt(urlopen(url),skiprows=1,delimiter=";")


# Extraction de la colonne fat
fat=biscuits[:,0]
# Extraction des variables explicatives
X=biscuits[:,1:]
# Trace d'un spectre
plt.figure(1)
plt.plot(X[1,:])
plt.title("Un exemple de spectre")
# In the following plot, color is varying according to the fat percent
fatn=(fat-np.min(fat))/(np.max(fat)-np.min(fat))
colors= plt.cm.inferno(fatn)
plt.figure(2)
for i in range(len(fat)):
   plt.plot(X[i,:],color=colors[i])
plt.title("Spectres NIR")
plt.ylabel("Absorbances")
plt.show()


#%%
# Adaboost 
 
B = 100
n_test = 4
mse_AdaBoost = np.zeros(B) 
for b in range(B): 
    X_train, X_test, y_train, y_test = train_test_split(X,fat,test_size=n_test)
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=2),n_estimators=20)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    mse_AdaBoost[b] = np.mean((y_pred-y_test.ravel())**2)
    
print("Error of AdaBoost: ",np.sqrt(np.mean(mse_AdaBoost)))


#%%
# Gradient Boosting 

mse_gb = np.zeros(B) 
for b in range(B): 
    X_train, X_test, y_train, y_test = train_test_split(X,fat,test_size=n_test)
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    clf = GradientBoostingRegressor(n_estimators=200,max_features=5)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    mse_gb[b] = np.mean((y_pred-y_test.ravel())**2)
    
print("Error of gradient boosting: ",np.sqrt(np.mean(mse_gb)))

#%%# eXtreme Gradient Boosting 
mse_xgb = np.zeros(B)
ncol = X.shape[1]
for b in range(B): 
    X_train, X_test, y_train, y_test = train_test_split(X,fat,test_size=n_test)
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    dtrain = xgb.DMatrix(data=X_train,label=y_train)
    dtest = xgb.DMatrix(data=X_test,label=y_test)
    params = {'n_estimators': 200, 'max_depth': 2, 'min_samples_split': 2,
               'eta': 1,'objective':'reg:linear','alpha': 0.2,'silent': 1,
               'early_stopping_rounds': 3}
    watchlist = [(dtest, 'eval',dtrain, 'eval')]
    num_round=30
    bst = xgb.train(params,dtrain,num_round,watchlist)
    y_pred = bst.predict(dtest)
    mse_xgb[b] = np.mean((y_pred-y_test.ravel())**2)
    
print("Error of eXtreme gradient boosting: ",np.sqrt(np.mean(mse_xgb)))
print("Error of gradient boosting: ",np.sqrt(np.mean(mse_gb)))
print("Error of AdaBoost: ",np.sqrt(np.mean(mse_AdaBoost)))

#%%
# Another way to do the selection of the hyper parameters
# - more efficient from a computation time point of vue
# - better exploration of the various possibilites
params={
    'max_depth': [2,5],
    'subsample': [0.4],
    'colsample_bytree': [.05,.1,.2],
    'n_estimators': [10,100,1000],
    'reg_alpha': [0.05,.1,1]
}

xgb_reg = xgb.XGBRegressor()
rs = GridSearchCV(xgb_reg,
                  params,
                  cv=ShuffleSplit(test_size=.25,
                                   n_splits=30, random_state=1), 
                  scoring = "neg_mean_squared_error", 
                  refit=False,n_jobs=-1)
rs.fit(X,fat)
print("Best parameters :",rs.best_params_)
print("Best accuracy :",np.round(np.sqrt(-rs.best_score_),2))