#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 27 11:51:27 2018

@author: valerie
"""

import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlopen
url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/Biscuits.csv"
biscuits=np.loadtxt(urlopen(url),skiprows=1,delimiter=";")


fichier = "/Users/valerie/ENSEIGNEMENT/RADO/Biscuits.csv"
biscuits=np.loadtxt(fichier,skiprows=1,delimiter=";")

# Extraction de la colonne fat
fat=biscuits[:,0]
# Extraction des variables explicatives
X=biscuits[:,1:]
# Trace d'un spectre
plt.figure(1)
plt.plot(X[1,:])
plt.title("Un exemple de spectre")
# TIn the following plot, color is varying according to the fat percent
fatn=fatn/max(fatn)
colors= plt.cm.inferno(fatn)
plt.figure(2)
for i in range(len(fat)):
   plt.plot(X[i,:],color=colors[i])
plt.title("Spectres NIR")
plt.ylabel("Absorbances")
plt.show()

# knn ==========================================================
from sklearn import neighbors

nb_neighbors = [1,5,10,20]


B = 30 # number of samples for the cross validation
mse_knn= np.zeros((B,len(nb_neighbors)))
for b in range(0,B):
    X_train, X_test, y_train, y_test = train_test_split(X,fat,test_size=9)
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    for i,k in enumerate(nb_neighbors):
        knn = neighbors.KNeighborsRegressor(k)
        knn.fit(X_train,y_train)
        y_pred = knn.predict(X_test)
        mse_knn[b,i] = np.mean((y_pred-y_test)**2)
    
plt.boxplot(err_knn,labels=nb_neighbors)
plt.ylabel('Mean classification error')
plt.xlabel('Nb of neighbors')

print("Error of knn method: ",np.sqrt(np.min(np.mean(mse_knn,axis=0))))

# This result could be improved by running knn in space with smaller dimension. 

# Decision Tree ============================================================
from sklearn import tree

B = 30
n_test = 9
mse_tree = np.zeros(B)
for b in range(B): 
    X_train, X_test, y_train, y_test = train_test_split(X,fat,test_size=n_test)
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    clf = tree.DecisionTreeRegressor()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    mse_tree[b] = np.mean((y_pred-y_test)**2)
    
 print("Error of tree regression: ",np.sqrt(np.mean(mse_tree)))