#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 11 13:48:37 2019

@author: valerie
"""

import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from urllib.request import urlopen


# Data importation
# On lit le tableau comme un tableau de chaînes de caractères,
# on extrait les colonnes numériques, 
# on fabrique artisanalement les dummies avec les colonnes catégorielles,
# on reconstitue le tout.
url="https://perso.univ-rennes1.fr/bernard.delyon/tp/ozone.txt"
Ozone = np.loadtxt(urlopen(url),skiprows=1,dtype=np.ndarray)
nomvar=np.genfromtxt(urlopen(url),max_rows=1,dtype='str')
y=Ozone[:,[1]].astype(float)
X=Ozone[:,2:-2]
vent=Ozone[:,[-2]]
VO=(vent=='Ouest')*1   # categorical variables have to be recoded
VS=(vent=='Sud')*1
VE=(vent=='Est')*1
P=(Ozone[:,[-1]]=='Pluie')*1
X=np.concatenate((X,VO,VE,VS,P),axis=1).astype(float)
# Noms des variables explicatives
nomvar=np.concatenate((nomvar[1:-2],["Ouest"],["Sud"],["Est"],["Pluie"]))


lr = linear_model.LinearRegression(fit_intercept=True)

B = 30 # number of samples for the cross validation
mse_lr, mse_pca = np.zeros(B),np.zeros(B)
for b in range(0,B):
    # sampling for train, test sets
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

    # standardization
    numvar = np.arange(10) # list of quantitative variables
    mk=np.mean(X_train[:,numvar],axis=0)
    sk=np.maximum(np.std(X_train[:,numvar],axis=0),10*np.finfo(float).eps)
    X_train[:,numvar], X_test[:,numvar] = np.add(X_train[:,numvar],-mk), np.add(X_test[:,numvar],-mk)
    X_train[:,numvar], X_test[:,numvar] = np.multiply(X_train[:,numvar],1/sk),np.multiply(X_test[:,numvar],1/sk)
    
    # learning linear model
    lr.fit(X_train,y_train) # fit the linear model
    y_pred = lr.predict(X_test) # prediction for the test set
    mse_lr[b] = np.mean((y_test-y_pred)**2) # compute MSE
    
    y_pred_naive = np.mean(y_train)
    mse_naive = np.mean((y_test-y_pred_naive)**2)
    
    
print("Error of linear regression: ", str(np.round(np.sqrt(np.mean(mse_lr)),2)))
print("Error of naive prediction: ", str(np.round(np.sqrt(np.mean(mse_naive)),2)))