library(FactoMineR)
library(glmnet)


url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/Biscuits.csv"
url = "~/Dropbox/ENSEIGNEMENT/RADO/Biscuits.csv"
biscuits=read.csv(url,sep=";")
# Extraction de la colonne fat
fat=biscuits[,1]
# Extraction des variables explicatives
X=biscuits[,-1]
# Trace d'un spectre

plot(as.numeric(X[2,]),typ="l")
title("Un exemple de spectre")
# TIn the following plot, color is varying according to the fat percent
fatn=fat/max(fat)
colors= rainbow(length(fat)*2)[rev(1:length(fat))]
i.col = rank(fat)
dev.new()
plot(as.numeric(X[1,]),col=colors[i.col[1]],typ="l",ylab = "Absorbances")
for (i in 2:length(fat)){
  lines(as.numeric(X[i,]),col=colors[i.col[i]],typ="l",ylab = "Absorbances")
}
title("Spectres NIR")

#=============================================================================
train_test_split <- function (X,y,test_size=.25,random_state=NULL){
  # Extraction of a train and a test datasets.
  #
  # Inputs : X, y : data
  #          test_size : a fraction (<1) of the total set or a number of samples (integer) ; default 0.25
  #          random_state : if equal to an interger, it fixes the random seed ; defaut NULL
  # Outputs : X_train, X_test, y_train, y_test
  #
  n = nrow(X)
  if (test_size>1){test_size=test_size/n}
  if (!is.null(random_state)){set.seed(random_state)}
  itest=sample(1:n,round(n*test_size))
  itrain=setdiff(1:n,itest)
  Xtrain=X[itrain,]
  Xtest=X[itest,]
  ytrain=y[itrain]
  ytest=y[itest]
  return(list(X_train=Xtrain,X_test=Xtest,y_train=ytrain,y_test=ytest))
}
#=============================================================================

# Find the waves length with highest  correlation with fat percent =============
p = dim(X)[2]
rho = NULL
for (j in 1:p){
  rho[j] = cor(X[,j],fat)
}

rho_28 = sort(rho)[p-28]
keep = which(rho>=rho_28)

# Overfitting =================================================================

# sampling for train, test sets
samples = train_test_split(X[,keep],fat,test_size=4,random_state=1)
X_train=samples$X_train
X_test=samples$X_test
y_train=samples$y_train
y_test=samples$y_test


# Fit a linear model ==========================================================


# Fit Ridge and Lasso regression ==============================================
alpha_values = c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1,5) # valeurs lambda