#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 30 15:55:35 2018

@author: valerie
"""

# ===================================================
#      Prediction of a categorical variable
# ===================================================
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlopen
# Cancer Relaps dataset
url = "https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_for_other_methods.gct"
X = np.loadtxt(urlopen(url),skiprows=3,usecols=range(2,99))
X = X.T
names = np.genfromtxt(urlopen(url),dtype='str',usecols=1,skip_header=3)
url = "https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_class_labels.cls"
label = np.loadtxt(urlopen(url),skiprows=2)

y=label
## ============================================================================
## Import utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

def ROC(y_test,y_score,methodName=" ",plot=True):

    ntest = np.size(y_test,0)
    B = np.size(y_test,1)
    fpr, tpr, _ = roc_curve(np.reshape(y_test,B*ntest), np.reshape(y_score,B*ntest))
#    if len(fpr)<3:
#        print("Problem: len(fpr) is lower than 3")
#        return
    roc_auc = auc(fpr, tpr)

    if plot:
        lw = 2
        plt.plot(fpr, tpr, color='darkorange',
            lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(methodName)
        plt.legend(loc="lower right")
        plt.show()
    return(roc_auc)

def genes_SNR(X,y):
  # - Inputs
  # X = dataset
  # y = labels (a boolean or factor variable)
  # - Output
  # SNR
  labels = np.unique(y)
  n,p = X.shape
  K = len(labels)
  
  means = np.zeros((K,p))
  sd = np.std(X,axis=0)
  
  for k in range(K):
      means[k,] = np.mean(X[y==labels[k],:],axis=0)
  SNR = np.max(np.reshape(np.abs(np.diff(means,axis=0)),(K-1,p)),axis=0)/sd
  return(SNR)


## ============================================================================
## Neural Network
from sklearn.neural_network import MLPClassifier

B = 30
n_test = 9
y_score = np.zeros([n_test,B]) 
y_test_all = np.zeros([n_test,B]) 
for b in range(B): 
    X_train, X_test, y_train, y_test = train_test_split(X,label,test_size=n_test)
    SNR = genes_SNR(X_train,y_train)
    keep = np.argsort(SNR)[-50:] 
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    clf = MLPClassifier(solver='lbfgs', alpha=0,hidden_layer_sizes=3) # 3 neurones
    clf.fit(X_train[:,keep],y_train)
    y_score[:,b] = clf.predict_proba(X_test[:,keep])[:,1]
    y_test_all[:,b] = y_test
              
ROC(y_test_all,y_score,"MLP,3")

# =============================================================================
# Another way to do the selection of the hyper parameters
# - more efficient from a computation time point of vue
# =============================================================================
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV, cross_val_score


SNR = genes_SNR(X,y)
keep = np.argsort(SNR)[-150:] 
    
    
params = {'hidden_layer_sizes': [3,4,5,10], 
          'alpha': [.05,.1,1], 
          'learning_rate' : ['constant', 'invscaling', 'adaptive'],
          'solver' : ['lbfgs','sgd']}

clf_ann = GridSearchCV(MLPClassifier(max_iter=1000),
                   param_grid=params, 
                   cv=ShuffleSplit(test_size=.25,
                                   n_splits=30, random_state=1), 
                   scoring = "accuracy", 
                   refit=False,n_jobs=-1)
clf_ann.fit(X[:,keep], y.ravel())

print("Best parameters :",clf_ann.best_params_)
print("Best score :",np.round(clf_ann.best_score_,2))

B = 30
n_test = 9
y_score = np.zeros([n_test,B]) 
y_test_all = np.zeros([n_test,B]) 
for b in range(B): 
    X_train, X_test, y_train, y_test = train_test_split(X,label,test_size=n_test)
    SNR = genes_SNR(X_train,y_train)
    keep = np.argsort(SNR)[-150:] 
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    clf = MLPClassifier(alpha=.05,hidden_layer_sizes=10,
                        learning_rate='adaptive',solver='sgd',
                        max_iter=1000) # 3 neurones
    clf.fit(X_train,y_train)
    y_score[:,b] = clf.predict_proba(X_test)[:,1]
    y_test_all[:,b] = y_test
              
ROC(y_test_all,y_score,"MLP")