#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 29 13:13:23 2018

@author: valerie
"""

import matplotlib.pyplot as plt
import numpy as np
from urllib.request import urlopen

from sklearn.model_selection import train_test_split
from sklearn.ensemble import bagging, RandomForestClassifier


# ===================================================
#      Prediction of a categorical variable
# ===================================================

# Cancer Relaps dataset
url = "https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_for_other_methods.gct"
X = np.loadtxt(urlopen(url),skiprows=3,usecols=range(2,99))
X = X.T
url = "https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_class_labels.cls"
label = np.loadtxt(urlopen(url),skiprows=2)

url = "https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_for_other_methods.gct"
names=np.genfromtxt(urlopen(url),usecols=0,skip_header=3,dtype='str')

y = label

# Plot of ROC curves ==========================================================
from sklearn.metrics import roc_curve, auc


def ROC(y_test,y_score,methodName=" ",plot=True):

    ntest = np.size(y_test,0)
    if len(y_test.shape)>1: B = np.size(y_test,1)
    else : B=1
    fpr, tpr, _ = roc_curve(np.reshape(y_test,B*ntest), np.reshape(y_score,B*ntest))
#    if len(fpr)<3:
#        print("Problem: len(fpr) is lower than 3")
#        return
    roc_auc = auc(fpr, tpr)

    if plot:
        lw = 2
        plt.plot(fpr, tpr, color='darkorange',
            lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(methodName)
        plt.legend(loc="lower right")
        plt.show()
    return(roc_auc)
    
 
# Find to genes with best SNR =================================================
def genes_SNR(X,y):
  # - Inputs
  # X = dataset
  # y = labels (a boolean or factor variable)
  # - Output
  # SNR
  labels = np.unique(y)
  n,p = X.shape
  K = len(labels)
  
  means = np.zeros((K,p))
  sd = np.std(X,axis=0)
  
  for k in range(K):
      means[k,] = np.mean(X[y==labels[k],:],axis=0)
  SNR = np.max(np.reshape(np.abs(np.diff(means,axis=0)),(K-1,p)),axis=0)/sd
  return(SNR)

    
# Bagging ============================================================

n_est_list = [20,50,100,500] # number of trees

B = 30 # number of samples for the cross validation
err_bag = np.zeros((B,len(n_est_list)))
prob_bag = []
ytest_bag = []
for b in range(0,B):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25)
    ytest_bag = np.concatenate((ytest_bag,np.ravel(y_test)))
    #SNR = genes_SNR(X_train,y_train)
    #keep = np.argsort(SNR)[-50:] 
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    prob_tmp = []
    for i,n_est in enumerate(n_est_list):
        clf = bagging.BaggingClassifier(n_estimators=n_est,n_jobs=-1,max_features=20,oob_score=True) # n_job : ofr parallel computing
        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        err_bag[b,i] = np.mean(y_pred!=y_test)
        prob_tmp = np.concatenate((prob_tmp,clf.predict_proba(X_test)[:,1]))
    if b>0:
        prob_bag = np.concatenate((prob_bag,np.reshape(prob_tmp,(len(n_est_list),len(y_test))).T),axis=0)
    else : prob_bag = np.reshape(prob_tmp,(len(y_test),len(n_est_list)))
    

err_bag_all = np.mean(err_bag,axis=0)

plt.figure(1)
plt.subplot(1,2,1)
plt.plot(err_bag_all,'ko')
plt.ylabel('Mean classification error')
plt.xlabel('Nb of trees')
plt.grid()
plt.subplot(1,2,2)
k0 = np.argmin(err_bag_all)
ROC(ytest_bag,prob_bag[:,k0],methodName="Bagging",plot=True)

# Random Forest ============================================================
from sklearn.ensemble import RandomForestClassifier

n_est_list = [20,50,100,500] # number of trees

B = 30 # number of samples for the cross validation
err_rf = np.zeros((B,len(n_est_list)))
prob_rf = []
ytest_rf = []
for b in range(0,B):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25)
    ytest_rf = np.concatenate((ytest_rf,np.ravel(y_test)))
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    prob_tmp = []
    for i,n_est in enumerate(n_est_list):
        clf = RandomForestClassifier(n_estimators=n_est,n_jobs=-1,max_features=20) # n_job : for parallel computing
        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        err_rf[b,i] = np.mean(y_pred!=y_test)
        prob_tmp = np.concatenate((prob_tmp,clf.predict_proba(X_test)[:,1]))
    if b>0:
        prob_rf = np.concatenate((prob_rf,np.reshape(prob_tmp,(len(n_est_list),len(y_test))).T),axis=0)
    else : prob_rf = np.reshape(prob_tmp,(len(y_test),len(n_est_list)))
    

err_rf_all = np.mean(err_rf,axis=0)

plt.figure(1)
plt.subplot(1,2,1)
plt.plot(err_rf_all,'ko')
plt.ylabel('Mean classification error')
plt.xlabel('Nb of trees')
plt.grid()
plt.subplot(1,2,2)
k0 = np.argmin(err_rf_all)
ROC(ytest_rf,prob_rf[:,k0],methodName="Random Forest",plot=True)

# Variable importance (last forest)
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking (20 first:")
for f in range(20):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the last forest
plt.figure(2)
plt.clf()
plt.title("Feature importances (20 first)")
plt.barh(range(20), importances[indices][:20],
       color="r", align="center")
plt.yticks(range(20), [names[i] for i in indices[:20]])
plt.show()


# =============================================================================
# =============================================================================
# Another way to do the selection of the hyper parameters
# - more efficient from a computation time point of vue
# =============================================================================
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

params = {'n_estimators': [40, 42], 
          'base_estimator__max_leaf_nodes':[10, 15], 
          'base_estimator__max_depth':[4, 5, 6]}

clf = GridSearchCV(bagging.BaggingClassifier(DecisionTreeClassifier(),
                        n_estimators = 100, max_features = 20),
                   param_grid=params, 
                   cv=ShuffleSplit(test_size=.25,
                                   n_splits=30, random_state=1), 
                   scoring = "roc_auc", 
                   refit=False,n_jobs=-1)
clf.fit(X, y)

print("Best parameters :",clf.best_params_)
print("Best score :",np.round(clf.best_score_,2))