#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 29 13:13:23 2018

@author: valerie
"""

# ===================================================
#      Prediction of a categorical variable
# ===================================================
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlopen

# Cancer Relaps dataset
url = "https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_for_other_methods.gct"
X = np.loadtxt(urlopen(url),skiprows=3,usecols=range(2,99))
X = X.T
names = np.genfromtxt(urlopen(url),dtype='str',usecols=1,skip_header=3)

url = "https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_class_labels.cls"
label = np.loadtxt(urlopen(url),skiprows=2)

y = label

## ============================================================================
## Import utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

def ROC(y_test,y_score,methodName=" ",plot=True):

    ntest = np.size(y_test,0)
    B = np.size(y_test,1)
    fpr, tpr, _ = roc_curve(np.reshape(y_test,B*ntest), np.reshape(y_score,B*ntest))
#    if len(fpr)<3:
#        print("Problem: len(fpr) is lower than 3")
#        return
    roc_auc = auc(fpr, tpr)

    if plot:
        lw = 2
        plt.plot(fpr, tpr, color='darkorange',
            lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(methodName)
        plt.legend(loc="lower right")
        plt.show()
    return(roc_auc)

# For genes swith best SNR selection ===========================================

def genes_SNR(X,y):
  # - Inputs
  # X = dataset
  # y = labels (a boolean or factor variable)
  # - Output
  # SNR
  labels = np.unique(y)
  n,p = X.shape
  K = len(labels)
  
  means = np.zeros((K,p))
  sd = np.std(X,axis=0)
  
  for k in range(K):
      means[k,] = np.mean(X[y==labels[k],:],axis=0)
  SNR = np.max(np.reshape(np.abs(np.diff(means,axis=0)),(K-1,p)),axis=0)/sd
  return(SNR)

# Adaboost ============================================================
 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


B = 30
n_test = 9
y_score = np.zeros([n_test,B]) 
y_test_all = np.zeros([n_test,B]) 
for b in range(B): 
    X_train, X_test, y_train, y_test = train_test_split(X,label,test_size=n_test)
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=20)
    clf.fit(X_train,y_train)
    y_score[:,b] = clf.predict_proba(X_test)[:,1]
    y_test_all[:,b] = y_test
              
ROC(y_test_all,y_score,"AdaBoost")

importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]


print("Feature ranking (20 first:")

for f in range(20):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the last forest
plt.figure(1)
plt.clf()
plt.title("Feature importances (20 first)")
plt.barh(range(20), importances[indices][:20],
       color="r", align="center")
plt.yticks(range(20), [names[i] for i in indices[:20]])
plt.show()


# Gradient Boosting ============================================================
from sklearn.ensemble import GradientBoostingClassifier

B = 30
n_test = 9
y_score = np.zeros([n_test,B]) 
y_test_all = np.zeros([n_test,B]) 
for b in range(B): 
    X_train, X_test, y_train, y_test = train_test_split(X,label,test_size=n_test)
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    clf = GradientBoostingClassifier(loss='exponential',n_estimators=200,max_features=5)
    clf.fit(X_train,y_train)
    y_score[:,b] = clf.predict_proba(X_test)[:,1]
    y_test_all[:,b] = y_test
              
ROC(y_test_all,y_score,"Gradient Boosting")

importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]


print("Feature ranking (20 first:")

for f in range(20):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the last forest
plt.figure(1)
plt.clf()
plt.title("Feature importances (20 first)")
plt.barh(range(20), importances[indices][:20],
       color="r", align="center")
plt.yticks(range(20), [names[i] for i in indices[:20]])
plt.show()


# eXtreme Gradient Boosting ============================================================
import xgboost as xgb

B = 30
n_test = 9
y_pred_proba = []
ytest_xgb = [] 
ncol = X.shape[1]
for b in range(B): 
    X_train, X_test, y_train, y_test = train_test_split(X,label,test_size=n_test)
    ytest_xgb.append(y_test)
    mk=np.mean(X_train,axis=0)
    sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps)
    X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk)
    X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk)
    dtrain = xgb.DMatrix(data=X_train,label=y_train)
    dtest = xgb.DMatrix(data=X_test,label=y_test)
    params = {'n_estimators': 100, 'max_depth': 2, 'min_samples_split': 5,
               'eta': 1,'objective':'reg:logistic','alpha': .1,'silent': 1,
               'early_stopping_rounds': 3}
    watchlist = [(dtest, 'eval')]
    num_round=50
    bst = xgb.train(params,dtrain,num_round,watchlist)
    y_pred_proba.append(bst.predict(dtest))

              
ROC(ytest_xgb,y_pred_proba,"XGB")

##
print('XGBoost with grid search')
# play with these params
params={
    'max_depth': [2,5,9], #[3,4,5,6,7,8,9], # 5 is good but takes too long in kaggle env
    'subsample': [.05,.1,.5], 
    'colsample_bytree': [.5,.7], 
    'n_estimators': [10,100,1000], #[1000,2000,3000]
    'reg_alpha': [0.03,.1,.5,1] #[0.01, 0.02, 0.03, 0.04]
}

xgb_clf = xgb.XGBClassifier()
clf = GridSearchCV(xgb_clf,
                  params,
                  cv=ShuffleSplit(test_size=.25,
                                   n_splits=30, random_state=1), 
                  scoring = "accuracy", 
                  refit=False,n_jobs=-1)
clf.fit(X,y)
print("Best parameters :",rs.best_params_)
print("Best accuracy :",np.round(np.sqrt(-rs.best_score_),2))