#!/usr/bin/env python 3.6
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 13 16:58:58 2017

@author: valerie
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # pb installation, à voir

gene = pd.read_table('https://perso.univ-rennes1.fr/valerie.monbet/DATA/data_set_ALL_AML_train.csv') 

#X = t(gene[,seq(3,ncol(gene),by=2)])
ncol = gene.shape[1] 

X = gene[gene.columns[np.arange(2,ncol,2)]]   
X = X.T

ncol = X.shape[1]
nrow = X.shape[0]
labels = np.array([1, 2])
Y = np.repeat(labels, [27, 11], axis=0)

plt.pcolor(X[np.arange(100)])
plt.colorbar()
plt.title('Example of gene expressions')
plt.ylabel('samples')
plt.xlabel('genes')
plt.show()

## Dimension reduction
# PCA plots
from sklearn.decomposition import PCA

pca = PCA(n_components=2) # compputed on all genes
pca.fit(X)

plt.plot(pca.explained_variance_ratio_)
plt.ylabel("Explained variance")
plt.xlabel("Number of components")
plt.show()

Coord = pca.fit_transform(X)
plt.scatter(Coord[np.nonzero(Y==2)[0],0],Coord[np.nonzero(Y==2)[0],1],c=Y)

# Keep only genes with high correlation with Y

rho = np.zeros(ncol)
for k in range(ncol) :
    c = np.corrcoef(X[k],Y)
    rho[k] = c[0,1]

w = np.nonzero(abs(rho)>.5)[0]
len(w)
pca = PCA(n_components=2) # compputed on all genes

pca.fit(X[w])
Coord = pca.fit_transform(X[w])
plt.scatter(Coord[np.nonzero(Y==2)[0],0],Coord[np.nonzero(Y==2)[0],1],c=Y)


# t-SNE plots
from tsne import bh_sne
from sklearn import manifold
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0,perplexity=50,
                     verbose=1,n_iter=1500)
X_tsne = tsne.fit_transform(X)

# plot the result
vis_x = X_tsne[:, 0]
vis_y = X_tsne[:, 1]

plt.scatter(vis_x, vis_y, c=Y)
plt.show()

## ============================================================================
## Import utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

def ROC(y_test,y_score,methodName=" ",plot=True):

    ntest = np.size(y_test,0)
    B = np.size(y_test,1)
    fpr, tpr, _ = roc_curve(np.reshape(y_test-1,B*ntest), np.reshape(y_score,B*ntest))
#    if len(fpr)<3:
#        print("Problem: len(fpr) is lower than 3")
#        return
    roc_auc = auc(fpr, tpr)

    if plot:
        lw = 2
        plt.plot(fpr, tpr, color='darkorange',
            lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(methodName)
        plt.legend(loc="lower right")
        plt.show()
    return(roc_auc)

## ============================================================================
## knn
from sklearn import neighbors

n = len(X[0])
Xw = X[w]
B = 100
n_neighbors = np.arange(1,7)
ErrClassif = np.zeros([len(n_neighbors),B])
for b in range(B): 
    itrain,itest=train_test_split(range(0,n-1),test_size=0.25)
    Xtrain = Xw.iloc[itrain]
    ytrain = Y[np.asarray(itrain)] # because itrain is a list
                          # and y is indexed from 6 to ... 
    ytest = Y[np.asarray(itest)] # because itest is a list
    for i in n_neighbors:
        clf = neighbors.KNeighborsClassifier(i)
        clf.fit(Xtrain, ytrain)
        yhat = clf.predict(Xw.iloc[itest])
        ErrClassif[i-1,b] = np.mean(ytest!=yhat)

plt.boxplot(ErrClassif.T,labels=n_neighbors)
plt.ylim(-0.1,1)
plt.ylabel('Mean classification error')
plt.xlabel('nb of neighbors')

# Best result for 1 neighbor
ibest = 1
ntest = 10 # 10 because len(itest) = 10
y_score = np.zeros([ntest,B]) # 10 because len(itest) = 10
y_test = np.zeros([ntest,B]) # 10 because len(itest) = 10

for b in range(B): 
    itrain,itest=train_test_split(range(0,n-1),test_size=0.25)
    Xtrain = Xw.iloc[itrain]
    ytrain = Y[np.asarray(itrain)] # because itrain is a list
                          # and y is indexed from 6 to ... 
    ytest = Y[np.asarray(itest)] # because itest is a list
    y_test[:,b] = ytest
    clf = neighbors.KNeighborsClassifier(ibest)
    clf.fit(Xtrain, ytrain)
    y_score[:,b] = clf.predict_proba(Xw.iloc[itest])[:,1]


ROC(y_test,y_score,"kNN, 1 neighbor")

## ============================================================================
## trees
from sklearn import tree

n = len(X[0])
Xw = X[w]
B = 100
ErrClassif_tree = np.zeros([B])
ntest = 10 # 10 because len(itest) = 10
y_score = np.zeros([ntest,B]) # 10 because len(itest) = 10
y_test = np.zeros([ntest,B]) # 10 because len(itest) = 10
for b in range(B): 
    itrain,itest=train_test_split(range(0,n-1),test_size=0.25)
    Xtrain = Xw.iloc[itrain]
    ytrain = Y[np.asarray(itrain)] # because itrain is a list
                          # and y is indexed from 6 to ... 
    ytest = Y[np.asarray(itest)] # because itest is a list
    y_test[:,b] = ytest
    clf = tree.DecisionTreeClassifier()
    clf.fit(Xtrain, ytrain)
    yhat = clf.predict(Xw.iloc[itest])
    ErrClassif_tree[b] = np.mean(ytest!=yhat)
    y_score[:,b] = clf.predict_proba(Xw.iloc[itest])[:,1]

# ROC
ROC(y_test,y_score,"Tree")

## ============================================================================
## Linear Discriminant Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

n = len(X[0])
Xw = X[w]
B = 100
ErrClassif_tree = np.zeros([B])
ntest = 10 # 10 because len(itest) = 10
y_score = np.zeros([ntest,B]) # 10 because len(itest) = 10
y_test = np.zeros([ntest,B]) # 10 because len(itest) = 10
for b in range(B): 
    itrain,itest=train_test_split(range(0,n-1),test_size=0.25)
    Xtrain = Xw.iloc[itrain]
    ytrain = Y[np.asarray(itrain)] # because itrain is a list
                          # and y is indexed from 6 to ... 
    ytest = Y[np.asarray(itest)] # because itest is a list
    y_test[:,b] = ytest
    lda = LinearDiscriminantAnalysis()
    lda.fit(Xtrain, ytrain)
    y_score[:,b] = lda.predict_proba(Xw.iloc[itest])[:,1]


# ROC
ROC(y_test,y_score,B,ntest,"LDA")

# for sparse lda look at https://github.com/ankazhao/python-sparselda

## ============================================================================
## Logistic regression with Lasso penalty
from sklearn.linear_model import LogisticRegression

n = len(X[0])
Xw = X[w]
B = 100
C_grid = (100,50,10,1) # grid for regularization constant
test_size = .1
ntest = round(.1*n) # 10 because len(itest) = 4 if test_size = .1
y_score_LR = np.zeros([ntest,B]) 
y_score = np.zeros([ntest,B,len(C_grid)]) 
y_test = np.zeros([ntest,B]) 
for b in range(B): 
    itrain,itest=train_test_split(range(0,n-1),test_size=test_size)
    Xtrain = Xw.iloc[itrain]
    ytrain = Y[np.asarray(itrain)] # because itrain is a list
                          # and y is indexed from 6 to ... 
    ytest = Y[np.asarray(itest)] # because itest is a list
    y_test[:,b] = ytest
    clf_LR = LogisticRegression() 
    clf_LR.fit(Xtrain, ytrain)
    y_score_LR[:,b] = clf.predict_proba(Xw.iloc[itest])[:,1]
    for i, C in enumerate(C_grid):
        clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.001)
        clf_l1_LR.fit(Xtrain, ytrain)
        y_score[:,b,i] = clf.predict_proba(Xw.iloc[itest])[:,1]

# If we xant evaluate the sparsity        
coef_l1_LR = clf_l1_LR.coef_.ravel()  
nb_z = np.mean(coef_l1_LR == 0) * 100
print("{0:.2f}% of coefficients are zeros for C equal to {1:.2f}".format(nb_z,C))

auc_tab = np.zeros(len(C_grid))    
for i in range(len(C_grid)):
    tmp = np.reshape(y_score[:,:,i],[ntest,B])
    auc_tab[i] =  ROC(y_test,tmp,plot=False)
auc_tab

    
ROC(y_test,y_score_LR,"LR")


## ============================================================================
## Neural Network
from sklearn.neural_network import MLPClassifier

n = len(X[0])
B = 30
layer_grid = ([2],[3],[4],[5],[2,1],[3,1],[3,2]) # list of different architectures
alpha_grid = (1e-3,1e-2,1e-1,1,10) # list of different architectures
test_size = .1
ntest = round(.1*n) # size of test sample
y_score = np.zeros([ntest,B,len(layer_grid)*len(alpha_grid)]) 
y_test = np.zeros([ntest,B]) 
for b in range(B): 
    itrain,itest=train_test_split(range(0,n-1),test_size=test_size)
    Xtrain = X.iloc[itrain]
    ytrain = Y[np.asarray(itrain)]
    rho = np.zeros(ncol) # correlated gene should be selected here
                  # and not outside the cross-validation loop
    for k in range(ncol) : 
        c = np.corrcoef(Xtrain[k],ytrain)
        rho[k] = c[0,1]
    w = np.nonzero(abs(rho)>.5)[0]
    ytest = Y[np.asarray(itest)] # because itest is a list
    y_test[:,b] = ytest
    cnt = 0
    for i, (C) in enumerate(layer_grid):
        for j, (alpha) in enumerate(alpha_grid):
            clf = MLPClassifier(solver='lbfgs', alpha=alpha,hidden_layer_sizes=C)
            clf.fit(Xtrain[w], ytrain)
            y_score[:,b,cnt] = clf.predict_proba(X[w].iloc[itest])[:,1]
            cnt += 1


auc_tab = np.zeros(len(layer_grid)*len(alpha_grid))    
for i in range(cnt):
    tmp = np.reshape(y_score[:,:,i],[ntest,B])
    auc_tab[i] =  ROC(y_test,tmp,plot=False)
auc_tab

cnt = 0
for i, (C) in enumerate(layer_grid):
    for j, (alpha) in enumerate(alpha_grid):
        if np.where(auc_tab==max(auc_tab))[0][0]==cnt:
            print(cnt,C,alpha)
        cnt +=1
        
tmp = np.reshape(y_score[:,:,np.where(auc_tab==max(auc_tab))[0][0]],[ntest,B]) 
ROC(y_test,tmp,"ANN")

## ============================================================================
## Random forest
from sklearn.ensemble import RandomForestClassifier

n = len(X[0])
B = 30
test_size = .1
ntest = round(.1*n) # size of test sample
y_score_RF = np.zeros([ntest,B]) 
y_test = np.zeros([ntest,B]) 
for b in range(B): 
    itrain,itest=train_test_split(range(0,n-1),test_size=test_size)
    Xtrain = X.iloc[itrain]
    ytrain = Y[np.asarray(itrain)]
    rho = np.zeros(ncol) # correlated gene should be selected here
                  # and not outside the cross-validation loop
    for k in range(ncol) : 
        c = np.corrcoef(Xtrain[k],ytrain)
        rho[k] = c[0,1]
    w = np.nonzero(abs(rho)>.5)[0]
    ytest = Y[np.asarray(itest)] # because itest is a list
    y_test[:,b] = ytest
    clf = RandomForestClassifier(max_depth=2) # default number of trees is 10
    clf.fit(Xtrain[w], ytrain)
    y_score_RF[:,b] = clf.predict_proba(X[w].iloc[itest])[:,1]

ROC(y_test,y_score_RF,"RF")


var_imp = clf.feature_importances_

## ============================================================================
## Gradient boosting
from sklearn.ensemble import GradientBoostingClassifier


# if no preselection of gene is done
n = len(X[0])
B = 30
test_size = .1
ntest = round(.1*n) # size of test sample
y_score_GB = np.zeros([ntest,B]) 
y_test = np.zeros([ntest,B]) 
for b in range(B): 
    itrain,itest=train_test_split(range(0,n-1),test_size=test_size)
    Xtrain = X.iloc[itrain]
    ytrain = Y[np.asarray(itrain)]
    w = np.nonzero(abs(rho)>.5)[0]
    ytest = Y[np.asarray(itest)] # because itest is a list
    y_test[:,b] = ytest
    clf = GradientBoostingClassifier(loss='exponential',n_estimators=200,max_features=15)
    clf.fit(Xtrain, ytrain)
    y_score_GB[:,b] = clf.predict_proba(X.iloc[itest])[:,1]

ROC(y_test,y_score_GB,"GB")

# with preselection
n = len(X[0])
B = 30
test_size = .1
ntest = round(.1*n) # size of test sample
y_score_GB = np.zeros([ntest,B]) 
y_test = np.zeros([ntest,B]) 
for b in range(B): 
    itrain,itest=train_test_split(range(0,n-1),test_size=test_size)
    Xtrain = X.iloc[itrain]
    ytrain = Y[np.asarray(itrain)]
    rho = np.zeros(ncol) # correlated gene should be selected here
                  # and not outside the cross-validation loop
    for k in range(ncol) : 
        c = np.corrcoef(Xtrain[k],ytrain)
        rho[k] = c[0,1]
    w = np.nonzero(abs(rho)>.5)[0]
    ytest = Y[np.asarray(itest)] # because itest is a list
    y_test[:,b] = ytest
    clf = GradientBoostingClassifier(loss='exponential',n_estimators=200,max_features=15)
    clf.fit(Xtrain[w], ytrain)
    y_score_GB[:,b] = clf.predict_proba(X[w].iloc[itest])[:,1]

ROC(y_test,y_score_GB,"GB")

## ============================================================================
## SVM

# http://www2.stat.duke.edu/~sayan/webPub/svmmicro.pdf
from sklearn.svm import SVC


# First fit SVM with rbf kernel
n = len(X[0])
B = 30
test_size = .1
ntest = round(.1*n) # size of test sample
C_grid = (1e-1,1,1e1,1e2,1e3,3e3) # different values for the marge
gamma_grid = (1e-5,1e-4,1e-3,1e-2,1e-1,1) # variance of the rbf kernel
test_size = .1
ntest = round(.1*n) # size of test sample
y_score_svc = np.zeros([ntest,B,len(C_grid)*len(gamma_grid)]) 
y_test = np.zeros([ntest,B]) 
for b in range(B): 
    itrain,itest=train_test_split(range(0,n-1),test_size=test_size)
    Xtrain = X.iloc[itrain]
    ytrain = Y[np.asarray(itrain)]
    
    # On garde les genes dont le rapport sigla sur bruit est "grand"
    # On calcule le SNR : (mean_class_2-mean_class_1)/std_all
    Xt1 = Xtrain.iloc[np.where(ytrain==1)[0]]
    Xt2 = Xtrain.iloc[np.where(ytrain==2)[0]]
    snr = (Xt1.apply(np.mean)- Xt2.apply(np.mean))/Xtrain.apply(np.std)
    w = np.nonzero(abs(snr)>1)[0]
    
    ytest = Y[np.asarray(itest)] # because itest is a list
    y_test[:,b] = ytest
    cnt = 0
    for i, (C) in enumerate(C_grid):
        for j, (gamma) in enumerate(gamma_grid):
            clf = SVC(C=C,gamma=gamma,probability=True)
            clf.fit(Xtrain[w], ytrain)
            y_score_svc[:,b,cnt] = clf.predict_proba(X[w].iloc[itest])[:,0]
            cnt += 1

#for b in range(B): 
#    itrain,itest=train_test_split(range(0,n-1),test_size=test_size)
#    Xtrain = X.iloc[itrain]
#    ytrain = Y[np.asarray(itrain)]
#    ytest = Y[np.asarray(itest)] # because itest is a list
#    y_test[:,b] = ytest
#    cnt = 0
#    for i, (C) in enumerate(C_grid):
#        for j, (gamma) in enumerate(gamma_grid):
#            clf = SVC(C=C,gamma=gamma,probability=True)
#            clf.fit(Xtrain, ytrain)
#            y_score_svc[:,b,cnt] = clf.predict_proba(X.iloc[itest])[:,0]
#            cnt += 1

auc_tab = np.zeros(len(C_grid)*len(gamma_grid))    
for i in range(cnt):
    tmp = np.reshape(y_score_svc[:,:,i],[ntest,B])
    auc_tab[i] =  ROC(y_test,tmp,plot=False)
auc_tab

cnt = 0
for i, (C) in enumerate(C_grid):
    for j, (gamma) in enumerate(gamma_grid):
        if np.where(auc_tab==max(auc_tab))[0][0]==cnt:
            print(cnt,C,gamma)
        cnt +=1
        
tmp = np.reshape(y_score_svc[:,:,np.where(auc_tab==max(auc_tab))[0][0]],[ntest,B]) 
ROC(y_test,tmp,"SVC-rbf")


# Now fit SVM with linear kernel
n = len(X[0])
B = 30
test_size = .1
ntest = round(.1*n) # size of test sample
C_grid = (1e-2,1e-1,1,1e1,1e2,1e3) # different values for the marge
test_size = .1
ntest = round(.1*n) # size of test sample
y_score_svc = np.zeros([ntest,B,len(C_grid)]) 
y_test = np.zeros([ntest,B])

for b in range(B): 
    itrain,itest=train_test_split(range(0,n-1),test_size=test_size)
    Xtrain = X.iloc[itrain]
    ytrain = Y[np.asarray(itrain)]
    rho = np.zeros(ncol) # correlated gene should be selected here
                  # and not outside the cross-validation loop
    
    Xt1 = Xtrain.iloc[np.where(ytrain==1)[0]]
    Xt2 = Xtrain.iloc[np.where(ytrain==2)[0]]
    snr = (Xt1.apply(np.mean)- Xt2.apply(np.mean))/Xtrain.apply(np.std)
    w = np.nonzero(abs(snr)>.5)[0]
    ytest = Y[np.asarray(itest)] # because itest is a list
    y_test[:,b] = ytest
    cnt = 0
    for i, (C) in enumerate(C_grid):
        clf = SVC(C=C,gamma=gamma,probability=True)
        clf.fit(Xtrain[w], ytrain)
        y_score_svc[:,b,cnt] = clf.predict_proba(X[w].iloc[itest])[:,0]
        cnt += 1

#for b in range(B): 
#    itrain,itest=train_test_split(range(0,n-1),test_size=test_size)
#    Xtrain = X.iloc[itrain]
#    ytrain = Y[np.asarray(itrain)]
#    ytest = Y[np.asarray(itest)] # because itest is a list
#    y_test[:,b] = ytest
#    cnt = 0
#    for i, (C) in enumerate(C_grid):
#        clf = SVC(C=C,gamma=gamma,probability=True)
#        clf.fit(Xtrain, ytrain)
#        y_score_svc[:,b,cnt] = clf.predict_proba(X.iloc[itest])[:,0]
#        cnt += 1

auc_tab = np.zeros(len(C_grid))    
for i in range(cnt):
    tmp = np.reshape(y_score_svc[:,:,i],[ntest,B])
    auc_tab[i] =  ROC(y_test,tmp,plot=False)
auc_tab

cnt = 0
for i, (C) in enumerate(C_grid):
    if np.where(auc_tab==max(auc_tab))[0][0]==cnt:
        print(cnt,C)
    cnt +=1
        
tmp = np.reshape(y_score_svc[:,:,np.where(auc_tab==max(auc_tab))[0][0]],[ntest,B]) 
ROC(y_test,tmp,"SVC-linear")