#!/usr/bin/env python 3.6 # -*- coding: utf-8 -*- """ Created on Wed Sep 13 16:58:58 2017 @author: valerie """ import pandas as pd import numpy as np import matplotlib.pyplot as plt # pb installation, à voir gene = pd.read_table('https://perso.univ-rennes1.fr/valerie.monbet/DATA/data_set_ALL_AML_train.csv') #X = t(gene[,seq(3,ncol(gene),by=2)]) ncol = gene.shape[1] X = gene[gene.columns[np.arange(2,ncol,2)]] X = X.T ncol = X.shape[1] nrow = X.shape[0] labels = np.array([1, 2]) Y = np.repeat(labels, [27, 11], axis=0) plt.pcolor(X[np.arange(100)]) plt.colorbar() plt.title('Example of gene expressions') plt.ylabel('samples') plt.xlabel('genes') plt.show() ## Dimension reduction # PCA plots from sklearn.decomposition import PCA pca = PCA(n_components=2) # compputed on all genes pca.fit(X) plt.plot(pca.explained_variance_ratio_) plt.ylabel("Explained variance") plt.xlabel("Number of components") plt.show() Coord = pca.fit_transform(X) plt.scatter(Coord[np.nonzero(Y==2)[0],0],Coord[np.nonzero(Y==2)[0],1],c=Y) # Keep only genes with high correlation with Y rho = np.zeros(ncol) for k in range(ncol) : c = np.corrcoef(X[k],Y) rho[k] = c[0,1] w = np.nonzero(abs(rho)>.5)[0] len(w) pca = PCA(n_components=2) # compputed on all genes pca.fit(X[w]) Coord = pca.fit_transform(X[w]) plt.scatter(Coord[np.nonzero(Y==2)[0],0],Coord[np.nonzero(Y==2)[0],1],c=Y) # t-SNE plots from tsne import bh_sne from sklearn import manifold tsne = manifold.TSNE(n_components=2, init='pca', random_state=0,perplexity=50, verbose=1,n_iter=1500) X_tsne = tsne.fit_transform(X) # plot the result vis_x = X_tsne[:, 0] vis_y = X_tsne[:, 1] plt.scatter(vis_x, vis_y, c=Y) plt.show() ## ============================================================================ ## Import utilities from sklearn.model_selection import train_test_split from sklearn.metrics import roc_curve, auc def ROC(y_test,y_score,methodName=" ",plot=True): ntest = np.size(y_test,0) B = np.size(y_test,1) fpr, tpr, _ = roc_curve(np.reshape(y_test-1,B*ntest), np.reshape(y_score,B*ntest)) # if len(fpr)<3: # print("Problem: len(fpr) is lower than 3") # return roc_auc = auc(fpr, tpr) if plot: lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title(methodName) plt.legend(loc="lower right") plt.show() return(roc_auc) ## ============================================================================ ## knn from sklearn import neighbors n = len(X[0]) Xw = X[w] B = 100 n_neighbors = np.arange(1,7) ErrClassif = np.zeros([len(n_neighbors),B]) for b in range(B): itrain,itest=train_test_split(range(0,n-1),test_size=0.25) Xtrain = Xw.iloc[itrain] ytrain = Y[np.asarray(itrain)] # because itrain is a list # and y is indexed from 6 to ... ytest = Y[np.asarray(itest)] # because itest is a list for i in n_neighbors: clf = neighbors.KNeighborsClassifier(i) clf.fit(Xtrain, ytrain) yhat = clf.predict(Xw.iloc[itest]) ErrClassif[i-1,b] = np.mean(ytest!=yhat) plt.boxplot(ErrClassif.T,labels=n_neighbors) plt.ylim(-0.1,1) plt.ylabel('Mean classification error') plt.xlabel('nb of neighbors') # Best result for 1 neighbor ibest = 1 ntest = 10 # 10 because len(itest) = 10 y_score = np.zeros([ntest,B]) # 10 because len(itest) = 10 y_test = np.zeros([ntest,B]) # 10 because len(itest) = 10 for b in range(B): itrain,itest=train_test_split(range(0,n-1),test_size=0.25) Xtrain = Xw.iloc[itrain] ytrain = Y[np.asarray(itrain)] # because itrain is a list # and y is indexed from 6 to ... ytest = Y[np.asarray(itest)] # because itest is a list y_test[:,b] = ytest clf = neighbors.KNeighborsClassifier(ibest) clf.fit(Xtrain, ytrain) y_score[:,b] = clf.predict_proba(Xw.iloc[itest])[:,1] ROC(y_test,y_score,"kNN, 1 neighbor") ## ============================================================================ ## trees from sklearn import tree n = len(X[0]) Xw = X[w] B = 100 ErrClassif_tree = np.zeros([B]) ntest = 10 # 10 because len(itest) = 10 y_score = np.zeros([ntest,B]) # 10 because len(itest) = 10 y_test = np.zeros([ntest,B]) # 10 because len(itest) = 10 for b in range(B): itrain,itest=train_test_split(range(0,n-1),test_size=0.25) Xtrain = Xw.iloc[itrain] ytrain = Y[np.asarray(itrain)] # because itrain is a list # and y is indexed from 6 to ... ytest = Y[np.asarray(itest)] # because itest is a list y_test[:,b] = ytest clf = tree.DecisionTreeClassifier() clf.fit(Xtrain, ytrain) yhat = clf.predict(Xw.iloc[itest]) ErrClassif_tree[b] = np.mean(ytest!=yhat) y_score[:,b] = clf.predict_proba(Xw.iloc[itest])[:,1] # ROC ROC(y_test,y_score,"Tree") ## ============================================================================ ## Linear Discriminant Analysis from sklearn.discriminant_analysis import LinearDiscriminantAnalysis n = len(X[0]) Xw = X[w] B = 100 ErrClassif_tree = np.zeros([B]) ntest = 10 # 10 because len(itest) = 10 y_score = np.zeros([ntest,B]) # 10 because len(itest) = 10 y_test = np.zeros([ntest,B]) # 10 because len(itest) = 10 for b in range(B): itrain,itest=train_test_split(range(0,n-1),test_size=0.25) Xtrain = Xw.iloc[itrain] ytrain = Y[np.asarray(itrain)] # because itrain is a list # and y is indexed from 6 to ... ytest = Y[np.asarray(itest)] # because itest is a list y_test[:,b] = ytest lda = LinearDiscriminantAnalysis() lda.fit(Xtrain, ytrain) y_score[:,b] = lda.predict_proba(Xw.iloc[itest])[:,1] # ROC ROC(y_test,y_score,B,ntest,"LDA") # for sparse lda look at https://github.com/ankazhao/python-sparselda ## ============================================================================ ## Logistic regression with Lasso penalty from sklearn.linear_model import LogisticRegression n = len(X[0]) Xw = X[w] B = 100 C_grid = (100,50,10,1) # grid for regularization constant test_size = .1 ntest = round(.1*n) # 10 because len(itest) = 4 if test_size = .1 y_score_LR = np.zeros([ntest,B]) y_score = np.zeros([ntest,B,len(C_grid)]) y_test = np.zeros([ntest,B]) for b in range(B): itrain,itest=train_test_split(range(0,n-1),test_size=test_size) Xtrain = Xw.iloc[itrain] ytrain = Y[np.asarray(itrain)] # because itrain is a list # and y is indexed from 6 to ... ytest = Y[np.asarray(itest)] # because itest is a list y_test[:,b] = ytest clf_LR = LogisticRegression() clf_LR.fit(Xtrain, ytrain) y_score_LR[:,b] = clf.predict_proba(Xw.iloc[itest])[:,1] for i, C in enumerate(C_grid): clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.001) clf_l1_LR.fit(Xtrain, ytrain) y_score[:,b,i] = clf.predict_proba(Xw.iloc[itest])[:,1] # If we xant evaluate the sparsity coef_l1_LR = clf_l1_LR.coef_.ravel() nb_z = np.mean(coef_l1_LR == 0) * 100 print("{0:.2f}% of coefficients are zeros for C equal to {1:.2f}".format(nb_z,C)) auc_tab = np.zeros(len(C_grid)) for i in range(len(C_grid)): tmp = np.reshape(y_score[:,:,i],[ntest,B]) auc_tab[i] = ROC(y_test,tmp,plot=False) auc_tab ROC(y_test,y_score_LR,"LR") ## ============================================================================ ## Neural Network from sklearn.neural_network import MLPClassifier n = len(X[0]) B = 30 layer_grid = ([2],[3],[4],[5],[2,1],[3,1],[3,2]) # list of different architectures alpha_grid = (1e-3,1e-2,1e-1,1,10) # list of different architectures test_size = .1 ntest = round(.1*n) # size of test sample y_score = np.zeros([ntest,B,len(layer_grid)*len(alpha_grid)]) y_test = np.zeros([ntest,B]) for b in range(B): itrain,itest=train_test_split(range(0,n-1),test_size=test_size) Xtrain = X.iloc[itrain] ytrain = Y[np.asarray(itrain)] rho = np.zeros(ncol) # correlated gene should be selected here # and not outside the cross-validation loop for k in range(ncol) : c = np.corrcoef(Xtrain[k],ytrain) rho[k] = c[0,1] w = np.nonzero(abs(rho)>.5)[0] ytest = Y[np.asarray(itest)] # because itest is a list y_test[:,b] = ytest cnt = 0 for i, (C) in enumerate(layer_grid): for j, (alpha) in enumerate(alpha_grid): clf = MLPClassifier(solver='lbfgs', alpha=alpha,hidden_layer_sizes=C) clf.fit(Xtrain[w], ytrain) y_score[:,b,cnt] = clf.predict_proba(X[w].iloc[itest])[:,1] cnt += 1 auc_tab = np.zeros(len(layer_grid)*len(alpha_grid)) for i in range(cnt): tmp = np.reshape(y_score[:,:,i],[ntest,B]) auc_tab[i] = ROC(y_test,tmp,plot=False) auc_tab cnt = 0 for i, (C) in enumerate(layer_grid): for j, (alpha) in enumerate(alpha_grid): if np.where(auc_tab==max(auc_tab))[0][0]==cnt: print(cnt,C,alpha) cnt +=1 tmp = np.reshape(y_score[:,:,np.where(auc_tab==max(auc_tab))[0][0]],[ntest,B]) ROC(y_test,tmp,"ANN") ## ============================================================================ ## Random forest from sklearn.ensemble import RandomForestClassifier n = len(X[0]) B = 30 test_size = .1 ntest = round(.1*n) # size of test sample y_score_RF = np.zeros([ntest,B]) y_test = np.zeros([ntest,B]) for b in range(B): itrain,itest=train_test_split(range(0,n-1),test_size=test_size) Xtrain = X.iloc[itrain] ytrain = Y[np.asarray(itrain)] rho = np.zeros(ncol) # correlated gene should be selected here # and not outside the cross-validation loop for k in range(ncol) : c = np.corrcoef(Xtrain[k],ytrain) rho[k] = c[0,1] w = np.nonzero(abs(rho)>.5)[0] ytest = Y[np.asarray(itest)] # because itest is a list y_test[:,b] = ytest clf = RandomForestClassifier(max_depth=2) # default number of trees is 10 clf.fit(Xtrain[w], ytrain) y_score_RF[:,b] = clf.predict_proba(X[w].iloc[itest])[:,1] ROC(y_test,y_score_RF,"RF") var_imp = clf.feature_importances_ ## ============================================================================ ## Gradient boosting from sklearn.ensemble import GradientBoostingClassifier # if no preselection of gene is done n = len(X[0]) B = 30 test_size = .1 ntest = round(.1*n) # size of test sample y_score_GB = np.zeros([ntest,B]) y_test = np.zeros([ntest,B]) for b in range(B): itrain,itest=train_test_split(range(0,n-1),test_size=test_size) Xtrain = X.iloc[itrain] ytrain = Y[np.asarray(itrain)] w = np.nonzero(abs(rho)>.5)[0] ytest = Y[np.asarray(itest)] # because itest is a list y_test[:,b] = ytest clf = GradientBoostingClassifier(loss='exponential',n_estimators=200,max_features=15) clf.fit(Xtrain, ytrain) y_score_GB[:,b] = clf.predict_proba(X.iloc[itest])[:,1] ROC(y_test,y_score_GB,"GB") # with preselection n = len(X[0]) B = 30 test_size = .1 ntest = round(.1*n) # size of test sample y_score_GB = np.zeros([ntest,B]) y_test = np.zeros([ntest,B]) for b in range(B): itrain,itest=train_test_split(range(0,n-1),test_size=test_size) Xtrain = X.iloc[itrain] ytrain = Y[np.asarray(itrain)] rho = np.zeros(ncol) # correlated gene should be selected here # and not outside the cross-validation loop for k in range(ncol) : c = np.corrcoef(Xtrain[k],ytrain) rho[k] = c[0,1] w = np.nonzero(abs(rho)>.5)[0] ytest = Y[np.asarray(itest)] # because itest is a list y_test[:,b] = ytest clf = GradientBoostingClassifier(loss='exponential',n_estimators=200,max_features=15) clf.fit(Xtrain[w], ytrain) y_score_GB[:,b] = clf.predict_proba(X[w].iloc[itest])[:,1] ROC(y_test,y_score_GB,"GB") ## ============================================================================ ## SVM # http://www2.stat.duke.edu/~sayan/webPub/svmmicro.pdf from sklearn.svm import SVC # First fit SVM with rbf kernel n = len(X[0]) B = 30 test_size = .1 ntest = round(.1*n) # size of test sample C_grid = (1e-1,1,1e1,1e2,1e3,3e3) # different values for the marge gamma_grid = (1e-5,1e-4,1e-3,1e-2,1e-1,1) # variance of the rbf kernel test_size = .1 ntest = round(.1*n) # size of test sample y_score_svc = np.zeros([ntest,B,len(C_grid)*len(gamma_grid)]) y_test = np.zeros([ntest,B]) for b in range(B): itrain,itest=train_test_split(range(0,n-1),test_size=test_size) Xtrain = X.iloc[itrain] ytrain = Y[np.asarray(itrain)] # On garde les genes dont le rapport sigla sur bruit est "grand" # On calcule le SNR : (mean_class_2-mean_class_1)/std_all Xt1 = Xtrain.iloc[np.where(ytrain==1)[0]] Xt2 = Xtrain.iloc[np.where(ytrain==2)[0]] snr = (Xt1.apply(np.mean)- Xt2.apply(np.mean))/Xtrain.apply(np.std) w = np.nonzero(abs(snr)>1)[0] ytest = Y[np.asarray(itest)] # because itest is a list y_test[:,b] = ytest cnt = 0 for i, (C) in enumerate(C_grid): for j, (gamma) in enumerate(gamma_grid): clf = SVC(C=C,gamma=gamma,probability=True) clf.fit(Xtrain[w], ytrain) y_score_svc[:,b,cnt] = clf.predict_proba(X[w].iloc[itest])[:,0] cnt += 1 #for b in range(B): # itrain,itest=train_test_split(range(0,n-1),test_size=test_size) # Xtrain = X.iloc[itrain] # ytrain = Y[np.asarray(itrain)] # ytest = Y[np.asarray(itest)] # because itest is a list # y_test[:,b] = ytest # cnt = 0 # for i, (C) in enumerate(C_grid): # for j, (gamma) in enumerate(gamma_grid): # clf = SVC(C=C,gamma=gamma,probability=True) # clf.fit(Xtrain, ytrain) # y_score_svc[:,b,cnt] = clf.predict_proba(X.iloc[itest])[:,0] # cnt += 1 auc_tab = np.zeros(len(C_grid)*len(gamma_grid)) for i in range(cnt): tmp = np.reshape(y_score_svc[:,:,i],[ntest,B]) auc_tab[i] = ROC(y_test,tmp,plot=False) auc_tab cnt = 0 for i, (C) in enumerate(C_grid): for j, (gamma) in enumerate(gamma_grid): if np.where(auc_tab==max(auc_tab))[0][0]==cnt: print(cnt,C,gamma) cnt +=1 tmp = np.reshape(y_score_svc[:,:,np.where(auc_tab==max(auc_tab))[0][0]],[ntest,B]) ROC(y_test,tmp,"SVC-rbf") # Now fit SVM with linear kernel n = len(X[0]) B = 30 test_size = .1 ntest = round(.1*n) # size of test sample C_grid = (1e-2,1e-1,1,1e1,1e2,1e3) # different values for the marge test_size = .1 ntest = round(.1*n) # size of test sample y_score_svc = np.zeros([ntest,B,len(C_grid)]) y_test = np.zeros([ntest,B]) for b in range(B): itrain,itest=train_test_split(range(0,n-1),test_size=test_size) Xtrain = X.iloc[itrain] ytrain = Y[np.asarray(itrain)] rho = np.zeros(ncol) # correlated gene should be selected here # and not outside the cross-validation loop Xt1 = Xtrain.iloc[np.where(ytrain==1)[0]] Xt2 = Xtrain.iloc[np.where(ytrain==2)[0]] snr = (Xt1.apply(np.mean)- Xt2.apply(np.mean))/Xtrain.apply(np.std) w = np.nonzero(abs(snr)>.5)[0] ytest = Y[np.asarray(itest)] # because itest is a list y_test[:,b] = ytest cnt = 0 for i, (C) in enumerate(C_grid): clf = SVC(C=C,gamma=gamma,probability=True) clf.fit(Xtrain[w], ytrain) y_score_svc[:,b,cnt] = clf.predict_proba(X[w].iloc[itest])[:,0] cnt += 1 #for b in range(B): # itrain,itest=train_test_split(range(0,n-1),test_size=test_size) # Xtrain = X.iloc[itrain] # ytrain = Y[np.asarray(itrain)] # ytest = Y[np.asarray(itest)] # because itest is a list # y_test[:,b] = ytest # cnt = 0 # for i, (C) in enumerate(C_grid): # clf = SVC(C=C,gamma=gamma,probability=True) # clf.fit(Xtrain, ytrain) # y_score_svc[:,b,cnt] = clf.predict_proba(X.iloc[itest])[:,0] # cnt += 1 auc_tab = np.zeros(len(C_grid)) for i in range(cnt): tmp = np.reshape(y_score_svc[:,:,i],[ntest,B]) auc_tab[i] = ROC(y_test,tmp,plot=False) auc_tab cnt = 0 for i, (C) in enumerate(C_grid): if np.where(auc_tab==max(auc_tab))[0][0]==cnt: print(cnt,C) cnt +=1 tmp = np.reshape(y_score_svc[:,:,np.where(auc_tab==max(auc_tab))[0][0]],[ntest,B]) ROC(y_test,tmp,"SVC-linear")