#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Jul 30 15:55:35 2018 @author: valerie """ # =================================================== # Prediction of a categorical variable # =================================================== import numpy as np import matplotlib.pyplot as plt from urllib.request import urlopen # Cancer Relaps dataset url = "https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_for_other_methods.gct" X = np.loadtxt(urlopen(url),skiprows=3,usecols=range(2,99)) X = X.T names = np.genfromtxt(urlopen(url),dtype='str',usecols=1,skip_header=3) url = "https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/Ex2_breast_cancer_ER_train_set_class_labels.cls" label = np.loadtxt(urlopen(url),skiprows=2) y=label ## ============================================================================ ## Import utilities from sklearn.model_selection import train_test_split from sklearn.metrics import roc_curve, auc def ROC(y_test,y_score,methodName=" ",plot=True): ntest = np.size(y_test,0) B = np.size(y_test,1) fpr, tpr, _ = roc_curve(np.reshape(y_test,B*ntest), np.reshape(y_score,B*ntest)) # if len(fpr)<3: # print("Problem: len(fpr) is lower than 3") # return roc_auc = auc(fpr, tpr) if plot: lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title(methodName) plt.legend(loc="lower right") plt.show() return(roc_auc) def genes_SNR(X,y): # - Inputs # X = dataset # y = labels (a boolean or factor variable) # - Output # SNR labels = np.unique(y) n,p = X.shape K = len(labels) means = np.zeros((K,p)) sd = np.std(X,axis=0) for k in range(K): means[k,] = np.mean(X[y==labels[k],:],axis=0) SNR = np.max(np.reshape(np.abs(np.diff(means,axis=0)),(K-1,p)),axis=0)/sd return(SNR) ## ============================================================================ ## Neural Network from sklearn.neural_network import MLPClassifier B = 30 n_test = 9 y_score = np.zeros([n_test,B]) y_test_all = np.zeros([n_test,B]) for b in range(B): X_train, X_test, y_train, y_test = train_test_split(X,label,test_size=n_test) SNR = genes_SNR(X_train,y_train) keep = np.argsort(SNR)[-50:] mk=np.mean(X_train,axis=0) sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps) X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk) X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk) clf = MLPClassifier(solver='lbfgs', alpha=0,hidden_layer_sizes=3) # 3 neurones clf.fit(X_train[:,keep],y_train) y_score[:,b] = clf.predict_proba(X_test[:,keep])[:,1] y_test_all[:,b] = y_test ROC(y_test_all,y_score,"MLP,3") # ============================================================================= # Another way to do the selection of the hyper parameters # - more efficient from a computation time point of vue # ============================================================================= from sklearn.model_selection import ShuffleSplit from sklearn.model_selection import GridSearchCV, cross_val_score SNR = genes_SNR(X,y) keep = np.argsort(SNR)[-150:] params = {'hidden_layer_sizes': [3,4,5,10], 'alpha': [.05,.1,1], 'learning_rate' : ['constant', 'invscaling', 'adaptive'], 'solver' : ['lbfgs','sgd']} clf_ann = GridSearchCV(MLPClassifier(max_iter=1000), param_grid=params, cv=ShuffleSplit(test_size=.25, n_splits=30, random_state=1), scoring = "accuracy", refit=False,n_jobs=-1) clf_ann.fit(X[:,keep], y.ravel()) print("Best parameters :",clf_ann.best_params_) print("Best score :",np.round(clf_ann.best_score_,2)) B = 30 n_test = 9 y_score = np.zeros([n_test,B]) y_test_all = np.zeros([n_test,B]) for b in range(B): X_train, X_test, y_train, y_test = train_test_split(X,label,test_size=n_test) SNR = genes_SNR(X_train,y_train) keep = np.argsort(SNR)[-150:] mk=np.mean(X_train,axis=0) sk=np.maximum(np.std(X_train,axis=0),10*np.finfo(float).eps) X_train, X_test = np.add(X_train,-mk), np.add(X_test,-mk) X_train, X_test = np.multiply(X_train,1/sk),np.multiply(X_test,1/sk) clf = MLPClassifier(alpha=.05,hidden_layer_sizes=10, learning_rate='adaptive',solver='sgd', max_iter=1000) # 3 neurones clf.fit(X_train,y_train) y_score[:,b] = clf.predict_proba(X_test)[:,1] y_test_all[:,b] = y_test ROC(y_test_all,y_score,"MLP")