#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Created on Mon Jun 12 10:11:20 2017 @author: valerie """ from mpl_toolkits.basemap import Basemap import matplotlib.pyplot as plt import numpy as np import pandas as pd from math import ceil from scipy import linalg from sklearn.cross_validation import ShuffleSplit from sklearn.cross_validation import cross_val_score from sklearn.base import BaseEstimator from sklearn.linear_model import LinearRegression, Ridge, Lasso from sklearn.tree import DecisionTreeRegressor from sklearn.svm import SVR from sklearn.neighbors import KNeighborsRegressor from sklearn.neural_network import MLPRegressor # data importation Ozone = pd.read_csv("~/Documents/ENSEIGNEMENT/TECHNICOLOR/Ozone.dat", delim_whitespace=True, dtype=object) Ozone['maxO3']=Ozone['maxO3'].astype('float64') Ozone['maxO3v']=Ozone['maxO3v'].astype('float64') Ozone['T9']=Ozone['T9'].astype('float64') Ozone['T12']=Ozone['T12'].astype('float64') Ozone['T15']=Ozone['T15'].astype('float64') Ozone['Ne9']=Ozone['Ne9'].astype('float64') Ozone['Ne12']=Ozone['Ne12'].astype('float64') Ozone['Ne15']=Ozone['Ne15'].astype('float64') Ozone['Vx9']=Ozone['Vx9'].astype('float64') Ozone['Vx12']=Ozone['Vx12'].astype('float64') Ozone['Vx15']=Ozone['Vx15'].astype('float64') # plt.plot(Ozone['maxO3']) plt.plot(Ozone['maxO3v'],Ozone['maxO3'],'.') ## ============================================================================ ## LLR def lowess(x, y, f=2. / 3., iter=3): """lowess(x, y, f=2./3., iter=3) -> yest Lowess smoother: Robust locally weighted regression. The lowess function fits a nonparametric regression curve to a scatterplot. The arrays x and y contain an equal number of elements; each pair (x[i], y[i]) defines a data point in the scatterplot. The function returns the estimated (smooth) values of y. The smoothing span is given by f. A larger value for f will result in a smoother curve. The number of robustifying iterations is given by iter. The function will run faster with a smaller number of iterations. """ n = len(x) r = int(ceil(f * n)) h = [np.sort(np.abs(x - x[i]))[r] for i in range(n)] w = np.clip(np.abs((x[:, None] - x[None, :]) / h), 0.0, 1.0) w = (1 - w ** 3) ** 3 yest = np.zeros(n) delta = np.ones(n) for iteration in range(iter): for i in range(n): weights = delta * w[:, i] b = np.array([np.sum(weights * y), np.sum(weights * y * x)]) A = np.array([[np.sum(weights), np.sum(weights * x)], [np.sum(weights * x), np.sum(weights * x * x)]]) beta = linalg.solve(A, b) yest[i] = beta[0] + beta[1] * x[i] residuals = y - yest s = np.median(np.abs(residuals)) delta = np.clip(residuals / (6.0 * s), -1, 1) delta = (1 - delta ** 2) ** 2 return yest yest = lowess(Ozone['maxO3v'],Ozone['maxO3']) plt.clf() plt.plot(Ozone['maxO3v'],Ozone['maxO3'], '.') plt.plot(Ozone['maxO3v'], yest, label='y pred') plt.legend() plt.show() yest = lowess(Ozone['T9'],Ozone['maxO3']) plt.clf() plt.plot(Ozone['T9'],Ozone['maxO3'], '.') plt.plot(Ozone['T9'], yest,'.', label='y pred') plt.legend() plt.show() # ============================================================================= # kNN y = Ozone['maxO3'] X = Ozone.iloc[:,2:12] # keep the numerical values only X1 = Ozone.iloc[:,np.append(range(3,12,3),11)] skf = ShuffleSplit(y.shape[0], n_iter=100, test_size=.25, random_state=1) NN_max = 15 sc_m_dist_L1 = np.zeros(NN_max) sc_sd_dist_L1 = np.zeros(NN_max) sc_m_dist_L2 = np.zeros(NN_max) sc_sd_dist_L2 = np.zeros(NN_max) sc_m_unif = np.zeros(NN_max) sc_sd_unif = np.zeros(NN_max) i = 0 for n_neighbors in range(1,NN_max+1): scores_dist = np.sqrt(-cross_val_score(KNeighborsRegressor(n_neighbors=n_neighbors, weights='distance'), X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) scores_unif = np.sqrt(-cross_val_score(KNeighborsRegressor(n_neighbors=n_neighbors, weights='uniform'), X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) sc_m_dist_L1[i] = np.mean(scores_dist) sc_sd_dist_L1[i] = np.std(scores_dist) scores_dist = np.sqrt(-cross_val_score( KNeighborsRegressor(n_neighbors=n_neighbors, weights='distance',metric='euclidean'), X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) sc_m_dist_L2[i] = np.mean(scores_dist) sc_sd_dist_L2[i] = np.std(scores_dist) sc_m_unif[i] = np.mean(scores_unif) sc_sd_unif[i] = np.std(scores_unif) print([n_neighbors, np.mean(scores_dist), np.std(scores_dist), np.mean(scores_unif), np.std(scores_unif)]) i = i+1 plt.plot(range(1,NN_max+1), sc_m_unif, '-') plt.fill_between(range(1,NN_max+1), sc_m_unif-sc_sd_unif, sc_m_unif+sc_sd_unif, alpha=0.3) plt.plot(range(1,NN_max+1), sc_m_dist_L2, '-') plt.fill_between(range(1,NN_max+1), sc_m_dist_L2-sc_sd_dist_L2, sc_m_dist_L2+sc_sd_dist_L2, alpha=0.3) #plt.plot(range(1,NN_max+1), sc_m_dist_L1, '-') #plt.fill_between(range(1,NN_max+1), sc_m_dist_L1-sc_sd_dist_L1, # sc_m_dist_L1+sc_sd_dist_L1, alpha=0.3) plt.plot(range(1,NN_max+1), sc_m_unif, 'ko-') plt.plot(range(1,NN_max+1), sc_m_unif-sc_sd_unif,'k--', range(1,NN_max+1), sc_m_unif+sc_sd_unif,'k--') plt.plot(range(1,NN_max+1), sc_m_dist_L2, 'b+-') plt.plot(range(1,NN_max+1), sc_m_dist_L2-sc_sd_dist_L2,'b--', range(1,NN_max+1), sc_m_dist_L2+sc_sd_dist_L2,'b--') plt.xlabel('k') plt.ylabel('RMSE') plt.grid() #plt.fill_between(range(1,NN_max+1), sc_m_dist_L2-sc_sd_dist_L2, # sc_m_dist_L2+sc_sd_dist_L2, alpha=0.3) plt.legend(['uniform','dist L2']) plt.savefig('Ozone_kNN.eps', format='eps', dpi=300) #knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights) #y_ = knn.fit(X, y).predict(T) ## ============================================================================ ## Regression Trees X = Ozone.iloc[:,2:12] max_depth=15 sc_m_dt = np.zeros(max_depth) sc_sd_dt = np.zeros(max_depth) i = 0 for depth in range(1,max_depth+1): scores = np.sqrt(-cross_val_score(DecisionTreeRegressor(max_depth=depth), X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) print([depth, np.mean(scores), np.std(scores)]) sc_m_dt[i] = np.mean(scores) sc_sd_dt[i] = np.std(scores) i = i+1 plt.plot(range(1,max_depth+1), sc_m_dt, 'ko-') plt.xlabel('Tree depth') plt.ylabel('RMSE') plt.plot(range(1,max_depth+1), sc_m_dt-sc_sd_dt,'k--', range(1,max_depth+1), sc_m_dt+sc_sd_dt,'k--') plt.savefig('Ozone_DT.eps', format='eps', dpi=300) from sklearn.tree import export_graphviz import os Tr = DecisionTreeRegressor(max_depth=depth) Tr_fit = Tr.fit(X,y) export_graphviz(Tr_fit) os.system('dot -Tpng tree.dot -o tree.png') ## Linear model scores = np.sqrt(-cross_val_score(LinearRegression(), X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) plt.plot([1,15],[np.mean(scores),np.mean(scores)],'r') plt.plot(range(1,max_depth+1), sc_m_dt, 'ko-') plt.plot(range(1,NN_max+1), sc_m_dist_L2, 'b+-') plt.xlabel('Tree depth/kNN') plt.ylabel('RMSE') plt.legend(['Linear','kNN','Tree']) #plt.savefig('Ozone_compar1.eps', format='eps', dpi=300) # Subset selection import itertools import time import statsmodels.api as sm def processSubset(feature_set): # Fit model on feature_set and calculate RSS model = sm.OLS(y,X[list(feature_set)]) regr = model.fit() RSS = ((regr.predict(X[list(feature_set)]) - y) ** 2).sum() return {'models':regr, 'RSS':RSS} def getBest(k): tic = time.time() results = [] for combo in itertools.combinations(X.columns, k): results.append(processSubset(combo)) # Wrap everything up in a nice dataframe # Choose the model with the highest RSS best_model = models.loc[models['RSS'].argmin()] toc = time.time() print("Processed ", models.shape[0], "models on", k, "predictors in", (toc-tic), "seconds.") # Return the best model, along with some other useful information about the model return best_model models = pd.DataFrame(columns=["RSS", "model"]) tic = time.time() for i in range(1,8): models.loc[i] = getBest(i) toc = time.time() print("Total elapsed time:", (toc-tic), "seconds.") ## ============================================================================ ## ANN # One hidden layer max_nb = 15 sc_m_ann = np.zeros(max_nb) sc_sd_ann = np.zeros(max_nb) i = 0 for nb_neurons in range(2,max_nb+1): scores = np.sqrt(-cross_val_score(MLPRegressor(solver='sgd', alpha=.1, hidden_layer_sizes=(nb_neurons), learning_rate='invscaling', learning_rate_init = .001, max_iter=5000), X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) print([nb_neurons, np.mean(scores), np.std(scores)]) sc_m_ann[i] = np.mean(scores) sc_sd_ann[i] = np.std(scores) i = i+1 max_1 = 15 max_2 = 10 sc_m_ann = np.zeros(100) sc_sd_ann = np.zeros(100) i = 0 for nb_1 in range(6,max_1+1): for nb_2 in range(2,min(nb_1,max_2)+1): scores = np.sqrt(-cross_val_score(MLPRegressor(solver='lbfgs',alpha=1e-5, hidden_layer_sizes=(nb_1,nb_2), learning_rate='invscaling',max_iter=500), X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) print([nb_1, nb_2, np.mean(scores), np.std(scores)]) sc_m_ann[i] = np.mean(scores) sc_sd_ann[i] = np.std(scores) i = i+1 ## SVR svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) svr_lin = SVR(kernel='linear', C=1e3) svr_poly = SVR(kernel='poly', C=1e3, degree=2) y_rbf = svr_rbf.fit(X, y).predict(X) y_lin = svr_lin.fit(X, y).predict(X) y_poly = svr_poly.fit(X, y).predict(X) ## ============================================================================ ### Adaboost from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor y = Ozone['maxO3'] X = Ozone.iloc[:,2:12] # Fit regression model skf = ShuffleSplit(y.shape[0], n_iter=100, test_size=.25, random_state=1) regr_1 = DecisionTreeRegressor(max_depth=2) scores_1 = np.sqrt(-cross_val_score(regr_1, X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) regr_10 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=2), n_estimators=10, random_state=1) scores_10 = np.sqrt(-cross_val_score(regr_10, X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) regr_50 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=2), n_estimators=50, random_state=1) scores_50 = np.sqrt(-cross_val_score(regr_50, X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) regr_100 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=2), n_estimators=100, random_state=1) scores_100 = np.sqrt(-cross_val_score(regr_100, X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) regr_300 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=2), n_estimators=300, random_state=1) scores_300 = np.sqrt(-cross_val_score(regr_300, X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) tmp= np.array([scores_1,scores_10,scores_50,scores_100,scores_300]) df = pd.DataFrame(tmp.T,columns=['1','10','50','100','300'] ) plt.boxplot(tmp.T,labels=['1','10','50','100','300']) plt.xlabel('Nb boosting iterations') plt.ylabel('RMSE') plt.savefig('Ozone_boosting.eps', format='eps', dpi=300) regr_1.fit(X, y) regr_2.fit(X, y) # Predict y_1 = regr_1.predict(X) y_2 = regr_2.predict(X) # Plot the results plt.figure() plt.plot(y, y_1, '.', c="g", label="n_estimators=1") plt.plot(y, y_2, '.', c="r", label="n_estimators=300") plt.xlabel("data") plt.ylabel("target") plt.title("Boosted Decision Tree Regression") plt.legend() plt.show() ## ============================================================================ # Gradient boosting from sklearn import ensemble from sklearn.model_selection import GridSearchCV param_grid = {'n_estimators': [1e0, 1e1, 5e1, 1e2, 3e2, 1e3], 'learning_rate': [0.001,0.01,0.1]} params = {'max_depth': 2, 'min_samples_split': 2, 'loss': 'ls'} boost = GridSearchCV(ensemble.GradientBoostingRegressor(**params), cv=skf, param_grid=param_grid) boost.fit(X,y) params = {'n_estimators': 1, 'max_depth': 2, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls'} regr_1 = ensemble.GradientBoostingRegressor(**params) scores_1 = np.sqrt(-cross_val_score(regr_1, X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) params = {'n_estimators': 10, 'max_depth': 2, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls'} regr_10 = ensemble.GradientBoostingRegressor(**params) scores_10 = np.sqrt(-cross_val_score(regr_10, X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) params = {'n_estimators': 50, 'max_depth': 2, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls'} regr_50 = ensemble.GradientBoostingRegressor(**params) scores_50 = np.sqrt(-cross_val_score(regr_50, X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) params = {'n_estimators': 100, 'max_depth': 2, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls'} regr_100 = ensemble.GradientBoostingRegressor(**params) scores_100 = np.sqrt(-cross_val_score(regr_100, X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) params = {'n_estimators': 300, 'max_depth': 2, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls'} regr_300 = ensemble.GradientBoostingRegressor(**params) scores_300 = np.sqrt(-cross_val_score(regr_300, X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) params = {'n_estimators': 1000, 'max_depth': 2, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls'} regr_1000 = ensemble.GradientBoostingRegressor(**params) scores_1000 = np.sqrt(-cross_val_score(regr_1000, X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) tmp= np.array([scores_1,scores_10,scores_50,scores_100,scores_300,scores_1000]) df = pd.DataFrame(tmp.T,columns=['1','10','50','100','300','1000'] ) plt.boxplot(tmp.T,labels=['1','10','50','100','300','1000']) plt.xlabel('Nb boosting iterations') plt.ylabel('RMSE') plt.savefig('Ozone_gradient_boosting.eps', format='eps', dpi=300) regr_300.fit(X,y) feature_importance = regr_300.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 plt.subplot(1, 2, 1) plt.barh(pos, feature_importance[sorted_idx], align='center') colnames = np.array(['T9','T12','T15','Ne9','Ne12','Ne15','Vx9','Vx12','Vx15','maxO3v']) plt.yticks(pos, colnames[sorted_idx]) plt.xlabel('Relative Importance') plt.title('Variable Importance') #plt.show() plt.savefig('Ozone_gradient_boosting_importance.eps', format='eps', dpi=300) ## ============================================================================ ### Kernel Ridge Regression from sklearn.model_selection import GridSearchCV param_grid = {"alpha": [1e0, 1e-1, 1e-2, 1e-3]} kr = GridSearchCV(KernelRidge(cv=skf, param_grid=param_grid) kr.fit(X, y) kr.best_score_ kr.best_params_ scores = np.sqrt(-cross_val_score(KernelRidge(kernel='rbf',alpha=.1,gamma=0.00046), X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) np.mean(scores) ## ============================================================================ ### SVR from sklearn.svm import SVR param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-3, 2, 5), 'epsilon':[1e1, 1e0, 1e-1, 1e-2, 1e-3]} svr = GridSearchCV(SVR(kernel='rbf'), cv=skf, param_grid=param_grid) svr.fit(X, y) svr.best_score_ svr.best_params_ scores = np.sqrt(-cross_val_score(SVR(kernel='rbf', gamma=0.001,C=100,epsilon=10), X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) np.mean(scores) param_grid={"C": [1e0, 1e1, 1e2, 1e3], 'epsilon':[1e1, 1e0, 1e-1, 1e-2, 1e-3]} svr = GridSearchCV(SVR(kernel='linear'), cv=skf, param_grid=param_grid) svr.fit(X, y) svr.best_score_ svr.best_params_ scores = np.sqrt(-cross_val_score(SVR(kernel='linear', C=1,epsilon=10), X=X, y=y, scoring='neg_mean_squared_error', cv=skf)) np.mean(scores)