%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.image as mpimg
plt.rcParams['figure.figsize'] = (16, 9)

Import and plot data¶

Here, we import the full netcdf file of the 163 years of SST anomalies in the Equatorial Pacific. We will use "lon", "lat", "time" and "sst_ano" for the different variables. Note that you can select a subsample of the dataset, selecting the time period (i_good_time).

import urllib
                 # Computation of lines below may take some time
target_url = 'https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/SSTanom_011901_052017_ERSSTv4.txt'
txt = urllib.urlopen(target_url).read()
txt = txt.split("\n")
data = np.genfromtxt(txt) 

#f = open('SSTanom_011901_052017_ERSSTv4.txt', 'r')  #Use these command  lines 
#                      #if you download the file in your home directory
#data = np.genfromtxt(f)
#f.close()

target_url = 'https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/lonlat_Jul2017.txt'
txt = urllib.urlopen(target_url).read()
txt = txt.split("\n")
lonlat = np.genfromtxt(txt)

NOI = pd.read_csv('https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/ElNino_011950_052017.tex',delim_whitespace=True,header=0 ) # ENSO index

lon = np.unique(lonlat[:,0])
lat = np.unique(lonlat[:,1])
lon_mat = np.reshape(lonlat[:,0],[len(lat),len(lon)])
lat_mat = np.reshape(lonlat[:,1],[len(lat),len(lon)])

lon_mat.shape
lat_mat.shape

(33, 84)

# Plot an example of SST field
data_mat = np.reshape(data[:,1000],[len(lat),len(lon)])
(i,j) = np.where(data_mat==-99)
data_mat[i,j] = -10
plt.pcolor(lon_mat,lat_mat,data_mat)
plt.clim([-2,3])
plt.colorbar()
plt.title('Example of SST')
plt.show()

# Plot the SST mean field
data_mat = np.reshape(np.mean(data,1),[len(lat),len(lon)])
(i,j) = np.where(data_mat<-19)
data_mat[i,j] = -19
plt.pcolor(lon_mat,lat_mat,data_mat)
plt.clim([-1,1])
plt.colorbar()
plt.title('SST mean field')
plt.show()

Feature definition¶

Here, we declare the input variable $X$ (map of SST anomalies in the El Nino 3.4 region at month $m$) and the response variable $y$ (ENSO index at month $m+6$). Note that other SST anomalies can be used (larger region, delayed times, etc...).

# Extract data from 01/1950 to 05/2017
deb = (1950-1901)*12
fin = data.shape[1]      
X = data[:,deb:(fin-6)].T
y = NOI['ANOM'][6:len(NOI)]   

X.shape

(803, 2772)

Empirical Orthogonal Functions (EOF) allow to vizualize the data¶

Empirical Orthogonal Functions is the usual name of Principal Component Analysis in environmental field.

from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca.fit(X)

plt.plot(pca.explained_variance_ratio_)
plt.ylabel("Explained variance")
plt.xlabel("Number of components")
plt.show()

# First EOF
EOF1 = pca.components_[1,:]
EOF1 = EOF1.reshape(lon_mat.shape)
#EOF1 = np.reshape(EOF1,(nb_lat,nb_lon))

plt.pcolor(EOF1); 
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0x1448f9ed0>

# Second EOF
EOF2 = pca.components_[2,:]
EOF2 = EOF2.reshape(lon_mat.shape)

plt.pcolor(EOF2); 
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0x127074c90>

Example of Ridge regression, selection of the regularization parameter¶

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
import pandas as pd

n = len(X[:,0])
B = 10
alpha = [5e-3,1e-2,5e-2,1e-1,5e-1,1]
RMSE = np.zeros([len(alpha),B])
RMSE_LM = np.zeros([1,B])
for b in range(B): 
    itrain,itest=train_test_split(range(0,n-1),test_size=0.3)
    LM = LinearRegression()
    Xtrain = X[itrain,:]
    ytrain = y[np.asarray(itrain)+6] # because itrain is a list
                          # and y is indexed from 6 to ... 
    ytest = y[np.asarray(itest)+6] # because itest is a list
    LM.fit(Xtrain,ytrain)
    yhat = LM.predict(X[itest,:])
    RMSE_LM[0,b] = np.sqrt(np.mean((ytest-yhat)**2))
    for i in range(len(alpha)):
        reg = Ridge(alpha=alpha[i])
        reg.fit(Xtrain,ytrain)
        yhat = reg.predict(X[itest,:])
        RMSE[i,b] = np.sqrt(np.mean((ytest-yhat)**2))

RMSE = np.append(RMSE_LM,RMSE,axis=0)
plt.boxplot(RMSE.T,labels=['0','5e-3','1e-2','5e-2','1e-1','5e-1','1'])
plt.ylim(0.3,.7)
plt.ylabel('RMSE')
plt.xlabel('alpha')

<matplotlib.text.Text at 0x1456f85d0>

reg = Ridge(alpha=1e-1)
reg.fit(X,y)
reg.coef_

coef = np.reshape(reg.coef_,lon_mat.shape)

plt.pcolor(coef.T); plt.colorbar()

<matplotlib.colorbar.Colorbar at 0x129199750>

The plot is hard to interpret. It is due to the strong correlation between the inputs. An idea to deal with this problem is to fit a model on PCA components or other latent variables. Another idea is to first compute a classification and then fit different linear models for different classes.

Test the statistical predictor¶

Here, we learn one regression model on the first part of the time series and test the fit on the independant last part (10 years).

from sklearn.neighbors import KNeighborsRegressor

# choose a regression method
reg=KNeighborsRegressor(n_neighbors=10, weights='distance')

# train and test dataset
dt=12*10 # last 10 years
X_train=X[0:len(y)-dt,:]
y_train=y[0:len(y)-dt]
X_test=X[-dt:,:]
y_test=y[-dt:]

# apply regression
reg.fit(X_train,y_train)
y_hat=reg.predict(X_test)

# plot results
plt.subplot(2,1,1)
line1,= plt.plot(range(0,len(y_test)),y_test,'-k') # use the date as abscissa?
line2,= plt.plot(range(0,len(y_test)),y_hat,'*-r') # use the date as abscissa?
plt.ylim([-2.5,2.5])
plt.legend([line1, line2], ['True ENSO index', 'Estimated ENSO index'])
plt.subplot(2,1,2)
plt.plot(y_hat,y_test,'*k')
plt.plot([-2,2],[-2,2],'-r')
plt.xlim([-2.5,2.5])
plt.ylim([-2.5,2.5])
plt.xlabel('True ENSO index')
plt.ylabel('Estimated ENSO index')

<matplotlib.text.Text at 0x12c49f4d0>