%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.image as mpimg
plt.rcParams['figure.figsize'] = (16, 9)
Here, we import the full netcdf file of the 163 years of SST anomalies in the Equatorial Pacific. We will use "lon", "lat", "time" and "sst_ano" for the different variables. Note that you can select a subsample of the dataset, selecting the time period (i_good_time).
import urllib
# Computation of lines below may take some time
target_url = 'https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/SSTanom_011901_052017_ERSSTv4.txt'
txt = urllib.urlopen(target_url).read()
txt = txt.split("\n")
data = np.genfromtxt(txt)
#f = open('SSTanom_011901_052017_ERSSTv4.txt', 'r') #Use these command lines
# #if you download the file in your home directory
#data = np.genfromtxt(f)
#f.close()
target_url = 'https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/lonlat_Jul2017.txt'
txt = urllib.urlopen(target_url).read()
txt = txt.split("\n")
lonlat = np.genfromtxt(txt)
NOI = pd.read_csv('https://perso.univ-rennes1.fr/valerie.monbet/MachineLearning/ElNino_011950_052017.tex',delim_whitespace=True,header=0 ) # ENSO index
lon = np.unique(lonlat[:,0])
lat = np.unique(lonlat[:,1])
lon_mat = np.reshape(lonlat[:,0],[len(lat),len(lon)])
lat_mat = np.reshape(lonlat[:,1],[len(lat),len(lon)])
lon_mat.shape
lat_mat.shape
# Plot an example of SST field
data_mat = np.reshape(data[:,1000],[len(lat),len(lon)])
(i,j) = np.where(data_mat==-99)
data_mat[i,j] = -10
plt.pcolor(lon_mat,lat_mat,data_mat)
plt.clim([-2,3])
plt.colorbar()
plt.title('Example of SST')
plt.show()
# Plot the SST mean field
data_mat = np.reshape(np.mean(data,1),[len(lat),len(lon)])
(i,j) = np.where(data_mat<-19)
data_mat[i,j] = -19
plt.pcolor(lon_mat,lat_mat,data_mat)
plt.clim([-1,1])
plt.colorbar()
plt.title('SST mean field')
plt.show()
Here, we declare the input variable $X$ (map of SST anomalies in the El Nino 3.4 region at month $m$) and the response variable $y$ (ENSO index at month $m+6$). Note that other SST anomalies can be used (larger region, delayed times, etc...).
# Extract data from 01/1950 to 05/2017
deb = (1950-1901)*12
fin = data.shape[1]
X = data[:,deb:(fin-6)].T
y = NOI['ANOM'][6:len(NOI)]
X.shape
Empirical Orthogonal Functions is the usual name of Principal Component Analysis in environmental field.
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
pca.fit(X)
plt.plot(pca.explained_variance_ratio_)
plt.ylabel("Explained variance")
plt.xlabel("Number of components")
plt.show()
# First EOF
EOF1 = pca.components_[1,:]
EOF1 = EOF1.reshape(lon_mat.shape)
#EOF1 = np.reshape(EOF1,(nb_lat,nb_lon))
plt.pcolor(EOF1);
plt.colorbar()
# Second EOF
EOF2 = pca.components_[2,:]
EOF2 = EOF2.reshape(lon_mat.shape)
plt.pcolor(EOF2);
plt.colorbar()
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
import pandas as pd
n = len(X[:,0])
B = 10
alpha = [5e-3,1e-2,5e-2,1e-1,5e-1,1]
RMSE = np.zeros([len(alpha),B])
RMSE_LM = np.zeros([1,B])
for b in range(B):
itrain,itest=train_test_split(range(0,n-1),test_size=0.3)
LM = LinearRegression()
Xtrain = X[itrain,:]
ytrain = y[np.asarray(itrain)+6] # because itrain is a list
# and y is indexed from 6 to ...
ytest = y[np.asarray(itest)+6] # because itest is a list
LM.fit(Xtrain,ytrain)
yhat = LM.predict(X[itest,:])
RMSE_LM[0,b] = np.sqrt(np.mean((ytest-yhat)**2))
for i in range(len(alpha)):
reg = Ridge(alpha=alpha[i])
reg.fit(Xtrain,ytrain)
yhat = reg.predict(X[itest,:])
RMSE[i,b] = np.sqrt(np.mean((ytest-yhat)**2))
RMSE = np.append(RMSE_LM,RMSE,axis=0)
plt.boxplot(RMSE.T,labels=['0','5e-3','1e-2','5e-2','1e-1','5e-1','1'])
plt.ylim(0.3,.7)
plt.ylabel('RMSE')
plt.xlabel('alpha')
reg = Ridge(alpha=1e-1)
reg.fit(X,y)
reg.coef_
coef = np.reshape(reg.coef_,lon_mat.shape)
plt.pcolor(coef.T); plt.colorbar()
The plot is hard to interpret. It is due to the strong correlation between the inputs. An idea to deal with this problem is to fit a model on PCA components or other latent variables. Another idea is to first compute a classification and then fit different linear models for different classes.
Here, we learn one regression model on the first part of the time series and test the fit on the independant last part (10 years).
from sklearn.neighbors import KNeighborsRegressor
# choose a regression method
reg=KNeighborsRegressor(n_neighbors=10, weights='distance')
# train and test dataset
dt=12*10 # last 10 years
X_train=X[0:len(y)-dt,:]
y_train=y[0:len(y)-dt]
X_test=X[-dt:,:]
y_test=y[-dt:]
# apply regression
reg.fit(X_train,y_train)
y_hat=reg.predict(X_test)
# plot results
plt.subplot(2,1,1)
line1,= plt.plot(range(0,len(y_test)),y_test,'-k') # use the date as abscissa?
line2,= plt.plot(range(0,len(y_test)),y_hat,'*-r') # use the date as abscissa?
plt.ylim([-2.5,2.5])
plt.legend([line1, line2], ['True ENSO index', 'Estimated ENSO index'])
plt.subplot(2,1,2)
plt.plot(y_hat,y_test,'*k')
plt.plot([-2,2],[-2,2],'-r')
plt.xlim([-2.5,2.5])
plt.ylim([-2.5,2.5])
plt.xlabel('True ENSO index')
plt.ylabel('Estimated ENSO index')