# Advanced section 2 - Regularization¶

This notebooks shows, in code, the effects of the regularization methods. We also show the instability that can arise in OLS.

In [1]:
# Imports
import numpy as np

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D as a3d

from plotly.plotly import plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

from copy import deepcopy

%matplotlib inline
%matplotlib notebook


## Create regression problem¶

To exemplify the estimators and their behavior on different types of data, we create a basic 2D regression, as well as a 2d regression with highly colinear predictors. Let's start with a normal regression with random predictor values:

In [2]:
## Global variables for plots

# True betas:
b_true = np.array([3,7]).reshape((2,1))

# Samples and Y noise
N=100
sigma = 5

# linspaces
lsp = np.linspace(-1.5,1.5,20)
lsp_x, lsp_y = np.meshgrid(lsp,lsp)
lsp_mat = np.column_stack((lsp_x.flatten(),lsp_y.flatten()))

In [3]:
# Generate random data from the true betas
X = np.random.rand(N,2)*10-5
X = (X - X.mean(axis=0))/X.std(axis=0)
eps = np.random.normal(scale=sigma, size = (N,1))
y = np.dot(X,b_true) + eps


## Plot regression problem¶

In [4]:
# 3D plot with Axes3D. 3D plotting with this is suboptimal, as it does no actual 3d rendering.

def plot_scatter_axes3d(X,y,b_true):
# y surface from true betas
y_noiseless = np.dot(lsp_mat,b_true).reshape((20,20))
fig = plt.figure(figsize=[9,7])
ax = fig.gca(projection='3d')
ax.scatter(X[:,0],X[:,1], y, color='red', alpha =1, zorder=y)
ax.plot_surface(lsp_x, lsp_y, np.dot(lsp_mat,b_true).reshape((20,20)), color='black', alpha = 0.5, zorder = y_noiseless)
plt.show()

plot_scatter_axes3d(X,y,b_true);

In [5]:
# 3D plot with plotly. Plotly is a much better approach for 3d plots, but it requires the user to have an online account.

def plot_scatter_plotly(X,y,b_true):

# y surface from true betas
y_noiseless = np.dot(lsp_mat,b_true).reshape((20,20))

data = [
go.Scatter3d(
x=X[:,0],
y=X[:,1],
z=y,
mode='markers',
marker=dict(
size=5,
color='green',
line=dict(
color='rgba(217, 217, 217, 0.14)',
width=0.5
),
opacity=1
)
),
go.Surface(
x=lsp_x,
y=lsp_y,
z=y_noiseless,
colorscale='Greens',
opacity=0.8,
showscale=False
),

]

layout = go.Layout(
title='2D Regression',
autosize=False,
width=700,
height=700,
)

fig = go.Figure(data=data, layout=layout)

return fig, data

print(X.shape, y.shape)

fig, data = plot_scatter_plotly(X,y,b_true)
print(len(data))
iplot(fig, filename='2D Regression - normal data')

(100, 2) (100, 1)
2

/anaconda3/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:


Out[5]:
In [6]:
# Regression types to use:
regr = ['OLS']
colors = ['Blues', 'Reds']


## Calculate regression coefficients for OLS (normal data)¶

In [7]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV

def fit_regression(X, y, regr=['OLS'], verbose=True):
betas=[]
if regr == 'all':
regr = ['OLS','Ridge','LASSO','EN']

for r in regr:
if r == 'OLS':
# OLS fit
regr_ols = LinearRegression(fit_intercept=False)
regr_ols.fit(X, y)
beta_ols = regr_ols.coef_
if verbose:
print(f'OLS coefficients: {beta_ols}')
betas.append(beta_ols)

elif r == 'Ridge':
# Ridge fit
regr_ridge = RidgeCV(fit_intercept=False)
regr_ridge.fit(X, y)
beta_ridge = regr_ridge.coef_
if verbose:
print(f'Ridge coefficients:{beta_ridge}, regularization coef: {regr_ridge.alpha_}')
betas.append(beta_ridge)

elif r == 'LASSO':
# LASSO fit
regr_lasso = LassoCV(fit_intercept=False)
regr_lasso.fit(X, y)
beta_lasso = regr_lasso.coef_
if verbose:
print(f'LASSO coefficients:{beta_lasso}, regularization coef: {regr_lasso.alpha_}')
betas.append(beta_lasso)

elif r == 'EN':
# Elastic Net fit
regr_EN = ElasticNetCV(fit_intercept=False)
regr_EN.fit(X, y)
beta_EN = regr_EN.coef_
if verbose:
print(f'ElasticNet coefficients:{beta_EN}, regularization coef: {regr_EN.alpha_}')
betas.append(beta_EN)

return betas

print('True coefficents:', b_true.ravel())
betas = fit_regression(X,y.ravel(),regr=regr);

True coefficents: [3 7]
OLS coefficients: [2.7448308  7.30976811]


## Plot fitted planes (normal data)¶

In [8]:
def plot_fitted_planes(betas, colors, names, data=[], return_traces=False):

for i,b in enumerate(betas):
y = np.dot(lsp_mat,b.reshape((2,1))).reshape((20,20))
data.append(go.Surface(
x=lsp_x,
y=lsp_y,
z=y,
colorscale=colors[i],
text = names[i],
showscale=False
))

layout = go.Layout(
title='2D Regression',
autosize=False,
width=700,
height=700,
)

if return_traces:
return data

fig = go.Figure(data=data, layout=layout)
return fig

fig = plot_fitted_planes(betas, colors=colors, names=regr, data=deepcopy(data))
iplot(fig, filename='2D Regression with different estimators - normal data')

/anaconda3/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:


Out[8]:

## Create regression problem with colinearity¶

In [9]:
# Generate random data with high colinearity between predictors
x1 = np.random.rand(N,1)*10-5
x2 = x1 + np.random.normal(scale=0.2, size=(N,1))
X_colin = np.column_stack((x1,x2))
X_colin = (X_colin-X_colin.mean(axis=0))/X_colin.std(axis=0)
eps = np.random.normal(scale=sigma, size = (N,1))
y_colin = np.dot(X_colin,b_true)+eps


## Plot regression problem with colinearity¶

In [10]:
fig, data_colin = plot_scatter_plotly(X_colin,y_colin,b_true)
iplot(fig, filename='2D Regression - normal data')

Out[10]:

## Calculate regression coefficients (colinear data)¶

In [11]:
print('True coefficents:', b_true.ravel())
betas_colin = fit_regression(X_colin,y_colin.ravel(),regr=regr);

True coefficents: [3 7]
OLS coefficients: [10.18864184  0.20719093]


## Plot fitting planes (colinear data)¶

In [12]:
fig = plot_fitted_planes(betas_colin, colors=colors, names=regr, data=deepcopy(data_colin))
iplot(fig, filename='2D Regression with different estimators - colinear data')

Out[12]:

## Add a small perturbation to the colinear data and fit again¶

In [13]:
# Perturbation is just a bit of small uniform noise:
perturbation = np.random.rand(X_colin.shape[0],X_colin.shape[1])*0.05
X_colin_pert = X_colin+perturbation
y_colin_pert = np.dot(X_colin_pert,b_true)+eps

print('True coefficents:', b_true.ravel())
betas = fit_regression(X_colin_pert,y_colin_pert.ravel(),regr=regr);
fig, data_colin_pert = plot_scatter_plotly(X_colin_pert,y_colin_pert,b_true)
fig = plot_fitted_planes(betas, colors=colors, names=regr, data=deepcopy(data_colin_pert))

iplot(fig, filename='2D Regression with different estimators - colinear data')

True coefficents: [3 7]
OLS coefficients: [3.59464312 6.79142214]

Out[13]:

This clearly shows how unstable our estimates are in OLS. As expected, in this case, the inverse Gram Matrix $(X^TX)^{-1}$ (proportional to covariance) will present very large diagonal values:

## Condition number and eigenvalues¶

In [14]:
print('Inverse of Gram Matrix for colinear data (propto covariance matrix of betas):')
print(np.linalg.inv(np.dot(X_colin.T,X_colin)))

print('Condition number of Gram Matrix:')
print(np.linalg.cond(np.dot(X_colin.T,X_colin)))

Inverse of Gram Matrix for colinear data (propto covariance matrix of betas):
[[ 2.82282151 -2.81781707]
[-2.81781707  2.82282151]]
Condition number of Gram Matrix:
1127.1277155550463

In [15]:
eigval, eigvec = np.linalg.eig(np.dot(X_colin.T,X_colin))
print("Matrix X[:5] =")
display(X[:5])
print("Eigenvalues =")
display(eigval)
print("Eigenvectors =")
display(eigvec)
print("Max(eigenvalues)/Min(eigenvalues)")
display(max(eigval)/min(eigval))

Matrix X[:5] =

array([[ 1.75683671,  0.65680951],
[-1.60918975,  1.54938769],
[ 0.20357474, -0.48327303],
[ 0.72870872,  1.21985667],
[ 0.15826972, -1.14201238]])
Eigenvalues =

array([1.77284892e-01, 1.99822715e+02])
Eigenvectors =

array([[-0.70710678, -0.70710678],
[ 0.70710678, -0.70710678]])
Max(eigenvalues)/Min(eigenvalues)

1127.127715555074
In [16]:
print('\nCompare to non-colinear data:')
print(np.linalg.inv(np.dot(X.T,X)))

print('Condition number of Gram Matrix with non-colinear data:')
print(np.linalg.cond(np.dot(X.T,X)))

Compare to non-colinear data:
[[ 0.01033188 -0.00185174]
[-0.00185174  0.01033188]]
Condition number of Gram Matrix with non-colinear data:
1.4367237004517126

In [17]:
eigval, eigvec = np.linalg.eig(np.dot(X.T,X))
print("Matrix X[:5] =")
display(X[:5])
print("Eigenvalues =")
display(eigval)
print("Eigenvectors =")
display(eigvec)
print("Max(eigenvalues)/Min(eigenvalues)")
display(max(eigval)/min(eigval))

Matrix X[:5] =

array([[ 1.75683671,  0.65680951],
[-1.60918975,  1.54938769],
[ 0.20357474, -0.48327303],
[ 0.72870872,  1.21985667],
[ 0.15826972, -1.14201238]])
Eigenvalues =

array([117.92257778,  82.07742222])
Eigenvectors =

array([[ 0.70710678, -0.70710678],
[ 0.70710678,  0.70710678]])
Max(eigenvalues)/Min(eigenvalues)

1.4367237004517126

## Analyze loss surfaces¶

In [18]:
def OLS_loss(X, y, beta, lbda=0):
y_hat = np.dot(X,beta)
return np.sum((y_hat-y)**2,axis=0)

def Ridge_loss(X, y, beta, lbda):
y_hat = np.dot(X,beta)
return np.sum((y_hat-y)**2,axis=0) + lbda*np.sum(beta**2, axis=0)

def LASSO_loss(X, y, beta, lbda):
y_hat = np.dot(X,beta)
return (1 / (2 * len(X)))*np.sum((y_hat-y)**2,axis=0) + lbda*np.sum(np.abs(beta), axis=0)

def EN_loss(X, y, beta, lbda):
ratio=0.1
y_hat = np.dot(X,beta)
return (1 / (2 * len(X)))*np.sum((y_hat-y)**2,axis=0) + lbda*(ratio*np.sum(beta**2, axis=0) + (1-ratio)*np.sum(np.abs(beta), axis=0))

In [19]:
# linspace for loss surface
L=40
lsp_b = np.linspace(-20,20,L)
lsp_b_x, lsp_b_y = np.meshgrid(lsp_b,lsp_b)
lsp_b_mat = np.column_stack((lsp_b_x.flatten(),lsp_b_y.flatten()))

def build_surface_fig(loss_values):

data = [
go.Surface(
x=lsp_b_x,
y=lsp_b_y,
z=loss_values,
colorscale='Viridis',
opacity=0.7,
contours=dict(z=dict(show=True,
width=3,
highlight=True,
highlightcolor='orange',
project=dict(z=True),
usecolormap=True))
)
]

layout = go.Layout(
title='Loss surface',
autosize=False,
width=700,
height=700,
scene=dict(
xaxis = dict(
title='Beta 1'),
yaxis = dict(
title='Beta 2'),
zaxis = dict(
title='Loss')
)
)

fig = go.Figure(data=data, layout=layout)
display(iplot(fig, filename='2D Regression with different estimators - colinear data'))

build_surface_fig(OLS_loss(X_colin,y_colin.reshape(-1,1), lsp_b_mat.T, 100).reshape((L,L)));

In [20]:
# # OLS
# loss_values = OLS_loss(X, y, lsp_b_mat.T).reshape((L,L))
# fig, data = build_surface_fig(loss_values)
# iplot(fig, filename='Loss Surface')

# # Ridge
# loss_values = Ridge_loss(X, y, lsp_b_mat.T, 100.0).reshape((L,L))
# fig, data = build_surface_fig(loss_values)
# iplot(fig, filename='Loss Surface')

# # LASSO
# loss_values = LASSO_loss(X, y, lsp_b_mat.T, 100.0).reshape((L,L))
# fig, data = build_surface_fig(loss_values)
# iplot(fig, filename='Loss Surface')

# # Elastic Net
# loss_values = EN_loss(X, y, lsp_b_mat.T, 100.0).reshape((L,L))
# fig, data = build_surface_fig(loss_values)
# fig['layout'].update()
# iplot(fig, filename='Loss Surface')

In [21]:
from ipywidgets import interactive, HBox, VBox

def loss_3d_interactive(X, y, loss='Ridge'):
'''Uses plotly to draw an interactive 3D representation of the loss function,
with a slider to control the regularization factor.

Inputs:
X: predictor matrix for the regression problem. Has to be of dim n x 2
y: response vector

loss: string with the loss to plot. Options are 'Ridge', 'LASSO', 'EN'.
'''

if loss == 'Ridge':
loss_function = Ridge_loss
lbda_slider_min = 0
lbda_slider_max = 1000
lbda_step = 1
clf = Ridge()
elif loss == 'LASSO':
loss_function = LASSO_loss
lbda_slider_min = 1
lbda_slider_max = 150
lbda_step = 1
clf = Lasso()
elif loss == 'EN':
loss_function = EN_loss
lbda_slider_min = 1
lbda_slider_max = 150
lbda_step = 1
clf = ElasticNet()
else:
raise ValueError("Loss string not recognized. Available options are: 'Ridge', 'LASSO', 'EN'.")

# linspace for loss surface
L=20
lsp_b = np.linspace(-10,10,L)
lsp_b_x, lsp_b_y = np.meshgrid(lsp_b,lsp_b)
lsp_b_mat = np.column_stack((lsp_b_x.flatten(),lsp_b_y.flatten()))

# Get all optimal betas for current lambda range
precomp_coefs=[]
for l in range(lbda_slider_min,lbda_slider_max+1,lbda_step):
clf.set_params(alpha=l)
clf.fit(X, y)
precomp_coefs.append(clf.coef_)

f = go.FigureWidget(
data=[
go.Surface(
x=lsp_b_x,
y=lsp_b_y,
z=loss_function(X,y.reshape(-1,1), lsp_b_mat.T, 0).reshape((L,L)),
colorscale='Viridis',
colorbar = dict(len=0.75),
opacity=0.7,
name=r"Loss function",
contours=dict(z=dict(show=True,
width=4,
highlight=True,
highlightcolor='orange',
project=dict(z=True),
usecolormap=True))
),

go.Scatter3d(
x=[p[0] for p in precomp_coefs],
y=[p[1] for p in precomp_coefs],
z=np.zeros(len(precomp_coefs)),
name=r"Trajectory Beta 1 and Beta 2",
marker=dict(
size=1,
color='red',
line=dict(
color='red',
width=0
),
opacity=1
)
),
go.Scatter3d(
x=[0],
y=[0],
z=[0],
name=r"Beta 1 and Beta 2 with constraint",
marker=dict(
size=10,
color='orange',
opacity=1
),
),
go.Scatter3d(
x=[3],
y=[7],
z=[0],
name=r"True Beta 1 and Beta 2 = (3,7)",
marker=dict(
size=10,
color='blue',
opacity=1
),
),
],

layout=go.Layout(scene=go.layout.Scene(
xaxis = dict(
title='Beta 1'),
yaxis = dict(
title='Beta 2'),
zaxis = dict(
title='Loss'),
camera=go.layout.scene.Camera(
up=dict(x=0, y=0, z=1),
center=dict(x=0, y=0, z=0),
eye=dict(x=1.25, y=1.25, z=1.25))
),
width=1000,
height=700,)
)

def update_z(lbda):
f.data[0].z = loss_function(X, y.reshape(-1,1), lsp_b_mat.T, lbda).reshape((L,L))
beta_opt = precomp_coefs[(lbda-lbda_slider_min)//(lbda_step)]
f.data[-2].x = [beta_opt[0]]
f.data[-2].y = [beta_opt[1]]
f.data[-2].z = [0]

lambda_slider = interactive(update_z, lbda=(lbda_slider_min, lbda_slider_max, lbda_step))
vb = VBox((f, lambda_slider))
vb.layout.align_items = 'center'
display(vb)

In [22]:
print(X_colin.shape,y_colin.shape)
loss_3d_interactive(X_colin, y_colin.ravel(), loss='Ridge')

(100, 2) (100, 1)

In [23]:
print(X_colin.shape,y_colin.shape)
loss_3d_interactive(X_colin, y_colin.ravel(), loss='LASSO')

(100, 2) (100, 1)

In [24]:
print(X_colin.shape,y_colin.shape)
loss_3d_interactive(X_colin, y_colin.ravel(), loss='EN')

(100, 2) (100, 1)


## Bayesian Interpretations of Lasso And Ridge¶

In [25]:
from ipywidgets import interactive, HBox, VBox
import scipy.stats

def interactive_dist():
'''Uses plotly to draw an interactive 3D representation of the loss function,
with a slider to control the regularization factor.

Inputs:
X: predictor matrix for the regression problem. Has to be of dim n x 2
y: response vector

loss: string with the loss to plot. Options are 'Ridge', 'LASSO', 'EN'.
'''

# linspace for loss surface
L=20
x = np.linspace(-0.6,0.6,1000)

y =  scipy.stats.norm.pdf(x, 0)
y_2 =  scipy.stats.laplace.pdf(x, 0, 2)

f = go.FigureWidget(
data=[
go.Scatter(x=x,
y=y_2,
name=r"Normal(mean = 0, scale = σ²/λ) = RIDGE"),
go.Scatter(x=x,
y=y_2,
name=r"Laplacienne(mean = 0, scale = σ²/λ) = LASSO")

],

layout=go.Layout(
title='Normal and Laplacienne Distribution with interactive λ (assume σ²=1)',
xaxis=dict(title='Beta',),
yaxis=dict(title='pdf',range=[0, 15]),
width=1000,
height=500,)

)

def update_z(lbda):
f.data[0].y = scipy.stats.norm.pdf(x, 0, (1/lbda))
f.data[1].y = scipy.stats.laplace.pdf(x, 0, (2/lbda))
lambda_slider = interactive(update_z, lbda=(0.5, 30, 1))
vb = VBox((f, lambda_slider))
vb.layout.align_items = 'center'
display(vb)

interactive_dist()

In [ ]: