Advanced section 2 - Regularization

This notebooks shows, in code, the effects of the regularization methods. We also show the instability that can arise in OLS.

In [1]:
# Imports
import numpy as np

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D as a3d

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.plotly import plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

from copy import deepcopy

%matplotlib inline
%matplotlib notebook

Create regression problem

To exemplify the estimators and their behavior on different types of data, we create a basic 2D regression, as well as a 2d regression with highly colinear predictors. Let's start with a normal regression with random predictor values:

In [2]:
## Global variables for plots

# True betas:
b_true = np.array([3,7]).reshape((2,1))

# Samples and Y noise
N=100
sigma = 5

# linspaces
lsp = np.linspace(-1.5,1.5,20)
lsp_x, lsp_y = np.meshgrid(lsp,lsp)
lsp_mat = np.column_stack((lsp_x.flatten(),lsp_y.flatten()))
In [3]:
# Generate random data from the true betas
X = np.random.rand(N,2)*10-5
X = (X - X.mean(axis=0))/X.std(axis=0)
eps = np.random.normal(scale=sigma, size = (N,1))
y = np.dot(X,b_true) + eps

Plot regression problem

In [4]:
# 3D plot with Axes3D. 3D plotting with this is suboptimal, as it does no actual 3d rendering.

def plot_scatter_axes3d(X,y,b_true):
    # y surface from true betas
    y_noiseless = np.dot(lsp_mat,b_true).reshape((20,20))
    fig = plt.figure(figsize=[9,7])
    ax = fig.gca(projection='3d')
    ax.scatter(X[:,0],X[:,1], y, color='red', alpha =1, zorder=y)
    ax.plot_surface(lsp_x, lsp_y, np.dot(lsp_mat,b_true).reshape((20,20)), color='black', alpha = 0.5, zorder = y_noiseless)
    plt.show()
    
plot_scatter_axes3d(X,y,b_true);
In [5]:
# 3D plot with plotly. Plotly is a much better approach for 3d plots, but it requires the user to have an online account.

# plotly.tools.set_credentials_file(username='camilofosco', api_key='AY1QtDdBCza2qZePnygz')

def plot_scatter_plotly(X,y,b_true):

    # y surface from true betas
    y_noiseless = np.dot(lsp_mat,b_true).reshape((20,20))
    
    data = [
        go.Scatter3d(
        x=X[:,0],
        y=X[:,1],
        z=y,
        mode='markers',
        marker=dict(
            size=5,
            color='green',
            line=dict(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5
                ),
            opacity=1
            )
        ),
        go.Surface(
                x=lsp_x,
                y=lsp_y,
                z=y_noiseless,
                colorscale='Greens',
                opacity=0.8,
                showscale=False
        ),

 
    ]

    layout = go.Layout(
        title='2D Regression',
        autosize=False,
        width=700,
        height=700,
    )

    fig = go.Figure(data=data, layout=layout)
        
    return fig, data

print(X.shape, y.shape)

fig, data = plot_scatter_plotly(X,y,b_true)
print(len(data))
iplot(fig, filename='2D Regression - normal data')
(100, 2) (100, 1)
2
/anaconda3/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:

Consider using IPython.display.IFrame instead

Out[5]:
In [6]:
# Regression types to use:
regr = ['OLS']
colors = ['Blues', 'Reds']

Calculate regression coefficients for OLS (normal data)

In [7]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV


def fit_regression(X, y, regr=['OLS'], verbose=True):
    betas=[]
    if regr == 'all':
        regr = ['OLS','Ridge','LASSO','EN']
        
    for r in regr:
        if r == 'OLS':
            # OLS fit
            regr_ols = LinearRegression(fit_intercept=False)
            regr_ols.fit(X, y)
            beta_ols = regr_ols.coef_ 
            if verbose:
                print(f'OLS coefficients: {beta_ols}')
            betas.append(beta_ols)
        
        elif r == 'Ridge':
            # Ridge fit
            regr_ridge = RidgeCV(fit_intercept=False)
            regr_ridge.fit(X, y)
            beta_ridge = regr_ridge.coef_
            if verbose:
                print(f'Ridge coefficients:{beta_ridge}, regularization coef: {regr_ridge.alpha_}')
            betas.append(beta_ridge)
        
        elif r == 'LASSO':
            # LASSO fit
            regr_lasso = LassoCV(fit_intercept=False)
            regr_lasso.fit(X, y)
            beta_lasso = regr_lasso.coef_ 
            if verbose:
                print(f'LASSO coefficients:{beta_lasso}, regularization coef: {regr_lasso.alpha_}')
            betas.append(beta_lasso)

        elif r == 'EN':
            # Elastic Net fit
            regr_EN = ElasticNetCV(fit_intercept=False)
            regr_EN.fit(X, y)
            beta_EN = regr_EN.coef_ 
            if verbose:
                print(f'ElasticNet coefficients:{beta_EN}, regularization coef: {regr_EN.alpha_}')
            betas.append(beta_EN)
    
    return betas

print('True coefficents:', b_true.ravel())
betas = fit_regression(X,y.ravel(),regr=regr);
True coefficents: [3 7]
OLS coefficients: [2.7448308  7.30976811]

Plot fitted planes (normal data)

In [8]:
def plot_fitted_planes(betas, colors, names, data=[], return_traces=False):
    
    for i,b in enumerate(betas):
        y = np.dot(lsp_mat,b.reshape((2,1))).reshape((20,20))
        data.append(go.Surface(
                    x=lsp_x,
                    y=lsp_y,
                    z=y,
                    colorscale=colors[i],
                    text = names[i],
                    showscale=False
            ))

    layout = go.Layout(
        title='2D Regression',
        autosize=False,
        width=700,
        height=700,
    )
    
    if return_traces:
        return data
    
    fig = go.Figure(data=data, layout=layout)
    return fig
    
fig = plot_fitted_planes(betas, colors=colors, names=regr, data=deepcopy(data))
iplot(fig, filename='2D Regression with different estimators - normal data')
/anaconda3/lib/python3.6/site-packages/IPython/core/display.py:689: UserWarning:

Consider using IPython.display.IFrame instead

Out[8]:

Create regression problem with colinearity

In [9]:
# Generate random data with high colinearity between predictors
x1 = np.random.rand(N,1)*10-5
x2 = x1 + np.random.normal(scale=0.2, size=(N,1))
X_colin = np.column_stack((x1,x2))
X_colin = (X_colin-X_colin.mean(axis=0))/X_colin.std(axis=0)
eps = np.random.normal(scale=sigma, size = (N,1))
y_colin = np.dot(X_colin,b_true)+eps

Plot regression problem with colinearity

In [10]:
fig, data_colin = plot_scatter_plotly(X_colin,y_colin,b_true)
iplot(fig, filename='2D Regression - normal data')
Out[10]:

Calculate regression coefficients (colinear data)

In [11]:
print('True coefficents:', b_true.ravel())
betas_colin = fit_regression(X_colin,y_colin.ravel(),regr=regr);
True coefficents: [3 7]
OLS coefficients: [10.18864184  0.20719093]

Plot fitting planes (colinear data)

In [12]:
fig = plot_fitted_planes(betas_colin, colors=colors, names=regr, data=deepcopy(data_colin))
iplot(fig, filename='2D Regression with different estimators - colinear data')
Out[12]:

Add a small perturbation to the colinear data and fit again

In [13]:
# Perturbation is just a bit of small uniform noise:
perturbation = np.random.rand(X_colin.shape[0],X_colin.shape[1])*0.05
X_colin_pert = X_colin+perturbation
y_colin_pert = np.dot(X_colin_pert,b_true)+eps

print('True coefficents:', b_true.ravel())
betas = fit_regression(X_colin_pert,y_colin_pert.ravel(),regr=regr);
fig, data_colin_pert = plot_scatter_plotly(X_colin_pert,y_colin_pert,b_true)
fig = plot_fitted_planes(betas, colors=colors, names=regr, data=deepcopy(data_colin_pert))

iplot(fig, filename='2D Regression with different estimators - colinear data')
True coefficents: [3 7]
OLS coefficients: [3.59464312 6.79142214]
Out[13]:

This clearly shows how unstable our estimates are in OLS. As expected, in this case, the inverse Gram Matrix $(X^TX)^{-1}$ (proportional to covariance) will present very large diagonal values:

Condition number and eigenvalues

In [14]:
print('Inverse of Gram Matrix for colinear data (propto covariance matrix of betas):')
print(np.linalg.inv(np.dot(X_colin.T,X_colin)))

print('Condition number of Gram Matrix:')
print(np.linalg.cond(np.dot(X_colin.T,X_colin)))
Inverse of Gram Matrix for colinear data (propto covariance matrix of betas):
[[ 2.82282151 -2.81781707]
 [-2.81781707  2.82282151]]
Condition number of Gram Matrix:
1127.1277155550463
In [15]:
eigval, eigvec = np.linalg.eig(np.dot(X_colin.T,X_colin))
print("Matrix X[:5] =")
display(X[:5])
print("Eigenvalues =")
display(eigval)
print("Eigenvectors =")
display(eigvec)
print("Max(eigenvalues)/Min(eigenvalues)")
display(max(eigval)/min(eigval))
Matrix X[:5] =
array([[ 1.75683671,  0.65680951],
       [-1.60918975,  1.54938769],
       [ 0.20357474, -0.48327303],
       [ 0.72870872,  1.21985667],
       [ 0.15826972, -1.14201238]])
Eigenvalues =
array([1.77284892e-01, 1.99822715e+02])
Eigenvectors =
array([[-0.70710678, -0.70710678],
       [ 0.70710678, -0.70710678]])
Max(eigenvalues)/Min(eigenvalues)
1127.127715555074
In [16]:
print('\nCompare to non-colinear data:')
print(np.linalg.inv(np.dot(X.T,X)))

print('Condition number of Gram Matrix with non-colinear data:')
print(np.linalg.cond(np.dot(X.T,X)))
Compare to non-colinear data:
[[ 0.01033188 -0.00185174]
 [-0.00185174  0.01033188]]
Condition number of Gram Matrix with non-colinear data:
1.4367237004517126
In [17]:
eigval, eigvec = np.linalg.eig(np.dot(X.T,X))
print("Matrix X[:5] =")
display(X[:5])
print("Eigenvalues =")
display(eigval)
print("Eigenvectors =")
display(eigvec)
print("Max(eigenvalues)/Min(eigenvalues)")
display(max(eigval)/min(eigval))
Matrix X[:5] =
array([[ 1.75683671,  0.65680951],
       [-1.60918975,  1.54938769],
       [ 0.20357474, -0.48327303],
       [ 0.72870872,  1.21985667],
       [ 0.15826972, -1.14201238]])
Eigenvalues =
array([117.92257778,  82.07742222])
Eigenvectors =
array([[ 0.70710678, -0.70710678],
       [ 0.70710678,  0.70710678]])
Max(eigenvalues)/Min(eigenvalues)
1.4367237004517126

Analyze loss surfaces

In [18]:
def OLS_loss(X, y, beta, lbda=0):
    y_hat = np.dot(X,beta)
    return np.sum((y_hat-y)**2,axis=0)

def Ridge_loss(X, y, beta, lbda):
    y_hat = np.dot(X,beta)
    return np.sum((y_hat-y)**2,axis=0) + lbda*np.sum(beta**2, axis=0)

def LASSO_loss(X, y, beta, lbda):
    y_hat = np.dot(X,beta)
    return (1 / (2 * len(X)))*np.sum((y_hat-y)**2,axis=0) + lbda*np.sum(np.abs(beta), axis=0)

def EN_loss(X, y, beta, lbda):
    ratio=0.1
    y_hat = np.dot(X,beta)
    return (1 / (2 * len(X)))*np.sum((y_hat-y)**2,axis=0) + lbda*(ratio*np.sum(beta**2, axis=0) + (1-ratio)*np.sum(np.abs(beta), axis=0))
In [19]:
# linspace for loss surface
L=40
lsp_b = np.linspace(-20,20,L)
lsp_b_x, lsp_b_y = np.meshgrid(lsp_b,lsp_b)
lsp_b_mat = np.column_stack((lsp_b_x.flatten(),lsp_b_y.flatten()))

def build_surface_fig(loss_values):
    
    data = [
        go.Surface(
                x=lsp_b_x,
                y=lsp_b_y,
                z=loss_values,
                colorscale='Viridis',
                    opacity=0.7,
                    contours=dict(z=dict(show=True,
                                         width=3,
                                         highlight=True,
                                         highlightcolor='orange',
                                         project=dict(z=True),
                                         usecolormap=True))
        )
    ]

    layout = go.Layout(
        title='Loss surface',
        autosize=False,
        width=700,
        height=700,
        scene=dict(
        xaxis = dict(
            title='Beta 1'),
        yaxis = dict(
            title='Beta 2'),
        zaxis = dict(
            title='Loss')
        )
    )

    fig = go.Figure(data=data, layout=layout)
    display(iplot(fig, filename='2D Regression with different estimators - colinear data'))

build_surface_fig(OLS_loss(X_colin,y_colin.reshape(-1,1), lsp_b_mat.T, 100).reshape((L,L)));
In [20]:
# # OLS
# loss_values = OLS_loss(X, y, lsp_b_mat.T).reshape((L,L))
# fig, data = build_surface_fig(loss_values)
# iplot(fig, filename='Loss Surface')

# # Ridge
# loss_values = Ridge_loss(X, y, lsp_b_mat.T, 100.0).reshape((L,L))
# fig, data = build_surface_fig(loss_values)
# iplot(fig, filename='Loss Surface')

# # LASSO
# loss_values = LASSO_loss(X, y, lsp_b_mat.T, 100.0).reshape((L,L))
# fig, data = build_surface_fig(loss_values)
# iplot(fig, filename='Loss Surface')

# # Elastic Net
# loss_values = EN_loss(X, y, lsp_b_mat.T, 100.0).reshape((L,L))
# fig, data = build_surface_fig(loss_values)
# fig['layout'].update()
# iplot(fig, filename='Loss Surface')
In [21]:
from ipywidgets import interactive, HBox, VBox

def loss_3d_interactive(X, y, loss='Ridge'):
    '''Uses plotly to draw an interactive 3D representation of the loss function, 
    with a slider to control the regularization factor.
    
    Inputs:
    X: predictor matrix for the regression problem. Has to be of dim n x 2
    y: response vector 
    
    loss: string with the loss to plot. Options are 'Ridge', 'LASSO', 'EN'.
    '''

    if loss == 'Ridge':
        loss_function = Ridge_loss
        lbda_slider_min = 0
        lbda_slider_max = 1000
        lbda_step = 1
        clf = Ridge()
    elif loss == 'LASSO':
        loss_function = LASSO_loss
        lbda_slider_min = 1
        lbda_slider_max = 150
        lbda_step = 1
        clf = Lasso()
    elif loss == 'EN':
        loss_function = EN_loss
        lbda_slider_min = 1
        lbda_slider_max = 150
        lbda_step = 1
        clf = ElasticNet()
    else:
        raise ValueError("Loss string not recognized. Available options are: 'Ridge', 'LASSO', 'EN'.")
            
    # linspace for loss surface
    L=20
    lsp_b = np.linspace(-10,10,L)
    lsp_b_x, lsp_b_y = np.meshgrid(lsp_b,lsp_b)
    lsp_b_mat = np.column_stack((lsp_b_x.flatten(),lsp_b_y.flatten()))
    
    # Get all optimal betas for current lambda range
    precomp_coefs=[]
    for l in range(lbda_slider_min,lbda_slider_max+1,lbda_step):
        clf.set_params(alpha=l)
        clf.fit(X, y)
        precomp_coefs.append(clf.coef_)
                
    f = go.FigureWidget(
        data=[
            go.Surface(
                    x=lsp_b_x,
                    y=lsp_b_y,
                    z=loss_function(X,y.reshape(-1,1), lsp_b_mat.T, 0).reshape((L,L)),
                    colorscale='Viridis',
                    colorbar = dict(len=0.75),
                    opacity=0.7,
                    name=r"Loss function",
                    contours=dict(z=dict(show=True,
                                         width=4,
                                         highlight=True,
                                         highlightcolor='orange',
                                         project=dict(z=True),
                                         usecolormap=True))
            ),
            
            go.Scatter3d(
                x=[p[0] for p in precomp_coefs],
                y=[p[1] for p in precomp_coefs],
                z=np.zeros(len(precomp_coefs)),
                name=r"Trajectory Beta 1 and Beta 2",
                marker=dict(
                    size=1,
                    color='red',
                    line=dict(
                        color='red',
                        width=0
                        ),
                    opacity=1
                    )
                ),
            go.Scatter3d(
                x=[0],
                y=[0],
                z=[0],
                name=r"Beta 1 and Beta 2 with constraint",
                marker=dict(
                    size=10,
                    color='orange',
                    opacity=1
                    ),
                ),
            go.Scatter3d(
                x=[3],
                y=[7],
                z=[0],
                name=r"True Beta 1 and Beta 2 = (3,7)",
                marker=dict(
                    size=10,
                    color='blue',
                    opacity=1
                    ),
            ),
        ],

        layout=go.Layout(scene=go.layout.Scene(
                    xaxis = dict(
                        title='Beta 1'),
                    yaxis = dict(
                        title='Beta 2'),
                    zaxis = dict(
                        title='Loss'),
            camera=go.layout.scene.Camera(
                up=dict(x=0, y=0, z=1),
                center=dict(x=0, y=0, z=0),
                eye=dict(x=1.25, y=1.25, z=1.25))
        ),
            width=1000,
            height=700,)
    )

    def update_z(lbda):
        f.data[0].z = loss_function(X, y.reshape(-1,1), lsp_b_mat.T, lbda).reshape((L,L))
        beta_opt = precomp_coefs[(lbda-lbda_slider_min)//(lbda_step)]
        f.data[-2].x = [beta_opt[0]]
        f.data[-2].y = [beta_opt[1]]
        f.data[-2].z = [0]

    lambda_slider = interactive(update_z, lbda=(lbda_slider_min, lbda_slider_max, lbda_step))
    vb = VBox((f, lambda_slider))
    vb.layout.align_items = 'center'
    display(vb)
In [22]:
print(X_colin.shape,y_colin.shape)
loss_3d_interactive(X_colin, y_colin.ravel(), loss='Ridge')
(100, 2) (100, 1)
In [23]:
print(X_colin.shape,y_colin.shape)
loss_3d_interactive(X_colin, y_colin.ravel(), loss='LASSO')
(100, 2) (100, 1)
In [24]:
print(X_colin.shape,y_colin.shape)
loss_3d_interactive(X_colin, y_colin.ravel(), loss='EN')
(100, 2) (100, 1)

Bayesian Interpretations of Lasso And Ridge

In [25]:
from ipywidgets import interactive, HBox, VBox
import scipy.stats

def interactive_dist():
    '''Uses plotly to draw an interactive 3D representation of the loss function, 
    with a slider to control the regularization factor.
    
    Inputs:
    X: predictor matrix for the regression problem. Has to be of dim n x 2
    y: response vector 
    
    loss: string with the loss to plot. Options are 'Ridge', 'LASSO', 'EN'.
    '''

        
    
    # linspace for loss surface
    L=20
    x = np.linspace(-0.6,0.6,1000)
    
    y =  scipy.stats.norm.pdf(x, 0)
    y_2 =  scipy.stats.laplace.pdf(x, 0, 2)
                
    f = go.FigureWidget(
        data=[
            go.Scatter(x=x, 
                       y=y_2,
                       name=r"Normal(mean = 0, scale = σ²/λ) = RIDGE"),
            go.Scatter(x=x,
                       y=y_2,
                       name=r"Laplacienne(mean = 0, scale = σ²/λ) = LASSO")

        ],
        
        layout=go.Layout(
            title='Normal and Laplacienne Distribution with interactive λ (assume σ²=1)',
            xaxis=dict(title='Beta',),
            yaxis=dict(title='pdf',range=[0, 15]),
            width=1000,
            height=500,)
         
    )

    def update_z(lbda):
        f.data[0].y = scipy.stats.norm.pdf(x, 0, (1/lbda))
        f.data[1].y = scipy.stats.laplace.pdf(x, 0, (2/lbda))
    lambda_slider = interactive(update_z, lbda=(0.5, 30, 1))
    vb = VBox((f, lambda_slider))
    vb.layout.align_items = 'center'
    display(vb)
    
    
interactive_dist()
In [ ]: