Key Word(s): Regularization, Lasso, Ridge
CS-109A Introduction to Data Science
Advanced Section 3: Regularization¶
Harvard University
Fall 2018
Instructors: Pavlos Protopapas and Kevin Rader
Section Leader: Camilo Fosco
Contributors: Will Claybaugh, Pavlos Protopapas
This notebooks shows, in code, the effects of the regularization methods. We also show the instability that can arise in OLS.
## RUN THIS CELL TO PROPERLY HIGHLIGHT THE EXERCISES
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)
# Imports
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D as a3d
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.plotly import plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
from copy import deepcopy
%matplotlib inline
%matplotlib notebook
Create regression problem¶
To exemplify the estimators and their behavior on different types of data, we create a basic 2D regression, as well as a 2d regression with highly colinear predictors. Let's start with a normal regression with random predictor values:
## Global variables for plots
# True betas:
b_true = np.array([3,7]).reshape((2,1))
# Samples and Y noise
N=300
sigma = 5
# linspaces
lsp = np.linspace(-1.5,1.5,20)
lsp_x, lsp_y = np.meshgrid(lsp,lsp)
lsp_mat = np.column_stack((lsp_x.flatten(),lsp_y.flatten()))
# Generate random data from the true betas
X = np.random.rand(N,2)*10-5
X = (X - X.mean(axis=0))/X.std(axis=0)
eps = np.random.normal(scale=sigma, size = (N,1))
y = np.dot(X,b_true) + eps
Plot regression problem¶
# 3D plot with Axes3D. 3D plotting with this is suboptimal, as it does no actual 3d rendering.
def plot_scatter_axes3d(X,y,b_true):
# y surface from true betas
y_noiseless = np.dot(lsp_mat,b_true).reshape((20,20))
fig = plt.figure(figsize=[9,7])
ax = fig.gca(projection='3d')
ax.scatter(X[:,0],X[:,1], y, color='red', alpha =1, zorder=y)
ax.plot_surface(lsp_x, lsp_y, np.dot(lsp_mat,b_true).reshape((20,20)), color='black', alpha = 0.5, zorder = y_noiseless)
plt.show()
plot_scatter_axes3d(X,y,b_true);
# 3D plot with plotly. Plotly is a much better approach for 3d plots, but it requires the user to have an online account.
# plotly.tools.set_credentials_file(username='camilofosco', api_key='AY1QtDdBCza2qZePnygz')
def plot_scatter_plotly(X,y,b_true):
# y surface from true betas
y_noiseless = np.dot(lsp_mat,b_true).reshape((20,20))
data = [
go.Scatter3d(
x=X[:,0],
y=X[:,1],
z=y,
mode='markers',
marker=dict(
size=5,
color='green',
line=dict(
color='rgba(217, 217, 217, 0.14)',
width=0.5
),
opacity=1
)
),
go.Surface(
x=lsp_x,
y=lsp_y,
z=y_noiseless,
colorscale='Greens',
opacity=0.8,
showscale=False
),
]
layout = go.Layout(
title='2D Regression',
autosize=False,
width=700,
height=700,
)
fig = go.Figure(data=data, layout=layout)
return fig, data
print(X.shape, y.shape)
fig, data = plot_scatter_plotly(X,y,b_true)
print(len(data))
iplot(fig, filename='2D Regression - normal data')
# Regression types to use:
regr = ['OLS']
colors = ['Blues', 'Reds']
Calculate regression coefficients for OLS (normal data)¶
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
def fit_regression(X, y, regr=['OLS'], verbose=True):
betas=[]
if regr == 'all':
regr = ['OLS','Ridge','LASSO','EN']
for r in regr:
if r == 'OLS':
# OLS fit
regr_ols = LinearRegression(fit_intercept=False)
regr_ols.fit(X, y)
beta_ols = regr_ols.coef_
if verbose:
print(f'OLS coefficients: {beta_ols}')
betas.append(beta_ols)
elif r == 'Ridge':
# Ridge fit
regr_ridge = RidgeCV(fit_intercept=False)
regr_ridge.fit(X, y)
beta_ridge = regr_ridge.coef_
if verbose:
print(f'Ridge coefficients:{beta_ridge}, regularization coef: {regr_ridge.alpha_}')
betas.append(beta_ridge)
elif r == 'LASSO':
# LASSO fit
regr_lasso = LassoCV(fit_intercept=False)
regr_lasso.fit(X, y)
beta_lasso = regr_lasso.coef_
if verbose:
print(f'LASSO coefficients:{beta_lasso}, regularization coef: {regr_lasso.alpha_}')
betas.append(beta_lasso)
elif r == 'EN':
# Elastic Net fit
regr_EN = ElasticNetCV(fit_intercept=False)
regr_EN.fit(X, y)
beta_EN = regr_EN.coef_
if verbose:
print(f'ElasticNet coefficients:{beta_EN}, regularization coef: {regr_EN.alpha_}')
betas.append(beta_EN)
return betas
print('True coefficents:', b_true.ravel())
betas = fit_regression(X,y.ravel(),regr=regr);
Plot fitted planes (normal data)¶
def plot_fitted_planes(betas, colors, names, data=[], return_traces=False):
for i,b in enumerate(betas):
y = np.dot(lsp_mat,b.reshape((2,1))).reshape((20,20))
data.append(go.Surface(
x=lsp_x,
y=lsp_y,
z=y,
colorscale=colors[i],
text = names[i],
showscale=False
))
layout = go.Layout(
title='2D Regression',
autosize=False,
width=700,
height=700,
)
if return_traces:
return data
fig = go.Figure(data=data, layout=layout)
return fig
fig = plot_fitted_planes(betas, colors=colors, names=regr, data=deepcopy(data))
iplot(fig, filename='2D Regression with different estimators - normal data')
Create regression problem with colinearity¶
# Generate random data with high colinearity between predictors
x1 = np.random.rand(N,1)*10-5
x2 = x1 + np.random.normal(scale=0.1, size=(N,1))
X_colin = np.column_stack((x1,x2))
X_colin = (X_colin-X_colin.mean(axis=0))/X_colin.std(axis=0)
eps = np.random.normal(scale=sigma, size = (N,1))
y_colin = np.dot(X_colin,b_true)+eps
Plot regression problem with colinearity¶
fig, data_colin = plot_scatter_plotly(X_colin,y_colin,b_true)
iplot(fig, filename='2D Regression - normal data')
Calculate regression coefficients (colinear data)¶
print('True coefficents:', b_true.ravel())
betas_colin = fit_regression(X_colin,y_colin.ravel(),regr=regr);
Plot fitting planes (colinear data)¶
fig = plot_fitted_planes(betas_colin, colors=colors, names=regr, data=deepcopy(data_colin))
iplot(fig, filename='2D Regression with different estimators - colinear data')
Add a small perturbation to the colinear data and fit again¶
# Perturbation is just a bit of small uniform noise:
perturbation = np.random.rand(X_colin.shape[0],X_colin.shape[1])*0.05
X_colin_pert = X_colin+perturbation
y_colin_pert = np.dot(X_colin_pert,b_true)+eps
print('True coefficents:', b_true.ravel())
betas = fit_regression(X_colin_pert,y_colin_pert.ravel(),regr=regr);
fig, data_colin_pert = plot_scatter_plotly(X_colin_pert,y_colin_pert,b_true)
fig = plot_fitted_planes(betas, colors=colors, names=regr, data=deepcopy(data_colin_pert))
iplot(fig, filename='2D Regression with different estimators - colinear data')
This clearly shows how unstable our estimates are in OLS. As expected, in this case, the inverse Gram Matrix $(X^TX)^{-1}$ (proportional to covariance) will present very large diagonal values:
print('Inverse of Gram Matrix for colinear data (propto covariance matrix of betas):')
print(np.linalg.inv(np.dot(X_colin.T,X_colin)))
print('Condition number of Gram Matrix:')
print(np.linalg.cond(np.dot(X_colin.T,X_colin)))
print('\nCompare to non-colinear data:')
print(np.linalg.inv(np.dot(X.T,X)))
print('Condition number of Gram Matrix with non-colinear data:')
print(np.linalg.cond(np.dot(X.T,X)))
Analyze loss surfaces¶
def OLS_loss(X, y, beta, lbda=0):
y_hat = np.dot(X,beta)
return np.sum((y_hat-y)**2,axis=0)
def Ridge_loss(X, y, beta, lbda):
y_hat = np.dot(X,beta)
return np.sum((y_hat-y)**2,axis=0) + lbda*np.sum(beta**2, axis=0)
def LASSO_loss(X, y, beta, lbda):
y_hat = np.dot(X,beta)
return (1 / (2 * len(X)))*np.sum((y_hat-y)**2,axis=0) + lbda*np.sum(np.abs(beta), axis=0)
def EN_loss(X, y, beta, lbda):
ratio=0.1
y_hat = np.dot(X,beta)
return (1 / (2 * len(X)))*np.sum((y_hat-y)**2,axis=0) + lbda*(ratio*np.sum(beta**2, axis=0) + (1-ratio)*np.sum(np.abs(beta), axis=0))
# linspace for loss surface
L=40
lsp_b = np.linspace(-20,20,L)
lsp_b_x, lsp_b_y = np.meshgrid(lsp_b,lsp_b)
lsp_b_mat = np.column_stack((lsp_b_x.flatten(),lsp_b_y.flatten()))
def build_surface_fig(loss_values):
data = [
go.Surface(
x=lsp_b_x,
y=lsp_b_y,
z=loss_values,
colorscale='Viridis',
opacity=0.7,
contours=dict(z=dict(show=True,
width=3,
highlight=True,
highlightcolor='orange',
project=dict(z=True),
usecolormap=True))
)
]
layout = go.Layout(
title='Loss surface',
autosize=False,
width=700,
height=700,
scene=dict(
xaxis = dict(
title='Beta 1'),
yaxis = dict(
title='Beta 2'),
zaxis = dict(
title='Loss')
)
)
fig = go.Figure(data=data, layout=layout)
display(iplot(fig, filename='2D Regression with different estimators - colinear data'))
build_surface_fig(OLS_loss(X_colin,y_colin.reshape(-1,1), lsp_b_mat.T, 100).reshape((L,L)));
# # OLS
# loss_values = OLS_loss(X, y, lsp_b_mat.T).reshape((L,L))
# fig, data = build_surface_fig(loss_values)
# iplot(fig, filename='Loss Surface')
# # Ridge
# loss_values = Ridge_loss(X, y, lsp_b_mat.T, 100.0).reshape((L,L))
# fig, data = build_surface_fig(loss_values)
# iplot(fig, filename='Loss Surface')
# # LASSO
# loss_values = LASSO_loss(X, y, lsp_b_mat.T, 100.0).reshape((L,L))
# fig, data = build_surface_fig(loss_values)
# iplot(fig, filename='Loss Surface')
# # Elastic Net
# loss_values = EN_loss(X, y, lsp_b_mat.T, 100.0).reshape((L,L))
# fig, data = build_surface_fig(loss_values)
# fig['layout'].update()
# iplot(fig, filename='Loss Surface')
from ipywidgets import interactive, HBox, VBox
def loss_3d_interactive(X, y, loss='Ridge'):
'''Uses plotly to draw an interactive 3D representation of the loss function,
with a slider to control the regularization factor.
Inputs:
X: predictor matrix for the regression problem. Has to be of dim n x 2
y: response vector
loss: string with the loss to plot. Options are 'Ridge', 'LASSO', 'EN'.
'''
if loss == 'Ridge':
loss_function = Ridge_loss
lbda_slider_min = 0
lbda_slider_max = 1000
lbda_step = 1
clf = Ridge()
elif loss == 'LASSO':
loss_function = LASSO_loss
lbda_slider_min = 1
lbda_slider_max = 150
lbda_step = 1
clf = Lasso()
elif loss == 'EN':
loss_function = EN_loss
lbda_slider_min = 1
lbda_slider_max = 150
lbda_step = 1
clf = ElasticNet()
else:
raise ValueError("Loss string not recognized. Available options are: 'Ridge', 'LASSO', 'EN'.")
# linspace for loss surface
L=20
lsp_b = np.linspace(-10,10,L)
lsp_b_x, lsp_b_y = np.meshgrid(lsp_b,lsp_b)
lsp_b_mat = np.column_stack((lsp_b_x.flatten(),lsp_b_y.flatten()))
# Get all optimal betas for current lambda range
precomp_coefs=[]
for l in range(lbda_slider_min,lbda_slider_max+1,lbda_step):
clf.set_params(alpha=l)
clf.fit(X, y)
precomp_coefs.append(clf.coef_)
f = go.FigureWidget(
data=[
go.Surface(
x=lsp_b_x,
y=lsp_b_y,
z=loss_function(X,y.reshape(-1,1), lsp_b_mat.T, 0).reshape((L,L)),
colorscale='Viridis',
opacity=0.7,
contours=dict(z=dict(show=True,
width=3,
highlight=True,
highlightcolor='orange',
project=dict(z=True),
usecolormap=True))
),
go.Scatter3d(
x=[p[0] for p in precomp_coefs],
y=[p[1] for p in precomp_coefs],
z=np.zeros(len(precomp_coefs)),
marker=dict(
size=1,
color='darkorange',
line=dict(
color='darkorange',
width=1
),
opacity=1
)
),
go.Scatter3d(
x=[0],
y=[0],
z=[0],
marker=dict(
size=10,
color='orange',
opacity=1
),
)
],
layout=go.Layout(scene=go.layout.Scene(
xaxis = dict(
title='Beta 1'),
yaxis = dict(
title='Beta 2'),
zaxis = dict(
title='Loss'),
camera=go.layout.scene.Camera(
up=dict(x=0, y=0, z=1),
center=dict(x=0, y=0, z=0),
eye=dict(x=1.25, y=1.25, z=1.25))
),
width=1000,
height=700,)
)
def update_z(lbda):
f.data[0].z = loss_function(X, y.reshape(-1,1), lsp_b_mat.T, lbda).reshape((L,L))
beta_opt = precomp_coefs[(lbda-lbda_slider_min)//(lbda_step)]
f.data[-1].x = [beta_opt[0]]
f.data[-1].y = [beta_opt[1]]
f.data[-1].z = [0]
lambda_slider = interactive(update_z, lbda=(lbda_slider_min, lbda_slider_max, lbda_step))
vb = VBox((f, lambda_slider))
vb.layout.align_items = 'center'
display(vb)
print(X_colin.shape,y_colin.shape)
loss_3d_interactive(X_colin, y_colin.ravel(), loss='Ridge')