Lecture 5: MultLinearRegression¶
Data Science 1: CS 109A/STAT 121A/AC 209A/ E 109A
Instructors: Pavlos Protopapas, Kevin Rader, Rahul Dave, Margo Levine
Harvard University
Fall 2017
In [1]:
import pandas as pd
import sys
import numpy as np
import statsmodels.api as sm
from statsmodels.tools import add_constant
from statsmodels.regression.linear_model import RegressionResults
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
sns.set(style="ticks")
%matplotlib inline
import seaborn as sns
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)
sns.set_context('poster')
In [2]:
def find_regression_params(regression_model, samples, cols):
nyc_cab_sample = nyc_cab_df.sample(n=samples)
y = nyc_cab_sample['Fare_amount'].values
X = nyc_cab_sample[cols].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
regression_model.fit(X_train, y_train)
return np.hstack((np.array([regression_model.intercept_]), regression_model.coef_))
### PLOTTING ROUTINE FOR HIST
def plot_hist_se(vals, bins, title, xlabel, ax):
mean = vals.mean()
std = vals.std()
ax.hist(vals, bins=bins, color='blue', edgecolor='white', linewidth=1, alpha=0.2)
ax.axvline(mean, color='red', label='mean = {0:.2f}'.format(mean))
ax.axvline(mean - 2 * std, color='green', linestyle='--', label='std = {0:.2f}'.format(std))
ax.axvline(mean + 2 * std, color='green', linestyle='--')
ax.set_xlabel(xlabel)
ax.set_ylabel('Frequency')
ax.set_title(title)
ax.legend(loc='best')
return ax
Residual Analysis¶
In [8]:
#nyc_cab_df = pd.read_csv('data/nyc_car_hire_data.csv', low_memory=False)
In [106]:
nyc_cab_sample = nyc_cab_df.sample(n=1000, random_state=4)
y = nyc_cab_sample['Fare_amount'].values
X = nyc_cab_sample['Trip Length (min)'].values
X_train, X_test, y_train, y_test = train_test_split(X.reshape((len(X), 1)), y, test_size=0.33, random_state=4)
regression = LinearRegression(fit_intercept=True)
regression.fit(X_train, y_train)
regression_line = lambda x: regression.intercept_ + regression.coef_ * x
print('The equation of the regression line is: {} + {} * x'.format(regression.intercept_, regression.coef_[0]))
In [107]:
train_R_sq = regression.score(X_train, y_train)
test_R_sq = regression.score(X_test, y_test)
print('The train R^2 is {}, the test R^2 is {}'.format(train_R_sq, test_R_sq))
In [108]:
fig, ax = plt.subplots(1, 2, figsize=(15, 8))
errors = y_train - regression.predict(X_train)
ax[0].scatter(X_train, errors, color='blue', alpha=0.1, label='residuals')
ax[0].axhline(y=0, color='gold', label='zero error')
ax[0].set_xlabel('Trip Length (min)')
ax[0].set_ylabel('Residual')
ax[0].set_title('NYC Care Hire Data:\n Predictor vs Residual')
ax[0].legend(loc='best')
ax[1].hist(errors, color='blue', alpha=0.1, label='residuals', bins=50, edgecolor='white', linewidth=2)
ax[1].axvline(x=0, color='gold', label='zero error')
ax[1].set_xlabel('Trip Length (min)')
ax[1].set_ylabel('Residual')
ax[1].set_title('NYC Care Hire Data:\n Residual Histogram')
ax[1].legend(loc='best')
Out[108]:
Multiple Linear Regression¶
In [117]:
nyc_cab_sample = nyc_cab_df.sample(n=1000, random_state=6)
nyc_cab_sample['lpep_pickup_datetime'] = nyc_cab_sample['lpep_pickup_datetime'].apply(lambda dt: pd.to_datetime(dt).hour)
nyc_cab_sample['Lpep_dropoff_datetime'] = nyc_cab_sample['Lpep_dropoff_datetime'].apply(lambda dt: pd.to_datetime(dt).hour)
msk = np.random.rand(len(nyc_cab_sample)) < 0.8
train = nyc_cab_sample[msk]
test = nyc_cab_sample[~msk]
y_train = train['Fare_amount'].values
X_train = train[['Trip Length (min)', 'Type', 'TMAX']].values
y_test = test['Fare_amount'].values
X_test = test[['Trip Length (min)', 'Type', 'TMAX']].values
In [118]:
nyc_cab_sample.TMAX.describe()
Out[118]:
In [119]:
multi_regression_model = LinearRegression(fit_intercept=True)
multi_regression_model.fit(X_train, y_train)
print('The equation of the regression plane is: {} + {}^T . x'.format(multi_regression_model.intercept_, multi_regression_model.coef_))
1. Train vs Test Error¶
In [120]:
train_MSE= np.mean((y_train - multi_regression_model.predict(X_train))**2)
test_MSE= np.mean((y_test - multi_regression_model.predict(X_test))**2)
print('The train MSE is {}, the test MSE is {}'.format(train_MSE, test_MSE))
train_R_sq = multi_regression_model.score(X_train, y_train)
test_R_sq = multi_regression_model.score(X_test, y_test)
print('The train R^2 is {}, the test R^2 is {}'.format(train_R_sq, test_R_sq))
3. Uncertainty in the Model Parameter Estimates¶
In [121]:
total_draws = 500
samples = 1000
regression_params = []
for i in range(total_draws):
if i % 10 == 0:
out = i * 1. / total_draws * 100
sys.stdout.write("\r%d%%" % out)
sys.stdout.flush()
regression_params.append(find_regression_params(multi_regression_model, samples, ['Trip Length (min)', 'Type', 'TMAX']))
sys.stdout.write("\r%d%%" % 100)
regression_params = np.array(regression_params)
In [122]:
fig, ax = plt.subplots(2, 2, figsize=(20, 15))
plot_hist_se(regression_params[:, 0], 50, 'Histogram of Estimates of Intercept', 'Intercept', ax[0,0])
plot_hist_se(regression_params[:, 1], 50, 'Histogram of Estimates of Slope of Trip Length', 'Slope of Trip Length', ax[0,1])
plot_hist_se(regression_params[:, 2], 50, 'Histogram of Estimates of Slope of Type', 'Slope of Ride Type', ax[1,0])
plot_hist_se(regression_params[:, 3], 50, 'Histogram of Estimates of Slope of TMax', 'Slope of TMax', ax[1,1])
plt.tight_layout()
plt.show()
Evaluating the Significance of Predictors¶
In [123]:
predictors_multiple = ['Trip Length (min)', 'Type', 'TMAX']
predictors_simple = ['Trip Length (min)']
X_train_multi = add_constant(train[predictors_multiple].values)
X_test_multi = add_constant(test[predictors_multiple].values)
X_train_simple = add_constant(train[predictors_simple].values)
X_test_simple = add_constant(test[predictors_simple].values)
1. Measuring Significance Using F-Stat, p-Values¶
In [124]:
multi_regression_model = sm.OLS(y_train, X_train_multi).fit()
print('F-stat:', multi_regression_model.fvalue)
print('p-values: {} (intercept), {} (Trip Length), {} (Type), {} (TMAX)'.format(*multi_regression_model.pvalues))
2. Measuring Significance Using AIC/BIC¶
In [125]:
print("AIC for ['Trip Length (min)', 'Type', 'TMAX']:", multi_regression_model.aic)
print("BIC for ['Trip Length (min)', 'Type', 'TMAX']:", multi_regression_model.bic)
In [126]:
simple_regression_model = sm.OLS(y_train, X_train_simple).fit()
print("AIC for ['Trip Length (min)']:", simple_regression_model.aic)
print("BIC for ['Trip Length (min)']:", simple_regression_model.bic)
3. Measuring Significance Using R^2¶
In [127]:
simple_model = LinearRegression(fit_intercept=False)
simple_model.fit(X_train_simple, y_train)
print("Simple Model: train R^2 = {}, test R^2 = {}".format(simple_model.score(X_train_simple, y_train), simple_model.score(X_test_simple, y_test)))
multiple_model = LinearRegression(fit_intercept=False)
multiple_model.fit(X_train_multi, y_train)
print("Multiple Predictor Model: train R^2 = {}, test R^2 = {}".format(multiple_model.score(X_train_multi, y_train), multiple_model.score(X_test_multi, y_test)))
4. The Effect of Number of Predictors on R^2¶
In [128]:
multi_regression_model = LinearRegression(fit_intercept=True)
cols = ['Trip Length (min)', 'Type', 'Trip_distance', 'TMAX', 'TMIN', 'lpep_pickup_datetime', 'Lpep_dropoff_datetime', 'Pickup_longitude', 'Pickup_latitude', 'SNOW', 'SNWD', 'SNWD', 'PRCP']
train_R_sq = []
test_R_sq = []
for i in range(1, len(cols) + 1):
predictors = cols[:i]
X_train = train[predictors].values
X_test = test[predictors].values
multi_regression_model.fit(X_train, y_train)
train_R_sq.append(multi_regression_model.score(X_train, y_train))
test_R_sq.append(multi_regression_model.score(X_test, y_test))
In [129]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.plot(range(1, len(cols) + 1), train_R_sq, color='blue', label='train R^2')
ax.plot(range(1, len(cols) + 1), test_R_sq, color='red', label='test R^2')
ax.set_title('Number of Predictor vs Model Fitness')
ax.set_xlabel('Number of Predictors')
ax.set_ylabel('$R^2$')
ax.legend(loc='best')
plt.show()
Multiple Linear Regresssion with Interaction Terms¶
In [136]:
y_train = train['Fare_amount'].values
X_train = train[['Trip Length (min)', 'Type']].values
y_test = test['Fare_amount'].values
X_test = test[['Trip Length (min)', 'Type']].values
gen_cross_terms = PolynomialFeatures(degree=3, interaction_only=True)
cross_terms = gen_cross_terms.fit_transform(X_train)
X_train_with_cross = np.hstack((X_train, cross_terms))
cross_terms = gen_cross_terms.fit_transform(X_test)
X_test_with_cross = np.hstack((X_test, cross_terms))
X_train_with_cross.shape
Out[136]:
In [137]:
multi_regression_model = LinearRegression(fit_intercept=True)
multi_regression_model.fit(X_train_with_cross, y_train)
train_MSE = np.mean((y_train - multi_regression_model.predict(X_train_with_cross))**2)
test_MSE = np.mean((y_test - multi_regression_model.predict(X_test_with_cross))**2)
print('The train MSE with interaction terms is {}, the test MSE is {}'.format(train_MSE, test_MSE))
train_R_sq = multi_regression_model.score(X_train_with_cross, y_train)
test_R_sq = multi_regression_model.score(X_test_with_cross, y_test)
print('The train R^2 with interaction terms is {}, the test R^2 is {}'.format(train_R_sq, test_R_sq))
In [138]:
multi_regression_model.fit(X_train, y_train)
train_MSE = np.mean((y_train - multi_regression_model.predict(X_train))**2)
test_MSE = np.mean((y_test - multi_regression_model.predict(X_test))**2)
print('The train MSE without interaction terms is {}, the test MSE is {}'.format(train_MSE, test_MSE))
train_R_sq = multi_regression_model.score(X_train, y_train)
test_R_sq = multi_regression_model.score(X_test, y_test)
print('The train R^2 without interaction terms is {}, the test R^2 is {}'.format(train_R_sq, test_R_sq))
Polynomial Regression¶
In [35]:
y_train = train['Fare_amount'].values
X_train = train[['Trip Length (min)', 'Type', 'TMAX']].values
y_test = test['Fare_amount'].values
X_test = test[['Trip Length (min)', 'Type', 'TMAX']].values
gen_poly_terms = PolynomialFeatures(degree=2, interaction_only=False)
X_train_with_poly = gen_poly_terms.fit_transform(X_train)
X_test_with_poly = gen_poly_terms.fit_transform(X_test)
In [36]:
poly_regression_model = LinearRegression(fit_intercept=True)
poly_regression_model.fit(X_train_with_poly, y_train)
train_MSE= np.mean((y_train - poly_regression_model.predict(X_train_with_poly))**2)
test_MSE= np.mean((y_test - poly_regression_model.predict(X_test_with_poly))**2)
print('The train MSE for degree 2 poly model is {}, the test MSE is {}'.format(train_MSE, test_MSE))
train_R_sq = poly_regression_model.score(X_train_with_poly, y_train)
test_R_sq = poly_regression_model.score(X_test_with_poly, y_test)
print('The train R^2 for degree 2 poly model is {}, the test R^2 is {}'.format(train_R_sq, test_R_sq))
Effect of Polynomial Degree on Model Performance¶
In [37]:
train_R_sq = []
test_R_sq = []
max_deg = 10
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.fit_transform(X_test)
for d in range(max_deg + 1):
out = d * 1. / max_deg * 100
sys.stdout.write("\r%d%%" % out)
sys.stdout.flush()
gen_poly_terms = PolynomialFeatures(degree=d, interaction_only=False)
X_train_with_poly = gen_poly_terms.fit_transform(X_train)
X_test_with_poly = gen_poly_terms.fit_transform(X_test)
poly_regression_model = LinearRegression(fit_intercept=False)
poly_regression_model.fit(X_train_with_poly, y_train)
train_R_sq.append(poly_regression_model.score(X_train_with_poly, y_train))
test_R_sq.append(poly_regression_model.score(X_test_with_poly, y_test))
sys.stdout.write("\r%d%%" % 100)
In [38]:
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].plot(range(max_deg + 1), np.array(train_R_sq), color='blue', label='train R^2')
ax[0].set_title('Number of Polynomial Degree vs Model Fitness')
ax[0].set_xlabel('Degree of Polynomial')
ax[0].set_ylabel('Train R^2')
ax[0].legend(loc='best')
ax[1].plot(range(max_deg + 1), test_R_sq, color='red', label='test R^2')
ax[1].set_title('Number of Polynomial Degree vs Model Fitness')
ax[1].set_xlabel('Degree of Polynomial')
ax[1].set_ylabel('Test R^2')
ax[1].legend(loc='best')
plt.show()