Key Word(s): Regression, kNN, Scikit-Learn

Lecture 4: Introduction to Regression¶

Data Science 1: CS 109A/STAT 121A/AC 209A/ E 109A
Instructors: Pavlos Protopapas, Kevin Rader, Rahul Dave, Margo Levine

Harvard University
Fall 2017

In [1]:

import pandas as pd
import sys
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
sns.set(style="ticks")
%matplotlib inline

import seaborn as sns
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)

sns.set_context('poster')
# import matplotlib.pylab as pylab
# params = {'legend.fontsize': 'large',
#           'figure.figsize': (15, 5),
#          'axes.labelsize': 'large',
#          'axes.titlesize':'large',
#          'xtick.labelsize':'large',
#          'ytick.labelsize':'large'}
# pylab.rcParams.update(params)}

Step #1: Load and Explore Data¶

In [2]:

# GET THE FULL DATA SET FROM 
#https://drive.google.com/file/d/0B28c493CP9GtRHFVM0U0SVI2Yms/view?usp=sharing
nyc_cab_df = pd.read_csv('data/nyc_car_hire_data.csv', low_memory=False) # DO WE NEED THE LOW MEMORY

In [3]:

nyc_cab_df.shape

Out[3]:

(1873671, 30)

In [4]:

nyc_cab_df.head()

Out[4]:

	AWND	Base	Day	Dropoff_latitude	Dropoff_longitude	Ehail_fee	Extra	Fare_amount	Lpep_dropoff_datetime	MTA_tax	Passenger_count	Payment_type	Pickup_latitude	Pickup_longitude	RateCodeID	Store_and_fwd_flag	TMAX	TMIN	Tip_amount	Tolls_amount	Total_amount	Trip_distance	Trip_type	Type	VendorID	lpep_pickup_datetime	Trip Length (min)
0	4.7	B02512	1	NaN	NaN	NaN	NaN	33.863498	2014-04-01 00:24:00	NaN	NaN	NaN	40.7690	-73.9549	NaN	NaN	60	39	NaN	NaN	NaN	4.083561	NaN	1	NaN	2014-04-01 00:11:00	13.0
1	4.7	B02512	1	NaN	NaN	NaN	NaN	19.022892	2014-04-01 00:29:00	NaN	NaN	NaN	40.7267	-74.0345	NaN	NaN	60	39	NaN	NaN	NaN	3.605694	NaN	1	NaN	2014-04-01 00:17:00	12.0
2	4.7	B02512	1	NaN	NaN	NaN	NaN	25.498981	2014-04-01 00:34:00	NaN	NaN	NaN	40.7316	-73.9873	NaN	NaN	60	39	NaN	NaN	NaN	4.221763	NaN	1	NaN	2014-04-01 00:21:00	13.0
3	4.7	B02512	1	NaN	NaN	NaN	NaN	28.024628	2014-04-01 00:39:00	NaN	NaN	NaN	40.7588	-73.9776	NaN	NaN	60	39	NaN	NaN	NaN	2.955510	NaN	1	NaN	2014-04-01 00:28:00	11.0
4	4.7	B02512	1	NaN	NaN	NaN	NaN	12.083589	2014-04-01 00:40:00	NaN	NaN	NaN	40.7594	-73.9722	NaN	NaN	60	39	NaN	NaN	NaN	1.922292	NaN	1	NaN	2014-04-01 00:33:00	7.0

In [5]:

nyc_cab_df.describe()

Out[5]:

	AWND	Day	Dropoff_latitude	Dropoff_longitude	Ehail_fee	Extra	Fare_amount	MTA_tax	PRCP	Passenger_count	Payment_type	Pickup_latitude	Pickup_longitude	RateCodeID	SNOW	SNWD	TMAX	TMIN	Tip_amount	Tolls_amount	Total_amount	Trip_distance	Trip_type	Type	VendorID	Trip Length (min)
count	1.873671e+06	1.873671e+06	1.309155e+06	1.309155e+06	0.0	1.309155e+06	1.873671e+06	1.309155e+06	1.873671e+06	1.309155e+06	1.309155e+06	1.873671e+06	1.873671e+06	1.309155e+06	1873671.0	1873671.0	1.873671e+06	1.873671e+06	1.309155e+06	1.309155e+06	1.309155e+06	1.873671e+06	1.225061e+06	1.873671e+06	1.309155e+06	1.873671e+06
mean	6.349472e+00	1.573272e+01	4.069035e+01	-7.380999e+01	NaN	3.718944e-01	1.480038e+01	4.885869e-01	3.277113e-01	1.424879e+00	1.613783e+00	4.070361e+01	-7.385354e+01	1.096059e+00	0.0	0.0	6.094607e+01	4.334624e+01	1.013024e+00	1.069806e-01	1.432070e+01	2.917737e+00	1.020388e+00	3.012888e-01	1.781165e+00	1.132344e+01
std	1.782350e+00	8.786914e+00	1.653301e+00	2.997829e+00	NaN	3.705219e-01	9.899925e+00	7.478954e-02	1.033659e+00	1.129275e+00	5.014897e-01	1.435388e+00	2.603334e+00	6.177816e-01	0.0	0.0	8.434375e+00	6.049009e+00	2.219229e+00	8.479594e-01	1.138924e+01	2.529476e+00	1.413220e-01	4.588180e-01	4.134567e-01	7.875921e+00
min	2.000000e+00	1.000000e+00	0.000000e+00	-1.219265e+02	NaN	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	1.000000e+00	-4.075827e+01	-1.219263e+02	1.000000e+00	0.0	0.0	4.700000e+01	3.100000e+01	0.000000e+00	0.000000e+00	0.000000e+00	-3.483963e-01	1.000000e+00	0.000000e+00	1.000000e+00	0.000000e+00
25%	4.900000e+00	8.000000e+00	4.071491e+01	-7.396558e+01	NaN	0.000000e+00	7.500000e+00	5.000000e-01	0.000000e+00	1.000000e+00	1.000000e+00	4.071750e+01	-7.397948e+01	1.000000e+00	0.0	0.0	5.400000e+01	4.000000e+01	0.000000e+00	0.000000e+00	7.500000e+00	1.340000e+00	1.000000e+00	0.000000e+00	2.000000e+00	6.000000e+00
50%	6.000000e+00	1.600000e+01	4.075613e+01	-7.394200e+01	NaN	5.000000e-01	1.300000e+01	5.000000e-01	0.000000e+00	1.000000e+00	2.000000e+00	4.074918e+01	-7.395300e+01	1.000000e+00	0.0	0.0	6.100000e+01	4.300000e+01	0.000000e+00	0.000000e+00	1.100000e+01	2.330000e+00	1.000000e+00	0.000000e+00	2.000000e+00	1.000000e+01
75%	8.300000e+00	2.400000e+01	4.080378e+01	-7.390412e+01	NaN	5.000000e-01	2.037272e+01	5.000000e-01	7.000000e-02	1.000000e+00	2.000000e+00	4.079704e+01	-7.392220e+01	1.000000e+00	0.0	0.0	6.700000e+01	4.600000e+01	1.500000e+00	0.000000e+00	1.750000e+01	3.640000e+00	1.000000e+00	1.000000e+00	2.000000e+00	1.400000e+01
max	9.400000e+00	3.000000e+01	4.289498e+01	0.000000e+00	NaN	3.000000e+01	3.351000e+03	1.500000e+00	4.970000e+00	9.000000e+00	5.000000e+00	4.211660e+01	7.393764e+01	9.900000e+01	0.0	0.0	7.700000e+01	5.900000e+01	5.000000e+02	1.500000e+02	3.352500e+03	2.313600e+02	2.000000e+00	1.000000e+00	2.000000e+00	5.900000e+01

In [6]:

nyc_cab_df.Passenger_count.hist()

Out[6]:

In [7]:

nyc_cab_df.dtypes

Out[7]:

AWND                     float64
Base                      object
Day                        int64
Dropoff_latitude         float64
Dropoff_longitude        float64
Ehail_fee                float64
Extra                    float64
Fare_amount              float64
Lpep_dropoff_datetime     object
MTA_tax                  float64
PRCP                     float64
Passenger_count          float64
Payment_type             float64
Pickup_latitude          float64
Pickup_longitude         float64
RateCodeID               float64
SNOW                     float64
SNWD                     float64
Store_and_fwd_flag        object
TMAX                       int64
TMIN                       int64
Tip_amount               float64
Tolls_amount             float64
Total_amount             float64
Trip_distance            float64
Trip_type                float64
Type                       int64
VendorID                 float64
lpep_pickup_datetime      object
Trip Length (min)        float64
dtype: object

In [8]:

nyc_cab_df.columns

Out[8]:

Index(['AWND', 'Base', 'Day', 'Dropoff_latitude', 'Dropoff_longitude', 'Ehail_fee', 'Extra', 'Fare_amount', 'Lpep_dropoff_datetime', 'MTA_tax', 'PRCP', 'Passenger_count', 'Payment_type', 'Pickup_latitude', 'Pickup_longitude', 'RateCodeID', 'SNOW', 'SNWD', 'Store_and_fwd_flag', 'TMAX', 'TMIN', 'Tip_amount', 'Tolls_amount', 'Total_amount', 'Trip_distance', 'Trip_type', 'Type', 'VendorID', 'lpep_pickup_datetime', 'Trip Length (min)'], dtype='object')

Clean data¶

How would you clean the data?

Extreme/outliers
Impute missing values
Drop some columns?
Drop rows with problematic values

In [9]:

nyc_cab_sample = nyc_cab_df.sample(n=1000, random_state=6)
fares = nyc_cab_sample['Fare_amount'].values
trip_lengths = nyc_cab_sample[['Trip Length (min)']].values

In [10]:

fig, ax = plt.subplots(1, 1, figsize=(10, 7))


ax.scatter(trip_lengths, fares, color='gray', alpha=0.1, s=20)

ax.set_xlabel('Trip Length (min)')
ax.set_ylabel('Trip Fare ($)')
ax.set_title('NYC Care Hire Data:\n Trip Duration vs Fare Scatter Plot')

Out[10]:

Step #2: Modeling the Data¶

In [11]:

#nyc_cab_sample = nyc_cab_df.sample(n=1000, random_state=1)

y = nyc_cab_sample['Fare_amount'].values
X = nyc_cab_sample[['Trip Length (min)']].values
X.shape

Out[11]:

(1000, 1)

In [26]:

regression = LinearRegression(fit_intercept=True)
regression.fit(X, y)



# this is the estimated statistical model (f_hat) using the predicted coefficients 
regression_line = lambda x: regression.intercept_ + regression.coef_ * x
print('The estimated equation of the regression line is: {} + {} * x'.format(regression.intercept_, regression.coef_[0]))

The estimated equation of the regression line is: 6.429017573516136 + 0.7339670486476195 * x

In [14]:

fig, ax = plt.subplots(1, 1, figsize=(10, 10))

x_vals = np.linspace(0, 70, 100)
ax.plot(x_vals, regression_line(x_vals), color='red', linewidth=1.0, label='regression line')
ax.scatter(X, y, color='gray', alpha=0.2, label='data')



ax.set_xlabel('Trip Length (min)')
ax.set_ylabel('Trip Fare ($)')
ax.set_title('NYC Care Hire Data:\n Trip Duration vs Fare Scatter Plot')
ax.legend(loc='best')

Out[14]:

Step #3: Evaluate and Interpret the Model¶

1. Train vs Test Error¶

In [15]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=6)

train_MSE= np.mean((y_train - regression.predict(X_train))**2)
test_MSE= np.mean((y_test - regression.predict(X_test))**2)
print('The train MSE is {}, the test MSE is {}'.format(train_MSE, test_MSE))

The train MSE is 48.59886786981504, the test MSE is 52.199666372395114

Why is the MSE for test higher?

2. Uncertainty in the Model Parameter Estimates¶

In [16]:

def find_regression_params(regression_model, samples):
    # randomly select number of samples from the big data
    nyc_cab_sample = nyc_cab_df.sample(n=samples)

    y = nyc_cab_sample['Fare_amount'].values
    X = nyc_cab_sample[['Trip Length (min)']].values

   

    regression_model.fit(X, y)
    
    return regression_model.intercept_, regression_model.coef_[0]

In [17]:

regression_model = LinearRegression(fit_intercept=True)

total_draws = 500
samples = 1000
regression_params = []

for i in range(total_draws):
    if i % 10 == 0:
        out = i * 1. / total_draws * 100
        sys.stdout.write("\r%d%%" % out)
        sys.stdout.flush()
        
    regression_params.append(find_regression_params(regression_model, samples))
    
sys.stdout.write("\r%d%%" % 100)
regression_params = np.array(regression_params)

100%

In [18]:

fig, ax = plt.subplots(1, 2, figsize=(10, 5))

ax[0].hist(regression_params[:, 0], bins=50, color='blue', edgecolor='white', linewidth=1, alpha=0.2)
ax[0].axvline(x=regression_params[:, 0].mean(), color='red', label='mean = {0:.2f}'.format(regression_params[:, 0].mean()))
ax[0].axvline(x=regression_params[:, 0].mean() - 2 * regression_params[:, 0].std(), color='green', linestyle='--', label='std = {0:.2f}'.format(regression_params[:, 0].std()))
ax[0].axvline(x=regression_params[:, 0].mean() + 2 * regression_params[:, 0].std(), color='green', linestyle='--')

ax[0].set_xlabel('Intercept')
ax[0].set_ylabel('Frequency')
ax[0].set_title('Histogram of Estimates of Intercept')
ax[0].legend(loc='best')


ax[1].hist(regression_params[:, 1], bins=50, color='gray', edgecolor='white', linewidth=1, alpha=0.2)
ax[1].axvline(x=regression_params[:, 1].mean(), color='red', label='mean = {0:.2f}'.format(regression_params[:, 1].mean()))
ax[1].axvline(x=regression_params[:, 1].mean() - 2 * regression_params[:, 1].std(), color='green', linestyle='--', label='std = {0:.2f}'.format(regression_params[:, 1].std()))
ax[1].axvline(x=regression_params[:, 1].mean() + 2 * regression_params[:, 1].std(), color='green', linestyle='--')

ax[1].set_xlabel('Slope')
ax[1].set_ylabel('Frequency')
ax[1].set_title('Histogram of Estimates of Slope')
ax[1].legend(loc='best')

plt.tight_layout()

3. The Effect of Sample Size on Uncertainty¶

In [19]:

regression_model = LinearRegression(fit_intercept=True)

def compute_SE(total_draws, samples, regression_model):

    regression_params = []

    for i in range(total_draws):
        regression_params.append(find_regression_params(regression_model, samples))

    regression_params = np.array(regression_params)
    return np.std(regression_params[:, 0]), np.std(regression_params[:, 1])

total_draws = 100
samples = range(100, 10001, 900)
ses = []

for i in range(len(samples)):
    out = i * 1. / len(samples) * 100
    sys.stdout.write("\r%d%%" % out)
    sys.stdout.flush()
    ses.append(compute_SE(total_draws, samples[i], regression_model))

sys.stdout.write("\r%d%%" % 100)

100%

In [20]:

ses = np.array(ses)
    
fig, ax = plt.subplots(1, 1, figsize=(10, 5))

ax.plot(samples, ses[:, 0], color='teal', label='SE of intercept')
ax.plot(samples, ses[:, 1], color='brown', label='SE of slope')

ax.set_xlabel('Number of Samples')
ax.set_ylabel('SE')
ax.set_title('Uncertainty in Parameter Estimates vs Sample Size')
ax.legend(loc='best')

Out[20]:

Step 2: Consider a Different Model for the Data - kNN¶

In [21]:

nyc_cab_sample = nyc_cab_df.sample(n=1000, random_state=1)

y = nyc_cab_sample['Fare_amount'].values
X = nyc_cab_sample[['Trip Length (min)']].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [22]:

knn_model = KNeighborsRegressor(n_neighbors=2)
knn_model.fit(X_train, y_train)

Out[22]:

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=2, p=2,
          weights='uniform')

In [23]:

fig, ax = plt.subplots(1, 2, figsize=(15, 8))
ax[0].scatter(X_train, knn_model.predict(X_train), color='red', alpha=0.1, label='knn model')
ax[0].scatter(X_train, y_train, color='gray', alpha=0.1, label='training data')


ax[0].set_xlabel('Trip Length (min)')
ax[0].set_ylabel('Trip Fare ($)')
ax[0].set_title('NYC Care Hire Data:\n Trip Duration vs Fare Scatter Plot')
ax[0].legend(loc='best')

ax[1].scatter(X_test, knn_model.predict(X_test), color='blue', alpha=0.1, label='knn model')
ax[1].scatter(X_test, y_test, color='gray', alpha=0.1, label='testing data')

ax[1].set_xlabel('Trip Length (min)')
ax[1].set_ylabel('Trip Fare ($)')
ax[1].set_title('NYC Care Hire Data:\n Trip Duration vs Fare Scatter Plot')
ax[1].legend(loc='best')

Out[23]:

In [24]:

train_MSE= np.mean((y_train - knn_model.predict(X_train))**2)
test_MSE= np.mean((y_test - knn_model.predict(X_test))**2)

print('The train MSE is {}, the test MSE is {}'.format(train_MSE, test_MSE))

The train MSE is 70.9224876379618, the test MSE is 77.62695077723178

Step #3: Evaluate and Interpret the kNN Model¶

In [25]:

train_MSEs = []
test_MSEs = []

for k in range(1, 100):
    sys.stdout.write("\r%d%%" % k)
    sys.stdout.flush()
    
    knn_model = KNeighborsRegressor(n_neighbors=k)
    knn_model.fit(X_train, y_train)
    train_MSEs.append(np.mean((y_train - knn_model.predict(X_train))**2))
    test_MSEs.append(np.mean((y_test - knn_model.predict(X_test))**2))
    
sys.stdout.write("\r%d%%" % 100)

100%

In [ ]:

fig, ax = plt.subplots(1, 1, figsize=(10, 10))

ax.plot(range(1, 100), train_MSEs, color='red', linewidth=1.0, label='train MSE')
ax.plot(range(1, 100), test_MSEs, color='blue', linewidth=1.0, label='test MSE')


ax.set_xlabel('k number of neighbors')
ax.set_ylabel('MSE')
ax.set_title('Train and Test MSE of the kNN model vs k')
ax.legend(loc='best')