Key Word(s): Regression, kNN, Scikit-Learn
Lecture 4: Introduction to Regression¶
Data Science 1: CS 109A/STAT 121A/AC 209A/ E 109A
Instructors: Pavlos Protopapas, Kevin Rader, Rahul Dave, Margo Levine
Harvard University
Fall 2017
In [1]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
sns.set(style="ticks")
%matplotlib inline
import seaborn as sns
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)
sns.set_context('poster')
# import matplotlib.pylab as pylab
# params = {'legend.fontsize': 'large',
# 'figure.figsize': (15, 5),
# 'axes.labelsize': 'large',
# 'axes.titlesize':'large',
# 'xtick.labelsize':'large',
# 'ytick.labelsize':'large'}
# pylab.rcParams.update(params)}
Step #1: Load and Explore Data¶
In [2]:
# GET THE FULL DATA SET FROM
#https://drive.google.com/file/d/0B28c493CP9GtRHFVM0U0SVI2Yms/view?usp=sharing
nyc_cab_df = pd.read_csv('data/nyc_car_hire_data.csv', low_memory=False) # DO WE NEED THE LOW MEMORY
In [3]:
nyc_cab_df.shape
Out[3]:
In [4]:
nyc_cab_df.head()
Out[4]:
In [5]:
nyc_cab_df.describe()
Out[5]:
In [6]:
nyc_cab_df.Passenger_count.hist()
Out[6]:
In [7]:
nyc_cab_df.dtypes
Out[7]:
In [8]:
nyc_cab_df.columns
Out[8]:
Clean data¶
How would you clean the data?
- Extreme/outliers
- Impute missing values
- Drop some columns?
- Drop rows with problematic values
In [9]:
nyc_cab_sample = nyc_cab_df.sample(n=1000, random_state=6)
fares = nyc_cab_sample['Fare_amount'].values
trip_lengths = nyc_cab_sample[['Trip Length (min)']].values
In [10]:
fig, ax = plt.subplots(1, 1, figsize=(10, 7))
ax.scatter(trip_lengths, fares, color='gray', alpha=0.1, s=20)
ax.set_xlabel('Trip Length (min)')
ax.set_ylabel('Trip Fare ($)')
ax.set_title('NYC Care Hire Data:\n Trip Duration vs Fare Scatter Plot')
Out[10]:
Step #2: Modeling the Data¶
In [11]:
#nyc_cab_sample = nyc_cab_df.sample(n=1000, random_state=1)
y = nyc_cab_sample['Fare_amount'].values
X = nyc_cab_sample[['Trip Length (min)']].values
X.shape
Out[11]:
In [26]:
regression = LinearRegression(fit_intercept=True)
regression.fit(X, y)
# this is the estimated statistical model (f_hat) using the predicted coefficients
regression_line = lambda x: regression.intercept_ + regression.coef_ * x
print('The estimated equation of the regression line is: {} + {} * x'.format(regression.intercept_, regression.coef_[0]))
In [14]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
x_vals = np.linspace(0, 70, 100)
ax.plot(x_vals, regression_line(x_vals), color='red', linewidth=1.0, label='regression line')
ax.scatter(X, y, color='gray', alpha=0.2, label='data')
ax.set_xlabel('Trip Length (min)')
ax.set_ylabel('Trip Fare ($)')
ax.set_title('NYC Care Hire Data:\n Trip Duration vs Fare Scatter Plot')
ax.legend(loc='best')
Out[14]:
Step #3: Evaluate and Interpret the Model¶
1. Train vs Test Error¶
In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=6)
train_MSE= np.mean((y_train - regression.predict(X_train))**2)
test_MSE= np.mean((y_test - regression.predict(X_test))**2)
print('The train MSE is {}, the test MSE is {}'.format(train_MSE, test_MSE))
Why is the MSE for test higher?
2. Uncertainty in the Model Parameter Estimates¶
In [16]:
def find_regression_params(regression_model, samples):
# randomly select number of samples from the big data
nyc_cab_sample = nyc_cab_df.sample(n=samples)
y = nyc_cab_sample['Fare_amount'].values
X = nyc_cab_sample[['Trip Length (min)']].values
regression_model.fit(X, y)
return regression_model.intercept_, regression_model.coef_[0]
In [17]:
regression_model = LinearRegression(fit_intercept=True)
total_draws = 500
samples = 1000
regression_params = []
for i in range(total_draws):
if i % 10 == 0:
out = i * 1. / total_draws * 100
sys.stdout.write("\r%d%%" % out)
sys.stdout.flush()
regression_params.append(find_regression_params(regression_model, samples))
sys.stdout.write("\r%d%%" % 100)
regression_params = np.array(regression_params)
In [18]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].hist(regression_params[:, 0], bins=50, color='blue', edgecolor='white', linewidth=1, alpha=0.2)
ax[0].axvline(x=regression_params[:, 0].mean(), color='red', label='mean = {0:.2f}'.format(regression_params[:, 0].mean()))
ax[0].axvline(x=regression_params[:, 0].mean() - 2 * regression_params[:, 0].std(), color='green', linestyle='--', label='std = {0:.2f}'.format(regression_params[:, 0].std()))
ax[0].axvline(x=regression_params[:, 0].mean() + 2 * regression_params[:, 0].std(), color='green', linestyle='--')
ax[0].set_xlabel('Intercept')
ax[0].set_ylabel('Frequency')
ax[0].set_title('Histogram of Estimates of Intercept')
ax[0].legend(loc='best')
ax[1].hist(regression_params[:, 1], bins=50, color='gray', edgecolor='white', linewidth=1, alpha=0.2)
ax[1].axvline(x=regression_params[:, 1].mean(), color='red', label='mean = {0:.2f}'.format(regression_params[:, 1].mean()))
ax[1].axvline(x=regression_params[:, 1].mean() - 2 * regression_params[:, 1].std(), color='green', linestyle='--', label='std = {0:.2f}'.format(regression_params[:, 1].std()))
ax[1].axvline(x=regression_params[:, 1].mean() + 2 * regression_params[:, 1].std(), color='green', linestyle='--')
ax[1].set_xlabel('Slope')
ax[1].set_ylabel('Frequency')
ax[1].set_title('Histogram of Estimates of Slope')
ax[1].legend(loc='best')
plt.tight_layout()
3. The Effect of Sample Size on Uncertainty¶
In [19]:
regression_model = LinearRegression(fit_intercept=True)
def compute_SE(total_draws, samples, regression_model):
regression_params = []
for i in range(total_draws):
regression_params.append(find_regression_params(regression_model, samples))
regression_params = np.array(regression_params)
return np.std(regression_params[:, 0]), np.std(regression_params[:, 1])
total_draws = 100
samples = range(100, 10001, 900)
ses = []
for i in range(len(samples)):
out = i * 1. / len(samples) * 100
sys.stdout.write("\r%d%%" % out)
sys.stdout.flush()
ses.append(compute_SE(total_draws, samples[i], regression_model))
sys.stdout.write("\r%d%%" % 100)
In [20]:
ses = np.array(ses)
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.plot(samples, ses[:, 0], color='teal', label='SE of intercept')
ax.plot(samples, ses[:, 1], color='brown', label='SE of slope')
ax.set_xlabel('Number of Samples')
ax.set_ylabel('SE')
ax.set_title('Uncertainty in Parameter Estimates vs Sample Size')
ax.legend(loc='best')
Out[20]:
Step 2: Consider a Different Model for the Data - kNN¶
In [21]:
nyc_cab_sample = nyc_cab_df.sample(n=1000, random_state=1)
y = nyc_cab_sample['Fare_amount'].values
X = nyc_cab_sample[['Trip Length (min)']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
In [22]:
knn_model = KNeighborsRegressor(n_neighbors=2)
knn_model.fit(X_train, y_train)
Out[22]:
In [23]:
fig, ax = plt.subplots(1, 2, figsize=(15, 8))
ax[0].scatter(X_train, knn_model.predict(X_train), color='red', alpha=0.1, label='knn model')
ax[0].scatter(X_train, y_train, color='gray', alpha=0.1, label='training data')
ax[0].set_xlabel('Trip Length (min)')
ax[0].set_ylabel('Trip Fare ($)')
ax[0].set_title('NYC Care Hire Data:\n Trip Duration vs Fare Scatter Plot')
ax[0].legend(loc='best')
ax[1].scatter(X_test, knn_model.predict(X_test), color='blue', alpha=0.1, label='knn model')
ax[1].scatter(X_test, y_test, color='gray', alpha=0.1, label='testing data')
ax[1].set_xlabel('Trip Length (min)')
ax[1].set_ylabel('Trip Fare ($)')
ax[1].set_title('NYC Care Hire Data:\n Trip Duration vs Fare Scatter Plot')
ax[1].legend(loc='best')
Out[23]:
In [24]:
train_MSE= np.mean((y_train - knn_model.predict(X_train))**2)
test_MSE= np.mean((y_test - knn_model.predict(X_test))**2)
print('The train MSE is {}, the test MSE is {}'.format(train_MSE, test_MSE))
Step #3: Evaluate and Interpret the kNN Model¶
In [25]:
train_MSEs = []
test_MSEs = []
for k in range(1, 100):
sys.stdout.write("\r%d%%" % k)
sys.stdout.flush()
knn_model = KNeighborsRegressor(n_neighbors=k)
knn_model.fit(X_train, y_train)
train_MSEs.append(np.mean((y_train - knn_model.predict(X_train))**2))
test_MSEs.append(np.mean((y_test - knn_model.predict(X_test))**2))
sys.stdout.write("\r%d%%" % 100)
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.plot(range(1, 100), train_MSEs, color='red', linewidth=1.0, label='train MSE')
ax.plot(range(1, 100), test_MSEs, color='blue', linewidth=1.0, label='test MSE')
ax.set_xlabel('k number of neighbors')
ax.set_ylabel('MSE')
ax.set_title('Train and Test MSE of the kNN model vs k')
ax.legend(loc='best')