Key Word(s): Dimensionality Reduction, PCA, Lasso Regression

In [35]:

import pandas as pd
import sys
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tools import add_constant
from statsmodels.regression.linear_model import RegressionResults
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
sns.set(style="ticks")
%matplotlib inline

In [2]:

nyc_cab_df = pd.read_csv('nyc_car_hire_data.csv', low_memory=False)

In [36]:

def train_test_split(df, n_samples, validation=False):
    if validation:
        nyc_cab_sample = df.sample(n=n_samples)

        nyc_cab_sample['lpep_pickup_datetime'] = nyc_cab_sample['lpep_pickup_datetime'].apply(lambda dt: pd.to_datetime(dt).hour)
        nyc_cab_sample['Lpep_dropoff_datetime'] = nyc_cab_sample['Lpep_dropoff_datetime'].apply(lambda dt: pd.to_datetime(dt).hour)

        msk = np.random.rand(len(nyc_cab_sample)) < 0.8
        non_test = nyc_cab_sample[msk]
        test = nyc_cab_sample[~msk]
        
        msk = np.random.rand(len(non_test)) < 0.7
        train = non_test[msk]
        validation = non_test[~msk]
        
        return train, validation, test
    
    else:
        nyc_cab_sample = df.sample(n=n_samples)

        nyc_cab_sample['lpep_pickup_datetime'] = nyc_cab_sample['lpep_pickup_datetime'].apply(lambda dt: pd.to_datetime(dt).hour)
        nyc_cab_sample['Lpep_dropoff_datetime'] = nyc_cab_sample['Lpep_dropoff_datetime'].apply(lambda dt: pd.to_datetime(dt).hour)

        msk = np.random.rand(len(nyc_cab_sample)) < 0.8
        train = nyc_cab_sample[msk]
        test = nyc_cab_sample[~msk]

        return train, test

Dimensionality Reduction: PCA¶

In [55]:

train, validation, test = train_test_split(nyc_cab_df, 1000, validation=True)

y_train = train['Fare_amount'].values
y_val = validation['Fare_amount'].values
y_test = test['Fare_amount'].values

regression_model = LinearRegression(fit_intercept=True)

all_predictors = ['Trip Length (min)', 'Type', 'Trip_distance', 'TMAX', 'TMIN', 'lpep_pickup_datetime', 'Lpep_dropoff_datetime', 'Pickup_longitude', 'Pickup_latitude', 'SNOW', 'SNWD', 'PRCP']

train.describe()

Out[55]:

	AWND	Day	Dropoff_latitude	Dropoff_longitude	Ehail_fee	Extra	Fare_amount	Lpep_dropoff_datetime	MTA_tax	PRCP	...	TMIN	Tip_amount	Tolls_amount	Total_amount	Trip_distance	Trip_type	Type	VendorID	lpep_pickup_datetime	Trip Length (min)
count	555.000000	555.000000	397.000000	397.000000	0.0	397.000000	555.000000	555.000000	397.000000	555.000000	...	555.000000	397.000000	397.000000	397.000000	555.000000	370.000000	555.000000	397.000000	555.000000	555.000000
mean	6.301261	15.544144	40.657118	-73.740833	NaN	0.360202	14.840373	13.706306	0.483627	0.304072	...	43.461261	0.970907	0.103224	14.302091	2.966048	1.016216	0.284685	1.780856	13.722523	11.745946
std	1.811899	8.811880	2.046459	3.710697	NaN	0.364079	8.991866	6.822284	0.089097	0.978401	...	5.772760	1.899930	0.790127	10.318883	2.440517	0.126477	0.451671	0.414188	6.732816	8.256622
min	2.000000	1.000000	0.000000	-74.194969	NaN	0.000000	0.000000	0.000000	0.000000	0.000000	...	31.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	1.000000	0.000000	0.000000
25%	4.900000	8.000000	40.722504	-73.964455	NaN	0.000000	7.725177	9.000000	0.500000	0.000000	...	40.000000	0.000000	0.000000	8.000000	1.476206	1.000000	0.000000	2.000000	9.000000	7.000000
50%	6.000000	15.000000	40.756714	-73.938911	NaN	0.500000	13.000000	15.000000	0.500000	0.000000	...	43.000000	0.000000	0.000000	11.000000	2.342590	1.000000	0.000000	2.000000	15.000000	10.000000
75%	7.800000	24.000000	40.804909	-73.891899	NaN	0.500000	20.303457	19.000000	0.500000	0.070000	...	46.000000	1.400000	0.000000	17.500000	3.595000	1.000000	1.000000	2.000000	19.000000	14.000000
max	9.400000	30.000000	40.882172	0.000000	NaN	1.000000	73.500000	23.000000	0.500000	4.970000	...	59.000000	16.600000	9.000000	99.600000	19.740000	2.000000	1.000000	2.000000	23.000000	56.000000

8 rows × 28 columns

1. Variable Selection: Backwards¶

In [38]:

def get_aic(X_train, y_train):
    X_train = add_constant(X_train)
    model = sm.OLS(y_train, X_train).fit()
    return model.aic

X_train = train[all_predictors].values
predictors = [(all_predictors, get_aic(X_train, y_train))]

for k in range(len(all_predictors), 1, -1):
    best_k_predictors = predictors[-1][0]
    aics = []
    
    for predictor in best_k_predictors:
        k_minus_1 = list(set(best_k_predictors) - set([predictor]))
        X_train = train[k_minus_1].values

        aics.append(get_aic(X_train, y_train))
    
    best_k_minus_1 = list(set(best_k_predictors) - set([best_k_predictors[np.argmin(aics)]]))
    predictors.append((best_k_minus_1, np.min(aics)))
    
best_predictor_set = sorted(predictors, key=lambda t: t[1])[-1]

In [39]:

best_predictor_set = sorted(predictors, key=lambda t: t[1])[0]

X_train = train[best_predictor_set[0]].values
X_val = validation[best_predictor_set[0]].values  
X_test = test[best_predictor_set[0]].values  
regression_model.fit(np.vstack((X_train, X_val)), np.hstack((y_train, y_val)))

print('best predictor set: {}\ntest R^2: {}'.format(best_predictor_set[0], regression_model.score(X_test, y_test)))

best predictor set: ['Type', 'SNOW', 'Lpep_dropoff_datetime', 'SNWD', 'Trip_distance', 'Trip Length (min)']
test R^2: 0.613997587194766

2. Variable Selection: LASSO Regression¶

In [40]:

X_train = train[all_predictors].values
X_val = validation[all_predictors].values
X_non_test = np.vstack((X_train, X_val))
X_test = test[all_predictors].values

y_non_test = np.hstack((y_train, y_val))

lasso_regression = Lasso(alpha=1.0, fit_intercept=True)
lasso_regression.fit(X_non_test, y_non_test)

print('Lasso regression model:\n {} + {}^T . x'.format(lasso_regression.intercept_, lasso_regression.coef_))

Lasso regression model:
 4.718051298020281 + [ 0.32545776  4.82421529  1.73898646  0.          0.         -0.
 -0.00684636  0.         -0.          0.          0.          0.        ]^T . x

Out[40]:

	AWND	Day	Dropoff_latitude	Dropoff_longitude	Ehail_fee	Extra	Fare_amount	Lpep_dropoff_datetime	MTA_tax	PRCP	...	TMIN	Tip_amount	Tolls_amount	Total_amount	Trip_distance	Trip_type	Type	VendorID	lpep_pickup_datetime	Trip Length (min)
count	563.000000	563.000000	382.000000	382.000000	0.0	382.000000	563.000000	563.000000	382.000000	563.000000	...	563.000000	382.000000	382.000000	382.000000	563.000000	353.000000	563.000000	382.000000	563.000000	563.000000
mean	6.295915	15.884547	40.758286	-73.933923	NaN	0.393979	14.486707	13.712256	0.484293	0.313375	...	43.358792	1.179607	0.145916	13.872644	2.786539	1.025496	0.321492	1.801047	13.873890	10.763766
std	1.793266	8.932875	0.057298	0.047643	NaN	0.376321	8.727789	6.738822	0.087331	1.030186	...	5.660867	2.409299	0.860186	9.865012	2.278020	0.157849	0.467464	0.399736	6.582902	7.868309
min	2.000000	1.000000	40.588928	-74.036293	NaN	0.000000	0.000000	0.000000	0.000000	0.000000	...	31.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	1.000000	0.000000	0.000000
25%	4.900000	8.000000	40.718020	-73.967112	NaN	0.000000	7.000000	9.000000	0.500000	0.000000	...	40.000000	0.000000	0.000000	7.500000	1.260000	1.000000	0.000000	2.000000	9.000000	6.000000
50%	5.800000	16.000000	40.755747	-73.942505	NaN	0.500000	12.500000	15.000000	0.500000	0.000000	...	43.000000	0.000000	0.000000	10.225000	2.150000	1.000000	0.000000	2.000000	15.000000	9.000000
75%	8.300000	24.000000	40.804507	-73.906462	NaN	0.500000	20.989595	19.000000	0.500000	0.050000	...	46.000000	1.895000	0.000000	16.787500	3.600677	1.000000	1.000000	2.000000	19.000000	13.000000
max	9.400000	30.000000	40.911869	-73.783318	NaN	1.000000	52.000000	23.000000	0.500000	4.970000	...	59.000000	21.000000	5.330000	68.250000	17.500000	2.000000	1.000000	2.000000	23.000000	50.000000

8 rows × 28 columns

In [41]:

print('Test R^2: {}'.format(lasso_regression.score(X_test, y_test)))

Test R^2: 0.5640977221609269

3. Principal Component Regression¶

In [42]:

pca = PCA(n_components=4)
pca.fit(X_non_test)
X_non_test_pca = pca.transform(X_non_test)
X_test_pca = pca.transform(X_test)

print('Explained variance ratio:', pca.explained_variance_ratio_)

Explained variance ratio: [ 0.35266104  0.30974822  0.23533029  0.04614604]

In [43]:

fig, ax = plt.subplots(1, 3, figsize=(20, 5))

ax[0].scatter(X_non_test_pca[:, 0], X_non_test_pca[:, 1], color='blue', alpha=0.2, label='train R^2')

ax[0].set_title('Dimension Reduced Data')
ax[0].set_xlabel('1st Principal Component')
ax[0].set_ylabel('2nd Principal Component')

ax[1].scatter(X_non_test_pca[:, 1], X_non_test_pca[:, 2], color='red', alpha=0.2, label='train R^2')

ax[1].set_title('Dimension Reduced Data')
ax[1].set_xlabel('2nd Principal Component')
ax[1].set_ylabel('3rd Principal Component')

ax[2].scatter(X_non_test_pca[:, 2], X_non_test_pca[:, 3], color='green', alpha=0.2, label='train R^2')

ax[2].set_title('Dimension Reduced Data')
ax[2].set_xlabel('3rd Principal Component')
ax[2].set_ylabel('4th Principal Component')

plt.show()

In [44]:

print('first pca component:', pca.components_[0])
print('\nsecond pca component:', pca.components_[1])
print('\nthird pca component:', pca.components_[2])
print('\nfourth pca component:', pca.components_[3])

first pca component: [-0.00535566 -0.0024424   0.00535312  0.84837581  0.52686147 -0.0347733
 -0.02516945 -0.0151692   0.00857694 -0.         -0.         -0.02158279]

second pca component: [ -2.78996297e-02  -7.78645860e-04  -8.41287443e-04  -4.07707358e-02
  -1.39591187e-02  -6.97761616e-01  -7.14020027e-01   2.26866742e-02
  -1.23425543e-02   0.00000000e+00   0.00000000e+00  -5.39458392e-04]

third pca component: [ 0.96942319 -0.00574121  0.24273353 -0.00318404  0.00956019 -0.01553881
 -0.02365233 -0.01653702  0.00871489  0.          0.         -0.0037816 ]

fourth pca component: [ 0.00906176 -0.00097938  0.00496818  0.52618728 -0.84884713 -0.01894907
  0.00571241  0.02344169 -0.01339048  0.          0.         -0.03684045]

In [45]:

regression_model = LinearRegression(fit_intercept=True)
regression_model.fit(X_non_test_pca, y_non_test)

print('Test R^2: {}'.format(regression_model.score(X_test_pca, y_test)))

Test R^2: 0.2867990583026705

3. Principal Component Regression With Polynomial and Interaction Terms¶

In [46]:

gen_poly_terms = PolynomialFeatures(degree=6, interaction_only=False)

min_max_scaler = MinMaxScaler()
X_non_test = min_max_scaler.fit_transform(X_non_test)
X_test = min_max_scaler.fit_transform(X_test)

X_train_full_poly = gen_poly_terms.fit_transform(X_non_test)
X_test_full_poly = gen_poly_terms.fit_transform(X_test)

print('number of total predictors', X_train_full_poly.shape[1])

number of total predictors 18564

In [47]:

pca = PCA(n_components=15)
pca.fit(X_train_full_poly)
X_train_pca = pca.transform(X_train_full_poly)
X_test_pca = pca.transform(X_test_full_poly)

print('Explained variance ratio:', pca.explained_variance_ratio_)

Explained variance ratio: [ 0.35464084  0.25203523  0.14403337  0.06464052  0.03057768  0.02686149
  0.0234663   0.01698414  0.01288877  0.01112968  0.00834928  0.00610953
  0.00562714  0.00519129  0.00386799]

In [48]:

regression_model = LinearRegression(fit_intercept=True)
regression_model.fit(X_train_pca, y_non_test)

print('Test R^2: {}'.format(regression_model.score(X_test_pca, y_test)))

Test R^2: 0.5393636355874347

In [49]:

pca = PCA(n_components=45)
pca.fit(X_train_full_poly)
X_train_pca = pca.transform(X_train_full_poly)
X_test_pca = pca.transform(X_test_full_poly)

print('Explained variance ratio:', pca.explained_variance_ratio_)

Explained variance ratio: [  3.54640840e-01   2.52035233e-01   1.44033366e-01   6.46405244e-02
   3.05776837e-02   2.68614888e-02   2.34663024e-02   1.69841411e-02
   1.28887651e-02   1.11296789e-02   8.34928245e-03   6.10953399e-03
   5.62714043e-03   5.19129406e-03   3.86799024e-03   3.43133791e-03
   3.08965850e-03   2.37287925e-03   2.28753078e-03   2.18918876e-03
   2.13384153e-03   1.80170559e-03   1.59511606e-03   1.35052783e-03
   1.18886197e-03   1.09513688e-03   9.56049716e-04   8.10425592e-04
   7.81231273e-04   6.72013350e-04   6.22551371e-04   6.00057274e-04
   4.38652587e-04   4.11883879e-04   4.07211026e-04   3.54631015e-04
   3.40884616e-04   3.02147913e-04   2.72643412e-04   2.59783246e-04
   2.39834877e-04   2.12233315e-04   2.01769670e-04   2.00545384e-04
   1.86754908e-04]

In [50]:

regression_model = LinearRegression(fit_intercept=True)
regression_model.fit(X_train_pca, y_non_test)

print('Test R^2: {}'.format(regression_model.score(X_test_pca, y_test)))

Test R^2: 0.5756401842246859

Selecting the Components of PCA Using Cross Validation¶

In [51]:

regression_model = LinearRegression(fit_intercept=True)
kf = KFold(n_splits=5)

x_val_scores = []

for n in range(1, 80, 5):
    out = n * 1. / 80 * 100
    sys.stdout.write("\r%d%%" % out)
    sys.stdout.flush()
    
    pca = PCA(n_components=15)
    pca.fit(X_train_full_poly)
    
    validation_R_sqs = []
    for train_index, val_index in kf.split(X_train_pca):
        X_train, X_val = X_train_full_poly[train_index], X_train_full_poly[val_index]
        y_train, y_val = y_non_test[train_index], y_non_test[val_index]

        X_train_pca = pca.transform(X_train)
        X_val_pca = pca.transform(X_val)
        
        regression_model.fit(X_train_pca, y_train)
        validation_R_sqs.append(regression_model.score(X_val_pca, y_val))
        
    x_val_scores.append(np.mean(validation_R_sqs))
    
sys.stdout.write("\r%d%%" % 100)

100%

In [52]:

fig, ax = plt.subplots(1, 1, figsize=(5, 5))

ax.plot(range(1, 80, 5), x_val_scores, color='blue')

ax.set_title('Log Cross Validation Score vs. Number of PCA Components')
ax.set_xlabel('number of components')
ax.set_ylabel('Cross Validation Score')
ax.set_yscale('log')

plt.show()

In [54]:

best_n = range(1, 80, 5)[np.argmax(x_val_scores)]

pca = PCA(n_components=best_n)
pca.fit(X_train_full_poly)
X_train_pca = pca.transform(X_train_full_poly)
X_test_pca = pca.transform(X_test_full_poly)

regression_model.fit(X_train_pca, y_non_test)
test_R_sq = regression_model.score(X_test_pca, y_test)

print('best regularization param is:', best_n)
print('the test R^2 for PC regression with n = {} is: {}'.format(best_n, test_R_sq))

best regularization param is: 26
the test R^2 for PC regression with n = 26 is: 0.5911750507980804