Key Word(s): Principal Component Analysis (PCA), Demo
CS-109A Introduction to Data Science
Lecture 8: Principal Component Analysis Demo¶
Harvard University
Summer 2018
Instructors: Pavlos Protopapas and Kevin Rader
In [1]:
## RUN THIS CELL TO PROPERLY HIGHLIGHT THE EXERCISES
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)
Out[1]:
In [1]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tools import add_constant
from statsmodels.regression.linear_model import RegressionResults
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
sns.set(style="ticks")
%matplotlib inline
In [2]:
nyc_cab_df = pd.read_csv('nyc_car_hire_data.csv', low_memory=False)
In [11]:
print(nyc_cab_df.shape)
nyc_cab_df.head()
Out[11]:
In [3]:
def train_test_split(df, n_samples, validation=False):
if validation:
nyc_cab_sample = df.sample(n=n_samples)
nyc_cab_sample['lpep_pickup_datetime'] = nyc_cab_sample['lpep_pickup_datetime'].apply(lambda dt: pd.to_datetime(dt).hour)
nyc_cab_sample['Lpep_dropoff_datetime'] = nyc_cab_sample['Lpep_dropoff_datetime'].apply(lambda dt: pd.to_datetime(dt).hour)
msk = np.random.rand(len(nyc_cab_sample)) < 0.8
non_test = nyc_cab_sample[msk]
test = nyc_cab_sample[~msk]
msk = np.random.rand(len(non_test)) < 0.7
train = non_test[msk]
validation = non_test[~msk]
return train, validation, test
else:
nyc_cab_sample = df.sample(n=n_samples)
nyc_cab_sample['lpep_pickup_datetime'] = nyc_cab_sample['lpep_pickup_datetime'].apply(lambda dt: pd.to_datetime(dt).hour)
nyc_cab_sample['Lpep_dropoff_datetime'] = nyc_cab_sample['Lpep_dropoff_datetime'].apply(lambda dt: pd.to_datetime(dt).hour)
msk = np.random.rand(len(nyc_cab_sample)) < 0.8
train = nyc_cab_sample[msk]
test = nyc_cab_sample[~msk]
return train, test
Dimensionality Reduction: PCA¶
In [4]:
train, validation, test = train_test_split(nyc_cab_df, 1000, validation=True)
y_train = train['Fare_amount'].values
y_val = validation['Fare_amount'].values
y_test = test['Fare_amount'].values
regression_model = LinearRegression(fit_intercept=True)
all_predictors = ['Trip Length (min)', 'Type', 'Trip_distance', 'TMAX', 'TMIN',
'lpep_pickup_datetime', 'Lpep_dropoff_datetime', 'Pickup_longitude',
'Pickup_latitude', 'SNOW', 'SNWD', 'PRCP']
train.describe()
Out[4]:
1. Variable Selection: Backwards¶
In [12]:
def get_aic(X_train, y_train):
X_train = add_constant(X_train)
model = sm.OLS(y_train, X_train).fit()
return model.aic
X_train = train[all_predictors].values
predictors = [(all_predictors, get_aic(X_train, y_train))]
for k in range(len(all_predictors), 1, -1):
best_k_predictors = predictors[-1][0]
aics = []
for predictor in best_k_predictors:
k_minus_1 = list(set(best_k_predictors) - set([predictor]))
X_train = train[k_minus_1].values
aics.append(get_aic(X_train, y_train))
best_k_minus_1 = list(set(best_k_predictors) - set([best_k_predictors[np.argmin(aics)]]))
predictors.append((best_k_minus_1, np.min(aics)))
best_predictor_set = sorted(predictors, key=lambda t: t[1])[-1]
In [13]:
best_predictor_set = sorted(predictors, key=lambda t: t[1])[0]
X_train = train[best_predictor_set[0]].values
X_val = validation[best_predictor_set[0]].values
X_test = test[best_predictor_set[0]].values
regression_model.fit(np.vstack((X_train, X_val)), np.hstack((y_train, y_val)))
print('best predictor set: {}\ntest R^2: {}'.format(best_predictor_set[0], regression_model.score(X_test, y_test)))
2. Variable Selection: LASSO Regression¶
In [14]:
X_train = train[all_predictors].values
X_val = validation[all_predictors].values
X_non_test = np.vstack((X_train, X_val))
X_test = test[all_predictors].values
y_non_test = np.hstack((y_train, y_val))
lasso_regression = Lasso(alpha=1.0, fit_intercept=True)
lasso_regression.fit(X_non_test, y_non_test)
print('Lasso regression model:\n {} + {}^T . x'.format(lasso_regression.intercept_, lasso_regression.coef_))
In [15]:
print('Test R^2: {}'.format(lasso_regression.score(X_test, y_test)))
3. Principal Component Regression¶
In [24]:
pca_all = PCA()
pca_all.fit(X_non_test)
print('First 4 principal components:\n', pca_all.components_[0:4])
print('Explained variance ratio:\n', pca_all.explained_variance_ratio_)
In [18]:
pca = PCA(n_components=4)
pca.fit(X_non_test)
X_non_test_pca = pca.transform(X_non_test)
X_test_pca = pca.transform(X_test)
print('Explained variance ratio:', pca.explained_variance_ratio_)
In [10]:
fig, ax = plt.subplots(1, 3, figsize=(20, 5))
ax[0].scatter(X_non_test_pca[:, 0], X_non_test_pca[:, 1], color='blue', alpha=0.2, label='train R^2')
ax[0].set_title('Dimension Reduced Data')
ax[0].set_xlabel('1st Principal Component')
ax[0].set_ylabel('2nd Principal Component')
ax[1].scatter(X_non_test_pca[:, 1], X_non_test_pca[:, 2], color='red', alpha=0.2, label='train R^2')
ax[1].set_title('Dimension Reduced Data')
ax[1].set_xlabel('2nd Principal Component')
ax[1].set_ylabel('3rd Principal Component')
ax[2].scatter(X_non_test_pca[:, 2], X_non_test_pca[:, 3], color='green', alpha=0.2, label='train R^2')
ax[2].set_title('Dimension Reduced Data')
ax[2].set_xlabel('3rd Principal Component')
ax[2].set_ylabel('4th Principal Component')
plt.show()
In [11]:
print('first pca component:\n', pca.components_[0])
print('\nsecond pca component:\n', pca.components_[1])
print('\nthird pca component:\n', pca.components_[2])
print('\nfourth pca component:\n', pca.components_[3])
In [12]:
regression_model = LinearRegression(fit_intercept=True)
regression_model.fit(X_non_test_pca, y_non_test)
print('Test R^2: {}'.format(regression_model.score(X_test_pca, y_test)))
3. Principal Component Regression With Polynomial and Interaction Terms¶
In [13]:
gen_poly_terms = PolynomialFeatures(degree=6, interaction_only=False)
min_max_scaler = MinMaxScaler()
X_non_test = min_max_scaler.fit_transform(X_non_test)
X_test = min_max_scaler.fit_transform(X_test)
X_train_full_poly = gen_poly_terms.fit_transform(X_non_test)
X_test_full_poly = gen_poly_terms.fit_transform(X_test)
print('number of total predictors', X_train_full_poly.shape[1])
In [14]:
pca = PCA(n_components=15)
pca.fit(X_train_full_poly)
X_train_pca = pca.transform(X_train_full_poly)
X_test_pca = pca.transform(X_test_full_poly)
print('Explained variance ratio:', pca.explained_variance_ratio_)
In [15]:
regression_model = LinearRegression(fit_intercept=True)
regression_model.fit(X_train_pca, y_non_test)
print('Test R^2: {}'.format(regression_model.score(X_test_pca, y_test)))
In [16]:
pca = PCA(n_components=45)
pca.fit(X_train_full_poly)
X_train_pca = pca.transform(X_train_full_poly)
X_test_pca = pca.transform(X_test_full_poly)
print('Explained variance ratio:', pca.explained_variance_ratio_)
In [17]:
regression_model = LinearRegression(fit_intercept=True)
regression_model.fit(X_train_pca, y_non_test)
print('Test R^2: {}'.format(regression_model.score(X_test_pca, y_test)))
Selecting the Components of PCA Using Cross Validation¶
In [18]:
regression_model = LinearRegression(fit_intercept=True)
kf = KFold(n_splits=5)
x_val_scores = []
for n in range(1, 80, 5):
out = n * 1. / 80 * 100
sys.stdout.write("\r%d%%" % out)
sys.stdout.flush()
pca = PCA(n_components=15)
pca.fit(X_train_full_poly)
validation_R_sqs = []
for train_index, val_index in kf.split(X_train_pca):
X_train, X_val = X_train_full_poly[train_index], X_train_full_poly[val_index]
y_train, y_val = y_non_test[train_index], y_non_test[val_index]
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
regression_model.fit(X_train_pca, y_train)
validation_R_sqs.append(regression_model.score(X_val_pca, y_val))
x_val_scores.append(np.mean(validation_R_sqs))
sys.stdout.write("\r%d%%" % 100)
In [19]:
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
ax.plot(range(1, 80, 5), x_val_scores, color='blue')
ax.set_title('Log Cross Validation Score vs. Number of PCA Components')
ax.set_xlabel('number of components')
ax.set_ylabel('Cross Validation Score')
ax.set_yscale('log')
plt.show()
In [20]:
best_n = range(1, 80, 5)[np.argmax(x_val_scores)]
pca = PCA(n_components=best_n)
pca.fit(X_train_full_poly)
X_train_pca = pca.transform(X_train_full_poly)
X_test_pca = pca.transform(X_test_full_poly)
regression_model.fit(X_train_pca, y_non_test)
test_R_sq = regression_model.score(X_test_pca, y_test)
print('best regularization param is:', best_n)
print('the test R^2 for PC regression with n = {} is: {}'.format(best_n, test_R_sq))
In [ ]: