CS109A Introduction to Data Science

Lecture 14 (PCA and High Dimensionality)

Harvard University
Fall 2019
Instructors: Pavlos Protopapas, Kevin Rader, and Chris Tanner


In [ ]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import sklearn as sk
import matplotlib.pyplot as plt

#     import statsmodels.api as sm
#     from statsmodels.tools import add_constant
#     from statsmodels.regression.linear_model import RegressionResults
#     import seaborn as sns
#     from sklearn.preprocessing import MinMaxScaler
#     from sklearn.model_selection import KFold
#     from sklearn.linear_model import LinearRegression
#     from sklearn.linear_model import Ridge
#     from sklearn.linear_model import Lasso
#     from sklearn.preprocessing import PolynomialFeatures
#     from sklearn.neighbors import KNeighborsRegressor

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
# sns.set(style="ticks")
# %matplotlib inline
In [ ]:
heart_df = pd.read_csv('../data/Heart.csv')
In [ ]:
print(heart_df.shape)
heart_df.head()
In [ ]:
heart_df.describe()
In [ ]:
# For pedagogical purposes, let's simplify our lives and use just 4 predictors
X = heart_df[['Age','RestBP','Chol','MaxHR']]
y = 1*(heart_df['AHD']=='Yes')
In [ ]:
#fit the 'full' model on the 4 predictors. and print out the coefficients
logit_full = LogisticRegression(C=1000000,solver="lbfgs").fit(X,y)

beta = logit_full.coef_[0]

print(beta)
In [ ]:
# investigating what happens when two identical predictors are used

logit1 = LogisticRegression(C=1000000,solver="lbfgs").fit(heart_df[['Age']],y)
logit2 = LogisticRegression(C=1000000,solver="lbfgs").fit(heart_df[['Age','Age']],y)

print("The coef estimate for Age (when in the model once):",logit1.coef_)
print("The coef estimates for Age (when in the model twice):",logit2.coef_)
In [ ]:
X = heart_df[['Age','RestBP','Chol','MaxHR']]

# create/fit the 'full' pca transformation
pca = PCA().fit(X)

# apply the pca transformation to the full predictor set
pcaX = pca.transform(X)

# convert to a data frame
pcaX_df = pd.DataFrame(pcaX, columns=[['PCA1' , 'PCA2', 'PCA3', 'PCA4']])

# here are the weighting (eigen-vectors) of the variables (first 2 at least)
print("First PCA Component (w1):",pca.components_[0,:])
print("Second PCA Component (w2):",pca.components_[1,:])

# here is the variance explained:
print("Variance explained by each component:",pca.explained_variance_ratio_)
In [ ]:
# Plot the response over the first 2 PCA component vectors

plt.scatter(pcaX_df['PCA1'][y==0],pcaX_df['PCA2'][y==0])
plt.scatter(pcaX_df['PCA1'][y==1],pcaX_df['PCA2'][y==1])

plt.legend(["AHD = No","AHD = Yes"])
plt.xlabel("First PCA Component Vector (Z1)")
plt.ylabel("Second PCA Component Vector (Z2)");
In [ ]:
logit_pcr1 = LogisticRegression(C=1000000,solver="lbfgs").fit(pcaX_df[['PCA1']],y)

print("Intercept from simple PCR-Logistic:",logit_pcr1.intercept_)
print("'Slope' from simple PCR-Logistic:", logit_pcr1.coef_)

print("First PCA Component (w1):",pca.components_[0,:])
In [ ]:
# Fit the other 3 PCRs on the rest of the 4 predictors

logit_pcr2 = LogisticRegression(C=1000000,solver="lbfgs").fit(pcaX_df[['PCA1','PCA2']],y)
logit_pcr3 = LogisticRegression(C=1000000,solver="lbfgs").fit(pcaX_df[['PCA1','PCA2','PCA3']],y)
logit_pcr4 = LogisticRegression(C=1000000,solver="lbfgs").fit(pcaX_df[['PCA1','PCA2','PCA3','PCA4']],y)

pcr1=(logit_pcr1.coef_*np.transpose(pca.components_[0:1,:])).sum(axis=1)
pcr2=(logit_pcr2.coef_*np.transpose(pca.components_[0:2,:])).sum(axis=1)
pcr3=(logit_pcr3.coef_*np.transpose(pca.components_[0:3,:])).sum(axis=1)
pcr4=(logit_pcr4.coef_*np.transpose(pca.components_[0:4,:])).sum(axis=1)

print(pcr1)
print(pcr2)
print(pcr3)
print(pcr4)

results = np.vstack((pcr1,pcr2,pcr3,pcr4,beta))
In [ ]:
plt.plot(['PCR1' , 'PCR2', 'PCR3', 'PCR4', 'Logistic'],results)

plt.ylabel("Back-calculated Beta Coefficients");

plt.legend(X.columns);
In [ ]:
scaler = sk.preprocessing.StandardScaler()
scaler.fit(X)
Z = scaler.transform(X)
pca = PCA(n_components=4).fit(Z)
pcaZ = pca.transform(Z)
pcaZ_df = pd.DataFrame(pcaZ, columns=[['PCA1' , 'PCA2', 'PCA3', 'PCA4']])

print("First PCA Component (w1):",pca.components_[0,:])
print("Second PCA Component (w2):",pca.components_[1,:])
In [ ]:
#fit the 'full' model on the 4 predictors. and print out the coefficients
logit_full = LogisticRegression(C=1000000,solver="lbfgs").fit(Z,y)


betaZ = logit_full.coef_[0]

print("Logistic coef. on standardized predictors:",betaZ)
In [ ]:
# Fit the PCR
logit_pcr1Z = LogisticRegression(C=1000000,solver="lbfgs").fit(pcaZ_df[['PCA1']],y)
logit_pcr2Z = LogisticRegression(C=1000000,solver="lbfgs").fit(pcaZ_df[['PCA1','PCA2']],y)
logit_pcr3Z = LogisticRegression(C=1000000,solver="lbfgs").fit(pcaZ_df[['PCA1','PCA2','PCA3']],y)
logit_pcr4Z = LogisticRegression(C=1000000,solver="lbfgs").fit(pcaZ_df[['PCA1','PCA2','PCA3','PCA4']],y)

pcr1Z=(logit_pcr1Z.coef_*np.transpose(pca.components_[0:1,:])).sum(axis=1)
pcr2Z=(logit_pcr2Z.coef_*np.transpose(pca.components_[0:2,:])).sum(axis=1)
pcr3Z=(logit_pcr3Z.coef_*np.transpose(pca.components_[0:3,:])).sum(axis=1)
pcr4Z=(logit_pcr4Z.coef_*np.transpose(pca.components_[0:4,:])).sum(axis=1)

resultsZ = np.vstack((pcr1Z,pcr2Z,pcr3Z,pcr4Z,betaZ))
print(resultsZ)

plt.plot(['PCR1-Z' , 'PCR2-Z', 'PCR3-Z', 'PCR4-Z', 'Logistic'],resultsZ)

plt.ylabel("Back-calculated Beta Coefficients");

plt.legend(X.columns);