%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns

import plotly
import plotly.graph_objs as go

import sklearn as sk
from sklearn.decomposition import PCA

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

pd.set_option("display.width", 500)
pd.set_option("display.max_columns", 100)

sns.set_style("darkgrid")
sns.set_palette("colorblind")


df_heart = pd.read_csv("Heart.csv")

# Force the response into a binary indicator:
df_heart["AHD"] = (df_heart["AHD"] == "Yes").astype("int")

print(df_heart.shape)
df_heart.head()
df_heart.drop(columns = ['Unnamed: 0'], inplace=True)


df_heart


df_heart.describe()


pd.crosstab(df_heart["Sex"], df_heart["AHD"])


pd.crosstab(df_heart["Thal"], df_heart["AHD"])


pd.crosstab(df_heart["ChestPain"], df_heart["AHD"])


_ = sns.histplot(data=df_heart, x="Age", hue="AHD")


_ = sns.histplot(data=df_heart, x="MaxHR", hue="AHD")


X = sm.add_constant(df_heart[["Age"]])
y = df_heart["AHD"]

reg1 = sm.OLS(y, X).fit()
reg1.summary()


# investigating what happens when two identical predictors are used

######
# your code here
######

X = sm.add_constant(df_heart[["Age", "Age"]])
reg2 = sm.OLS(y, X).fit()
print(reg2.summary())


columns = ["Age", "RestBP", "Chol", "MaxHR", "Sex", "Oldpeak", "Slope"]

X = df_heart[columns]
y = df_heart["AHD"]
X.describe()


X.corr()


reg_full = sm.OLS(y, sm.add_constant(X)).fit()

reg_full.summary()


# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
  
vif_data


# create/fit the 'full' pca transformation
pca = PCA().fit(X)

# apply the pca transformation to the full predictor set
pcaX = pca.transform(X)

# convert to a data frame
pcr_columns = ["PCA1" , "PCA2", "PCA3", "PCA4", "PCA5", "PCA6", "PCA7"]
pcaX_df = pd.DataFrame(pcaX, columns=pcr_columns)

# here are the weighting (eigen-vectors) of the variables (first 2 at least)
print("First PCA Component (w1):", pca.components_[0,:])
print("Second PCA Component (w2):", pca.components_[1,:])


pcaX_df


# here is the variance explained:
print("Variance explained by each component:", pca.explained_variance_ratio_)

blue = sns.color_palette("colorblind")[0]
sns.barplot(y=list(range(1,8)), x=pca.explained_variance_ratio_, orient="h", color=blue)
plt.xscale("log")


_ = sns.barplot(y=list(range(1,8)), x=pca.explained_variance_ratio_, orient="h", color=blue)


# create/fit the 'full' pca transformation
Z = sk.preprocessing.StandardScaler().fit(X).transform(X)
pca_standard = PCA().fit(Z)
pcaZ = pca_standard.transform(Z)

# convert to a data frame
pcaZ_df = pd.DataFrame(pcaZ, columns=pcr_columns)


print(pca_standard.components_.shape)
print(pcaZ.shape)


pd.DataFrame.from_dict({"Variable": X.columns,
                        "PCA1": pca.components_[0],
                        "PCA2": pca.components_[1],
                        "PCA-Z1": pca_standard.components_[0],
                        "PCA-Z2": pca_standard.components_[1]})


np.sum(pca.components_[0,:]**2)


# Plot the response over the first 2 PCA component vectors

sns.scatterplot(data=pcaX_df, x="PCA1", y="PCA2", hue=df_heart["AHD"], legend="full")

plt.xlabel("First PCA Component Vector (Z1)")
plt.ylabel("Second PCA Component Vector (Z2)");


X = sm.add_constant(pcaX_df[["PCA1"]])
reg_pcr1 = sm.OLS(y, X).fit()

reg_pcr1.summary()


print("First PCA Component (w1):", pca.components_[0:1,:])


beta = reg_pcr1.params[1]

(beta*pca.components_[0:1,:])


results_arr = []

for i in range(1, 8):
    reg_pcr_tmp = sm.OLS(y, sm.add_constant(pcaX_df[pcr_columns[:i]])).fit()
    pcr_tmp = np.transpose(pca.components_[:i,:]) @ reg_pcr_tmp.params[1:i+1]
    results_arr.append(pcr_tmp)

betas = reg_full.params[1:]
results_arr.append(betas)
results = np.vstack(results_arr)
print(results)


plt.plot(pcr_columns + ["Linear"], results)
plt.ylabel("Back-calculated Beta Coefficients")
plt.legend(df_heart.columns)


X = df_heart[columns]

scaler = sk.preprocessing.StandardScaler()
Z = scaler.fit_transform(X)
pca = PCA().fit(Z)
pcaZ = pca.transform(Z)
pcaZ_df = pd.DataFrame(pcaZ, columns=pcr_columns)

print("First PCA Component (w1):", pca.components_[0,:])
print("Second PCA Component (w2):", pca.components_[1,:])


regZ_full = sm.OLS(y, sm.add_constant(pd.DataFrame(Z, columns=columns))).fit()
regZ_full.summary()


# Fit the PCR

results_arr = []

for i in range(1, 8):
    reg_pcrZ_tmp = sm.OLS(y, sm.add_constant(pcaZ_df[pcr_columns[:i]])).fit()
    pcrZ_tmp = np.transpose(pca.components_[:i,:]) @ reg_pcrZ_tmp.params[1:i+1]
    results_arr.append(pcrZ_tmp)

betasZ = regZ_full.params[1:]
results_arr.append(betasZ)
resultsZ = np.vstack(results_arr)
print(resultsZ)


plt.plot(pcr_columns + ["Linear"],resultsZ)
plt.ylabel("Back-calculated Beta Coefficients");
plt.legend(X.columns);


_ = sns.scatterplot(data=df_heart, x="Age", y="MaxHR")


sns.scatterplot(data=df_heart, x="Age", y="MaxHR")
_ = plt.axis("equal")


X = df_heart[["Age", "MaxHR"]].values

mu = np.mean(X, axis=0)
S = (X - mu).T @ (X - mu)

S


eigen_values, eigen_vectors = scipy.linalg.eig(S)
w_1 = eigen_vectors[:, np.argmax(eigen_values)]
w_1


X_hat = X - X @ np.outer(w_1, w_1)

mu_hat = np.mean(X_hat, axis=0)
S_hat = (X_hat - mu_hat).T @ (X_hat - mu_hat)

S_hat


eigen_values, eigen_vectors = scipy.linalg.eig(S_hat)
w_2 = eigen_vectors[:, np.argmax(eigen_values)]
w_2


np.sqrt(np.max(eigen_values))


pca = PCA().fit(X)
pca.components_


sns.scatterplot(x=X[:,0], y=X[:,1])
sns.scatterplot(x=X_hat[:,0], y=X_hat[:,1])

x = np.stack([X[:,0], X_hat[:,0]]).T
y = np.stack([X[:,1], X_hat[:,1]]).T

for i in range(len(X)//20):
    sns.lineplot(x=x[i], y=y[i], color="k")
    
x = [0, -100*w_1[0]]
y = [0, -100*w_1[1]]
    
sns.lineplot(x=x, y=y)

x = [0, 100*w_2[0]]
y = [0, 100*w_2[1]]
    
sns.lineplot(x=x, y=y)

plt.xlabel("Age")
plt.ylabel("MaxHR")

_ = plt.axis("equal")


X = df_heart[["Age", "MaxHR", "RestBP"]].values

# Configure Plotly to be rendered inline in the notebook.
plotly.offline.init_notebook_mode()

# Configure the trace.
trace = go.Scatter3d(
    x=X[:,0],
    y=X[:,1],
    z=X[:,2],
    mode="markers",
    marker={
        "size": 10,
        "opacity": 0.8,
    }
)

# Configure the layout.
layout = go.Layout(
    margin={"l": 0, "r": 0, "b": 0, "t": 0},
    scene=go.layout.Scene(
        xaxis=go.layout.scene.XAxis(title="Age"),
        yaxis=go.layout.scene.YAxis(title="MaxHR"),
        zaxis=go.layout.scene.ZAxis(title="RestBP")
    )
)


data = [trace]

plot_figure = go.Figure(data=data, layout=layout)

# Render the plot.
plotly.offline.iplot(plot_figure)


X = df_heart[["Age", "MaxHR", "RestBP"]].values
X_orig = X.copy()

# Configure Plotly to be rendered inline in the notebook.
plotly.offline.init_notebook_mode()

# Configure the trace.
trace = go.Scatter3d(
    x=X[:,0],
    y=X[:,1],
    z=X[:,2],
    mode="markers",
    name="Original Data",
    marker={
        "size": 10,
        "opacity": 0.8,
    }
)

# Configure the layout.
layout = go.Layout(
    margin={"l": 0, "r": 0, "b": 0, "t": 0},
    scene=go.layout.Scene(
        xaxis=go.layout.scene.XAxis(title="Age"),
        yaxis=go.layout.scene.YAxis(title="MaxHR"),
        zaxis=go.layout.scene.ZAxis(title="RestBP")
    )
)


data = [trace]

plot_figure = go.Figure(data=data, layout=layout)


## First projection

mu = np.mean(X, axis=0)
S = (X - mu).T @ (X - mu)

eigen_values, eigen_vectors = scipy.linalg.eig(S)
w_1 = eigen_vectors[:, np.argmax(eigen_values)]

X_prev = X.copy()
X = X_orig - X_orig @ np.outer(w_1, w_1)

# Configure the trace.
trace = go.Scatter3d(
    x=X[:,0],
    y=X[:,1],
    z=X[:,2],
    mode="markers",
    name="First Projection",
    marker={
        "size": 10,
        "opacity": 0.8,
    }
)

data.append(trace)

x_lines = []
y_lines = []
z_lines = []

#create the coordinate list for the lines
for i in range(len(X)//10):
    
    trace = go.Scatter3d(
        x=[X_prev[i,0], X[i,0]],
        y=[X_prev[i,1], X[i,1]],
        z=[X_prev[i,2], X[i,2]],
        mode="lines",
        showlegend=False,
        line=go.scatter3d.Line(color="black")
    )

    data.append(trace)

## Second projection

mu = np.mean(X, axis=0)
S = (X - mu).T @ (X - mu)

eigen_values, eigen_vectors = scipy.linalg.eig(S)
w_2 = eigen_vectors[:, np.argmax(eigen_values)]

X_prev = X.copy()
X = X_orig - X_orig @ np.outer(w_1, w_1) - X_orig @ np.outer(w_2, w_2)

# Configure the trace.
trace = go.Scatter3d(
    x=X[:,0],
    y=X[:,1],
    z=X[:,2],
    mode="markers",
    name="Second Projection",
    marker={
        "size": 10,
        "opacity": 0.8,
    }
)

data.append(trace)

#create the coordinate list for the lines
for i in range(len(X)//10):

    trace = go.Scatter3d(
        x=[X_prev[i,0], X[i,0]],
        y=[X_prev[i,1], X[i,1]],
        z=[X_prev[i,2], X[i,2]],
        mode="lines",
        showlegend=False,
        line=go.scatter3d.Line(color="black")
    )

    data.append(trace)

## Third projection

mu = np.mean(X, axis=0)
S = (X - mu).T @ (X - mu)

eigen_values, eigen_vectors = scipy.linalg.eig(S)
w_3 = eigen_vectors[:, np.argmax(eigen_values)]


## Eigenvectors

trace = go.Scatter3d(
    x=[0, 200*w_1[0]],
    y=[0, 200*w_1[1]],
    z=[0, 200*w_1[2]],
    mode="lines",
    name="First Eigenvector",
    line=go.scatter3d.Line(color="blue")
)

data.append(trace)

trace = go.Scatter3d(
    x=[0, 200*w_2[0]],
    y=[0, 200*w_2[1]],
    z=[0, 200*w_2[2]],
    mode="lines",
    name="Second Eigenvector",
    line=go.scatter3d.Line(color="red")
)

data.append(trace)

trace = go.Scatter3d(
    x=[0, 200*w_3[0]],
    y=[0, 200*w_3[1]],
    z=[0, 200*w_3[2]],
    mode="lines",
    name="Third Eigenvector",
    line=go.scatter3d.Line(color="green")
)

data.append(trace)


plot_figure = go.Figure(data=data, layout=layout)

# Render the plot.
plotly.offline.iplot(plot_figure)


X = df_heart[["Age", "MaxHR", "RestBP"]].values
X_orig = X.copy()

# Configure Plotly to be rendered inline in the notebook.
plotly.offline.init_notebook_mode()

# Configure the trace.
trace = go.Scatter3d(
    x=X[:,0],
    y=X[:,1],
    z=X[:,2],
    mode="markers",
    name="Original Data",
    marker={
        "size": 10,
        "opacity": 0.8,
    }
)

# Configure the layout.
layout = go.Layout(
    margin={"l": 0, "r": 0, "b": 0, "t": 0},
    scene=go.layout.Scene(
        xaxis=go.layout.scene.XAxis(title="Age"),
        yaxis=go.layout.scene.YAxis(title="MaxHR"),
        zaxis=go.layout.scene.ZAxis(title="RestBP")
    )
)


data = [trace]

plot_figure = go.Figure(data=data, layout=layout)


## Remove smallest eigenvalues

X = X_orig - X_orig @ np.outer(w_2, w_2) - X_orig @ np.outer(w_3, w_3)

# Configure the trace.
trace = go.Scatter3d(
    x=X[:,0],
    y=X[:,1],
    z=X[:,2],
    mode="markers",
    name="Data Along First Eigenvector",
    marker={
        "size": 10,
        "opacity": 0.8,
    }
)

data.append(trace)

plot_figure = go.Figure(data=data, layout=layout)


# Render the plot.
plotly.offline.iplot(plot_figure)


columns = ["Age", "RestBP", "Chol", "MaxHR", "Sex", "Oldpeak", "Slope"]

X = df_heart[columns].values

mu = np.mean(X, axis=0)
S = (X - mu).T @ (X - mu) / (len(X) - 1)

total_variance = np.diag(S).sum()
print(f"Total variance is: {total_variance}")


n = len(columns)
var_arr = []
eigenvector_arr = []

for i in range(n):
    eigen_values, eigen_vectors = scipy.linalg.eig(S)
    w = np.real(eigen_vectors[:, np.argmax(eigen_values)])
    eigenvector_arr.append(w)

    X = X - X @ np.outer(w, w)

    mu = np.mean(X, axis=0)
    S = (X - mu).T @ (X - mu) / (len(X) - 1)

    variance = np.diag(S).sum()
    var_arr.append((total_variance-variance)/total_variance)
    
sns.lineplot(x=list(range(1, n+1)), y=var_arr)
plt.xlabel("Number of Components")
plt.ylabel("Proportion of Total Variance")
print(var_arr)


X = df_heart[columns].values

pca = PCA().fit(X)
print(pca.explained_variance_ratio_)


n_components = 2

X_hat = pca.transform(X)[:,:n_components]

sns.scatterplot(x=X_hat[:,0], y=X_hat[:,1], hue=df_heart["AHD"], legend="full")

plt.xlabel("First PCA Component Vector (Z1)")
plt.ylabel("Second PCA Component Vector (Z2)");

_ = plt.axis("equal")

Title :¶

Description :¶

Data Description:¶

Instructions:¶

Hints:¶

CS109A Introduction to Data Science

Principal Component Analysis¶

PCA¶

Part 0: Reading the data¶

Part 1: Principal Components Analysis (PCA)¶

Part 2: PCA in Regression (PCR)¶

Part 3: Underlying Math¶