# import libraries

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
import statsmodels.api as sm


heart = pd.read_csv('Heart.csv')

# Force the response into a binary indicator:
heart['AHD'] = 1*(heart['AHD'] == "Yes")

heart.describe()


# Make a plot of the response (AHD) vs the predictor (Age)

plt.plot(heart[['Age']].values, heart['AHD'].values ,'o', markersize=7,color="#011DAD",label="Data")

plt.xticks(np.arange(18, 80, 4.0))
plt.xlabel("Age")
plt.ylabel("AHD")
plt.yticks((0,1), labels=('No', 'Yes'))

plt.legend()
plt.show()


# split into train and validation
heart_train, heart_val = train_test_split(heart, train_size = 0.75, random_state = 5)

# select variables for model estimation
x_train = heart_train[['Age']]
y_train = heart_train['AHD']

x_val = heart_val[['Age']]
y_val = heart_val['AHD']


# Create a linear regression model, with random state=5

regress1 = LinearRegression(fit_intercept=True).fit(x_train,y_train)

print("Linear Regression Estimated Betas:",regress1.intercept_,regress1.coef_[0])


# Plot the estimated probability for training data
dummy_x=np.linspace(np.min(x_train)-30,np.max(x_train)+30)
yhat_regress = regress1.predict(dummy_x.reshape(-1,1))
plt.plot(x_train, y_train, 'o' ,alpha=0.2, label='Data')
plt.plot(dummy_x, yhat_regress, label = "OLS")

plt.ylim(-0.2, 1.2)
plt.show()


### edTest(test_logit1) ###
# Create a logistic regression model, with random state=5 and no penalty

logit1 = ___(penalty=___, max_iter = 1000, random_state=5)

#Fit the model using the training set

logit1.fit(x_train,y_train)

# Get the coefficient estimates

print("Logistic Regression Estimated Betas (B0,B1):",logit1.intercept_,logit1.coef_)


# Confirm the probability calculation above using logit1.predict()
# Be careful as to how you define the new observation.  Hint: double brackets is one way to do it

logit1.predict_proba([[___]])


### edTest(test_accuracy) ###

# Compute the training & validation accuracy 

train_accuracy = logit1.___(x_train , y_train)
val_accuracy = logit1.___(x_val , y_val)

# Print the two accuracies below

print("Train Accuracy", train_accuracy)
print("Validation Accuracy", val_accuracy)


x=np.linspace(np.min(heart[['Age']])-10,np.max(heart[['Age']])+10,200)

yhat_class_logit = logit1.predict(x)
yhat_prob_logit = logit1.predict_proba(x)[:,1]

# plot the observed data
plt.plot(x_train, y_train, 'o' ,alpha=0.1, label='Train Data')
plt.plot(x_val, 0.94*y_val+0.03, 'o' ,alpha=0.1, label='Validation Data')

# plot the predictions
plt.plot(x, yhat_class_logit, label='logit1 Classifications')
plt.plot(x, yhat_prob_logit, label='logit1 Probabilities')

# put the lower-left part of the legend 5% to the right along the x-axis, and 45% up along the y-axis
plt.legend(loc=(0.05,0.45))

# Don't forget your axis labels!
plt.xlabel("Age")
plt.ylabel("Heart disease (AHD)")

plt.show()


### edTest(test_logit2) ###
# adding a column of ones to X
x_train_with_constant = sm.add_constant(x_train)
x_val_with_constant = sm.add_constant(x_val)

# train a new model using statsmodels package
logreg = sm.___(y_train, x_train_with_constant).fit()
print(logreg.summary())

Title :¶

Description :¶

Simple linear regression model fitting¶

What could go wrong with this linear regression model?¶

Simple logisitc regression model fitting¶

Interpret the Coefficient Estimates¶

Calculate the estimated probability that a person with age 60 will have AHD in the ICU.¶

Accuracy computation¶

Plot the predictions¶

Statistical Inference¶

What is an estimated 95% confidence interval for the coefficient corresponding to 'Age' variable?¶