import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split


# read the data
heart = pd.read_csv('Heart.csv')

# Force the response into a binary indicator:
heart['AHD'] = 1*(heart['AHD'] == "Yes")

heart_train, heart_test = train_test_split(heart, test_size=0.3, random_state = 109)


# Train two logistic regression models, one with l2 penalty and the other one with no penalty
degree = 3
predictors = ['Age','Sex','MaxHR','RestBP','Chol']

X_train = PolynomialFeatures(degree=degree,include_bias=False).fit_transform(heart_train[predictors])
y_train = heart_train['AHD']

X_test = PolynomialFeatures(degree=degree,include_bias=False).fit_transform(heart_test[predictors])
y_test = heart_test['AHD']


logit = LogisticRegression(penalty='none', max_iter = 10000).fit(X_train, y_train)
logit_ridge = LogisticRegression(C=0.001, penalty='l2',solver='lbfgs', max_iter = 10000).fit(X_train, y_train)


# Predict the probabilities, and then predict the labels based on threshold = 0.5
yhat_logit = logit.predict_proba(X_test)[:,1]
yhat_logit_ridge = logit_ridge.predict_proba(X_test)[:,1]

threshold = 0.5

# Print the confusion matrices
print('The confusion matrix in test for logit when cut-off is',threshold, ': \n',
      sk.metrics.___(y_test, yhat_logit>threshold))
print('The confusion matrix in test for logit_ridge when cut-off is',threshold, ': \n',
      sk.metrics.___(y_test, yhat_logit_ridge>threshold))


### edTest(test_roc) ###
# Plot the ROC curve
yhat_logit= logit.predict_proba(X_test)[:,1]
yhat_logit_ridge= logit_ridge.predict_proba(X_test)[:,1]

fpr, tpr, thresholds = metrics.___(y_test, yhat_logit)
fpr_ridge, tpr_ridge, thresholds_ridge = metrics.___(y_test, yhat_logit_ridge)

x=np.arange(0,100)/100
plt.plot(x,x,'--',color="gray",alpha=0.3)
plt.plot(fpr,tpr,label="logit")
plt.plot(fpr_ridge,tpr_ridge,label="logit_ridge")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.title("ROC Curve for Predicting AHD in a Logistic Regression Model")
plt.legend()
plt.show()


### edTest(test_auc) ###
# print the AUC scores
auc_no_reg = metrics.___(fpr,tpr)
auc_ridge = metrics.___(fpr_ridge,tpr_ridge)
print(auc_no_reg)
print(auc_ridge)

Title :¶

Description :¶