Key Word(s): sklearn, logistic regression, multiclass, regularization, CV, cross-validation, roc, confusion matrix, metrics
CS109A Introduction to Data Science
Lecture 11 (Logistic Regression #2)¶
Harvard University
Fall 2019
Instructors: Pavlos Protopapas, Kevin Rader, and Chris Tanner
In [ ]:
%matplotlib inline
import sys
import numpy as np
import pylab as pl
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib
import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
import sklearn.metrics as met
In [ ]:
df_heart = pd.read_csv('data/Heart.csv')
df_heart['AHD'] = 1*(df_heart['AHD'] == "Yes")
df_heart['Interaction'] = df_heart['MaxHR']*df_heart['Sex']
df_heart.head()
In [ ]:
plt.plot(df_heart.MaxHR, df_heart.AHD, 'o', alpha=0.4)
plt.ylim(-0.1,1.1)
plt.show()
Regularization in Logistic Regression¶
In [ ]:
beta1_l1 = []
beta1_l2 = []
Cs = []
data_x = df_heart[['MaxHR','Sex','Interaction']]
data_y = df_heart['AHD']
for i in range(1, 50):
C = i/10
logitm_l1 = LogisticRegression(C = C, penalty = "l1", solver='liblinear',max_iter=200)
logitm_l1.fit (data_x, data_y)
logitm_l2 = LogisticRegression(C = C, penalty = "l2", solver='lbfgs')
logitm_l2.fit (data_x, data_y)
beta1_l1.append(logitm_l1.coef_[0])
beta1_l2.append(logitm_l2.coef_[0])
Cs.append(C)
plt.plot(np.array(Cs), beta1_l1, color='black', lw=3)
plt.plot(np.array(Cs), beta1_l2, color='blue', lw=3)
plt.xlabel ("lambda = C")
plt.ylabel("beta1")
plt.show()
Multi-Class Logistic Regression (Multinomial)¶
In [ ]:
# Response for Multinomial Logistic Regression Example
print(df_heart.RestECG.values)
plt.hist(df_heart.RestECG.values)
plt.show()
In [ ]:
# 'Multinomial' Logistic Regression Example
data_x = df_heart[['Sex']]
# 0 = normal; 1 = having ST-T; 2 = hypertrophy
data_y = df_heart['RestECG']
logitm = LogisticRegression(C = 10000000,solver='lbfgs',multi_class='ovr')
logitm.fit(data_x, data_y)
# The coefficients
print('Estimated beta1: \n', logitm.coef_)
print('Estimated beta0: \n', logitm.intercept_)
In [ ]:
logitm2 = LogisticRegression(C = 10000000,solver='lbfgs',multi_class='multinomial')
logitm2.fit (data_x, data_y)
# The coefficients
print('Estimated beta1: \n', logitm2.coef_)
print('Estimated beta0: \n', logitm2.intercept_)
In [ ]:
logitm3 = LogisticRegression(C = 10000000,solver='lbfgs',multi_class='ovr')
logitm3.fit(df_heart[['Sex','MaxHR','Interaction']], df_heart['RestECG'])
# The coefficients
print('Estimated beta1: \n', logitm3.coef_)
print('Estimated beta0: \n', logitm3.intercept_)
In [ ]:
X=np.arange(0,2)
print("For OVR Logistic Regression:")
print(logitm.predict_proba(X.reshape(-1,1)))
print(logitm.predict(X.reshape(-1,1)))
print("For Multinomial Logistic Regression:")
print(logitm2.predict_proba(X.reshape(-1,1)))
print(logitm2.predict(X.reshape(-1,1)))
Confusion Matrices and ROC Curves¶
In [ ]:
logitm.fit(df_heart[['Sex','MaxHR','Interaction']], df_heart['AHD']);
# The coefficients
print('Estimated beta1: \n', logitm.coef_)
print('Estimated beta0: \n', logitm.intercept_)
In [ ]:
#calculating confusion matrices
yhat = logitm.predict_proba(df_heart[['Sex','MaxHR','Interaction']])[:,1]
print('The average predicted probability is',np.mean(yhat))
print('The confusion matrix when cut-off is 0.5: \n',met.confusion_matrix(df_heart['AHD'], yhat>0.5))
print('The confusion matrix when cut-off is',np.mean(yhat),'\n',met.confusion_matrix(df_heart['AHD'], yhat>np.mean(yhat)))
print('The confusion matrix when cut-off is 0.75: \n',met.confusion_matrix(df_heart['AHD'], yhat>0.72))
In [ ]:
#ROC curve on the train set
fpr, tpr, thresholds = met.roc_curve(df_heart['AHD'], yhat)
x=np.arange(0,100)/100
plt.plot(x,x,'--',color="gray",alpha=0.3)
plt.plot(fpr,tpr)
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.title("ROC Curve for Predicting AHD in a Logistic Regression Model")
plt.show()
Regularization Example¶
In [ ]:
df_toy2 = pd.read_csv('data/toy_example2.csv')
df_toy2.head()
In [ ]:
In [ ]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=100000,solver='lbfgs')
logreg.fit(df_toy2[['x1','x2']], df_toy2['y'])
beta = logreg.coef_
beta0 = logreg.intercept_
print('Estimated beta1: \n', beta)
print('Estimated beta0: \n', beta0)
In [ ]:
plt.scatter(df_toy2['x1'][df_toy2['y']==0],df_toy2['x2'][df_toy2['y']==0])
plt.scatter(df_toy2['x1'][df_toy2['y']==1],df_toy2['x2'][df_toy2['y']==1])
dummy_x1 = np.arange(0,1,0.01)
dummy_x2 = (beta[0,0]*dummy_x1+beta0)/-beta[0,1]
plt.plot(dummy_x1,dummy_x2,'--',c="black",lw=5)
plt.show()
In [ ]:
X = df_toy2[['x1','x2']]
logreg2 = LogisticRegression(C=100000,solver='liblinear')
poly = sk.preprocessing.PolynomialFeatures(10)
X_poly = poly.fit_transform(X)
logreg2.fit(X_poly, df_toy2['y'])
logreg2.coef_
In [ ]:
dummy_x1 = np.arange(0,1.001,0.001)
def find_boundary(dummy_x, model, poly):
yhat = []
for x_i in dummy_x:
dummy_x1 = np.repeat(x_i,dummy_x.size)
df_X = pd.DataFrame(np.array([dummy_x1,dummy_x]).transpose())
df_X2 = poly.fit_transform(df_X)
yhat.append(dummy_x[np.sum(model.predict(df_X2))])
return yhat
dummy_x2 = find_boundary(dummy_x1, logreg2, poly)
In [ ]:
plt.scatter(df_toy2['x1'][df_toy2['y']==0],df_toy2['x2'][df_toy2['y']==0])
plt.scatter(df_toy2['x1'][df_toy2['y']==1],df_toy2['x2'][df_toy2['y']==1])
plt.plot(dummy_x1,dummy_x2,'--',c="black",lw=3)
plt.show()
In [ ]:
Cs = 10.0**np.arange(-5,5)
logregCV = LogisticRegressionCV(Cs = Cs,cv=5,solver='liblinear',penalty='l1')
logregCV.fit(X_poly, df_toy2['y']);
In [ ]:
print(logregCV.coef_)
#print(logregCV.scores_)
#print(logregCV.coefs_paths_)
In [ ]:
dummy_x2 = (beta[0,0]*dummy_x1+beta0)/-beta[0,1]
dummy_x2poly = find_boundary(dummy_x1, logreg2, poly)
dummy_x2CV = find_boundary(dummy_x1, logregCV, poly)
In [ ]:
plt.scatter(df_toy2['x1'][df_toy2['y']==0],df_toy2['x2'][df_toy2['y']==0])
plt.scatter(df_toy2['x1'][df_toy2['y']==1],df_toy2['x2'][df_toy2['y']==1])
plt.plot(dummy_x1,dummy_x2,c="black",lw=3)
plt.plot(dummy_x1,dummy_x2poly,'--',c="red",lw=3)
plt.plot(dummy_x1,dummy_x2CV,'-.',c="purple",lw=3)
plt.legend(['logit','poly(10) logit','regularized','y=0','y=1'])
plt.rcParams["figure.figsize"] = [10,6]
plt.show()
In [ ]: