CS109A Introduction to Data Science

Lecture 11 (Logistic Regression #2)

Harvard University
Fall 2019
Instructors: Pavlos Protopapas, Kevin Rader, and Chris Tanner


In [ ]:
%matplotlib inline
import sys
import numpy as np
import pylab as pl
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib
import sklearn as sk


from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
import sklearn.metrics as met
In [ ]:
df_heart = pd.read_csv('data/Heart.csv')

df_heart['AHD'] = 1*(df_heart['AHD'] == "Yes")
df_heart['Interaction'] = df_heart['MaxHR']*df_heart['Sex']

df_heart.head()
In [ ]:
plt.plot(df_heart.MaxHR, df_heart.AHD, 'o', alpha=0.4)
plt.ylim(-0.1,1.1)
plt.show()

Regularization in Logistic Regression

In [ ]:
beta1_l1 = []
beta1_l2 = []
Cs = []

data_x = df_heart[['MaxHR','Sex','Interaction']]
data_y = df_heart['AHD']

for i in range(1, 50):
    C = i/10
    logitm_l1 = LogisticRegression(C = C, penalty = "l1", solver='liblinear',max_iter=200)
    logitm_l1.fit (data_x, data_y)
    logitm_l2 = LogisticRegression(C = C, penalty = "l2", solver='lbfgs')
    logitm_l2.fit (data_x, data_y)
    beta1_l1.append(logitm_l1.coef_[0])
    beta1_l2.append(logitm_l2.coef_[0])
    Cs.append(C)

plt.plot(np.array(Cs), beta1_l1,  color='black', lw=3)
plt.plot(np.array(Cs), beta1_l2,  color='blue', lw=3)
plt.xlabel ("lambda = C")
plt.ylabel("beta1")
plt.show()

Multi-Class Logistic Regression (Multinomial)

In [ ]:
# Response for Multinomial Logistic Regression Example

print(df_heart.RestECG.values)
plt.hist(df_heart.RestECG.values)
plt.show()
In [ ]:
# 'Multinomial' Logistic Regression Example

data_x = df_heart[['Sex']]

# 0 = normal; 1 = having ST-T; 2 = hypertrophy
data_y = df_heart['RestECG']

logitm = LogisticRegression(C = 10000000,solver='lbfgs',multi_class='ovr')
logitm.fit(data_x, data_y)

# The coefficients
print('Estimated beta1: \n', logitm.coef_)
print('Estimated beta0: \n', logitm.intercept_)
In [ ]:
logitm2 = LogisticRegression(C = 10000000,solver='lbfgs',multi_class='multinomial')
logitm2.fit (data_x, data_y)

# The coefficients
print('Estimated beta1: \n', logitm2.coef_)
print('Estimated beta0: \n', logitm2.intercept_)
In [ ]:
logitm3 = LogisticRegression(C = 10000000,solver='lbfgs',multi_class='ovr')
logitm3.fit(df_heart[['Sex','MaxHR','Interaction']], df_heart['RestECG'])
# The coefficients
print('Estimated beta1: \n', logitm3.coef_)
print('Estimated beta0: \n', logitm3.intercept_)
In [ ]:
X=np.arange(0,2)
print("For OVR Logistic Regression:")
print(logitm.predict_proba(X.reshape(-1,1)))
print(logitm.predict(X.reshape(-1,1)))

print("For Multinomial Logistic Regression:")
print(logitm2.predict_proba(X.reshape(-1,1)))
print(logitm2.predict(X.reshape(-1,1)))

Confusion Matrices and ROC Curves

In [ ]:
logitm.fit(df_heart[['Sex','MaxHR','Interaction']], df_heart['AHD']);

# The coefficients
print('Estimated beta1: \n', logitm.coef_)
print('Estimated beta0: \n', logitm.intercept_)
In [ ]:
#calculating confusion matrices

yhat = logitm.predict_proba(df_heart[['Sex','MaxHR','Interaction']])[:,1]

print('The average predicted probability is',np.mean(yhat))

print('The confusion matrix when cut-off is 0.5: \n',met.confusion_matrix(df_heart['AHD'], yhat>0.5))
print('The confusion matrix when cut-off is',np.mean(yhat),'\n',met.confusion_matrix(df_heart['AHD'], yhat>np.mean(yhat)))
print('The confusion matrix when cut-off is 0.75: \n',met.confusion_matrix(df_heart['AHD'], yhat>0.72))
In [ ]:
#ROC curve on the train set

fpr, tpr, thresholds = met.roc_curve(df_heart['AHD'], yhat)

x=np.arange(0,100)/100
plt.plot(x,x,'--',color="gray",alpha=0.3)
plt.plot(fpr,tpr)
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.title("ROC Curve for Predicting AHD in a Logistic Regression Model")
plt.show()

Regularization Example

In [ ]:
df_toy2 = pd.read_csv('data/toy_example2.csv')
df_toy2.head()
In [ ]:
 
In [ ]:
from sklearn.linear_model import LogisticRegression


logreg = LogisticRegression(C=100000,solver='lbfgs')
logreg.fit(df_toy2[['x1','x2']], df_toy2['y'])

beta = logreg.coef_
beta0 = logreg.intercept_

print('Estimated beta1: \n', beta)
print('Estimated beta0: \n', beta0)
In [ ]:
plt.scatter(df_toy2['x1'][df_toy2['y']==0],df_toy2['x2'][df_toy2['y']==0])
plt.scatter(df_toy2['x1'][df_toy2['y']==1],df_toy2['x2'][df_toy2['y']==1])

dummy_x1 = np.arange(0,1,0.01)
dummy_x2 = (beta[0,0]*dummy_x1+beta0)/-beta[0,1]

plt.plot(dummy_x1,dummy_x2,'--',c="black",lw=5)
plt.show()
In [ ]:
X = df_toy2[['x1','x2']]

logreg2 = LogisticRegression(C=100000,solver='liblinear')
poly = sk.preprocessing.PolynomialFeatures(10)
X_poly = poly.fit_transform(X)


logreg2.fit(X_poly, df_toy2['y'])

logreg2.coef_
In [ ]:
dummy_x1 = np.arange(0,1.001,0.001)

def find_boundary(dummy_x, model, poly):
   
    yhat = []
    
    for x_i in dummy_x:
        dummy_x1 = np.repeat(x_i,dummy_x.size)
        df_X = pd.DataFrame(np.array([dummy_x1,dummy_x]).transpose())
        df_X2 = poly.fit_transform(df_X)
        yhat.append(dummy_x[np.sum(model.predict(df_X2))])
      
    return yhat

                    
dummy_x2 = find_boundary(dummy_x1, logreg2, poly)
In [ ]:
plt.scatter(df_toy2['x1'][df_toy2['y']==0],df_toy2['x2'][df_toy2['y']==0])
plt.scatter(df_toy2['x1'][df_toy2['y']==1],df_toy2['x2'][df_toy2['y']==1])

plt.plot(dummy_x1,dummy_x2,'--',c="black",lw=3)
plt.show()
In [ ]:
Cs = 10.0**np.arange(-5,5)

logregCV = LogisticRegressionCV(Cs = Cs,cv=5,solver='liblinear',penalty='l1')

logregCV.fit(X_poly, df_toy2['y']);
In [ ]:
print(logregCV.coef_)
#print(logregCV.scores_)
#print(logregCV.coefs_paths_)
In [ ]:
dummy_x2 = (beta[0,0]*dummy_x1+beta0)/-beta[0,1]
dummy_x2poly = find_boundary(dummy_x1, logreg2, poly)
dummy_x2CV = find_boundary(dummy_x1, logregCV, poly)
In [ ]:
plt.scatter(df_toy2['x1'][df_toy2['y']==0],df_toy2['x2'][df_toy2['y']==0])
plt.scatter(df_toy2['x1'][df_toy2['y']==1],df_toy2['x2'][df_toy2['y']==1])

plt.plot(dummy_x1,dummy_x2,c="black",lw=3)
plt.plot(dummy_x1,dummy_x2poly,'--',c="red",lw=3)
plt.plot(dummy_x1,dummy_x2CV,'-.',c="purple",lw=3)
plt.legend(['logit','poly(10) logit','regularized','y=0','y=1'])
plt.rcParams["figure.figsize"] = [10,6]
plt.show()
In [ ]: