Key Word(s): sklearn, logistic regression, classification, probabilities, boundaries
CS109A Introduction to Data Science
Lecture 10 (Logistic Regression)¶
Harvard University
Fall 2019
Instructors: Pavlos Protopapas, Kevin Rader, and Chris Tanner
In [ ]:
#from sklearn import datasets
import pandas as pd
%pylab inline
import matplotlib.pylab as plt
import numpy as np
import sklearn as sk
from sklearn.neighbors import NearestNeighbors
from sklearn import neighbors
from sklearn import linear_model
In [ ]:
df_heart = pd.read_csv('data/Heart.csv')
In [ ]:
df_heart.head()
In [ ]:
df_heart['AHD'] = 1*(df_heart['AHD'] == "Yes")
df_heart.AHD.head(10)
In [ ]:
plt.plot(df_heart.MaxHR, df_heart.AHD, 'o', alpha=0.4)
plt.ylim(-0.1,1.1)
plt.show()
#uh-oh, that's not good :(
In [ ]:
fig = plt.figure()
fig.patch.set_alpha(0.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)
host = fig.add_subplot(111)
par1 = host.twinx()
host.set_xlabel("MaxHR")
host.set_ylabel("Probability")
par1.set_ylabel("AHD")
color1 = plt.cm.viridis(0)
data_x = df_heart['MaxHR']
data_y = df_heart['AHD']
regr = sk.linear_model.LinearRegression(fit_intercept=True)
regr.fit(data_x.values.reshape(-1,1), data_y)
# Make predictions using the testing set
x=np.linspace(np.min(data_x)-10,np.max(data_x)+10)
y_ = regr.predict(x.reshape(-1,1))
host.plot(data_x, data_y, 'o' ,alpha=0.4, label='Data')
host.plot(x, y_, label='LinReg')
host.legend(loc=3)
labels = ['No', 'Yes']
# You can specify a rotation for the tick labels in degrees or with keywords.
par1.set_yticks( [0.061, 0.83])
par1.set_yticklabels(labels)
plt.show()
#plt.savefig('fig/FittingLR.png', dpi=300, transparent=True)
Plot for linear regression -> Log Regression¶
In [ ]:
fig, ax1 = plt.subplots()
fig.patch.set_alpha(0.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)
x = np.linspace(-100, 100, 100)
y = x
yl = 1/(1+np.exp(-y))
ax1.plot(x,y, label='Y=X')
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.legend()
#plt.savefig('fig/LinR.png', dpi=300, transparent=True)
In [ ]:
fig = plt.figure()
fig.patch.set_alpha(0.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)
plt.plot(x,yl, label='Y=f(x)')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.grid()
#plt.savefig('fig/LogR.png', dpi=300, transparent=True)
In [ ]:
from matplotlib.animation import FuncAnimation
fig, ax = plt.subplots()
fig.patch.set_alpha(1.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)
line, = ax.plot(x,yl, label=r'$Y=\frac{1}{1+e^{-(X+\beta_0) }}$')
plt.xlabel('X')
plt.ylabel('Y')
#plt.legend(loc=5)
def update(i):
b0=2*i
label = r'$\beta_0=$ {0}'.format(b0)
print(label)
# Update the line and the axes (with a new xlabel). Return a tuple of
# "artists" that have to be redrawn for this frame.
line.set_ydata( 1/(1+np.exp(-x+b0)))
ax.set_title(label)
return line, ax
# FuncAnimation will call the 'update' function for each frame; here
# animating over 10 frames, with an interval of 200ms between frames.
anim = FuncAnimation(fig, update, frames=np.arange(-20, 20,2), interval=300, blit=False)
#anim.save('fig/LogBeta0.gif', dpi=120, writer='imagemagick', savefig_kwargs={'transparent': True, 'facecolor': '#F9F9F9'})
#plt.savefig('fig/LogRBeta.png', dpi=300, transparent=True)
In [ ]:
from matplotlib.animation import FuncAnimation
fig, ax = plt.subplots()
fig.patch.set_alpha(1.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)
line, = ax.plot(x,yl, label=r'$Y=\frac{1}{1+e^{-(X+\beta_0) }}$')
plt.xlabel('X')
plt.ylabel('Y')
#plt.legend(loc=5)
def update(i):
b1=2*i
label = r'$\beta_1=$ {0}'.format(np.round(b1, decimals=2))
print(label)
# Update the line and the axes (with a new xlabel). Return a tuple of
# "artists" that have to be redrawn for this frame.
line.set_ydata( 1/(1+np.exp(-b1*x)))
ax.set_title(label)
return line, ax
# FuncAnimation will call the 'update' function for each frame; here
# animating over 10 frames, with an interval of 200ms between frames.
anim = FuncAnimation(fig, update, frames=np.arange(.2, -.2,-.03), interval=200, blit=False)
anim.save('fig/LogBeta1.gif', dpi=120, writer='imagemagick', savefig_kwargs={'transparent': True, 'facecolor': '#F9F9F9'})
#plt.savefig('fig/LogRBeta.png', dpi=300, transparent=True)
Likelihood function¶
In [ ]:
fig = plt.figure()
fig.patch.set_alpha(0.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)
x = np.linspace(-10, 10, 100)
y = x
yl = 1/(1+np.exp(-0.5*y))
#plt.plot(x,yl, label=r'$P(Y=1)=\frac{1}{1+e^{-X\beta}}$')
plt.plot(x,yl)
plt.xlabel('X')
plt.ylabel('P(Y=1)')
plt.legend()
plt.ylim((-0.2, 1.1))
#plt.savefig('fig/Likelihood1.png', dpi=300, transparent=True)
### FRAME 2
plt.axvline(x=3, color='#A2A2A2', ls='-.')
#plt.savefig('fig/Likelihood2.png', dpi=300, transparent=True)
### FRAME 3
plt.annotate(r' $P(y=1|x=3)$', xy=(3, .83),\
xytext=(-8, .81),
arrowprops=dict(facecolor='black', shrink=0.05),
)
plt.savefig('fig/Likelihood3.png', dpi=300, transparent=True)
### FRAME 4
plt.annotate(r' $p=P(y=1|x=3)$', xy=(3, .83),\
xytext=(-10, .81),
arrowprops=dict(facecolor='black', shrink=0.05),
)
plt.savefig('fig/Likelihood4.png', dpi=300, transparent=True)
### FRAME 5
plt.clf()
plt.ylim((-0.2, 1.1))
plt.plot(x,yl)
plt.xlabel('X')
plt.ylabel('P(Y=1)')
plt.axvline(x=3, color='#A2A2A2', ls='-.')
coinf = np.random.binomial(1, 0.7, size=1)
plt.plot( [3], [0], 'ko')
plt.plot( [3], [1], 'ko')
plt.annotate(r' $p$', xy=(2.7, 1),\
xytext=(-3, 0.98),
arrowprops=dict(facecolor='white', shrink=0.0),
)
plt.annotate(r' $1-p$', xy=(2.8, .00),\
xytext=(-4, -.02),
arrowprops=dict(facecolor='white', shrink=0.05),
)
plt.savefig('fig/Likelihood5.png', dpi=300, transparent=True)
plt.show()
Plots for simple model¶
In [ ]:
from sklearn import linear_model
fig = plt.figure()
fig.patch.set_alpha(0.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)
plt.ylim((-0.1, 1.1))
plt.xlabel("MaxHR")
plt.ylabel("Heart disease (AHD)")
data_x = df_heart['MaxHR']
data_y = df_heart['AHD']
plt.plot(data_x, data_y, 'o' ,alpha=0.4, label='Data')
plt.show()
#plt.legend(loc=3)
#plt.savefig('fig/FittingLogR1.png', dpi=300, transparent=True)
In [ ]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=100000, fit_intercept=True,solver='lbfgs')
logreg.fit(data_x.values.reshape(-1,1), data_y);
print('Estimated beta1: \n', logreg.coef_)
print('Estimated beta0: \n', logreg.intercept_)
In [ ]:
fig = plt.figure()
fig.patch.set_alpha(0.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)
x=np.linspace(np.min(data_x),np.max(data_x))
yhat = logreg.predict_proba(x.reshape(-1,1))[:,1]
plt.plot(data_x, data_y, 'o' ,alpha=0.4, label='Data')
plt.plot(x,yhat, label='Model')
plt.legend()
plt.xlabel("MaxHR")
plt.ylabel("Heart disease (AHD)")
plt.savefig('fig/FittingLogR2.png', dpi=300, transparent=True)
Categorical predictors¶
In [ ]:
data_x = df_heart['Sex']
data_y = df_heart['AHD']
idx0 = np.where(data_x.values==0)
idx1 = np.where(data_x.values==1)
print("percentage of females with HD", data_y.values[idx0].sum()/idx0[0].shape)
print("percentage of males with HD", data_y.values[idx1].sum()/idx1[0].shape)
pd.crosstab(df_heart['Sex'],df_heart['AHD'])
In [ ]:
fig = plt.figure()
fig.patch.set_alpha(0.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)
plt.ylim((-0.1, 1.1))
plt.xlabel("Sex")
plt.ylabel("Heart disease (AHD)")
data_x = df_heart['Sex']
data_y = df_heart['AHD']
logreg.fit(data_x.values.reshape(-1,1), data_y);
x=np.linspace(np.min(data_x),np.max(data_x))
yhat = logreg.predict_proba(x.reshape(-1,1))[:,1]
plt.plot(data_x, data_y, 'o' ,alpha=0.4, label='Data')
plt.plot(x,yhat, label='Model')
plt.show()
In [ ]:
print('Estimated beta1: \n', logreg.coef_)
print('Estimated beta0: \n', logreg.intercept_)
In [ ]:
Probit¶
In [ ]:
from scipy.stats import logistic
from scipy.stats import norm
fig = plt.figure()
fig.patch.set_alpha(0.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)
x = np.linspace(logistic.ppf(0.01), logistic.ppf(0.99), 100)
plt.plot(x, logistic.pdf(x,loc=0, scale=1), 'r-', lw=5, alpha=0.6, label='logistic pdf')
plt.plot(x, norm.pdf(x,loc=0, scale=1), 'b-', lw=5, alpha=0.6, label='normal pdf')
plt.xlabel('X')
plt.ylabel('Probability')
plt.legend()
plt.savefig('fig/NormVsLog.png', dpi=300, transparent=True)
Multiple Logistic Regression¶
In [ ]:
data_x = df_heart[['MaxHR','Sex']]
data_y = df_heart['AHD']
logreg.fit(data_x, data_y);
print('Estimated beta1, beta2: \n', logreg.coef_)
print('Estimated beta0: \n', logreg.intercept_)
In [ ]:
df_heart['Interaction'] = df_heart.MaxHR * df_heart.Sex
data_x = df_heart[['MaxHR','Sex', 'Interaction']]
data_y = df_heart['AHD']
logreg = LogisticRegression(C=100000, fit_intercept=True, solver='lbfgs')
logreg.fit(data_x, data_y);
print('Estimated beta1, beta2, beta3: \n', logreg.coef_)
print('Estimated beta0: \n', logreg.intercept_)
Multi-Class (Multinomial)¶
In [ ]:
# Response for Multinomial Logistic Regression Example
print(df_heart.RestECG.values)
plt.hist(df_heart.RestECG.values)
plt.show()
In [ ]:
# Multinomial Logistic Regression Example
data_x = df_heart[['Sex']]
data_y = df_heart['RestECG']
logitm = LogisticRegression(C = 10000000,solver='lbfgs')
logitm.fit(data_x, data_y)
# The coefficients
print('Estimated beta1: \n', logitm.coef_)
print('Estimated beta0: \n', logitm.intercept_)
In [ ]:
logitm = LogisticRegression(C = 10000000,solver='lbfgs',multi_class='ovr')
logitm.fit (data_x, data_y)
# The coefficients
print('Estimated beta1: \n', logitm.coef_)
print('Estimated beta0: \n', logitm.intercept_)
In [ ]:
logitm = LogisticRegression(C = 10000000,solver='lbfgs',multi_class='multinomial')
logitm.fit (data_x, data_y)
# The coefficients
print('Estimated beta1: \n', logitm.coef_)
print('Estimated beta0: \n', logitm.intercept_)
In [ ]:
logitm = LogisticRegression(C = 10000000,solver='lbfgs')
logitm.fit (df_heart[['Sex','MaxHR','Interaction']], df_heart['RestECG'])
# The coefficients
print('Estimated beta1: \n', logitm.coef_)
print('Estimated beta0: \n', logitm.intercept_)
In [ ]: