CS-109A Introduction to Data Science
Lecture 11: Logistic Regression 2 Demo¶
Harvard University
Fall 2018
Instructors: Pavlos Protopapas and Kevin Rader
## RUN THIS CELL TO PROPERLY HIGHLIGHT THE EXERCISES
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)
import sys
import numpy as np
import pandas as pd
import statsmodels.api as sm
import sklearn.linear_model as sk
import matplotlib.pyplot as plt
%matplotlib inline
df_heart = pd.read_csv('../data/heart.csv')
df_heart.head()
plt.plot(df_heart.MaxHR, df_heart.AHD, 'o', alpha=0.1)
from sklearn import linear_model
fig = plt.figure()
fig.patch.set_alpha(0.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)
host = fig.add_subplot(111)
par1 = host.twinx()
host.set_xlabel("MaxHR")
host.set_ylabel("Probability")
par1.set_ylabel("AHD")
color1 = plt.cm.viridis(0)
data_x = df_heart.MaxHR
data_y = df_heart.AHD.map(lambda x: 0 if x=='No' else 1)
regr = linear_model.LinearRegression(fit_intercept=True)
regr.fit(data_x.reshape(-1,1), data_y.reshape(-1,1))
# Make predictions using the testing set
x=np.linspace(np.min(data_x),np.max(data_x))
y_ = regr.predict(x.reshape(-1,1))
host.plot(data_x, data_y, 'o' ,alpha=0.1, label='Data')
host.plot(x, y_, label='LinReg')
host.legend(loc=3)
labels = ['No', 'Yes']
# You can specify a rotation for the tick labels in degrees or with keywords.
par1.set_yticks( [0.061, 0.83])
par1.set_yticklabels(labels)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=100000, fit_intercept=True)
logreg.fit(data_x.values.reshape(-1,1), data_y);
print('Estimated beta1: \n', logreg.coef_)
print('Estimated beta0: \n', logreg.intercept_)
fig = plt.figure()
fig.patch.set_alpha(0.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)
x=np.linspace(np.min(data_x),np.max(data_x))
y_ = logreg.predict_proba(x.reshape(-1,1))[:,0]
plt.plot(data_x, data_y, 'o' ,alpha=0.1, label='Data')
plt.plot(x,y_, label='Model')
plt.legend()
plt.xlabel("MaxHR")
plt.ylabel("Heart disease (AHD)")
plt.savefig('FittingLogR2.png', dpi=300, transparent=True)
Multiple Logistic Regression¶
Just like in linear regression, the logistic regression model can be extended to incorporate multiple predictors/features. The logistic model (written in the log-odds format) can then be written as: $$ \ln\left(\frac{P(Y=1)}{1-P(Y=1)}\right) = \beta_0+\beta_1 X_1+\dots+\beta_p X_p .$$ Maximum likelihood methods can still be used to generate estimates for the $p+1$ parameters: $\beta_0,\beta_1,\dots,\beta_p $.
data_x = df_heart[['MaxHR','Sex']]
data_y = df_heart['AHD']
logreg = LogisticRegression(C=100000, fit_intercept=True)
logreg.fit(data_x, data_y);
print('Estimated beta1, beta2: \n', logreg.coef_)
print('Estimated beta0: \n', logreg.intercept_)
Regularization in Logistic Regression¶
beta1_l1 = []
beta1_l2 = []
Cs = []
data_x = df_heart[['MaxHR']]
data_y = df_heart['AHD']
for i in range(1, 500):
C = i/100
logitm_l1 = sk.LogisticRegression(C = C, penalty = "l1")
logitm_l1.fit (data_x, data_y)
logitm_l2 = sk.LogisticRegression(C = C, penalty = "l2")
logitm_l2.fit (data_x, data_y)
beta1_l1.append(logitm_l1.coef_[0])
beta1_l2.append(logitm_l2.coef_[0])
Cs.append(C)
plt.plot(np.array(Cs), beta1_l1, color='black', lw=3)
plt.plot(np.array(Cs), beta1_l2, color='blue', lw=3)
plt.xlabel ("lambda = C")
plt.ylabel("beta1")
plt.show()
print(1.0/np.array(Cs))
Logistic Regression for $Y$ with more than 2 Categories¶
There are several extensions to standard logistic regression when the response variable $Y$ has more than 2 categories. The two most common are ordinal logistic regression and multinomial logistic regression. Ordinal logistic regression is used when the categories have a specific hierarchy (like class year: Freshman, Sophomore, Junior, Senior; or a 7-point rating scale from strongly disagree to strongly agree). Multinomial logistic regression is used when the categories have no inherent order (like eye color: blue, green, brown, hazel, et...).
#read the NFL play-by-play data
nfldata = pd.read_csv("NFLplaybyplay-2015.csv")
# shuffle the data
nfldata = nfldata.reindex(np.random.permutation(nfldata.index))
# For simplicity, we will select only 500 points form the dataset.
N = 500
X = nfldata[["YardLine"]]
nfldata["PlayType"]=nfldata["IsPass"]+2*nfldata["IsRush"]
logitm = sk.LogisticRegression(C = 10000000)
logitm.fit (X, nfldata["PlayType"])
# The coefficients
print('Estimated beta1: \n', logitm.coef_)
print('Estimated beta0: \n', logitm.intercept_)
X=np.arange(100)
print(logitm.predict_proba(X.reshape(-1,1))[0:10,:])
print(logitm.predict(X.reshape(-1,1)))
#read in party ID data
party_df = pd.read_csv("gsspartyid.csv")
party_df.head()
#quick summaries of party ID data
print(party_df['politicalparty'].value_counts())
party_df.describe()
#estimating logistic regression for party ID classification
party_df['logincome'] = np.log(party_df['income'].values())
train_x = party_df[['income','educ','abortion']]
train_y = party_df[['republican']]
logitm = sk.LogisticRegression(C = 10000000)
logitm.fit (train_x,party_df['republican'])
# The coefficients
print('Estimated beta1: \n', logitm.coef_)
print('Estimated beta0: \n', logitm.intercept_)
#calculating confusion matrices
import sklearn.metrics as met
yhat = logitm.predict_proba(train_x)[:,0]
print(met.confusion_matrix(party_df['republican'], yhat>0.5))
print(met.confusion_matrix(party_df['republican'], yhat>0.48))
print(met.confusion_matrix(party_df['republican'], yhat>0.72))
#ROC curve on the train set
fpr, tpr, thresholds = met.roc_curve(party_df['republican'], yhat)
x=np.arange(0,100)/100
plt.plot(x,x,'--',color="gray",alpha=0.3)
plt.plot(tpr,fpr)
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.title("ROC Curve for the Party ID Logistic Regression Model")