CS-109A Introduction to Data Science

Lecture 11: Logistic Regression 2 Demo

Harvard University
Fall 2018
Instructors: Pavlos Protopapas and Kevin Rader


In [2]:
## RUN THIS CELL TO PROPERLY HIGHLIGHT THE EXERCISES
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)
Out[2]:

In [3]:
import sys
import numpy as np
import pandas as pd
import statsmodels.api as sm
import sklearn.linear_model as sk
import matplotlib.pyplot as plt
%matplotlib inline
In [4]:
df_heart = pd.read_csv('../data/heart.csv')
df_heart.head()
Out[4]:
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No
4 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No
In [5]:
plt.plot(df_heart.MaxHR, df_heart.AHD, 'o', alpha=0.1)
Out[5]:
[]
In [4]:
from sklearn import linear_model

fig = plt.figure()
fig.patch.set_alpha(0.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)

host = fig.add_subplot(111)

par1 = host.twinx()

host.set_xlabel("MaxHR")
host.set_ylabel("Probability")
par1.set_ylabel("AHD")

color1 = plt.cm.viridis(0)

data_x = df_heart.MaxHR
data_y = df_heart.AHD.map(lambda x: 0 if x=='No' else 1)

regr = linear_model.LinearRegression(fit_intercept=True)

regr.fit(data_x.reshape(-1,1), data_y.reshape(-1,1))

# Make predictions using the testing set
x=np.linspace(np.min(data_x),np.max(data_x))
y_ = regr.predict(x.reshape(-1,1))

host.plot(data_x, data_y, 'o' ,alpha=0.1, label='Data')
host.plot(x, y_, label='LinReg')
host.legend(loc=3)

labels = ['No', 'Yes']

# You can specify a rotation for the tick labels in degrees or with keywords.
par1.set_yticks( [0.061, 0.83])
par1.set_yticklabels(labels)
/Users/krader/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:27: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
Out[4]:
[, ]
In [5]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=100000, fit_intercept=True)
logreg.fit(data_x.values.reshape(-1,1), data_y);

print('Estimated beta1: \n', logreg.coef_)
print('Estimated beta0: \n', logreg.intercept_)
Estimated beta1: 
 [[-0.04326016]]
Estimated beta0: 
 [ 6.30193148]
In [12]:
fig = plt.figure()
fig.patch.set_alpha(0.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)

x=np.linspace(np.min(data_x),np.max(data_x))
y_ = logreg.predict_proba(x.reshape(-1,1))[:,0]
plt.plot(data_x, data_y, 'o' ,alpha=0.1, label='Data')
plt.plot(x,y_, label='Model')
plt.legend()

plt.xlabel("MaxHR")
plt.ylabel("Heart disease (AHD)")

plt.savefig('FittingLogR2.png', dpi=300, transparent=True)

Multiple Logistic Regression

Just like in linear regression, the logistic regression model can be extended to incorporate multiple predictors/features. The logistic model (written in the log-odds format) can then be written as: $$ \ln\left(\frac{P(Y=1)}{1-P(Y=1)}\right) = \beta_0+\beta_1 X_1+\dots+\beta_p X_p .$$ Maximum likelihood methods can still be used to generate estimates for the $p+1$ parameters: $\beta_0,\beta_1,\dots,\beta_p $.

In [13]:
data_x = df_heart[['MaxHR','Sex']]
data_y = df_heart['AHD']

logreg = LogisticRegression(C=100000, fit_intercept=True)
logreg.fit(data_x, data_y);


print('Estimated beta1, beta2: \n', logreg.coef_)
print('Estimated beta0: \n', logreg.intercept_)
Estimated beta1, beta2: 
 [[-0.04496354  1.40079047]]
Estimated beta0: 
 [ 5.58662464]

Regularization in Logistic Regression

In [104]:
beta1_l1 = []
beta1_l2 = []
Cs = []
data_x = df_heart[['MaxHR']]
data_y = df_heart['AHD']

for i in range(1, 500):
    C = i/100
    logitm_l1 = sk.LogisticRegression(C = C, penalty = "l1")
    logitm_l1.fit (data_x, data_y)
    logitm_l2 = sk.LogisticRegression(C = C, penalty = "l2")
    logitm_l2.fit (data_x, data_y)
    beta1_l1.append(logitm_l1.coef_[0])
    beta1_l2.append(logitm_l2.coef_[0])
    Cs.append(C)

plt.plot(np.array(Cs), beta1_l1,  color='black', lw=3)
plt.plot(np.array(Cs), beta1_l2,  color='blue', lw=3)
plt.xlabel ("lambda = C")
plt.ylabel("beta1")
plt.show()
In [93]:
print(1.0/np.array(Cs))
[ 100.           50.           33.33333333   25.           20.
   16.66666667   14.28571429   12.5          11.11111111   10.
    9.09090909    8.33333333    7.69230769    7.14285714    6.66666667
    6.25          5.88235294    5.55555556    5.26315789    5.
    4.76190476    4.54545455    4.34782609    4.16666667    4.
    3.84615385    3.7037037     3.57142857    3.44827586    3.33333333
    3.22580645    3.125         3.03030303    2.94117647    2.85714286
    2.77777778    2.7027027     2.63157895    2.56410256    2.5
    2.43902439    2.38095238    2.3255814     2.27272727    2.22222222
    2.17391304    2.12765957    2.08333333    2.04081633    2.
    1.96078431    1.92307692    1.88679245    1.85185185    1.81818182
    1.78571429    1.75438596    1.72413793    1.69491525    1.66666667
    1.63934426    1.61290323    1.58730159    1.5625        1.53846154
    1.51515152    1.49253731    1.47058824    1.44927536    1.42857143
    1.4084507     1.38888889    1.36986301    1.35135135    1.33333333
    1.31578947    1.2987013     1.28205128    1.26582278    1.25
    1.2345679     1.2195122     1.20481928    1.19047619    1.17647059
    1.1627907     1.14942529    1.13636364    1.12359551    1.11111111
    1.0989011     1.08695652    1.07526882    1.06382979    1.05263158
    1.04166667    1.03092784    1.02040816    1.01010101    1.
    0.99009901    0.98039216    0.97087379    0.96153846    0.95238095
    0.94339623    0.93457944    0.92592593    0.91743119    0.90909091
    0.9009009     0.89285714    0.88495575    0.87719298    0.86956522
    0.86206897    0.85470085    0.84745763    0.84033613    0.83333333
    0.82644628    0.81967213    0.81300813    0.80645161    0.8
    0.79365079    0.78740157    0.78125       0.7751938     0.76923077
    0.76335878    0.75757576    0.7518797     0.74626866    0.74074074
    0.73529412    0.72992701    0.72463768    0.71942446    0.71428571
    0.70921986    0.70422535    0.6993007     0.69444444    0.68965517
    0.68493151    0.68027211    0.67567568    0.67114094    0.66666667
    0.66225166    0.65789474    0.65359477    0.64935065    0.64516129
    0.64102564    0.63694268    0.63291139    0.62893082    0.625
    0.62111801    0.61728395    0.61349693    0.6097561     0.60606061
    0.60240964    0.5988024     0.5952381     0.59171598    0.58823529
    0.58479532    0.58139535    0.57803468    0.57471264    0.57142857
    0.56818182    0.56497175    0.56179775    0.55865922    0.55555556
    0.55248619    0.54945055    0.54644809    0.54347826    0.54054054
    0.53763441    0.53475936    0.53191489    0.52910053    0.52631579
    0.52356021    0.52083333    0.51813472    0.51546392    0.51282051
    0.51020408    0.50761421    0.50505051    0.50251256    0.5
    0.49751244    0.4950495     0.49261084    0.49019608    0.48780488
    0.48543689    0.48309179    0.48076923    0.4784689     0.47619048
    0.47393365    0.47169811    0.46948357    0.46728972    0.46511628
    0.46296296    0.46082949    0.4587156     0.456621      0.45454545
    0.45248869    0.45045045    0.44843049    0.44642857    0.44444444
    0.44247788    0.44052863    0.43859649    0.43668122    0.43478261
    0.43290043    0.43103448    0.42918455    0.42735043    0.42553191
    0.42372881    0.42194093    0.42016807    0.41841004    0.41666667
    0.41493776    0.41322314    0.41152263    0.40983607    0.40816327
    0.40650407    0.4048583     0.40322581    0.40160643    0.4
    0.39840637    0.3968254     0.39525692    0.39370079    0.39215686
    0.390625      0.38910506    0.3875969     0.38610039    0.38461538
    0.38314176    0.38167939    0.38022814    0.37878788    0.37735849
    0.37593985    0.37453184    0.37313433    0.37174721    0.37037037
    0.36900369    0.36764706    0.36630037    0.3649635     0.36363636
    0.36231884    0.36101083    0.35971223    0.35842294    0.35714286
    0.35587189    0.35460993    0.35335689    0.35211268    0.35087719
    0.34965035    0.34843206    0.34722222    0.34602076    0.34482759
    0.34364261    0.34246575    0.34129693    0.34013605    0.33898305
    0.33783784    0.33670034    0.33557047    0.33444816    0.33333333
    0.33222591    0.33112583    0.330033      0.32894737    0.32786885
    0.32679739    0.3257329     0.32467532    0.3236246     0.32258065
    0.32154341    0.32051282    0.31948882    0.31847134    0.31746032
    0.3164557     0.31545741    0.31446541    0.31347962    0.3125
    0.31152648    0.31055901    0.30959752    0.30864198    0.30769231
    0.30674847    0.3058104     0.30487805    0.30395137    0.3030303
    0.3021148     0.30120482    0.3003003     0.2994012     0.29850746
    0.29761905    0.29673591    0.29585799    0.29498525    0.29411765
    0.29325513    0.29239766    0.29154519    0.29069767    0.28985507
    0.28901734    0.28818444    0.28735632    0.28653295    0.28571429
    0.28490028    0.28409091    0.28328612    0.28248588    0.28169014
    0.28089888    0.28011204    0.27932961    0.27855153    0.27777778
    0.27700831    0.27624309    0.27548209    0.27472527    0.2739726
    0.27322404    0.27247956    0.27173913    0.27100271    0.27027027
    0.26954178    0.2688172     0.26809651    0.26737968    0.26666667
    0.26595745    0.26525199    0.26455026    0.26385224    0.26315789
    0.26246719    0.2617801     0.26109661    0.26041667    0.25974026
    0.25906736    0.25839793    0.25773196    0.25706941    0.25641026
    0.25575448    0.25510204    0.25445293    0.25380711    0.25316456
    0.25252525    0.25188917    0.25125628    0.25062657    0.25
    0.24937656    0.24875622    0.24813896    0.24752475    0.24691358
    0.24630542    0.24570025    0.24509804    0.24449878    0.24390244
    0.243309      0.24271845    0.24213075    0.24154589    0.24096386
    0.24038462    0.23980815    0.23923445    0.23866348    0.23809524
    0.23752969    0.23696682    0.23640662    0.23584906    0.23529412
    0.23474178    0.23419204    0.23364486    0.23310023    0.23255814
    0.23201856    0.23148148    0.23094688    0.23041475    0.22988506
    0.2293578     0.22883295    0.2283105     0.22779043    0.22727273
    0.22675737    0.22624434    0.22573363    0.22522523    0.2247191
    0.22421525    0.22371365    0.22321429    0.22271715    0.22222222
    0.22172949    0.22123894    0.22075055    0.22026432    0.21978022
    0.21929825    0.21881838    0.21834061    0.21786492    0.2173913
    0.21691974    0.21645022    0.21598272    0.21551724    0.21505376
    0.21459227    0.21413276    0.21367521    0.21321962    0.21276596
    0.21231423    0.21186441    0.21141649    0.21097046    0.21052632
    0.21008403    0.20964361    0.20920502    0.20876827    0.20833333
    0.20790021    0.20746888    0.20703934    0.20661157    0.20618557
    0.20576132    0.20533881    0.20491803    0.20449898    0.20408163
    0.20366599    0.20325203    0.20283976    0.20242915    0.2020202
    0.2016129     0.20120724    0.20080321    0.2004008 ]

Logistic Regression for $Y$ with more than 2 Categories

There are several extensions to standard logistic regression when the response variable $Y$ has more than 2 categories. The two most common are ordinal logistic regression and multinomial logistic regression. Ordinal logistic regression is used when the categories have a specific hierarchy (like class year: Freshman, Sophomore, Junior, Senior; or a 7-point rating scale from strongly disagree to strongly agree). Multinomial logistic regression is used when the categories have no inherent order (like eye color: blue, green, brown, hazel, et...).

In [21]:
#read the NFL play-by-play data
nfldata = pd.read_csv("NFLplaybyplay-2015.csv")

# shuffle the data
nfldata = nfldata.reindex(np.random.permutation(nfldata.index))

# For simplicity, we will select only 500 points form the dataset.
N = 500

X = nfldata[["YardLine"]]

nfldata["PlayType"]=nfldata["IsPass"]+2*nfldata["IsRush"]

logitm = sk.LogisticRegression(C = 10000000)
logitm.fit (X, nfldata["PlayType"])

# The coefficients
print('Estimated beta1: \n', logitm.coef_)
print('Estimated beta0: \n', logitm.intercept_)
Estimated beta1: 
 [[-0.01460736]
 [ 0.00635893]
 [ 0.00652455]]
Estimated beta0: 
 [-0.26422696 -0.61186328 -1.20051275]
In [33]:
X=np.arange(100)
print(logitm.predict_proba(X.reshape(-1,1))[0:10,:])
print(logitm.predict(X.reshape(-1,1)))
[[ 0.42692074  0.34563977  0.22743948]
 [ 0.42380137  0.34739801  0.22880062]
 [ 0.42067735  0.34915746  0.23016519]
 [ 0.41754902  0.35091792  0.23153306]
 [ 0.41441671  0.35267918  0.23290411]
 [ 0.41128078  0.35444103  0.23427819]
 [ 0.40814155  0.35620326  0.23565519]
 [ 0.40499938  0.35796565  0.23703497]
 [ 0.40185462  0.35972799  0.23841739]
 [ 0.39870763  0.36149006  0.23980231]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
In [43]:
#read in party ID data
party_df = pd.read_csv("gsspartyid.csv")
party_df.head()
Out[43]:
politicalparty income educ abortion republican
0 Republican 2152 9 0 1
1 Republican 906 6 0 1
2 Democrat 1373 6 0 0
3 Democrat 1941 4 0 0
4 Democrat 355 7 0 0
In [46]:
#quick summaries of party ID data
print(party_df['politicalparty'].value_counts())
party_df.describe()
Democrat      775
Republican    532
Name: politicalparty, dtype: int64
Out[46]:
income educ abortion republican
count 1307.000000 1307.000000 1307.000000 1307.000000
mean 1088.351186 7.970926 0.456006 0.407039
std 714.286949 3.767038 0.498251 0.491470
min 1.000000 1.000000 0.000000 0.000000
25% 451.500000 5.000000 0.000000 0.000000
50% 1083.000000 7.000000 0.000000 0.000000
75% 1725.000000 9.000000 1.000000 1.000000
max 2315.000000 21.000000 1.000000 1.000000
In [49]:
#estimating logistic regression for party ID classification

party_df['logincome'] = np.log(party_df['income'].values())

train_x = party_df[['income','educ','abortion']]
train_y = party_df[['republican']]

logitm = sk.LogisticRegression(C = 10000000)
logitm.fit (train_x,party_df['republican'])

# The coefficients
print('Estimated beta1: \n', logitm.coef_)
print('Estimated beta0: \n', logitm.intercept_)
Estimated beta1: 
 [[ -5.35180582e-05  -1.22975923e-02  -9.96214452e-01]]
Estimated beta0: 
 [ 0.20927611]
In [61]:
#calculating confusion matrices
import sklearn.metrics as met

yhat = logitm.predict_proba(train_x)[:,0]

print(met.confusion_matrix(party_df['republican'], yhat>0.5))
print(met.confusion_matrix(party_df['republican'], yhat>0.48))
print(met.confusion_matrix(party_df['republican'], yhat>0.72))
[[288 487]
 [311 221]]
[[143 632]
 [138 394]]
[[539 236]
 [455  77]]
In [82]:
#ROC curve on the train set
fpr, tpr, thresholds = met.roc_curve(party_df['republican'], yhat)

x=np.arange(0,100)/100
plt.plot(x,x,'--',color="gray",alpha=0.3)
plt.plot(tpr,fpr)
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.title("ROC Curve for the Party ID Logistic Regression Model")
Out[82]: