CS-109A Introduction to Data Science

Lecture 14: Discriminant Analysis Demo

Harvard University
Fall 2018
Instructors: Pavlos Protopapas and Kevin Rader


In [1]:
## RUN THIS CELL TO PROPERLY HIGHLIGHT THE EXERCISES
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)
Out[1]:

In [ ]:
import sys
import numpy as np
import pylab as pl
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib
import sklearn as sk
import sklearn.discriminant_analysis as da
import sklearn.neighbors as knn
In [ ]:
#read in the GSS data
gssdata = pd.read_csv("gsspartyid.csv")
gssdata.head()
In [ ]:
LDA = da.LinearDiscriminantAnalysis()
X = gssdata[["income","educ","abortion"]]
model_LDA = LDA.fit(X,gssdata['republican'])
print("Specificity is",np.mean(model_LDA.predict(X[gssdata['republican']==0])))
print("Sensitivity is",1-np.mean(model_LDA.predict(X[gssdata['republican']==1])))
print("False positive rate is",np.mean(gssdata['republican'][model_LDA.predict(X)==1]))
print("False negative rate is",1-np.mean(gssdata['republican'][model_LDA.predict(X)==0]))
In [ ]:
#read in the first simulation data set
data1 = pd.read_csv("dataset1.csv")
X = data1[['x1','x2']]
data1['x1sq'] = data1['x1']**2
data1['x2sq'] = data1['x2']**2
X2 = data1[['x1','x2','x1sq','x2sq']]
y = data1['y']
n = data1.shape[0]
data1.head()
In [ ]:
lda = da.LinearDiscriminantAnalysis()
qda = da.QuadraticDiscriminantAnalysis()
lda.fit(X,y)
qda.fit(X,y)
logit1 = sk.linear_model.LogisticRegression(C = 1000000)
logit2 = sk.linear_model.LogisticRegression(C = 1000000)
logit1.fit(X,y)
logit2.fit(X2,y)

print("Overall misclassification rate of Logit1 is",(1-logit1.score(X,y)))
print("Overall misclassification rate of Logit2 is",(1-logit2.score(X2,y)))
print("Overall misclassification rate of LDA is",(1-lda.score(X,y)))
print("Overall misclassification rate of QDA is",(1-qda.score(X,y)))
In [ ]:
knn3=knn.KNeighborsClassifier(3)
knn25=knn.KNeighborsClassifier(25)
knn3.fit(X,y)
knn25.fit(X,y)

#note, the KNNs are not correct :(
print("Overall misclassification rate of kNN3 is",(1-knn3.score(X,y)))
print("Overall misclassification rate of kNN25 is",(1-knn25.score(X,y)))