In [7]:
import sys
import numpy as np
import pylab as pl
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib
import sklearn as sk
import sklearn.discriminant_analysis as da
import sklearn.neighbors as knn
In [2]:
#read in the GSS data
gssdata = pd.read_csv("../data/gsspartyid.csv")
gssdata.head()
Out[2]:
In [3]:
LDA = da.LinearDiscriminantAnalysis()
X = gssdata[["income","educ","abortion"]]
model_LDA = LDA.fit(X,gssdata['republican'])
print("Specificity is",np.mean(model_LDA.predict(X[gssdata['republican']==0])))
print("Sensitivity is",1-np.mean(model_LDA.predict(X[gssdata['republican']==1])))
print("False positive rate is",np.mean(gssdata['republican'][model_LDA.predict(X)==1]))
print("False negative rate is",1-np.mean(gssdata['republican'][model_LDA.predict(X)==0]))
In [4]:
#read in the first simulation data set
data1 = pd.read_csv("../data/dataset1.csv")
X = data1[['x1','x2']]
data1['x1sq'] = data1['x1']**2
data1['x2sq'] = data1['x2']**2
X2 = data1[['x1','x2','x1sq','x2sq']]
y = data1['y']
n = data1.shape[0]
data1.head()
Out[4]:
In [5]:
lda = da.LinearDiscriminantAnalysis()
qda = da.QuadraticDiscriminantAnalysis()
lda.fit(X,y)
qda.fit(X,y)
logit2 = sk.linear_model.LogisticRegression(C = 1000000)
logit1 = sk.linear_model.LogisticRegression(C = 1000000)
logit1.fit(X,y)
logit2.fit(X2,y)
print("Overall misclassification rate of Logit1 is",(1-logit1.score(X,y)))
print("Overall misclassification rate of Logit2 is",(1-logit2.score(X2,y)))
print("Overall misclassification rate of LDA is",(1-lda.score(X,y)))
print("Overall misclassification rate of QDA is",(1-qda.score(X,y)))
In [6]:
knn3=knn.KNeighborsClassifier(3)
knn25=knn.KNeighborsClassifier(25)
knn3.fit(X,y)
knn25.fit(X,y)
#note, the KNNs are not correct :(
print("Overall misclassification rate of kNN3 is",(1-knn3.score(X,y)))
print("Overall misclassification rate of kNN25 is",(1-knn25.score(X,y)))