In [7]:
import sys
import numpy as np
import pylab as pl
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib
import sklearn as sk
import sklearn.discriminant_analysis as da
import sklearn.neighbors as knn
In [2]:
#read in the GSS data
gssdata = pd.read_csv("../data/gsspartyid.csv")
gssdata.head()
Out[2]:
politicalparty income educ abortion republican
0 Republican 2152 9 0 1
1 Republican 906 6 0 1
2 Democrat 1373 6 0 0
3 Democrat 1941 4 0 0
4 Democrat 355 7 0 0
In [3]:
LDA = da.LinearDiscriminantAnalysis()
X = gssdata[["income","educ","abortion"]]
model_LDA = LDA.fit(X,gssdata['republican'])
print("Specificity is",np.mean(model_LDA.predict(X[gssdata['republican']==0])))
print("Sensitivity is",1-np.mean(model_LDA.predict(X[gssdata['republican']==1])))
print("False positive rate is",np.mean(gssdata['republican'][model_LDA.predict(X)==1]))
print("False negative rate is",1-np.mean(gssdata['republican'][model_LDA.predict(X)==0]))
Specificity is 0.38838709677419353
Sensitivity is 0.37406015037593987
False positive rate is 0.5252365930599369
False negative rate is 0.7043090638930163
In [4]:
#read in the first simulation data set
data1 = pd.read_csv("../data/dataset1.csv")
X = data1[['x1','x2']]
data1['x1sq'] = data1['x1']**2
data1['x2sq'] = data1['x2']**2
X2 = data1[['x1','x2','x1sq','x2sq']]
y = data1['y']
n = data1.shape[0]
data1.head()
Out[4]:
y x1 x2 x1sq x2sq
0 1 2.585529 2.501129 6.684959 6.255645
1 1 1.890697 1.600182 3.574734 2.560583
2 1 2.605887 0.084014 6.790649 0.007058
3 1 2.630099 1.547508 6.917418 2.394780
4 1 1.715840 1.208271 2.944108 1.459918
In [5]:
lda = da.LinearDiscriminantAnalysis()
qda = da.QuadraticDiscriminantAnalysis()
lda.fit(X,y)
qda.fit(X,y)
logit2 = sk.linear_model.LogisticRegression(C = 1000000)
logit1 = sk.linear_model.LogisticRegression(C = 1000000)
logit1.fit(X,y)
logit2.fit(X2,y)

print("Overall misclassification rate of Logit1 is",(1-logit1.score(X,y)))
print("Overall misclassification rate of Logit2 is",(1-logit2.score(X2,y)))
print("Overall misclassification rate of LDA is",(1-lda.score(X,y)))
print("Overall misclassification rate of QDA is",(1-qda.score(X,y)))
Overall misclassification rate of Logit1 is 0.04410000000000003
Overall misclassification rate of Logit2 is 0.04410000000000003
Overall misclassification rate of LDA is 0.04425000000000001
Overall misclassification rate of QDA is 0.04410000000000003
In [6]:
knn3=knn.KNeighborsClassifier(3)
knn25=knn.KNeighborsClassifier(25)
knn3.fit(X,y)
knn25.fit(X,y)

#note, the KNNs are not correct :(
print("Overall misclassification rate of kNN3 is",(1-knn3.score(X,y)))
print("Overall misclassification rate of kNN25 is",(1-knn25.score(X,y)))
Overall misclassification rate of kNN3 is 0.03310000000000002
Overall misclassification rate of kNN25 is 0.04235