Key Word(s): KNN imputation classification
CS109A Introduction to Data Science
Lecture 12 ($k$-NN Classification and Missingness)¶
Harvard University
Fall 2019
Instructors: Pavlos Protopapas, Kevin Rader, and Chris Tanner
In [ ]:
%matplotlib inline
import sys
import numpy as np
import pylab as pl
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib
import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as met
In [ ]:
df_heart = pd.read_csv('Heart.csv')
df_heart.head()
In [ ]:
data_x = df_heart.MaxHR
data_y = df_heart.AHD.map(lambda x: 0 if x=='No' else 1)
knn1 = KNeighborsClassifier(n_neighbors=1)
knn5 = KNeighborsClassifier(n_neighbors=5)
knn10 = KNeighborsClassifier(n_neighbors=10)
knn50 = KNeighborsClassifier(n_neighbors=50)
knn1.fit(data_x.values.reshape(-1,1), data_y);
knn5.fit(data_x.values.reshape(-1,1), data_y);
knn10.fit(data_x.values.reshape(-1,1), data_y);
knn50.fit(data_x.values.reshape(-1,1), data_y);
fig = plt.figure()
fig.patch.set_alpha(0.0)
plt.xkcd(scale=0.1, length=0.0)
plt.gcf().subplots_adjust(bottom=0.20, left = 0.16, right=0.86)
x=np.linspace(np.min(data_x),np.max(data_x))
yhat1 = knn1.predict(x.reshape(-1,1))
yhat5 = knn5.predict(x.reshape(-1,1))
yhat10 = knn10.predict(x.reshape(-1,1))
yhat50 = knn50.predict(x.reshape(-1,1))
plt.plot(data_x, data_y, 'o' ,alpha=0.1, label='Data')
plt.plot(x,yhat1, label='knn1')
plt.plot(x,yhat5, label='knn5')
plt.plot(x,yhat10, label='knn10')
plt.plot(x,yhat50, label='knn50')
plt.legend()
plt.xlabel("MaxHR")
plt.ylabel("Heart disease (AHD)")
plt.show()
In [ ]:
phat1 = knn1.predict_proba(x.reshape(-1,1))[:,1]
phat5 = knn5.predict_proba(x.reshape(-1,1))[:,1]
phat10 = knn10.predict_proba(x.reshape(-1,1))[:,1]
phat50 = knn50.predict_proba(x.reshape(-1,1))[:,1]
plt.plot(data_x, data_y, 'o' ,alpha=0.1, label='Data')
plt.plot(x,phat1, label='knn1')
plt.plot(x,phat5, label='knn5')
plt.plot(x,phat10, label='knn10')
plt.plot(x,phat50, label='knn50')
plt.legend()
plt.xlabel("MaxHR")
plt.ylabel("Heart disease (AHD)")
plt.show()
In [ ]:
#two predictors
knn1 = KNeighborsClassifier(n_neighbors=1)
knn5 = KNeighborsClassifier(n_neighbors=5)
knn10 = KNeighborsClassifier(n_neighbors=10)
knn50 = KNeighborsClassifier(n_neighbors=50)
data_x = df_heart[['MaxHR','RestBP']]
data_y = df_heart.AHD.map(lambda x: 0 if x=='No' else 1)
knn1.fit(data_x, data_y);
knn5.fit(data_x, data_y);
knn10.fit(data_x, data_y);
knn50.fit(data_x, data_y);
print(knn1.score(data_x, data_y))
print(knn5.score(data_x, data_y))
print(knn10.score(data_x, data_y))
print(knn50.score(data_x, data_y))
In [ ]:
# Don't forget to split into train and test
# (or better yet, use cross-validation)
# to determine what k is actually best!
Dealing with Missingness¶
In [ ]:
# There are some missing values to begin with
print(df_heart.shape)
print(df_heart.dropna().shape)
In [ ]:
import numpy.random as random
random.seed(109)
n = df_heart["MaxHR"].size
# create 20 missing completely at random observations
miss = random.choice(n,20)
heart_mcar = pd.read_csv('Heart.csv')
heart_mcar.loc[miss,"MaxHR"] = np.nan
print(heart_mcar["MaxHR"][miss].head())
print(heart_mcar.dropna().shape)
In [ ]:
# create 20 missing at random observations
miss = random.binomial(1,0.1+0.2*df_heart["Sex"],n)
heart_mar = pd.read_csv('Heart.csv')
heart_mar.loc[miss==1,"MaxHR"]=np.nan
print(heart_mar.loc[miss==1,"MaxHR"].head())
print(heart_mar.dropna().shape)
In [ ]:
# create 20 missing not at random observations
miss = random.binomial(1,0.1*(df_heart["MaxHR"]>df_heart["MaxHR"].mean()),n)
heart_mnar = pd.read_csv('Heart.csv')
heart_mnar.loc[miss==1,"MaxHR"]=np.nan
print(heart_mnar.loc[miss==1,"MaxHR"].head())
print(heart_mnar.dropna().shape)
In [ ]:
# sklearn is not happy when you give it missing values
knn50 = KNeighborsClassifier(n_neighbors=50)
data_x = heart_mcar[['MaxHR','RestBP']]
data_y = df_heart.AHD.map(lambda x: 0 if x=='No' else 1)
knn50.fit(data_x, data_y);
In [ ]:
# So let's just fill in the mean to make it happy
data_x = data_x.fillna(data_x.mean())
knn50.fit(data_x, data_y);
In [ ]: