Key Word(s): Interpretation, Lime, SHAP
CS109A Introduction to Data Science
Lecture 35 (Interpreting Machine Learning Models)¶
Harvard University
Fall 2020
Instructors: Pavlos Protopapas, Kevin Rader, and Chris Tanner
Title¶
Exercise
Description¶
For this exercise you will be working on a notebook in Ed to interpret various machine learning (ML) models: mostly a random forest and neural net model. The steps of this notebook are:
- Read and wrangle the data (the
Heart.csv
data we have used before - Fit 5 different ML models
- Calculate the default variable importance measures for the tree-based models
- Use
eli5
to calculate permutation importance measures. - Interpret the models based on the plots of predictions (how does Age relate to AHD): both at the means of the other variables and for every observation in the training data set.
- Use
lime
to better understand how variables are related to the response for specific predictors.
Hints:¶
eli5.permutation_importance : To get permutation importance metrics
lime.lime_tabular : To get LIME explanations for a tabular data set like ours
Note: This exercise is auto-graded and you can try multiple attempts.
import pandas as pd
import sys
import numpy as np
import scipy as sp
import sklearn as sk
import matplotlib.pyplot as plt
#from sklearn.linear_model import LogisticRegression
#from sklearn.decomposition import PCA
from sklearn import tree
from sklearn import ensemble
# Here are the decision trees
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import tensorflow as tf
print(tf.__version__) # You should see a 2.0.0 here!
# sns.set(style="ticks")
# %matplotlib inline
Part 1: Data Wrangling¶
heart_df = pd.read_csv('data/Heart.csv')
print(heart_df.shape)
heart_df.head()
heart_df.describe()
X = heart_df[['Age','Sex','ChestPain','RestBP','Chol','Fbs','RestECG','MaxHR','ExAng','Oldpeak','Slope','Ca','Thal']]
y = 1*(heart_df['AHD']=='Yes')
#X['ChestPain']=X['ChestPain'].astype('category')
#X['ChestPain']=X['ChestPain'].cat.codes
#X['Thal']=X['Thal'].astype('category')
#X['Thal']=X['Thal'].cat.codes
X = X.assign(ChestPain=X['ChestPain'].astype('category').cat.codes)
X = X.assign(Thal=X['Thal'].astype('category').cat.codes)
X.describe()
X['Ca']=X['Ca'].fillna(0)
from sklearn.model_selection import train_test_split
itrain, itest = train_test_split(range(X.shape[0]), train_size=0.80)
X_train = X.iloc[itrain, :]
X_test = X.iloc[itest, :]
y_train = y.iloc[itrain]
y_test = y.iloc[itest]
Q1.1: How were the categorical variables handled? How were missing values treated? Were these wise choices?
*your answer here
Part 2: Fitting Five ML Models¶
# fit a possibly underfit (depth = 3) decision tree classifier
dt3 = tree.DecisionTreeClassifier(max_depth = 3)
dt3.fit(X_train,y_train)
# fit an overfit (depth = 10) decision tree classifier
dt10 = tree.DecisionTreeClassifier(max_depth = 10)
dt10.fit(X_train,y_train)
# Evaluate using AUC
print("AUC on train for dt3:",sk.metrics.roc_auc_score(y_train,dt3.predict_proba(X_train)[:,1]))
print("AUC on test for dt3:",sk.metrics.roc_auc_score(y_test,dt3.predict_proba(X_test)[:,1]))
print("AUC on train for dt10:",sk.metrics.roc_auc_score(y_train,dt10.predict_proba(X_train)[:,1]))
print("AUC on test for dt10:",sk.metrics.roc_auc_score(y_test,dt10.predict_proba(X_test)[:,1]))
# fit random forest and adaboost models
np.random.seed(109)
randomforest = RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=10)
randomforest.fit(X_train,y_train);
adaboost = AdaBoostClassifier(
base_estimator=DecisionTreeClassifier(max_depth=4),
n_estimators=1000,
learning_rate=.8)
adaboost.fit(X_train,y_train);
# evaluate using AUC
print("AUC on train for randomforest:",sk.metrics.roc_auc_score(---,---)
print("AUC on test for randomforest:",sk.metrics.roc_auc_score(---,---)
print("AUC on train for adaboost:",sk.metrics.roc_auc_score(---,---)
print("AUC on test for adaboost:",sk.metrics.roc_auc_score(---,---)
# build a NN model
tf.random.set_seed(109)
NN_model = []
nodes_layers =[15,15]
#reset the model
NN_model = tf.keras.models.Sequential()
# input layers
NN_model.add(tf.keras.layers.Dense(nodes_layers[0], activation='tanh', input_shape=(X_train.shape[1],)))
# hidden layers
for s in nodes_layers[1:]:
print(s)
NN_model.add(tf.keras.layers.Dense(units = s, activation = 'tanh'))
# output layer
NN_model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
# Summary
NN_model.summary()
# compile it and run it
# your code here
X_test_std = ((X_test-X_train.mean(axis=0))/(X_train.std(axis=0)+0.2))
X_train_std = ((X_train-X_train.mean(axis=0))/(X_train.std(axis=0)+0.2))
batch_size = 32
epochs = 100
#opt = tf.keras.optimizers.SGD(lr=0.002,clipvalue=0.7)
NN_model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['acc'])
# fit it
history_basic = NN_model.fit(X_train_std, y_train,
batch_size=---,
epochs=---,
validation_split=.3,
verbose=False)
print("AUC on train for NN_model:",sk.metrics.roc_auc_score(---,---)
print("AUC on test for NN_model:",sk.metrics.roc_auc_score(---,---)
Q2.1: Which model performs best? Which models are overfit? How do you know?
*your answer here
Part 3: Variable Importance¶
#Default Variable Importance
plt.figure(figsize=(24,6))
#plt.set_xticks()
#plt.set_xticklabels(X.columns)
num=10
plt.subplot(1, 4, 1)
dt3_importances = dt3.feature_importances_
order = np.flip(np.argsort(dt3_importances))[0:num]
plt.barh(range(num),dt3_importances[order],tick_label=X.columns[order]);
plt.title("Relative Variable Importance for dt4")
plt.subplot(1, 4, 2)
dt10_importances = dt10.feature_importances_
order = np.flip(np.argsort(dt10_importances))[0:num]
plt.barh(range(num),dt10_importances[order],tick_label=X.columns[order]);
plt.title("Relative Variable Importance for dt10")
plt.subplot(1, 4, 3)
rf_importances = ---
order = ---
plt.barh(---,---);
plt.title("Relative Variable Importance for rf")
plt.subplot(1, 4, 4)
adaboost_importances = adaboost.feature_importances_
adaboost_importances = pd.Series(adaboost_importances).fillna(0)
order = np.flip(np.argsort(adaboost_importances))[0:num]
plt.barh(range(num),adaboost_importances[order],tick_label=X.columns[order]);
plt.title("Relative Variable Importance for adaboost");
Q3.1: How do these variable importance measures compare for these 4 models? Which predictor is most important in general? How is it related to AHD
?
*your answer here
Part 4: Using Eli-5¶
import eli5
#permutation importance for the random forest
from eli5.sklearn import PermutationImportance
seed = 42
perm = PermutationImportance(randomforest,random_state=seed,n_iter=10).fit(X_test, y_test)
eli5.show_weights(perm,feature_names=X.columns.tolist())
#eli5.explain_weights(perm, feature_names = X_train.columns.tolist())
#permutation importance for the NN model: need to define a scoring metric since there is no tf.score() by default
def aucscore(model, X, y):
y_pred = model.predict(X)
return sk.metrics.roc_auc_score(y, y_pred)
aucscore(NN_model,X_train_std, y_train)
perm = PermutationImportance(NN_model, random_state=seed,n_iter=10, scoring=---).fit(---,---)
eli5.show_weights(---,---)
Q4.1: How do the permutation importance measures compare to the default variable importance in the random forest? How does the NN model compare to the random forest?
*your answer here
Part 5: Plotting Predictions¶
yhat_rf_train = randomforest.predict_proba(X_train)[:,1]
plt.scatter(X_train[['Age']],yhat_rf_train);
yhat_rf_test = randomforest.predict_proba(X_test)[:,1]
plt.scatter(X_test[['Age']],yhat_rf_test,marker='x');
plt.title("Predicted Probabilities vs. Age from the RF in train and test");
yhat_nn_train = NN_model.predict(---)
plt.scatter(---,---);
yhat_nn_test = NN_model.predict(---)
plt.scatter(---,---);
plt.title("Predicted Probabilities vs. Age from NN in train and test");
Q5.1 How do the random forest and NN model compare in the interpretation of Age with AHD? Which is more reliable?
*your answer here
# Create the data frame of means to do the prediction
means1 = X_train.mean(axis = 0)
means_df = (means1.to_frame()).transpose()
# Do the prediction at all observed ages
Ages = np.arange(np.min(X['Age']),np.max(X['Age']))
means_df = pd.concat([means_df]*Ages.size,ignore_index=True)
means_df['Age'] = Ages
#plots at means
yhat_nn = NN_model.predict(means_df)
plt.scatter(X_train['Age'],y_train)
plt.plot(means_df['Age'],yhat_nn,color="red")
plt.title("Predicted Probabilities vs. Age from NN in train");
#Plots for all observations. And then averaged
yhat_nns = []
for i in range(0,X_train.shape[0]):
obs = X_train.iloc[i,:].to_frame().transpose()
obs_df = pd.concat([obs]*Ages.size,ignore_index=True)
obs_df['Age'] = Ages
yhat_nn = NN_model.predict_proba(obs_df)
yhat_nns.append(yhat_nn.transpose())
plt.plot(obs_df['Age'],yhat_nn,color='blue',alpha=0.05)
plt.plot(obs_df['Age'],np.mean(yhat_nns,axis=0)[0],color='red',linewidth=2);
plt.ylim(0,1)
plt.title("Predicted Probabilities vs. Age from NN in train for all observations");
Q5.1 Interpret the two plots above. What is the difference in the interpretations? Is there anyu evidence of interaction effects between Age and the other predictors? How do you know?
*your answer here
Part 6: Using LIME¶
# pip install lime
import lime
from lime.lime_tabular import LimeTabularExplainer
#explainer = LimeTabularExplainer(X_train)#class_names = [0,1])
explainer = LimeTabularExplainer(X_train.values,
feature_names=X_train.columns,
class_names = [0,1],
mode='classification')
idx = 42
exp = explainer.explain_instance(X_train.values[idx],
randomforest.predict_proba,
num_features = 13)#X_train.values[idx].size)
print('Observation #: %d' % idx)
print('Probability(AHD) =', randomforest.predict_proba(X_train)[idx][1])
print('True class: %s' % y_train[idx])
### Plot the results
# exp.as_list()
exp.as_pyplot_figure();
# change the observation number and see what changes.
idx = ---
exp = explainer.explain_instance(X_train.values[idx],
randomforest.predict_proba,
num_features = 13)
print('Observation #: %d' % idx)
print('Probability(AHD) =', randomforest.predict_proba(X_train)[idx][1])
print('True class: %s' % y_train[idx])
### Plot the results
# exp.as_list()
exp.as_pyplot_figure();
Q6.1 Interpret the LIME results above. Do they agree with the other interpretations for the random forest model seen so far?
your answer here