CS-109A Introduction to Data Science
Lecture 21: Stacking Examples¶
Harvard University
Fall 2018
Instructors: Pavlos Protopapas and Kevin Rader
In [1]:
## RUN THIS CELL TO PROPERLY HIGHLIGHT THE EXERCISES
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)
Out[1]:
In [103]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn import ensemble
from sklearn.tree import DecisionTreeClassifier
sns.set(style="ticks")
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from io import StringIO
#import pydot
from IPython.display import display
from IPython.display import Image
import seaborn as sns
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)
sns.set_context('poster')
In [2]:
#-------- fit_and_plot_dt
# Fit decision tree with on given data set with given depth, and plot the data/model
# Input:
# fname (string containing file name)
# depth (depth of tree)
def fit_and_plot_dt(x, y, depth, title, ax, plot_data=True, fill=True, color='Greens'):
# FIT DECISION TREE MODEL
dt = tree.DecisionTreeClassifier(max_depth = depth)
dt.fit(x, y)
# PLOT DECISION TREE BOUNDARY
ax = plot_tree_boundary(x, y, dt, title, ax, plot_data, fill, color)
return ax
In [3]:
#-------- plot_tree_boundary
# A function that visualizes the data and the decision boundaries
# Input:
# x (predictors)
# y (labels)
# model (the classifier you want to visualize)
# title (title for plot)
# ax (a set of axes to plot on)
# Returns:
# ax (axes with data and decision boundaries)
def plot_tree_boundary(x, y, model, title, ax, plot_data=True, fill=True, color='Greens', alpha=0.1):
if plot_data:
# PLOT DATA
ax.scatter(x[y==1,0], x[y==1,1], c='green')
ax.scatter(x[y==0,0], x[y==0,1], c='white')
# CREATE MESH
interval = np.arange(min(x.min(), y.min()),max(x.max(), y.max()),0.01)
n = np.size(interval)
x1, x2 = np.meshgrid(interval, interval)
x1 = x1.reshape(-1,1)
x2 = x2.reshape(-1,1)
xx = np.concatenate((x1, x2), axis=1)
# PREDICT ON MESH POINTS
yy = model.predict(xx)
yy = yy.reshape((n, n))
# PLOT DECISION SURFACE
x1 = x1.reshape(n, n)
x2 = x2.reshape(n, n)
if fill:
ax.contourf(x1, x2, yy, alpha=alpha, cmap=color)
else:
ax.contour(x1, x2, yy, alpha=alpha, cmap=color)
# LABEL AXIS, TITLE
ax.set_title(title)
ax.set_xlabel('Latitude')
ax.set_ylabel('Longitude')
return ax
In [4]:
data = np.random.multivariate_normal([0, 0], np.eye(2) * 5, size=1000)
data = np.hstack((data, np.zeros((1000, 1))))
data[data[:, 0]**2 + data[:, 1]**2 < 3**2, 2] = 1
In [5]:
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
x = data[:, :-1]
y = data[:, -1]
ax.scatter(x[y == 1, 0], x[y == 1, 1], c='green', label='vegetation', alpha=0.5)
ax.scatter(x[y == 0, 0], x[y == 0, 1], c='gray', label='non vegetation', alpha=0.1)
ax.set_xlabel('longitude')
ax.set_ylabel('latitude')
ax.set_title('satellite image')
ax.legend()
plt.tight_layout()
plt.show()
In [6]:
#Variance reduction: Baggining, RF, Tree, Adaboost
depth = None
fig, ax = plt.subplots(1, 4, figsize=(20, 5))
for i in range(8):
new_data = np.random.multivariate_normal([0, 0], np.eye(2) * 5, size=100)
new_data = np.hstack((new_data, np.zeros((100, 1))))
new_data[new_data[:, 0]**2 + new_data[:, 1]**2 < 3**2, 2] = 1
x = new_data[:, :-1]
y = new_data[:, -1]
ax[0] = fit_and_plot_dt(x, y, depth, 'Variance of Single Decision Tree', ax[0], plot_data=False, fill=False)
bag = ensemble.BaggingClassifier(n_estimators=30)
bag.fit(x, y)
ax[1] = plot_tree_boundary(x, y, bag, 'Variance of Bagging', ax[1], plot_data=False, fill=False, color='Reds')
rf = ensemble.RandomForestClassifier(n_estimators=30)
rf.fit(x, y)
ax[2] = plot_tree_boundary(x, y, rf, 'Variance of RF', ax[2], plot_data=False, fill=False, color='Blues')
rf = ensemble.RandomForestClassifier(n_estimators=30)
rf.fit(x, y)
ax[2] = plot_tree_boundary(x, y, rf, 'Variance of RF', ax[2], plot_data=False, fill=False, color='Blues')
adaboost = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=2), n_estimators=30)
adaboost.fit(x, y)
ax[3] = plot_tree_boundary(x, y, adaboost, 'Variance of AdaBoost', ax[3], plot_data=False, fill=False, color='Greys')
ax[0].set_xlim(-3.2, 3.2)
ax[0].set_ylim(-3.2, 3.2)
ax[1].set_xlim(-3.2, 3.2)
ax[1].set_ylim(-3.2, 3.2)
ax[2].set_xlim(-3.2, 3.2)
ax[2].set_ylim(-3.2, 3.2)
ax[3].set_xlim(-3.2, 3.2)
ax[3].set_ylim(-3.2, 3.2)
plt.show()
In [42]:
#Error comparison
data = np.random.multivariate_normal([0, 0], np.eye(2) * 5, size=200)
data = np.hstack((data, np.zeros((200, 1))))
data[data[:, 0]**2 + data[:, 1]**2 < 3**2, 2] = np.random.choice([0, 1], len(data[data[:, 0]**2 + data[:, 1]**2 < 3**2]), p=[0.2, 0.8])
x = data[:, :-1]
y = data[:, -1]
test_data = np.random.multivariate_normal([0, 0], np.eye(2) * 5, size=1000)
test_data = np.hstack((test_data, np.zeros((1000, 1))))
test_data[test_data[:, 0]**2 + test_data[:, 1]**2 < 3**2, 2] = np.random.choice([0, 1], len(test_data[test_data[:, 0]**2 + test_data[:, 1]**2 < 3**2]), p=[0.2, 0.8])
x_test = test_data[:, :-1]
y_test = test_data[:, -1]
dt = tree.DecisionTreeClassifier()
dt.fit(x, y)
tree_score = np.array([dt.score(x_test, y_test)] * len(range(20, 320, 10)))
bag_score = []
bag_oob = []
rf_score = []
rf_oob = []
boost_score = []
for i in range(20, 320, 10):
bag = ensemble.BaggingClassifier(n_estimators=i, oob_score=True)
bag.fit(x, y)
bag_score.append(bag.score(x_test, y_test))
bag_oob.append(bag.oob_score_)
rf = ensemble.RandomForestClassifier(n_estimators=i, oob_score=True)
rf.fit(x, y)
rf_score.append(rf.score(x_test, y_test))
rf_oob.append(rf.oob_score_)
adaboost = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=2), n_estimators=i, learning_rate=0.5)
adaboost.fit(x, y)
boost_score.append(adaboost.score(x_test, y_test))
In [8]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.plot(range(20, 320, 10), tree_score, color='black', linestyle='--', label='Single Tree')
ax.plot(range(20, 320, 10), bag_score, color='red', alpha=0.8, label='Bagging')
ax.plot(range(20, 320, 10), rf_score, color='green', alpha=0.8, label='Random Forest')
ax.plot(range(20, 320, 10), bag_oob, color='red', alpha=0.2, label='Bagging OOB')
ax.plot(range(20, 320, 10), rf_oob, color='green', alpha=0.2, label='RF OOB')
ax.plot(range(20, 320, 10), boost_score, color='grey', alpha=0.8, label='AdaBoost')
ax.set_title('Comparison of Errors')
ax.set_xlabel('Number of Trees in Ensemble')
ax.legend(loc='best')
plt.show()
In [9]:
#Choosing learning rate
boost_score_small = []
boost_score = []
boost_score_large = []
for i in range(20, 120, 5):
adaboost = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=2), n_estimators=i, learning_rate=1e-1)
adaboost.fit(x, y)
boost_score_small.append(adaboost.score(x, y))
adaboost = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=2), n_estimators=i, learning_rate=0.5)
adaboost.fit(x, y)
boost_score.append(adaboost.score(x, y))
adaboost = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=2), n_estimators=i, learning_rate=0.5e1)
adaboost.fit(x, y)
boost_score_large.append(adaboost.score(x, y))
In [10]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.plot(range(20, 220, 10), boost_score_small, color='blue', label='lambda=1e-1')
ax.plot(range(20, 220, 10), boost_score, color='red', alpha=0.8, label='lambda=0.5')
ax.plot(range(20, 220, 10), boost_score_large, color='green', alpha=0.8, label='lambda=0.5e1')
ax.set_title('Comparison of Learning Rates')
ax.set_xlabel('Number of Trees in Ensemble')
ax.legend(loc='best')
plt.show()
In [56]:
t1 = bag.predict_proba(x_test)[:,1]
t2 = adaboost.predict_proba(x_test)[:,1]
t3 = rf.predict_proba(x_test)[:,1]
logistic = LogisticRegression(C=100).fit(x,y)
t4 = logistic.predict_proba(x_test)[:,1]
T_test = np.column_stack((t1,t2,t3,t4))
#X = pd.concat([t1, t2, t3], axis=1)
In [57]:
print(t1.shape, t2.shape, t3.shape, T_test.shape)
In [71]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(C=1000).fit(T_test,y_test)
In [76]:
newtest_data = np.random.multivariate_normal([0, 0], np.eye(2) * 5, size=5000)
newtest_data = np.hstack((newtest_data, np.zeros((5000, 1))))
newtest_data[newtest_data[:, 0]**2 + newtest_data[:, 1]**2 < 3**2, 2] = np.random.choice([0, 1], len(newtest_data[newtest_data[:, 0]**2 + newtest_data[:, 1]**2 < 3**2]), p=[0.2, 0.8])
x_newtest = newtest_data[:, :-1]
y_newtest = newtest_data[:, -1]
In [77]:
print(bag.score(x_newtest,y_newtest),
adaboost.score(x_newtest,y_newtest),
rf.score(x_newtest,y_newtest),
logistic.score(x_newtest,y_newtest))
In [78]:
t1_new = bag.predict_proba(x_newtest)[:,1]
t2_new = adaboost.predict_proba(x_newtest)[:,1]
t3_new = rf.predict_proba(x_newtest)[:,1]
t4_new = logistic.predict_proba(x_newtest)[:,1]
T_new = np.column_stack((t1_new,t2_new,t3_new,t4_new))
In [79]:
logit.score(T_new,y_newtest)
Out[79]:
In [95]:
def overlay_decision_boundary(ax, model, colors=None, nx=200, ny=200, desaturate=.5, xlim=None, ylim=None):
"""
A function that visualizes the decision boundaries of a classifier.
ax: Matplotlib Axes to plot on
model: Classifier to use.
- if `model` has a `.predict` method, like an sklearn classifier, we call `model.predict(X)`
- otherwise, we simply call `model(X)`
colors: list or dict of colors to use. Use color `colors[i]` for class i.
- If colors is not provided, uses the current color cycle
nx, ny: number of mesh points to evaluated the classifier on
desaturate: how much to desaturate each of the colors (for better contrast with the sample points)
xlim, ylim: range to plot on. (If the default, None, is passed, the limits will be taken from `ax`.)
"""
# Create mesh.
xmin, xmax = ax.get_xlim() if xlim is None else xlim
ymin, ymax = ax.get_ylim() if ylim is None else ylim
xx, yy = np.meshgrid(
np.linspace(xmin, xmax, nx),
np.linspace(ymin, ymax, ny))
X = np.c_[xx.flatten(), yy.flatten()]
# Predict on mesh of points.
model = getattr(model, 'predict', model)
y = model(X)
y = y.astype(int) # This may be necessary for 32-bit Python.
y = y.reshape((nx, ny))
# Generate colormap.
if colors is None:
# If colors not provided, use the current color cycle.
# Shift the indices so that the lowest class actually predicted gets the first color.
# ^ This is a bit magic, consider removing for next year.
colors = (['white'] * np.min(y)) + sns.utils.get_color_cycle()
if isinstance(colors, dict):
missing_colors = [idx for idx in np.unique(y) if idx not in colors]
assert len(missing_colors) == 0, f"Color not specified for predictions {missing_colors}."
# Make a list of colors, filling in items from the dict.
color_list = ['white'] * (np.max(y) + 1)
for idx, val in colors.items():
color_list[idx] = val
else:
assert len(colors) >= np.max(y) + 1, "Insufficient colors passed for all predictions."
color_list = colors
color_list = [sns.utils.desaturate(color, desaturate) for color in color_list]
cmap = matplotlib.colors.ListedColormap(color_list)
# Plot decision surface
ax.pcolormesh(xx, yy, y, zorder=-2, cmap=cmap, norm=matplotlib.colors.NoNorm(), vmin=0, vmax=y.max() + 1)
xx = xx.reshape(nx, ny)
yy = yy.reshape(nx, ny)
if len(np.unique(y)) > 1:
ax.contour(xx, yy, y, colors="black", linewidths=1, zorder=-1)
else:
print("Warning: only one class predicted, so not plotting contour lines.")
def plot_decision_boundary(x, y, model, title, ax):
plot_data(ax, x, y)
overlay_decision_boundary(ax, model, colors=class_colors)
ax.set_title(title)
In [111]:
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
x = data[:, :-1]
y = data[:, -1]
ax.scatter(x[y == 1, 0], x[y == 1, 1], c='green', label='vegetation', alpha=0.5)
ax.scatter(x[y == 0, 0], x[y == 0, 1], c='gray', label='non vegetation', alpha=0.1)
ax.set_xlabel('longitude')
ax.set_ylabel('latitude')
ax.set_title('satellite image')
ax.legend()
plt.tight_layout()
c = ['white','green']
overlay_decision_boundary(ax, bag, colors=c, desaturate=.3)
plt.show()