Harvard University
Fall 2018
Instructors: Pavlos Protopapas, Kevin Rader
Section Leaders: Mehul Smriti Raje, Ken Arnold, Karan Motwani, Cecilia Garraffo
The dataset can be downloaded from https://drive.google.com/open?id=0B28c493CP9GtRHFVM0U0SVI2Yms
Our goal, as in lecture, will be to predict taxi fares based on various other features of the ride.
For descriptions of what the columns mean, see http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml and data dictionaries there.
# Instead of converting to datetime after loading, you can tell Pandas the datatypes you want while loading.
nyc_cab_df = pd.read_csv(
'nyc_car_hire_data.csv',
dtype={'Store_and_fwd_flag': str, 'Base': str},
parse_dates=['lpep_pickup_datetime', 'Lpep_dropoff_datetime'])
nyc_cab_df['pickup_hour'] = nyc_cab_df.lpep_pickup_datetime.dt.hour
nyc_cab_df['dropoff_hour'] = nyc_cab_df.Lpep_dropoff_datetime.dt.hour
nyc_cab_df.head()
AWND | Base | Day | Dropoff_latitude | Dropoff_longitude | Ehail_fee | Extra | Fare_amount | Lpep_dropoff_datetime | MTA_tax | PRCP | Passenger_count | Payment_type | Pickup_latitude | Pickup_longitude | RateCodeID | SNOW | SNWD | Store_and_fwd_flag | TMAX | TMIN | Tip_amount | Tolls_amount | Total_amount | Trip_distance | Trip_type | Type | VendorID | lpep_pickup_datetime | Trip Length (min) | pickup_hour | dropoff_hour | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4.7 | B02512 | 1 | NaN | NaN | NaN | NaN | 33.863498 | 2014-04-01 00:24:00 | NaN | 0.0 | NaN | NaN | 40.7690 | -73.9549 | NaN | 0.0 | 0.0 | NaN | 60 | 39 | NaN | NaN | NaN | 4.083561 | NaN | 1 | NaN | 2014-04-01 00:11:00 | 13.0 | 0 | 0 |
1 | 4.7 | B02512 | 1 | NaN | NaN | NaN | NaN | 19.022892 | 2014-04-01 00:29:00 | NaN | 0.0 | NaN | NaN | 40.7267 | -74.0345 | NaN | 0.0 | 0.0 | NaN | 60 | 39 | NaN | NaN | NaN | 3.605694 | NaN | 1 | NaN | 2014-04-01 00:17:00 | 12.0 | 0 | 0 |
2 | 4.7 | B02512 | 1 | NaN | NaN | NaN | NaN | 25.498981 | 2014-04-01 00:34:00 | NaN | 0.0 | NaN | NaN | 40.7316 | -73.9873 | NaN | 0.0 | 0.0 | NaN | 60 | 39 | NaN | NaN | NaN | 4.221763 | NaN | 1 | NaN | 2014-04-01 00:21:00 | 13.0 | 0 | 0 |
3 | 4.7 | B02512 | 1 | NaN | NaN | NaN | NaN | 28.024628 | 2014-04-01 00:39:00 | NaN | 0.0 | NaN | NaN | 40.7588 | -73.9776 | NaN | 0.0 | 0.0 | NaN | 60 | 39 | NaN | NaN | NaN | 2.955510 | NaN | 1 | NaN | 2014-04-01 00:28:00 | 11.0 | 0 | 0 |
4 | 4.7 | B02512 | 1 | NaN | NaN | NaN | NaN | 12.083589 | 2014-04-01 00:40:00 | NaN | 0.0 | NaN | NaN | 40.7594 | -73.9722 | NaN | 0.0 | 0.0 | NaN | 60 | 39 | NaN | NaN | NaN | 1.922292 | NaN | 1 | NaN | 2014-04-01 00:33:00 | 7.0 | 0 | 0 |
Let's have a quick look at the data before we get started. (We really should do more than this, but skipping that for time.)
plt.boxplot(nyc_cab_df.Fare_amount);
# TODO: some useful predictors may be missing, but maybe because they have missing data.
# nyc_cab_df.columns.difference(all_predictors)
all_predictors = ['Trip Length (min)', 'Type', 'Trip_distance', 'TMAX', 'TMIN',
'pickup_hour', 'dropoff_hour', 'Pickup_longitude',
'Pickup_latitude', 'SNOW', 'SNWD', 'PRCP']
sample_frac = .01
shuffled = nyc_cab_df_clean.sample(frac=sample_frac, random_state=0)[all_predictors + ['Fare_amount']]
shuffled.shape
n_total = len(shuffled)
n_test = int(n_total * .2)
n_valid = int(n_total * .2)
n_train = n_total - n_valid - n_test
train = shuffled.iloc[:n_train].copy()
valid = shuffled.iloc[n_train:n_train + n_valid].copy()
test = shuffled.iloc[n_train + n_valid:].copy()
assert len(train) == n_train
assert len(valid) == n_valid
assert len(test) == n_test
print(f"{n_train} train, {n_valid} validation, {n_test} test.")
11183 train, 3727 validation, 3727 test.
train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 11183 entries, 959269 to 686097 Data columns (total 13 columns): Trip Length (min) 11183 non-null float64 Type 11183 non-null int64 Trip_distance 11183 non-null float64 TMAX 11183 non-null int64 TMIN 11183 non-null int64 pickup_hour 11183 non-null int64 dropoff_hour 11183 non-null int64 Pickup_longitude 11183 non-null float64 Pickup_latitude 11183 non-null float64 SNOW 11183 non-null float64 SNWD 11183 non-null float64 PRCP 11183 non-null float64 Fare_amount 11183 non-null float64 dtypes: float64(8), int64(5) memory usage: 1.2 MB
Warm-up exercise: for which of the following do the units of the predictors matter (e.g., trip length in minutes vs seconds; temperature in F or C)?
Remember that the mean and variance used to scale data are parameters that need to be learned from our training data.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(train[all_predictors])
pd.DataFrame({
'mean': scaler.mean_,
'variance': scaler.var_
}, index=all_predictors).T
Trip Length (min) | Type | Trip_distance | TMAX | TMIN | pickup_hour | dropoff_hour | Pickup_longitude | Pickup_latitude | SNOW | SNWD | PRCP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
mean | 11.330680 | 0.300009 | 2.875462 | 60.886167 | 43.269874 | 13.980148 | 13.978897 | -73.879007 | 40.718952 | 0.0 | 0.0 | 0.336174 |
variance | 61.431918 | 0.210004 | 5.458100 | 70.157211 | 36.335824 | 42.114960 | 43.055980 | 4.887413 | 1.486695 | 0.0 | 0.0 | 1.105074 |
# Scaling in place... not my favorite approach
for df in [train, valid, test]:
df[all_predictors] = scaler.transform(df[all_predictors])
train[all_predictors].describe()
Trip Length (min) | Type | Trip_distance | TMAX | TMIN | pickup_hour | dropoff_hour | Pickup_longitude | Pickup_latitude | SNOW | SNWD | PRCP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1.118300e+04 | 1.118300e+04 | 1.118300e+04 | 1.118300e+04 | 1.118300e+04 | 1.118300e+04 | 1.118300e+04 | 1.118300e+04 | 1.118300e+04 | 11183.0 | 11183.0 | 1.118300e+04 |
mean | -8.895286e-17 | -8.895286e-18 | 1.105557e-16 | -2.265121e-16 | -5.534139e-16 | -5.511900e-17 | 1.906133e-17 | -4.778039e-16 | 3.429768e-15 | 0.0 | 0.0 | -9.530664e-18 |
std | 1.000045e+00 | 1.000045e+00 | 1.000045e+00 | 1.000045e+00 | 1.000045e+00 | 1.000045e+00 | 1.000045e+00 | 1.000045e+00 | 1.000045e+00 | 0.0 | 0.0 | 1.000045e+00 |
min | -1.445636e+00 | -6.546676e-01 | -1.230798e+00 | -1.657854e+00 | -2.035507e+00 | -2.154238e+00 | -2.130375e+00 | -2.332211e-01 | -3.339532e+01 | 0.0 | 0.0 | -3.197923e-01 |
25% | -6.801201e-01 | -6.546676e-01 | -6.615118e-01 | -8.221316e-01 | -5.424547e-01 | -7.674041e-01 | -7.587809e-01 | -4.550192e-02 | -3.110732e-04 | 0.0 | 0.0 | -3.197923e-01 |
50% | -1.697762e-01 | -6.546676e-01 | -2.420375e-01 | 1.359045e-02 | -4.477065e-02 | 1.571516e-01 | 3.080150e-01 | -3.345387e-02 | 2.595593e-02 | 0.0 | 0.0 | -3.197923e-01 |
75% | 3.405678e-01 | 1.527493e+00 | 3.186884e-01 | 7.299237e-01 | 4.529134e-01 | 7.735220e-01 | 7.652132e-01 | -1.930459e-02 | 6.536592e-02 | 0.0 | 0.0 | -2.532033e-01 |
max | 5.954351e+00 | 1.527493e+00 | 8.113206e+00 | 1.923812e+00 | 2.609544e+00 | 1.389892e+00 | 1.374811e+00 | 3.341808e+01 | 3.855061e-01 | 0.0 | 0.0 | 4.408025e+00 |
train.Fare_amount.describe()
count 11183.000000 mean 14.650476 std 8.598972 min 0.010000 25% 7.500000 50% 13.000000 75% 20.014209 max 52.500000 Name: Fare_amount, dtype: float64
k_vals = [1, 5, 10, 15, 20, 25, 30, 35, 40, 50, 60]
knns = {
k: KNeighborsRegressor(n_neighbors=k).fit(
train[all_predictors], train.Fare_amount)
for k in k_vals}
train_r2s = [
metrics.r2_score(train.Fare_amount, model.predict(train[all_predictors]))
for k, model in knns.items()]
plt.plot(k_vals, train_r2s, '-+', label="Train")
plt.xlabel('n_neighbors')
plt.ylabel("$R^2$")
plt.legend();
Which n_neighbors should we choose?
Do best on unseen data.
So let's see how well our models do on it.
val_r2s = [
metrics.r2_score(valid.Fare_amount, model.predict(valid[all_predictors]))
for k, model in knns.items()]
plt.plot(k_vals, train_r2s, '-+', label="Train")
plt.plot(k_vals, val_r2s, '-*', label="Validation")
plt.xlabel('n_neighbors')
plt.ylabel("$R^2$")
plt.legend();
Now, which n_neighbors should we choose?
best_r2_idx = np.argmax(val_r2s)
best_r2 = val_r2s[best_r2_idx]
best_n_neighbors = k_vals[best_r2_idx]
print(f"Best n_neighbors is {best_n_neighbors}, which gives a validation R^2 of {best_r2:.3f}")
Best n_neighbors is 10, which gives a validation R^2 of 0.855
How confident are we about that choice? What if there was something unusual about our particular validation set?
How could we gain more confidence?
If we repeat an experiment and the results say the same thing, we get more confident in that result. How could we repeat this experiment?
Let's choose different subsets of our data as our training and validation sets.
from sklearn.model_selection import KFold
n_splits = 10
folds = np.zeros((n_splits, len(train)))
fake_dataset = np.arange(len(train))
for fold_idx, (train_indices, valid_indices) in enumerate(KFold(n_splits=n_splits).split(fake_dataset)):
folds[fold_idx, train_indices] = 2
folds[fold_idx, valid_indices] = 1
fig, ax = plt.subplots(figsize=(10, 5))
sns.heatmap(folds[::-1], ax=ax) # TODO: label this better!
ax.set(xlabel="Datum idx", ylabel="Fold idx");
Cross-Validation core
X = train[all_predictors]
y = train.Fare_amount
n_neighbors = 10 # for example
cv = KFold(n_splits=10)
cv_r2 = []
for fold_idx, (train_indices, valid_indices) in enumerate(cv.split(X)):
# Train on training set.
model = KNeighborsRegressor(n_neighbors=n_neighbors)
model.fit(X.iloc[train_indices], y.iloc[train_indices])
# Evaluate on validation set.
val_predictions = model.predict(X.iloc[valid_indices])
val_r2 = metrics.r2_score(y.iloc[valid_indices], val_predictions)
cv_r2.append(val_r2)
sns.boxplot(cv_r2); plt.xlabel("Validation $R^2$");
Now do this for each candidate n_neighbors!
def cross_validate_knn(X, y, n_splits, k_vals):
cv_results = []
cv = KFold(n_splits=n_splits)
# Outer loop: pick n_neighbors
for n_neighbors in k_vals:
# Inner loop: pick train and validation sets.
for fold_idx, (train_indices, valid_indices) in enumerate(cv.split(X)):
# Train on training set.
model = KNeighborsRegressor(n_neighbors=n_neighbors)
model.fit(X.iloc[train_indices], y.iloc[train_indices])
# Evaluate on validation set. I include MSE here just to show how.
val_predictions = model.predict(X.iloc[valid_indices])
val_r2 = metrics.r2_score(y.iloc[valid_indices], val_predictions)
val_mse = metrics.mean_squared_error(y.iloc[valid_indices], val_predictions)
cv_results.append(dict(
fold_idx=fold_idx,
n_neighbors=n_neighbors,
val_r2=val_r2,
val_mse=val_mse
))
return pd.DataFrame(cv_results)
cv_results = cross_validate_knn(X, y, n_splits=10, k_vals=k_vals)
cv_results.head(11)
fold_idx | n_neighbors | val_mse | val_r2 | |
---|---|---|---|---|
0 | 0 | 1 | 19.582487 | 0.751775 |
1 | 1 | 1 | 17.710063 | 0.743087 |
2 | 2 | 1 | 17.890290 | 0.739744 |
3 | 3 | 1 | 15.108882 | 0.790357 |
4 | 4 | 1 | 21.316752 | 0.713888 |
5 | 5 | 1 | 15.927632 | 0.773481 |
6 | 6 | 1 | 16.262528 | 0.774167 |
7 | 7 | 1 | 16.395153 | 0.789799 |
8 | 8 | 1 | 19.922931 | 0.728344 |
9 | 9 | 1 | 16.710313 | 0.795838 |
10 | 0 | 5 | 12.751734 | 0.838361 |
# Make a table of the validation R^2 data, where each row is a different fold.
val_r2_table = cv_results.pivot(index='fold_idx', columns='n_neighbors', values='val_r2')
# alternative: cv_results.set_index(['fold_idx', 'n_neighbors']).val_r2.unstack(level=1)
val_r2_table
n_neighbors | 1 | 5 | 10 | 15 | 20 | 25 | 30 | 35 | 40 | 50 | 60 |
---|---|---|---|---|---|---|---|---|---|---|---|
fold_idx | |||||||||||
0 | 0.751775 | 0.838361 | 0.846247 | 0.848543 | 0.846561 | 0.845755 | 0.843973 | 0.844033 | 0.840830 | 0.837155 | 0.834562 |
1 | 0.743087 | 0.829843 | 0.853299 | 0.858193 | 0.857334 | 0.856293 | 0.854176 | 0.853082 | 0.851862 | 0.849573 | 0.847336 |
2 | 0.739744 | 0.851182 | 0.859116 | 0.861349 | 0.863230 | 0.860448 | 0.860142 | 0.857826 | 0.855546 | 0.852625 | 0.849647 |
3 | 0.790357 | 0.855258 | 0.860757 | 0.863444 | 0.862979 | 0.862656 | 0.860542 | 0.858673 | 0.859154 | 0.855150 | 0.851734 |
4 | 0.713888 | 0.785308 | 0.791115 | 0.797459 | 0.797057 | 0.794462 | 0.792262 | 0.789779 | 0.788168 | 0.783295 | 0.780932 |
5 | 0.773481 | 0.842808 | 0.852990 | 0.859311 | 0.862505 | 0.858309 | 0.855681 | 0.854264 | 0.853136 | 0.849946 | 0.846897 |
6 | 0.774167 | 0.832477 | 0.844946 | 0.843760 | 0.844797 | 0.843010 | 0.841472 | 0.839870 | 0.838088 | 0.835430 | 0.832205 |
7 | 0.789799 | 0.855152 | 0.860794 | 0.860559 | 0.861657 | 0.859818 | 0.856437 | 0.854543 | 0.852385 | 0.850082 | 0.846750 |
8 | 0.728344 | 0.788996 | 0.800917 | 0.801670 | 0.800536 | 0.799309 | 0.798490 | 0.796204 | 0.794484 | 0.790631 | 0.789120 |
9 | 0.795838 | 0.871517 | 0.875833 | 0.874913 | 0.875388 | 0.874800 | 0.874535 | 0.873019 | 0.871999 | 0.869174 | 0.865932 |
Let's visualize that data, 3 different ways.
fig, ax = plt.subplots(figsize=(10, 5))
for fold_idx, row in val_r2_table.iterrows():
ax.plot(row.index, row.values, '-x', c='black', alpha=.3, label="single fold" if fold_idx is 0 else None)
# plt.plot(row.index, val_r2_table.mean(axis=0), c='white', linewidth=5)
ax.plot(row.index, val_r2_table.mean(axis=0), label='Mean')
ax.legend()
ax.set(xlabel="n_neighbors", ylabel="$R^2$ on validation fold", title="Performance vs hyperparameter over different CV folds");
fig, ax = plt.subplots(figsize=(10, 5))
ax.boxplot(val_r2_table.values, positions=k_vals, widths=4);
ax.set(xlabel="n_neighbors", ylabel="$R^2$ on validation fold");
cv_r2_mean = cv_results.groupby('n_neighbors').val_r2.mean()
cv_r2_std = cv_results.groupby('n_neighbors').val_r2.std()
plt.errorbar(k_vals, cv_r2_mean, yerr=cv_r2_std, capsize=5)
plt.gca().set(xlabel="n_neighbors", ylabel="Validation $R^2$ $\pm$ stdev");
Now, which n_neighbors should you choose?
For now, let's use the model with the best mean CV performance.
best_n_neighbors = cv_r2_mean.idxmax()
estimated_r2_best = cv_r2_mean[best_n_neighbors]
print("The best n_neighbors is {}, with estimated R^2 = {:.2f} +- {:.2f}".format(
best_n_neighbors, estimated_r2_best, cv_r2_std[best_n_neighbors]))
The best n_neighbors is 20, with estimated R^2 = 0.85 +- 0.03
all_my_data = pd.concat([train, valid])
my_best_model = (
KNeighborsRegressor(n_neighbors=best_n_neighbors)
.fit(all_my_data[all_predictors], all_my_data.Fare_amount))
assert len(train) + len(valid) == len(all_my_data)
print(f"Trained a new model on {len(train)} + {len(valid)} = {len(all_my_data)} samples.")
Trained a new model on 11183 + 3727 = 14910 samples.
We've already looked at train and validation. Only test remains completely unseen. The point of no return...
print(f"We expect our best model to get an R^2 of {estimated_r2_best:.3f}, or even better because we have more data now.")
print("Drumroll please...")
We expect our best model to get an R^2 of 0.847, or even better because we have more data now. Drumroll please...
test_predictions = my_best_model.predict(test[all_predictors])
test_r2 = metrics.r2_score(test.Fare_amount, test_predictions)
print(f"Our best model got an R^2 of {test_r2:.3f}.")
Our best model got an R^2 of 0.817.
btw, when I first ran this, the test R^2 was much worse, ~0.5. But I figured out why. Can you?
It turns out that the test set happened to have some enormous outliers.
def cross_validate_general(train_model, X, y, n_splits, params):
cv_results = []
cv = KFold(n_splits=n_splits)
# Outer loop: pick hyperparameter setting.
for param_setting in params:
# Inner loop: pick train and validation sets.
for fold_idx, (train_indices, valid_indices) in enumerate(cv.split(X, y)):
# Train on training set.
model = train_model(X[train_indices], y[train_indices], param_setting)
# Evaluate on validation set.
val_predictions = model.predict(X[valid_indices])
val_r2 = metrics.r2_score(y[valid_indices], val_predictions)
val_mse = metrics.mean_squared_error(y[valid_indices], val_predictions)
cv_results.append(dict(
param_setting,
fold_idx=fold_idx,
val_r2=val_r2,
val_mse=val_mse
))
return pd.DataFrame(cv_results)
def train_lasso(X, y, params):
return Lasso(alpha=params['alpha']).fit(X, y)
def train_KNN(X, y, params):
return KNeighborsRegressor(n_neighbors=params['n_neighbors']).fit(X, y)
X = train[all_predictors].values
y = train.Fare_amount.values
params = [{'alpha': alpha} for alpha in [.001, .01, .1, 1., 10.]]
cv_results = cross_validate_general(
train_model=train_lasso,
# train_model=train_KNN,
X=X, y=y,
n_splits=10, params=params)
agg_cv_results = cv_results.groupby('alpha').val_r2.agg(['mean', 'std'])
agg_cv_results
/Users/kcarnold/anaconda3/envs/py36/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems. ConvergenceWarning)
mean | std | |
---|---|---|
alpha | ||
0.001 | 0.844636 | 0.031926 |
0.010 | 0.844776 | 0.031824 |
0.100 | 0.844775 | 0.031353 |
1.000 | 0.813970 | 0.026487 |
10.000 | -0.001185 | 0.001649 |
best_alpha = agg_cv_results['mean'].idxmax()
estimated_r2 = agg_cv_results['mean'][best_alpha]
best_model = Lasso(alpha=best_alpha).fit(X, y)
test_predictions = best_model.predict(test[all_predictors].values)
test_r2 = metrics.r2_score(test.Fare_amount, test_predictions)
print(f"Best alpha was {best_alpha}, with estimated R^2 of {estimated_r2:.3f}.")
print(f"Training this on all data gives a test R^2 of {test_r2:.3f}")
Best alpha was 0.01, with estimated R^2 of 0.845. Training this on all data gives a test R^2 of 0.803
# We could also use a meta-estimator from sklearn.
# But make sure you understand all of what's going on in the above first!
from sklearn.model_selection import GridSearchCV
gridsearch_model = GridSearchCV(
estimator=Lasso(),
param_grid=[{'alpha': [.1, 1., 10.]}],
cv=KFold(n_splits=10),
scoring='r2',
return_train_score=False)
gridsearch_model.fit(X, y)
pd.DataFrame(gridsearch_model.cv_results_).set_index('param_alpha')
mean_fit_time | std_fit_time | mean_score_time | std_score_time | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | split5_test_score | split6_test_score | split7_test_score | split8_test_score | split9_test_score | mean_test_score | std_test_score | rank_test_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
param_alpha | ||||||||||||||||||
0.1 | 0.002758 | 0.000181 | 0.000329 | 0.000071 | {'alpha': 0.1} | 0.848971 | 0.852551 | 0.857965 | 0.864766 | 0.780191 | 0.863877 | 0.843819 | 0.856511 | 0.797919 | 0.881180 | 0.844777 | 0.029740 | 1 |
1.0 | 0.002498 | 0.000339 | 0.000279 | 0.000051 | {'alpha': 1.0} | 0.815950 | 0.819903 | 0.823129 | 0.838299 | 0.761552 | 0.825254 | 0.812359 | 0.825485 | 0.773266 | 0.844504 | 0.813972 | 0.025125 | 2 |
10.0 | 0.001751 | 0.000142 | 0.000266 | 0.000034 | {'alpha': 10.0} | -0.000373 | -0.000410 | -0.001153 | -0.000108 | -0.000075 | -0.005123 | -0.000034 | -0.002568 | -0.000001 | -0.002008 | -0.001185 | 0.001564 | 3 |
# It also refits the model on the full dataset, so we get the same test R^2.
gridsearch_model.score(test[all_predictors], test.Fare_amount)
0.803262478581998