CS109A Introduction to Data Science¶

Lecture 2: Example¶

Harvard University
Fall 2019

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from math import radians, cos, sin, asin, sqrt
import datetime
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set(style="ticks")
%matplotlib inline


First Look At The Data¶

hubway_data = pd.read_csv('hubway_trips.csv', low_memory=False)


hubway_data.shape


A little data manipulation¶

check_out_year = hubway_data['start_date'].apply(lambda s: int(s[-13:-9]))
year_to_age = (check_out_year - hubway_data['birth_date'])


Who¶

fig, ax = plt.subplots(1, 2, figsize=(15, 6))

#And now for our first plot: a barplot of gender
gender_counts = np.unique(hubway_data['gender'].replace(np.nan, 'NaN', regex=True).values, return_counts=True)
ax[0].bar(range(3), gender_counts[1], align='center', color=['black', 'green', 'teal'], alpha=0.5)
ax[0].set_xticks([0, 1, 2])
ax[0].set_xticklabels(['none', 'male', 'female', ' '])
ax[0].set_title('Users by Gender')

#And a histogram of ages

ax[1].hist(year_to_age.dropna(),bins=30)
ax[1].set_title('Histogram of User Ages by Checkout')
ax[1].axvline(x=np.mean(year_to_age.dropna()), color='red', label='Average Age')
ax[1].legend()

plt.show()

#And now for a second plot: scatter plot of age with duration
plt.yscale('log')
plt.scatter(year_to_age,hubway_data.duration+1)
plt.title('Scatter plot of Duration by User Ages')
plt.xlabel('Age in years')
plt.ylabel('Duration (in seconds)')
plt.show()


Where¶

station_data = pd.read_csv('hubway_stations.csv', low_memory=False)[['id', 'lat', 'lng']]

hubway_data_with_gps = hubway_data.join(station_data.set_index('id'), on='strt_statn')


When¶

#check_out_times = pd.to_datetime(hubway_data['start_date'])
check_out_hours = hubway_data['start_date'].apply(lambda s: int(s[-8:-6]))

fig, ax = plt.subplots(1, 1, figsize=(10, 5))

check_out_counts = np.unique(check_out_hours, return_counts=True)
ax.bar(check_out_counts[0], check_out_counts[1], align='center', width=0.4, alpha=0.6)
ax.set_xlim([-1, 24])
ax.set_xticks(range(24))
ax.set_xlabel('Hour of Day')
ax.set_ylabel('Number of Checkouts')
ax.set_title('Time of Day vs Checkouts')

plt.show()


How¶

def haversine(pt, lat2=42.355589, lon2=-71.060175):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
lon1 = pt[0]
lat1 = pt[1]

# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 3956 # Radius of earth in miles
return c * r

station_counts = np.unique(hubway_data_with_gps['strt_statn'].dropna(), return_counts=True)
counts_df = pd.DataFrame({'id':station_counts[0], 'checkouts':station_counts[1]})
counts_df = counts_df.join(station_data.set_index('id'), on='id')

counts_df.loc[:, 'dist_to_center'] = list(map(haversine, counts_df[['lng', 'lat']].values))

fig, ax = plt.subplots(1, 1, figsize=(10, 5))

ax.scatter(counts_df['dist_to_center'].values, counts_df['checkouts'].values)

reg_line = LinearRegression()
reg_line.fit(counts_df['dist_to_center'].values.reshape((len(counts_df['dist_to_center']), 1)), counts_df['checkouts'].values)

distances = np.linspace(counts_df['dist_to_center'].min(), counts_df['dist_to_center'].max(), 50)

ax.plot(distances, reg_line.predict(distances.reshape((len(distances), 1))), color='red', label='Regression Line')

ax.set_xlabel('Distance to City Center (Miles)')
ax.set_ylabel('Number of Checkouts')
ax.set_title('Distance to City Center vs Checkouts')
ax.legend()

plt.savefig('How.png', dpi=300)

#let's look at some subgroups
print(np.unique(hubway_data.subsc_type,return_counts=True))

#And now for a plot of histograms across registration type
duration_registered = (hubway_data.duration[hubway_data.subsc_type == 'Registered'])
duration_casual = (hubway_data.duration[hubway_data.subsc_type == 'Casual'])

print(np.mean(duration_registered))
print(np.mean(duration_casual))

logduration_registered = np.log(duration_registered+1)
logduration_casual = np.log(duration_casual+1)

plt.hist(logduration_registered.dropna(), alpha=.5, bins=30)
plt.hist(logduration_casual.dropna(), alpha=.5, bins=30)

#plt.hist(duration_registered.dropna()+1, alpha=.5)
#plt.hist(duration_casual.dropna()+1, alpha=.5)
plt.title('Histograms of duration split by Checkouts')
plt.xlabel('Duration (in log(seconds))')
plt.ylabel('Number of Checkouts')
#plt.xscale('log')

plt.show()

