In [1]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from math import radians, cos, sin, asin, sqrt
import datetime
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set(style="ticks")
%matplotlib inline
Download the data from
https://drive.google.com/open?id=0B28c493CP9GtMzN1emFoMkJNNlU
First Look At The Data¶
In [2]:
hubway_data = pd.read_csv('hubway_trips.csv', low_memory=False)
hubway_data.head()
Out[2]:
A little data manipulation¶
In [84]:
check_out_year = hubway_data['start_date'].apply(lambda s: int(s[-13:-9]))
age_col = (check_out_year - hubway_data['birth_date'])
In [85]:
#And now for our first plot: a histogram of ages
plt.hist(age_col.dropna())
plt.title('Histogram of User Ages by Checkout')
plt.xlabel('Age in years')
plt.ylabel('Number of Checkouts')
plt.axvline(x=np.mean(age_col.dropna()), color='red', label='Average Age')
plt.legend()
plt.show()
In [89]:
#And now for a second plot: scatter plot of age
plt.scatter(age_col,hubway_data.duration)
plt.title('Scatter plot of Duration by User Ages')
plt.xlabel('Age in years')
plt.ylabel('Duration (in seconds)')
#plt.yscale('log')
plt.show()
In [64]:
#let's look at some subgroups
print(np.unique(hubway_data.subsc_type,return_counts=True))
In [82]:
#And now for a plot of histograms across registration type
duration_registered = (hubway_data.duration[hubway_data.subsc_type == 'Registered'])
duration_casual = (hubway_data.duration[hubway_data.subsc_type == 'Casual'])
plt.hist(logduration_casual.dropna()+1, alpha=.5)
plt.hist(logduration_registered.dropna()+1, alpha=.5)
plt.title('Histograms of duration split by Checkouts')
plt.xlabel('Duration (in seconds)')
plt.ylabel('Number of Checkouts')
#plt.xscale('log')
plt.show()