CS109A Introduction to Data Science
Lecture 23: A/B Testing Demo¶
Harvard University
Fall 2018
Instructors: Pavlos Protopapas and Kevin Rader
In [1]:
## RUN THIS CELL TO PROPERLY HIGHLIGHT THE EXERCISES
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)
Out[1]:
In [2]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import statsmodels as sm
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.proportion import proportions_ztest
%matplotlib inline
Read in the data¶
In [3]:
# Load dataset_1
data_bone = pd.read_csv('bone.csv', delimiter=',')
data_bone.head()
Out[3]:
In [4]:
#creating the bone density measures for each group
controlgroup = data_bone['bone density'][data_bone['group']=='Control']
trt1group = data_bone['bone density'][data_bone['group']=='Lowjump']
trt2group = data_bone['bone density'][data_bone['group']=='Highjump']
#two sample t-test
sp.stats.ttest_ind(controlgroup,trt2group)
Out[4]:
In [5]:
sp.stats.ttest_ind(controlgroup,trt2group)
Out[5]:
In [6]:
#ANOVA F-test
sp.stats.f_oneway(controlgroup,trt1group,trt2group)
Out[6]:
In [7]:
#creating indicator variables for each group
ybar = np.mean(data_bone['bone density'])
controlgroup_ind = 1*(data_bone['bone density'][data_bone['group']=='Control']>ybar)
trt1group_ind = 1*(data_bone['bone density'][data_bone['group']=='Lowjump']>ybar)
trt2group_ind = 1*(data_bone['bone density'][data_bone['group']=='Highjump']>ybar)
#let's see if the indicator is different across Control vs. Other
mytable1 = pd.crosstab(data_bone['group']=='Control',data_bone['bone density']>ybar)
mytable1
Out[7]:
In [8]:
mytable1.iloc[:,1]
Out[8]:
In [9]:
# z-test for proportions
xs = mytable1.iloc[:,1]
ns = mytable1.sum(axis=1)
proportions_ztest(xs,ns)
Out[9]:
In [10]:
#Fisher Exact test
mytable = pd.crosstab(data_bone['group']=='Control',data_bone['bone density']>ybar)
sp.stats.fisher_exact(mytable1)
Out[10]:
In [11]:
mytable2 = pd.crosstab(data_bone['group'],data_bone['bone density']>ybar)
sp.stats.chi2_contingency(mytable2)
Out[11]:
In [12]:
mytable2
Out[12]:
In [28]:
#chi-sq goodness of fit
sp.stats.chisquare(data_bone['group'].value_counts())
Out[28]:
In [27]:
data_bone['group'].value_counts()
Out[27]: