CS109A Introduction to Data Science

Lecture 23: A/B Testing Demo

Harvard University
Fall 2018
Instructors: Pavlos Protopapas and Kevin Rader


In [1]:
## RUN THIS CELL TO PROPERLY HIGHLIGHT THE EXERCISES
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)
Out[1]:

In [2]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import statsmodels as sm
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.proportion import proportions_ztest

%matplotlib inline

Read in the data

In [3]:
# Load dataset_1
data_bone = pd.read_csv('bone.csv', delimiter=',')
data_bone.head()
Out[3]:
rat group treatment bone density lowjump highjump
0 1 Control 1 611 0 0
1 2 Control 1 621 0 0
2 3 Control 1 614 0 0
3 4 Control 1 593 0 0
4 5 Control 1 593 0 0
In [4]:
#creating the bone density measures for each group
controlgroup = data_bone['bone density'][data_bone['group']=='Control']
trt1group = data_bone['bone density'][data_bone['group']=='Lowjump']
trt2group = data_bone['bone density'][data_bone['group']=='Highjump']

#two sample t-test
sp.stats.ttest_ind(controlgroup,trt2group)
Out[4]:
Ttest_indResult(statistic=-3.7154746787634116, pvalue=0.001583142292646354)
In [5]:
sp.stats.ttest_ind(controlgroup,trt2group)
Out[5]:
Ttest_indResult(statistic=-3.7154746787634116, pvalue=0.001583142292646354)
In [6]:
#ANOVA F-test
sp.stats.f_oneway(controlgroup,trt1group,trt2group)
Out[6]:
F_onewayResult(statistic=7.977836956953773, pvalue=0.0018951062682877964)
In [7]:
#creating indicator variables for each group
ybar = np.mean(data_bone['bone density'])

controlgroup_ind = 1*(data_bone['bone density'][data_bone['group']=='Control']>ybar)
trt1group_ind = 1*(data_bone['bone density'][data_bone['group']=='Lowjump']>ybar)
trt2group_ind = 1*(data_bone['bone density'][data_bone['group']=='Highjump']>ybar)

#let's see if the indicator is different across Control vs. Other
mytable1 = pd.crosstab(data_bone['group']=='Control',data_bone['bone density']>ybar)
mytable1
Out[7]:
bone density False True
group
False 6 14
True 8 2
In [8]:
mytable1.iloc[:,1]
Out[8]:
group
False    14
True      2
Name: True, dtype: int64
In [9]:
# z-test for proportions
xs = mytable1.iloc[:,1]
ns = mytable1.sum(axis=1)
proportions_ztest(xs,ns)
Out[9]:
(2.587745847533828, 0.0096606229974849581)
In [10]:
#Fisher Exact test
mytable = pd.crosstab(data_bone['group']=='Control',data_bone['bone density']>ybar)

sp.stats.fisher_exact(mytable1)
Out[10]:
(0.10714285714285714, 0.018690654672663655)
In [11]:
mytable2 = pd.crosstab(data_bone['group'],data_bone['bone density']>ybar)
sp.stats.chi2_contingency(mytable2)
Out[11]:
(13.928571428571431,
 0.00094503772154749571,
 2,
 array([[ 4.66666667,  5.33333333],
        [ 4.66666667,  5.33333333],
        [ 4.66666667,  5.33333333]]))
In [12]:
mytable2
Out[12]:
bone density False True
group
Control 8 2
Highjump 0 10
Lowjump 6 4
In [28]:
#chi-sq goodness of fit
sp.stats.chisquare(data_bone['group'].value_counts())
Out[28]:
Power_divergenceResult(statistic=0.0, pvalue=1.0)
In [27]:
data_bone['group'].value_counts()
Out[27]:
Lowjump     10
Control     10
Highjump    10
Name: group, dtype: int64