Key Word(s): ??
import pymc3 as pm
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import seaborn as sns
import matplotlib.pyplot as plt
n_theta = 10000
# generate 10,000 values from Beta(2,5)
theta = np.random.beta(2,5,n_theta)
print("First five values of theta:\n\t", theta[0:5])
print("Sample mean:\n\t", np.mean(theta))
print("The 2.5% and 97.5% of quantiles:\n\t", np.percentile(theta,[2.5,97.5]))
plt.hist(theta,50)
plt.xlabel("Value of Theta")
plt.ylabel("Count")
plt.show()
# simulate y from posterior predictive distribution
y = np.random.binomial(1, theta, n_theta) # generate a heads/tails value from each of the 10,000 thetas
print("First 5 heads/tails values (tails=0, heads=1)\n\t", y[0:10])
print("Overall frequency of Tails and Heads, accounting for uncertainty about theta itself\n\t", np.bincount(y)/10000)
plt.hist(y, density=True)
plt.xticks([.05,.95],["Tails","Heads"])
plt.show()
Rejection sampling and Weighted bootstrap¶
Example adapted from https://wiseodd.github.io/techblog/2015/10/21/rejection-sampling/
sns.set()
def h(x):
return st.norm.pdf(x, loc=30, scale=10) + st.norm.pdf(x, loc=80, scale=20)
def g(x):
return st.norm.pdf(x, loc=50, scale=30)
x = np.arange(-50, 151)
M = max(h(x) / g(x)) # for rejection sampling
h is a mixture of two normal distributions (unnormalized), and density h is a normal distribution with mean 50 and standard deviation 30.
plt.plot(x, h(x))
plt.show()
# Superimpose h and g on same plot
plt.plot(x,h(x))
plt.plot(x,g(x))
plt.show()
# Superimpose h and M*g on same plot - now M*g envelopes h
plt.plot(x,h(x))
plt.plot(x,M*g(x))
plt.show()
def rejection_sampling(maxiter=10000,sampsize=1000):
samples = []
sampcount = 0 # counter for accepted samples
maxcount = 0 # counter for proposal simulation
# sampcount/maxcount at any point in the iteration is the acceptance rate
while (sampcount < sampsize and maxcount < maxiter):
z = np.random.normal(50, 30)
u = np.random.uniform(0, 1)
maxcount += 1
if u <= h(z)/(M*g(z)):
samples.append(z)
sampcount += 1
print('Rejection rate is',100*(1-sampcount/maxcount))
if maxcount == maxiter: print('Maximum iterations achieved')
return np.array(samples)
s = rejection_sampling(maxiter=10000,sampsize=1000)
sns.displot(s)
# weighted bootstrap computation involving h and g
import random
def weighted_bootstrap(iter=1000,size=100):
w = []
y = []
for i in range(iter):
z = np.random.normal(50, 30)
y.append(z)
wz = h(z)/g(z)
w.append(wz)
v = random.choices(y,weights=w,k=size) # do not need to renormalize w
return np.array(v)
wb = weighted_bootstrap(iter=10000,size=1000)
sns.displot(wb)
Beetles¶
beetles_x = np.array([1.6907, 1.7242, 1.7552, 1.7842, 1.8113, 1.8369, 1.8610, 1.8839])
beetles_x_mean = beetles_x - np.mean(beetles_x)
beetles_n = np.array([59, 60, 62, 56, 63, 59, 62, 60])
beetles_y = np.array([6, 13, 18, 28, 52, 53, 61, 60])
beetles_N = np.array([8]*8)
from scipy.special import expit
expit(2)
with pm.Model() as beetle_model:
# The intercept (log probability of beetles dying when dose=0)
# is centered at zero, and wide-ranging (easily anywhere from 0 to 100%)
# If we wanted, we could choose something like Normal(-3,2) for a no-dose
# death rate roughly between .007 and .25
alpha_star = pm.Normal('alpha*', mu=0, sigma=100)
# the effect on the log-odds of each unit of the dose is wide-ranging:
# we're saying we've got little idea what the effect will be, and it could
# be strongly negative.
beta = pm.Normal('beta', mu=0, sigma=100)
# given alpha, beta, and the dosage, the probability of death is deterministic:
# it's the inverse logit of the intercept+slope*dosage
# Because beetles_x has 8 entries, we end up with 8 p_i values
p_i = pm.Deterministic('$P_i$', pm.math.invlogit(alpha_star + beta*beetles_x_mean))
# finally, the number of bettles we see killed is Binomial(n=number of beetles, p=probability of death)
deaths = pm.Binomial('obs_deaths', n=beetles_n, p=p_i, observed=beetles_y)
trace = pm.sample(2000, tune=2000, target_accept=0.9)
pm.traceplot(trace, compact=False);
def trace_summary(trace, var_names=None):
if var_names is None:
var_names = trace.varnames
quants = [0.025,0.25,0.5,0.75,0.975]
colnames = ['mean', 'sd', *["{}%".format(x*100) for x in quants]]
rownames = []
series = []
for cur_var in var_names:
var_trace = trace[cur_var]
if var_trace.ndim == 1:
vals = [np.mean(var_trace, axis=0), np.std(var_trace, axis=0), *np.quantile(var_trace, quants, axis=0)]
series.append(pd.Series(vals, colnames))
rownames.append(cur_var)
else:
for i in range(var_trace.shape[1]):
cur_col = var_trace[:,i]
vals = [np.mean(cur_col, axis=0), np.std(cur_col, axis=0), *np.quantile(cur_col, quants, axis=0)]
series.append(pd.Series(vals, colnames))
rownames.append("{}[{}]".format(cur_var,i))
return pd.DataFrame(series, index=rownames)
trace_summary(trace)
We can also plot the density each chain explored. Any major deviations between chains are signs of difficulty converging.
for x in trace.varnames:
pm.plot_forest(trace, var_names=[x], combined=True)
In addition to the above summaries of the distribution, pymc3 has statistics intended to summarize the quality of the samples. The most common of these is r_hat, which measures whether the different chains seem to be exploring the same space or if they're stuck in different spaces. R-hat above 1.3 is a strong sign the sample isn't good yet. Values close to 1 are ideal.
pm.summary(trace)
Sleep Study¶
import pandas as pd
sleepstudy = pd.read_csv("sleepstudy.csv")
sleepstudy
# adding a column that numbers the subjects from 0 to n
raw_ids = np.unique(sleepstudy['Subject'])
raw2newid = {x:np.where(raw_ids == x)[0][0] for x in raw_ids}
sleepstudy['SeqSubject'] = sleepstudy['Subject'].map(raw2newid)
sleepstudy
with pm.Model() as sleep_model:
# In this model, we're going to say the alphas (individuals' intercepts; their starting reaction time)
# and betas (individuals' slopes; how much worse they get with lack of sleep) are normally distributed.
# We'll specify that we're certain about the mean of those distribution [more on that later], but admit
# we're uncertain about how much spread there is (i.e. uncertain about the SD). Tau_alpha and Tau_beta
# will be the respective SD.
#
# Of course, the SDs must be positive (negative SD isn't mathematically possible), so we draw them from
# a Gamma, which cannot ever output negative numbers. Here, we use alpha and beta values that spread the
# distribution: "the SD could be anything!". If we had more intuition (e.g. "the starting reaction times can't
# have SD above 3,000") we would plot Gamma(a,b) and tune the parameters so that there was little mass
# above 3,000, then use those values below)
tau_alpha = pm.Gamma('tau_alpha', alpha=.001, beta=.001)
tau_beta = pm.Gamma('tau_beta', alpha=.001, beta=.001)
# Across the population of people, we suppose that
# the slopes are normally distributed, as are the intercepts,
# and the two are drawn independently
#
# (Here, we hard-code assumed means, but we don't have to.
# In general, these should be set from our pre-data intuition,
# rather than from plots/exploration of the data)
alpha = pm.Normal('alpha', mu=300, tau=tau_alpha, shape=len(raw_ids))
beta = pm.Normal('beta', mu=10, tau=tau_beta, shape=len(raw_ids))
# Remember: there's only one alpha/beta per person, but
# we have lots of observations per person. The below
# builds a vector with one entry per observation, recording
# the alpha/beta we want to use with that observation.
#
# That is, the length is 180, but it only has 17 unique values,
# matching the 17 unique patients' personal slopes or intercepts
intercepts = alpha[sleepstudy['SeqSubject']]
slopes = beta[sleepstudy['SeqSubject']]
# now we have the true/predicted response time for each observation (each row of original data)
# (Here we use pm.Deterministic to signal that this is something we'll care about)
mu_i = pm.Deterministic('mu_i', intercepts + slopes*sleepstudy['Days'])
# The _observed_ values are noisy versions of the hidden true values, however!
# Specifically, we model them as a normal at the true value and single unknown variance
# (one explanation: we're saying the measurement equipment adds normally-distributed noise tau_obs
# so noise doesn't vary from observation to observation or person to person: there's just one universal
# noise level)
tau_obs = pm.Gamma('tau_obs', 0.001, 0.001)
obs = pm.Normal('observed', mu=mu_i, tau=tau_obs, observed=sleepstudy['Reaction'])
trace = pm.sample(2000, tune=2000, target_accept=0.9)
# this command can take a few minutes to finish... or never :-/
#pm.traceplot(trace);
trace_summary(trace, var_names=['tau_alpha', 'tau_beta', 'alpha', 'beta', 'tau_obs'])
pm.summary(trace, var_names=['tau_alpha', 'tau_beta', 'alpha', 'beta', 'tau_obs'])
import statsmodels.formula.api as sm
import seaborn as sns
from matplotlib import gridspec
ymin,ymax = np.min(sleepstudy["Reaction"]),np.max(sleepstudy["Reaction"])
plt.figure(figsize=(11,8.5))
gs = gridspec.GridSpec(3, 6)
gs.update(wspace=0.5, hspace=0.5)
for i, subj in enumerate(np.unique(sleepstudy['Subject'])):
ss_extract = sleepstudy.loc[sleepstudy['Subject']==subj]
ss_extract_ols = sm.ols(formula="Reaction~Days",data=ss_extract).fit()
#new subplot
subplt = plt.subplot(gs[i])
#plot without confidence intervals
sns.regplot(x='Days', y='Reaction', ci=None, data=ss_extract).set_title('Subject '+str(subj))
if i not in [0,6,12]:
plt.ylabel("")
i+=1
subplt.set_ylim(ymin,ymax)
_ = plt.figlegend(['Estimated from each subject alone'],loc = 'lower center', ncol=6)
_ = plt.show()
plt.figure(figsize=(11,8.5))
for i, subj in enumerate(np.unique(sleepstudy['Subject'])):
ss_extract = sleepstudy.loc[sleepstudy['Subject']==subj]
#new subplot
subplt = plt.subplot(gs[i])
#plot without confidence intervals
sns.regplot(x='Days', y='Reaction', ci=None, data=ss_extract).set_title('Subject '+str(subj))
sns.regplot(x='Days', y='Reaction', ci=None, scatter=False, data=sleepstudy)
if i not in [0,6,12]:
plt.ylabel("")
i+=1
subplt.set_ylim(ymin,ymax)
_ = plt.figlegend(['Estimated from each subject alone','Pooling all subjects'],loc = 'lower center', ncol=6)
_ = plt.show()
plt.figure(figsize=(11,8.5))
subj_arr = np.unique(sleepstudy['Subject'])
for i, subj in enumerate(subj_arr):
ss_extract = sleepstudy.loc[sleepstudy['Subject']==subj]
#new subplot
subplt = plt.subplot(gs[i])
#plot without confidence intervals
sns.regplot(x='Days', y='Reaction', ci=None, data=ss_extract).set_title('Subject '+str(subj))
sns.regplot(x='Days', y='Reaction', ci=None, scatter=False, data=sleepstudy)
subj_num = int(np.where(subj_arr==subj)[0])
subjects_avg_intercept = np.mean(trace['alpha'][:,i])
subjects_avg_slope = np.mean(trace['beta'][:,i])
hmodel_fit = [subjects_avg_intercept + subjects_avg_slope*x for x in range(-1,11)]
sns.lineplot(x=range(-1,11),y=hmodel_fit)
if i not in [0,6,12]:
plt.ylabel("")
i+=1
subplt.set_ylim(ymin,ymax)
_ = plt.figlegend(['Estimated from each subject alone','Pooling all subjects','Hierarchical (partial pooling)'],loc = 'lower center', ncol=6)
_ = plt.show()
model_predictions = trace['mu_i'].mean(axis=0)
obs_reactions = sleepstudy['Reaction']
plt.figure(figsize=(11,8.5))
plt.scatter(sleepstudy['Reaction'], model_predictions)
plt.plot(plt.xlim(), plt.ylim(), c='black')
plt.xlabel("Observed Reaction Time (ms)")
plt.ylabel("Predicted Reaction Time [Mean of Posterior] (ms)")
plt.title("Observed and Fitted Reaction Times from . Bayesian Hierarchical Model")
plt.show()