# import all libraries used in this notebook
import os
import numpy as np
import pandas as pd
from cmdstanpy import CmdStanModel

# plotting libs
import matplotlib.pyplot as plt
import plotnine as p9

# suppress plotnine warnings
import warnings
warnings.filterwarnings('ignore')

# setup plotnine look and feel
p9.theme_set(
  p9.theme_grey() + 
  p9.theme(text=p9.element_text(size=10),
        plot_title=p9.element_text(size=14),
        axis_title_x=p9.element_text(size=12),
        axis_title_y=p9.element_text(size=12),
        axis_text_x=p9.element_text(size=8),
        axis_text_y=p9.element_text(size=8)
       )
)
xlabels_90 = p9.theme(axis_text_x = p9.element_text(angle=90, hjust=1))


# keep notebook outputs clean - demos only
import logging
logging.getLogger('cmdstanpy').setLevel(logging.CRITICAL)


mn_radon = pd.read_csv(os.path.join('data','mn_radon.csv'))
print(f'number of houses: {len(mn_radon)}')
mn_radon.head(7)

number of houses: 919


mn_counties = pd.read_csv(os.path.join('data','mn_uranium.csv'))
print(f'number of counties: {len(mn_counties)}')
mn_counties.head(3)

number of counties: 85


pd.set_option('display.precision', 2)


p9.ggplot(data=mn_radon, mapping=p9.aes(x='log_radon')) + p9.geom_histogram()

<ggplot: (8771573607283)>

p9.ggplot() + p9.geom_histogram(data=mn_radon, mapping=p9.aes(x='log_radon'))


# overlay histograms
(p9.ggplot(data=mn_radon, mapping=p9.aes(x='log_radon', color='factor(floor)', fill='factor(floor)'))
    + p9.geom_histogram(alpha=0.7, binwidth=0.2)
    + p9.scale_color_manual(['darkgreen','darkblue'])
    + p9.scale_fill_manual(['orange','violet'])
    + p9.theme(figure_size=(6,4))
)

<ggplot: (8771625035457)>


# use home radon data county name as a categorical variable
mn_radon['county'] = (mn_radon['county'].astype('category', copy=False))


print(f'number of houses: {len(mn_radon)}')
print(f'number of counties: {len(mn_counties)}')

number of houses: 919
number of counties: 85


print(f'log_radon summary statistics\n{mn_radon["log_radon"].describe()}')

log_radon summary statistics
count    919.00
mean       1.22
std        0.85
min       -2.30
25%        0.64
50%        1.28
75%        1.79
max        3.88
Name: log_radon, dtype: float64


pct_1 = round((mn_radon.floor.sum() / len(mn_radon) * 100))
pct_0= round(100 - pct_1)
print(f'floor 0: {pct_0}%\nfloor 1: {pct_1}%')

floor 0: 83%
floor 1: 17%


# overlay histograms
(p9.ggplot(data=mn_radon, mapping=p9.aes(x='log_radon', color='factor(floor)', fill='factor(floor)'))
    + p9.geom_histogram(alpha=0.7, binwidth=0.2)
    + p9.scale_color_manual(['darkgreen','blue'], name='floor')
    + p9.scale_fill_manual(['orange','violet'], name='floor')
    + p9.xlab("log radon levels")
    + p9.theme(figure_size=(6,4))
)

<ggplot: (8771608679855)>


(p9.ggplot(data=mn_radon, mapping=p9.aes(x='log_radon', color='factor(floor)'))
    + p9.stat_density(geom='line')
    + p9.scale_color_manual(['darkorange','purple'], name='floor')
    + p9.xlab("log radon levels")
    + p9.theme(figure_size=(6,4))
)

<ggplot: (8771608678229)>


plot_radon_floor = (p9.ggplot(data=mn_radon, mapping=p9.aes(x='floor', y='log_radon')) 
    + p9.geom_jitter(width=0.1, alpha=0.5, fill='orange', color='darkred')
    + p9.scale_x_continuous(breaks=range(0,2), minor_breaks=[])
    + p9.ggtitle("Radon measurements by floor")
    + p9.theme(figure_size=(4,4))
)
plot_radon_floor

<ggplot: (8771531305870)>


print(f'Number of counties: {mn_radon.county.nunique()}')
print(f'Counties with measurements from floor 0: {mn_radon[mn_radon["floor"]==0]["county"].nunique()}')
print(f'Counties with measurements from floor 1: {mn_radon[mn_radon["floor"]==1]["county"].nunique()}')

Number of counties: 85
Counties with measurements from floor 0: 85
Counties with measurements from floor 1: 60


(p9.ggplot()
    + p9.geom_histogram(data=mn_counties, mapping=p9.aes(x='homes'), bins=40)
    + p9.xlab("homes per county")
    + p9.ylab("counties per bin")
    + p9.theme(figure_size=(12,4))
)

<ggplot: (8771625057524)>


obs_asc = mn_counties.sort_values(by='homes').reset_index(drop=True).county.values
mn_radon['county'] = mn_radon['county'].cat.reorder_categories(obs_asc)


(p9.ggplot(data=mn_radon, mapping=p9.aes(x='county',y='log_radon'))
    + p9.geom_boxplot(width=2, varwidth=True, outlier_alpha=0.4)
    + p9.scale_x_discrete(expand=(0,3))
    + p9.ggtitle("Counties ordered by number of observations per county")
    + p9.ylab("range of home radon measurements")
    + xlabels_90
    + p9.theme(figure_size=(20,6))
)

<ggplot: (8771573709993)>


(p9.ggplot(data=mn_counties, mapping=p9.aes(x='homes', y='log_uranium'))
    + p9.geom_point(fill='orange', color='darkred')
    + p9.geom_text(data=mn_counties[mn_counties['homes']>25],
                   mapping=p9.aes(label='county'),
                   size=8, nudge_x=4, nudge_y=0.1)
    + p9.xlab("observations per county") + p9.ylab("county soil log_uranium")
    + p9.theme(figure_size=(12,4))
)

<ggplot: (8771625099171)>


(p9.ggplot(data=mn_radon, mapping=p9.aes(x='log_uranium', y='log_radon'))
     + p9.geom_point(alpha=0.9, fill='orange', color='darkred')
     + p9.facet_grid(facets='~ floor', labeller='label_both')
     + p9.xlab("county-level soil log_uranium") + p9.ylab("home log_radon") 
     + p9.theme(figure_size=(12,4))
)

<ggplot: (8771573693413)>


(p9.ggplot()
    + p9.geom_jitter(data=mn_radon[mn_radon['floor']==0],
                    mapping=p9.aes(x='log_uranium', y='log_radon'), 
                    width=0.01, alpha=0.7, fill='orange', color='darkred')
    + p9.geom_jitter(data=mn_radon[mn_radon['floor']==1],
                    mapping=p9.aes(x='log_uranium', y='log_radon'), 
                    width=0.01, alpha=0.7, fill='purple', color='darkblue')
    + p9.xlab("county-level soil log_uranium")
    + p9.ylab("home log_radon")
    + p9.theme(figure_size=(8,4))
)

<ggplot: (8771540349456)>


uranium_desc = mn_counties.sort_values(by='log_uranium', ascending=False).reset_index()

(p9.ggplot(data=mn_radon, mapping=p9.aes(x='county',y='log_radon'))
    + p9.geom_boxplot(width=2, varwidth=True, outlier_alpha=0.4)
    + p9.scale_x_discrete(limits=uranium_desc['county'], expand=(0,1))
    + p9.ggtitle("Counties ordered by soil uranium high (left) to low (right)")
    + p9.ylab("range of home radon measurements")
    + xlabels_90
    + p9.theme(figure_size=(20,6))
)

<ggplot: (8771531337186)>


p1 = (plot_radon_floor
      +  p9.geom_smooth(method='lm')
      +  p9.ggtitle("regression log_radon on floor, all counties")
     )      
p1

<ggplot: (8771625059545)>


(p9.ggplot(data=mn_radon, mapping=p9.aes('floor', 'log_radon'))
     + p9.geom_jitter(width=0.05)
     + p9.geom_smooth(method='lm')
     + p9.facet_wrap('county')
     + p9.ggtitle("per-county regression log_radon on floor, counties ordered by observations, ascending")
     + p9.scale_x_continuous(breaks=range(0,2), minor_breaks=[])
     + p9.scales.ylim(-3, 4)  # same limits as complete pooling
     + p9.theme(figure_size=(18,20))
)

<ggplot: (8771591108985)>

data {
  int<lower=1> N;
  vector[N] x;
  vector[N] y;
}
parameters {
  real alpha;
  real beta;
  real<lower=0> sigma;
}
model {
  y ~ normal(alpha + beta * x, sigma);
  alpha ~ normal(0, 10);
  beta ~ normal(0, 10);
  sigma ~ normal(0, 10);
}
generated quantities {
  array[N] real y_rep = normal_rng(alpha + beta * x, sigma);
}


complete_pooling_model = CmdStanModel(stan_file=os.path.join('stan', 'radon_cp.stan'))
no_pooling_model = CmdStanModel(stan_file=os.path.join('stan', 'radon_np.stan'))


radon_data = {"N": len(mn_radon), 
              "x": mn_radon.floor.astype(float), 
              "y": mn_radon.log_radon,
              "J":85, 
              "county" : mn_radon.county_id}


complete_pooling_fit = complete_pooling_model.sample(data=radon_data, show_progress=False)

16:19:05 - cmdstanpy - INFO - CmdStan start processing
16:19:05 - cmdstanpy - INFO - Chain [1] start processing
16:19:05 - cmdstanpy - INFO - Chain [2] start processing
16:19:05 - cmdstanpy - INFO - Chain [3] start processing
16:19:05 - cmdstanpy - INFO - Chain [4] start processing
16:19:06 - cmdstanpy - INFO - Chain [1] done processing
16:19:06 - cmdstanpy - INFO - Chain [2] done processing
16:19:06 - cmdstanpy - INFO - Chain [3] done processing
16:19:06 - cmdstanpy - INFO - Chain [4] done processing
16:19:06 - cmdstanpy - WARNING - Non-fatal error during sampling:
Exception: normal_lpdf: Scale parameter is 0, but must be positive! (in '/Users/mitzi/github/stan-dev/example-models/jupyter/radon/stan/radon_cp.stan', line 12, column 2 to column 38)
Exception: normal_lpdf: Scale parameter is 0, but must be positive! (in '/Users/mitzi/github/stan-dev/example-models/jupyter/radon/stan/radon_cp.stan', line 12, column 2 to column 38)
Consider re-running with show_console=True if the above output is unclear!


no_pooling_fit = no_pooling_model.sample(data=radon_data, show_progress=False)

16:19:06 - cmdstanpy - INFO - CmdStan start processing
16:19:06 - cmdstanpy - INFO - Chain [1] start processing
16:19:06 - cmdstanpy - INFO - Chain [2] start processing
16:19:06 - cmdstanpy - INFO - Chain [3] start processing
16:19:06 - cmdstanpy - INFO - Chain [4] start processing
16:19:07 - cmdstanpy - INFO - Chain [2] done processing
16:19:07 - cmdstanpy - INFO - Chain [3] done processing
16:19:07 - cmdstanpy - INFO - Chain [1] done processing
16:19:07 - cmdstanpy - INFO - Chain [4] done processing


pool_pd = complete_pooling_fit.draws_pd(vars=['alpha', 'beta', 'sigma'])
print(f'sample draws shape:  {pool_pd.shape}')
pool_pd.head(3)

sample draws shape:  (4000, 3)


pool_stats = pool_pd.describe()
pool_stats.round(2)


complete_pooling_fit.summary().round(2)[1:4]


complete_pool_alpha_mean = complete_pooling_fit.stan_variable('alpha').mean()

no_pool_alpha = no_pooling_fit.stan_variable('alpha')
no_pool_alpha_mean = np.mean(no_pool_alpha, axis=0)  # axis=0 uses all rows, i.e., per-column mean
no_pool_alpha_lower = np.quantile(no_pool_alpha, 0.16, axis=0)
no_pool_alpha_upper = np.quantile(no_pool_alpha, 0.84, axis=0)
no_pool_alpha_pd = pd.DataFrame(data={
    "mean": no_pool_alpha_mean,
    "upper": no_pool_alpha_upper, 
    "lower": no_pool_alpha_lower, 
    "county":mn_counties['county']
})
no_pool_alpha_pd.head(3)


f1 = pool_pd.alpha + pool_pd.beta  # y coord at floor 1 
f0 = pd.Series(pool_pd.alpha.values)  # y coord at floor 0

# 
sz = 100
ys = pd.concat([f0, f1], axis=1)
ys = ys.sample(sz).reset_index(drop=True)

# add sample regression lines to plot_radon_floor (from earlier section)
p2 = plot_radon_floor
for i in range(sz):
    p2 = p2 + p9.geom_line(data=ys.T, mapping=p9.aes(x=[0,1], y=ys.loc[i]),
                           inherit_aes=False, color='grey', alpha=0.06)

# add central regression line
p2 = p2 + p9.stat_function(mapping=p9.aes(x=1),
    fun=lambda x: pool_stats.alpha['mean'] + pool_stats.beta['mean']*x,
    color='blue', size=1
)
p2

<ggplot: (8771573864090)>


# get sort order
pop_asc = mn_counties.sort_values(by='homes').reset_index()

p_no_pool = (p9.ggplot(data=no_pool_alpha_pd)
 # Range strip
 + p9.geom_segment(
     mapping=p9.aes(x='county', xend='county', y='lower', yend='upper'),
     size=1.4, color='darkblue', alpha=0.5,
 )
 + p9.geom_point(mapping=p9.aes(x='county', y='mean'))
 + p9.geom_hline(yintercept=complete_pool_alpha_mean, color='darkorange', size=1.5)
 + p9.scale_x_discrete(limits=pop_asc['county'])
 + p9.ggtitle("No pooling model estimates for alpha (basement log_radon level)")
 + p9.xlab("observations per county") + p9.ylab("central 67% interval")
 + xlabels_90
 + p9.theme(figure_size=(20,6)) 
)
p_no_pool

<ggplot: (8771591724575)>

y ~ normal(alpha[county] + beta * x, sigma);


y_rep_pp = no_pooling_fit.draws_pd(vars='y_rep')
y_rep_pp.shape

(4000, 919)


# for each of the 85 counties, estimate the median y_rep
stat_median = []
for i in range(1,86):
    idxs = mn_radon.index[mn_radon['county_id'] == i].tolist()
    stat_median.append(np.median(y_rep_pp.iloc[:, idxs].to_numpy().flatten()))


np_ppc_median = (p9.ggplot()
  + p9.geom_boxplot(data=mn_radon,
                    mapping=p9.aes(x='county',y='log_radon'),
                    color='orange', fatten=3, alpha=0.7, outlier_alpha=0.3)
  + p9.geom_point(mapping=p9.aes(x=mn_counties.county, y=stat_median), color='purple')
  + p9.scale_x_discrete(limits=pop_asc['county'], expand=(0,1))
  + p9.ggtitle("No-pooling model, posterior predictive checks: median estimates for alpha")
  + p9.xlab("observations per county")                
  + xlabels_90
  + p9.theme(figure_size=(16,6))
)
np_ppc_median

<ggplot: (8771591493036)>


# get a random sample of the draws
sz = 80
y_rep = no_pooling_fit.draws_pd(vars='y_rep')

# each column is a replicate of the data, using estimates of alpha, beta
y_rep_sample = y_rep.sample(sz).reset_index(drop=True).T

# plot actual distribution of the data against predicted new data
np_ppc = p9.ggplot()
for i in range(sz):
    np_ppc = np_ppc + p9.stat_density(mapping=p9.aes(x=y_rep_sample[i]), geom='line', color='lightblue', alpha=0.4)
np_ppc = (np_ppc 
          + p9.stat_density(data=mn_radon, mapping=p9.aes(x='log_radon'), geom='line', color='darkblue', size=1.1)
          + p9.ggtitle("No-pooling model, posterior predictive checks: density of y, y_rep")
          + p9.xlab("log radon levels") + p9.ylab("density") + p9.scales.xlim(-3,6)
          + p9.theme(figure_size=(6,4))
         )
np_ppc

<ggplot: (8771591769826)>

data {
  int<lower=1> N;  // observations
  int<lower=1> J;  // counties
  array[N] int<lower=1, upper=J> county;
  vector[N] x;
  vector[N] y;
}
parameters {
  real mu_alpha;
  real<lower=0> sigma_alpha;
  vector<offset=mu_alpha, multiplier=sigma_alpha>[J] alpha;  // non-centered parameterization
  real beta;
  real<lower=0> sigma;
}
model {
  y ~ normal(alpha[county] + beta * x, sigma);  
  alpha ~ normal(mu_alpha, sigma_alpha); // partial-pooling
  beta ~ normal(0, 10);
  sigma ~ normal(0, 10);
  mu_alpha ~ normal(0, 10);
  sigma_alpha ~ normal(0, 10);
}
generated quantities {
  array[N] real y_rep = normal_rng(alpha[county] + beta * x, sigma);
}


partial_pooling_alpha_model = CmdStanModel(stan_file=os.path.join('stan', 'radon_pp_alpha.stan'))


partial_pooling_alpha_fit = partial_pooling_alpha_model.sample(data=radon_data, show_progress=False)
partial_pooling_alpha_fit.summary().round(2).head(10)

16:19:28 - cmdstanpy - INFO - CmdStan start processing
16:19:28 - cmdstanpy - INFO - Chain [1] start processing
16:19:28 - cmdstanpy - INFO - Chain [2] start processing
16:19:28 - cmdstanpy - INFO - Chain [3] start processing
16:19:28 - cmdstanpy - INFO - Chain [4] start processing
16:19:29 - cmdstanpy - INFO - Chain [1] done processing
16:19:29 - cmdstanpy - INFO - Chain [2] done processing
16:19:29 - cmdstanpy - INFO - Chain [3] done processing
16:19:30 - cmdstanpy - INFO - Chain [4] done processing
16:19:30 - cmdstanpy - WARNING - Non-fatal error during sampling:
Exception: offset_multiplier_constrain: multiplier is 0, but must be positive finite! (in '/Users/mitzi/github/stan-dev/example-models/jupyter/radon/stan/radon_pp_alpha.stan', line 11, column 2 to column 59)
	Exception: offset_multiplier_constrain: multiplier is 0, but must be positive finite! (in '/Users/mitzi/github/stan-dev/example-models/jupyter/radon/stan/radon_pp_alpha.stan', line 11, column 2 to column 59)
	Exception: offset_multiplier_constrain: multiplier is 0, but must be positive finite! (in '/Users/mitzi/github/stan-dev/example-models/jupyter/radon/stan/radon_pp_alpha.stan', line 11, column 2 to column 59)
	Exception: offset_multiplier_constrain: multiplier is 0, but must be positive finite! (in '/Users/mitzi/github/stan-dev/example-models/jupyter/radon/stan/radon_pp_alpha.stan', line 11, column 2 to column 59)
	Exception: offset_multiplier_constrain: multiplier is 0, but must be positive finite! (in '/Users/mitzi/github/stan-dev/example-models/jupyter/radon/stan/radon_pp_alpha.stan', line 11, column 2 to column 59)
	Exception: offset_multiplier_constrain: multiplier is 0, but must be positive finite! (in '/Users/mitzi/github/stan-dev/example-models/jupyter/radon/stan/radon_pp_alpha.stan', line 11, column 2 to column 59)
	Exception: offset_multiplier_constrain: multiplier is 0, but must be positive finite! (in '/Users/mitzi/github/stan-dev/example-models/jupyter/radon/stan/radon_pp_alpha.stan', line 11, column 2 to column 59)
Exception: offset_multiplier_constrain: multiplier is 0, but must be positive finite! (in '/Users/mitzi/github/stan-dev/example-models/jupyter/radon/stan/radon_pp_alpha.stan', line 11, column 2 to column 59)
Exception: offset_multiplier_constrain: multiplier is 0, but must be positive finite! (in '/Users/mitzi/github/stan-dev/example-models/jupyter/radon/stan/radon_pp_alpha.stan', line 11, column 2 to column 59)
	Exception: offset_multiplier_constrain: multiplier is 0, but must be positive finite! (in '/Users/mitzi/github/stan-dev/example-models/jupyter/radon/stan/radon_pp_alpha.stan', line 11, column 2 to column 59)
Exception: offset_multiplier_constrain: multiplier is 0, but must be positive finite! (in '/Users/mitzi/github/stan-dev/example-models/jupyter/radon/stan/radon_pp_alpha.stan', line 11, column 2 to column 59)
Consider re-running with show_console=True if the above output is unclear!


part_pool_mu_alpha = partial_pooling_alpha_fit.stan_variable('mu_alpha').mean()
part_pool_alpha = partial_pooling_alpha_fit.stan_variable('alpha')
part_pool_alpha_mean = np.mean(part_pool_alpha, axis=0)
part_pool_alpha_lower = np.quantile(part_pool_alpha, 0.16, axis=0)
part_pool_alpha_upper = np.quantile(part_pool_alpha, 0.84, axis=0)
part_pool_alpha_pd = pd.DataFrame(
    data={
        "mean": part_pool_alpha_mean,
        "upper": part_pool_alpha_upper, 
        "lower": part_pool_alpha_lower, 
        "county":mn_counties['county']
    }
)


# visualize
p_partial_pool_intercept = (p9.ggplot(data=part_pool_alpha_pd)
 # Range strip
 + p9.geom_segment(
     mapping=p9.aes(x='county', xend='county', y='lower', yend='upper'),
     size=1.7, color='blue', alpha=0.7,
 )
 + p9.geom_point(mapping=p9.aes(x='county', y='mean'))
 + p9.geom_hline(yintercept=part_pool_mu_alpha, color='darkblue', size=1)
 + p9.scale_x_discrete(limits=pop_asc['county']) + p9.scales.ylim(0,3.5)
 + p9.ggtitle("multilevel varying intercept model estimates for alpha (basement log_radon level)")
 + p9.xlab("ordered by observations per county") + p9.ylab("central 67% interval")
 + xlabels_90
 + p9.theme(figure_size=(20,6)) 
)
p_partial_pool_intercept

<ggplot: (8771573842981)>


(p_partial_pool_intercept
    + p9.geom_segment(data=no_pool_alpha_pd,
         mapping=p9.aes(x='county', xend='county', y='lower', yend='upper'),
         size=1.4, color='orange', alpha=0.6,
     )
     + p9.geom_hline(yintercept=complete_pool_alpha_mean, color='darkorange', size=1)
)

<ggplot: (8771573711529)>


(p_partial_pool_intercept
 + p9.geom_boxplot(data=mn_radon, mapping=p9.aes(x='county',y='log_radon'), 
                   color='orange', alpha=0.4, outlier_alpha=0.3)
)

<ggplot: (8771591521133)>


y_rep_pp = partial_pooling_alpha_fit.draws_pd(vars='y_rep')

# compute per-county medians, means
pp_stat_median = []
for i in range(1,86):
    idxs = mn_radon.index[mn_radon['county_id'] == i].tolist()
    pp_stat_median.append(np.median(y_rep_pp.iloc[:, idxs].to_numpy().flatten()))

# plot medians from sample against boxplot y
(p9.ggplot()
  + p9.geom_boxplot(data=mn_radon,
                    mapping=p9.aes(x='county',y='log_radon'), 
                    color='orange', fatten=2, alpha=0.7, outlier_alpha=0.3)
  + p9.geom_point(mapping=p9.aes(x=mn_counties.county, y=pp_stat_median), color='purple')
  + p9.scale_x_discrete(limits=pop_asc['county'])
  + p9.ggtitle("Partial-pooling model, posterior predictive checks: median estimates for alpha")
  + xlabels_90
  + p9.theme(figure_size=(16,6))
)

<ggplot: (8771591473116)>


# each column is a replicate of the data, using estimates of alpha, beta
y_rep_pp_sample = y_rep_pp.sample(sz).reset_index(drop=True).T

# plot actual distribution of the data against predicted new data
pp_ppc = p9.ggplot()
for i in range(sz):
    pp_ppc = pp_ppc + p9.stat_density(mapping=p9.aes(x=y_rep_pp_sample[i]),
                                      geom='line', color='lightblue', alpha=0.4)
pp_ppc = (pp_ppc 
          + p9.stat_density(data=mn_radon, mapping=p9.aes(x='log_radon'),
                            geom='line', color='darkblue', size=1.1)
          + p9.ggtitle("Partial-pooling model, posterior predictive checks: density for y, y_rep")
          + p9.xlab("log radon levels") + p9.ylab("density") + p9.scales.xlim(-3, 6)
          + p9.theme(figure_size=(6,4))
         )
pp_ppc

<ggplot: (8771524138572)>


# combine multiple plots
import patchworklib as pw  # ignore warning about seaborn
from plotnine.data import *

g1 = pw.load_ggplot(pp_ppc)
g1.savefig(quick=True)
g2 = pw.load_ggplot(np_ppc)
g2.savefig(quick=True)
p12 = (g1 | g2)

p12.savefig(quick=True)

No module named 'seaborn'

y ~ normal(alpha + beta * x, sigma);


df_radon = pd.read_csv(os.path.join('data','raw_radon.csv'),
    usecols=['state', 'stfips', 'floor', 'activity', 'county', 'cntyfips'],
    skipinitialspace=True,    # CSV file has spaces after delimiter, ignore them
).convert_dtypes()
print(f'Total records: {len(df_radon)}')
df_radon.head(3)

Total records: 12777


df_uranium = pd.read_csv(os.path.join('data','raw_uranium.csv'),
                        usecols=['stfips', 'ctfips', 'st', 'cty', 'Uppm'],
                        skipinitialspace=True,
                        ).drop_duplicates().convert_dtypes()
df_uranium.head(3)


df_radon['fips'] = df_radon.stfips*1000 + df_radon.cntyfips
df_uranium['fips'] = df_uranium.stfips*1000 + df_uranium.ctfips

df_radon = df_radon.merge(df_uranium[['fips', 'Uppm']], on='fips')
df_radon.head(3)


df_radon['radon'] = df_radon.activity.apply(lambda x: x if x > 0. else 0.1)
df_radon['log_radon'] = np.log(df_radon['radon'])

df_radon['uranium'] = df_radon.Uppm.apply(lambda x: x if x > 0. else 0.1)
df_radon['log_uranium'] = np.log(df_radon['uranium'])
df_radon.head(3)


df_radon.drop(columns=['stfips', 'activity', 'cntyfips', 'Uppm', 'fips', 'radon', 'uranium'], inplace=True)


mn_radon = df_radon[df_radon['state']=='MN'].reset_index(drop=True)
mn_radon.drop(columns=['state'], inplace=True)
mn_radon.head(3)


mn_radon['county'] = (mn_radon['county'].astype('category', copy=False))
mn_radon['county_id'] = mn_radon.county.cat.codes + 1  ## Stan indexes from 1
mn_radon[:5]


mn_uranium = mn_radon.iloc[mn_radon.county.drop_duplicates().index].reset_index(drop=True)
mn_uranium['homes'] = mn_radon.value_counts(subset='county', sort=False).array
mn_uranium.drop(columns=['floor', 'log_radon'], inplace=True)
mn_uranium[:5]


# uncomment as needed
# mn_radon.to_csv(r'mn_radon.csv', index=False)
# mn_uranium.to_csv(r'mn_uranium.csv', index=False)

	floor	county	log_radon	log_uranium	county_id
0	1	AITKIN	0.788457	-0.689048	1
1	0	AITKIN	0.788457	-0.689048	1
2	0	AITKIN	1.064711	-0.689048	1
3	0	AITKIN	0.000000	-0.689048	1
4	0	ANOKA	1.131402	-0.847313	2
5	0	ANOKA	0.916291	-0.847313	2
6	0	ANOKA	0.405465	-0.847313	2

	county	log_uranium	county_id	homes
0	AITKIN	-0.689048	1	4
1	ANOKA	-0.847313	2	52
2	BECKER	-0.113459	3	3

	alpha	beta	sigma
0	1.32	-0.58	0.80
1	1.35	-0.61	0.84
2	1.32	-0.68	0.81

	alpha	beta	sigma
count	4000.00	4000.00	4000.00
mean	1.33	-0.61	0.82
std	0.03	0.07	0.02
min	1.22	-0.87	0.76
25%	1.31	-0.66	0.81
50%	1.33	-0.61	0.82
75%	1.35	-0.56	0.84
max	1.43	-0.33	0.90

	Mean	StdDev	5%	50%	95%	N_Eff	N_Eff/s	R_hat
alpha	1.33	0.03	1.28	1.33	1.38	2840.00	1750.93	1.0
beta	-0.61	0.07	-0.73	-0.61	-0.49	2874.22	1772.02	1.0
sigma	0.82	0.02	0.79	0.82	0.86	3778.41	2329.47	1.0

Multilevel regression modeling with CmdStanPy and plotnine¶

Table of Contents

Case study: home radon measurement¶

Notebook setup¶

Radon dataset: home radon measurements, per-county soil uranium levels¶

Plotting basics with plotnine¶

Best Practice: preliminary data analysis¶

Best Practice: start with a simple model¶

Linear regression in Stan¶

Fitting Models in CmdStanPy¶

Extracting fit information¶

Visualizing model estimates with plotnine¶

Best Practice: posterior predictive checks¶

Multilevel Regression¶

Modeling the regression intercept term¶

Visualizations¶

Posterior predictive checks¶

Discussion¶

References and Resources¶

Acknowledgement and thanks!¶

Appendix A: Linear regression review (chapters 2 and 3 of Gelman and Hill)¶

Appendix B: Data preparation using pandas.¶

	Mean	MCSE	StdDev	5%	50%	95%	N_Eff	N_Eff/s	R_hat
lp__	-246.18	0.31	9.25	-262.06	-245.92	-231.25	901.00	330.04	1.0
mu_alpha	1.46	0.00	0.05	1.38	1.46	1.55	2379.49	871.61	1.0
sigma_alpha	0.33	0.00	0.05	0.26	0.33	0.41	1500.24	549.54	1.0
alpha[1]	1.19	0.00	0.26	0.74	1.19	1.62	7863.33	2880.34	1.0
alpha[2]	0.93	0.00	0.10	0.77	0.93	1.09	6276.73	2299.17	1.0
alpha[3]	1.48	0.00	0.26	1.06	1.48	1.91	8023.51	2939.02	1.0
alpha[4]	1.51	0.00	0.22	1.14	1.50	1.87	8997.97	3295.96	1.0
alpha[5]	1.44	0.00	0.25	1.02	1.44	1.87	8071.05	2956.43	1.0
alpha[6]	1.48	0.00	0.26	1.05	1.48	1.92	8957.86	3281.27	1.0
alpha[7]	1.86	0.00	0.17	1.57	1.85	2.15	8237.52	3017.41	1.0

	state	stfips	floor	activity	cntyfips	county	fips	Uppm
0	AZ	4	1	0.3	1	APACHE	4001	2.26
1	AZ	4	9	0.6	1	APACHE	4001	2.26
2	AZ	4	1	0.5	1	APACHE	4001	2.26

	floor	county	log_radon	log_uranium	county_id
0	1	AITKIN	0.79	-0.69	1
1	0	AITKIN	0.79	-0.69	1
2	0	AITKIN	1.06	-0.69	1
3	0	AITKIN	0.00	-0.69	1
4	0	ANOKA	1.13	-0.85	2

	stfips	ctfips	st	cty	Uppm
0	1	1	AL	AUTAUGA	1.78
1	1	3	AL	BALDWIN	1.38
2	1	5	AL	BARBOUR	2.1