In [3]:

import pymc3 as pm
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

import theano as th
th.config.warn.round=False

%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option("display.width",110)
pd.set_option("display.max_columns",12)
np.set_printoptions(linewidth=110, edgeitems=6, suppress=True)

In [2]:

data = np.array([51.06, 55.12, 53.73, 50.24, 52.05, 56.40, 48.45, 52.34,
55.65, 51.49, 51.86, 63.43, 53.00, 56.09, 51.93, 52.31, 52.33, 57.48, 57.44,
55.14, 53.93, 54.62, 56.09, 68.58, 51.36, 55.47, 50.73, 51.94, 54.95, 50.39,
52.91, 51.5, 52.68, 47.72, 49.73, 51.82, 54.99, 52.84, 53.19, 54.52, 51.46,
53.73, 51.61, 49.81, 52.42, 54.3, 53.84, 53.16]).astype(np.float32)
data = data / 5
_ = sns.kdeplot(data)

/Users/graham/anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

General Experimenting¶

In [3]:

# general experimenting from PYMC3 API quickstart

with pm.Model() as model:
    mu = pm.Normal('mu', mu=0, sd=50, testval=0.1)
    plus_2 = pm.Deterministic('mu plus 2', mu + 2)
    x = pm.Uniform('x', lower=0, upper=1)
    xx = pm.Normal('xx', mu=0, sd=1, shape=5)
    obs = pm.Normal('obs', mu=mu, sd=1, observed=data)

model.basic_RVs, model.observed_RVs, model.free_RVs, model.deterministics
model.named_vars
#model.logp({'mu': 0})
mu.random(size=3)
xx.random(size=3)

Out[3]:

([mu, x_interval__, xx, obs], [obs], [mu, x_interval__, xx], [mu plus 2, x])

Out[3]:

{'mu': mu,
 'mu plus 2': mu plus 2,
 'obs': obs,
 'x_interval__': x_interval__,
 'xx': xx}

Out[3]:

array([-57.16537171,   8.2865051 , -20.29619937])

Out[3]:

array([[-0.40803731, -1.26768239, -0.50392692, -0.55405493, -0.55611535],
       [-0.71291931,  1.02684142,  0.44666301, -0.3015636 , -0.14790859],
       [ 0.14269815,  1.14211246,  1.09377286,  0.10118101, -1.76370341]])

In [4]:

# live plotting of inference

%matplotlib notebook 

with model:
    trace = pm.sample(2000, njobs=4, n_init=80000, live_plot=True, refresh_every=500, roll_over=1000)
%matplotlib inline

trace['mu'].shape
trace.nchains
trace.get_values('mu', chains=1).shape

Auto-assigning NUTS sampler...
Initializing NUTS using ADVI...
Average Loss = 61.885:  44%|████▍     | 35248/80000 [00:03<00:04, 9548.72it/s]  
Convergence archived at 35700
Interrupted at 35,700 [44%]: Average Loss = 912.99
  0%|          | 0/2500 [00:00

100%|██████████| 2500/2500 [00:17<00:00, 146.56it/s]

Out[4]:

(8000,)

Out[4]:

(2000,)

In [5]:

_ = pm.traceplot(trace)
_ = plt.show()

_ = plt.figure(figsize=(15,3))
_ = pm.forestplot(trace)
_ = plt.show()

pm.gelman_rubin(trace)

_ = pm.plot_posterior(trace)

Out[5]:

{'mu': 0.99991282811644211,
 'mu plus 2': 0.99991287679554608,
 'x': 0.99996509758312413,
 'x_interval__': 1.0000341587255757,
 'xx': array([ 0.99978429,  0.99979776,  0.99983805,  0.99986452,  1.00025845], dtype=float32)}

In [12]:

with model:
    post_pred = pm.sample_ppc(trace, samples=1000)
    
post_pred['obs'].shape

ax = sns.distplot(post_pred['obs'], label='Posterior predictive mean')
_ = ax.axvline(data.mean(), color='r', ls='--', label='True mean')
_ = ax.legend()
_ = plt.show()

_ = sns.distplot(mu.random( size=10000));

100%|██████████| 1000/1000 [00:03<00:00, 250.62it/s]
/Users/graham/anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

Finding mean and standard deviation¶

In [7]:

%%time

np.random.seed(1)
with pm.Model() as m1:
    mu = pm.Uniform('mu', min(data), max(data))
    sigma = pm.HalfNormal('sigma', sd=10)
    y = pm.Normal('y', mu=mu, sd=sigma, observed=data) # a student T might be better here
    
    start = pm.find_MAP()
    step = pm.NUTS() # pm.Metropolis()
    trace_m1 = pm.sample(4000, step=step, start=start, njobs=4)
    
trace_m1b = trace_m1[500:]

Warning: Desired error not necessarily achieved due to precision loss.
         Current function value: 54.391045
         Iterations: 8
         Function evaluations: 99
         Gradient evaluations: 87

100%|██████████| 4500/4500 [00:06<00:00, 736.84it/s]

CPU times: user 10.2 s, sys: 1.02 s, total: 11.2 s
Wall time: 49.4 s

In [8]:

# show priors
_ = plt.figure(figsize=(20,7))

_ = plt.subplot(121)
_ = plt.title('mu')
_ = sns.distplot(mu.random(size=100000), label='prior')
_ = sns.distplot(trace_m1b.mu, label='posterior')
_ = plt.axvline(x=data.mean(), lw=1, color='r')
_ = plt.legend()

_ = plt.subplot(122)
_ = plt.title('sigma')
_ = sns.distplot(sigma.random(size=100000), label='prior')
_ = sns.distplot(trace_m1b.sigma, label='posterior')
_ = plt.axvline(x=data.std(), lw=1, color='r')
_ = plt.legend()
_ = plt.xlim(0,30)

#_ = plt.hist(sigma.random(size=100000), bins=100, normed=True, alpha=0.7, label='prior')
#_ = plt.hist(trace_m1b.sigma, bins=100, normed=True, alpha=0.7, label='posterior')
#_ = plt.hist(mu.random(size=100000), bins=100, normed=True, alpha=0.7, label='prior')
#_ = plt.hist(trace_m1b.mu, bins=100, normed=True, alpha=0.7, label='posterior')

/Users/graham/anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [10]:

# show all relevant plots

_ = pm.traceplot(trace_m1b, lines={'mu': data.mean(), 'sigma': data.std()})

_ = pm.plot_posterior(trace_m1b, varnames=['mu'], kde_plot=True, rope=[10.4,10.6], ref_val=11)
_ = pm.plot_posterior(trace_m1b, varnames=['sigma'], kde_plot=True, rope=[0.7, 0.8], ref_val=1)

_ = pm.autocorrplot(trace_m1b)

_ = plt.figure(figsize=(4,3))
_ = plt.plot(pm.geweke(trace_m1b['mu'])[:,1], 'o')
_ = plt.axhline(1, c='red'); _ = plt.axhline(-1, c='red')
_ = plt.gca().margins(0.05)

In [43]:

# show all relevant diagnostics
pm.diagnostics.gelman_rubin(trace_m1b)
pm.diagnostics.effective_n(trace_m1b)

pm.df_summary(trace_m1b)
pd.DataFrame(data).describe().T

Out[43]:

{'mu': 1.0000526506579144,
 'mu_interval_': 1.0000592552896899,
 'sigma': 1.0005980191434252,
 'sigma_log_': 1.0006139668936609}

Out[43]:

{'mu': 9253.0, 'mu_interval_': 9189.0, 'sigma': 6380.0, 'sigma_log_': 6459.0}

Out[43]:

	mean	sd	mc_error	hpd_2.5	hpd_97.5
mu	53.501238	0.496864	0.004870	52.527270	54.489587
sigma	3.541994	0.375928	0.004542	2.861996	4.312026

Out[43]:

	count	mean	std	min	25%	50%	75%	max
0	48.0	53.496458	3.456198	47.72	51.5825	52.875	54.96	68.58

In [44]:

# generate posterior samples
with m1:
    ppc_m1b = pm.sample_ppc(trace_m1b, samples = 2000)
    
samples = ppc_m1b['y'][:,0]

_ = sns.kdeplot(data)
_ = sns.kdeplot(samples)

100%|██████████| 2000/2000 [00:14<00:00, 141.76it/s]
/Users/graham/anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [ ]:

# demo of sampling from prior (omit the observed keyword)
with pm.Model() as prior_context:
    sigma = pm.Gamma('gamma', alpha=2.0, beta=1.0)
    mu = pm.Normal('mu', mu=0, sd=sigma)
    trace = pm.sample(5000, step=pm.Metropolis())
_ = pm.traceplot(trace)

Linear Regression¶

In [ ]:

ans = sns.load_dataset('anscombe') 
x_2 = ans[ans.dataset == 'II']['x'].values 
y_2 = ans[ans.dataset == 'II']['y'].values 
x_2 = x_2 -x_2.mean() 
y_2 = y_2 -y_2.mean() 
#plt.scatter(x_2, y_2) 
#plt.xlabel('$x$', fontsize=16) 
#plt.ylabel('$y$', fontsize=16, rotation=0)

In [ ]:

np.random.seed(1)
with pm.Model() as model_poly:
    alpha = pm.Normal('alpha', mu=0, sd=10)
    beta1 = pm.Normal('beta1', mu=0, sd=1)
    beta2 = pm.Normal('beta2', mu=0, sd=1)
    epsilon = pm.Uniform('epsilon', lower=0, upper=10)
    mu = alpha + beta1 * x_2 + beta2 * x_2**2
    y_pred = pm.Normal('y_pred', mu=mu, sd=epsilon, observed=y_2)
    start = pm.find_MAP()
    step = pm.NUTS(scaling=start)
    trace_poly = pm.sample(3000, step=step, start=start)

In [ ]:

_ = pm.traceplot(trace_poly)
_ = pm.plot_posterior(trace_poly, kde_plot=True)

In [ ]:

x_p = np.linspace(-6, 6)
y_p = trace_poly['alpha'].mean() + trace_poly['beta1'].mean() * x_p + trace_poly['beta2'].mean() * x_p**2
_ = plt.scatter(x_2, y_2)
_ = plt.xlabel('$x$', fontsize=16)
_ = plt.ylabel('$y$', fontsize=16, rotation=0)
_ = plt.plot(x_p, y_p, c='r')

German Tank Problem with PYMC3¶

In [4]:

tanks = np.array([10, 256, 202, 97]).astype(np.int32)
tanks

Out[4]:

array([ 10, 256, 202,  97], dtype=int32)

In [5]:

m = tanks.mean()
s = tanks.std()
m,s

m+2*s

Out[5]:

(141.25, 94.924641163398661)

Out[5]:

331.09928232679732

In [6]:

# statistically optimal formula
max(tanks) + max(tanks) / len(tanks) - 1

Out[6]:

319.0

In [7]:

import scipy.stats
stats.norm.ppf(0.96943389677160008)
stats.norm.cdf(1.8725380240735363)

Out[7]:

1.8725380240735361

Out[7]:

0.96943389677160008

In [24]:

#P(N∣tanks)∝ P(tanks∣N) * P(N)

with pm.Model() as tank_m:
    N = pm.DiscreteUniform("N", lower=tanks.max(), upper=10000) # define prior
    tank_obs = pm.DiscreteUniform("tank_r", lower=0, upper=N, observed=tanks) # define likelihood
    tank_r = pm.sample(10000, njobs=4)
    tank_r = tank_r[2000:]

Assigned Metropolis to N
INFO:pymc3:Assigned Metropolis to N
100%|██████████| 10500/10500 [00:01<00:00, 6708.26it/s]

In [25]:

_ = pm.traceplot(tank_r)
_ = pm.plot_posterior(tank_r, kde_plot=False)
pm.summary(tank_r)

N:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  389.196          200.428          5.123            [256.000, 718.000]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  258.000        285.000        327.000        415.000        905.000

German Tank Problem with Pystan¶

In [18]:

import pystan

ocode = """
data {
    int M;  // number of serial numbers observed
    real tanks[M];       // serial numbers
}
parameters {
    real N;
}
model {
    N ~ uniform(max(tanks), 10000);  // P(N)
    tanks ~ uniform(0, N);           // P(D|N)
}
"""

data = {'tanks': tanks, 'M': len(tanks)}
fit = pystan.stan(model_code=ocode, data=data, init='init_r', iter=100000, warmup=1000, chains=4)
fit

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_17dd69bef31927148438092177f52297 NOW.

Out[18]:

Inference for Stan model: anon_model_17dd69bef31927148438092177f52297.
4 chains, each with iter=100000; warmup=1000; thin=1; 
post-warmup draws per chain=99000, total post-warmup draws=396000.

       mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
N    384.06    0.39 211.49 258.18 281.73 322.49 406.33 877.64 287637    1.0
lp__ -19.47  2.2e-3   0.81 -21.77 -19.67 -19.16 -18.95 -18.89 137715    1.0

Samples were drawn using NUTS at Wed May 31 11:37:26 2017.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).

In [19]:

fig = fit.plot()
fig.set_size_inches((12, 2))

plt.tight_layout()
plt.show()

fit.extract()['N'].mean()

Out[19]:

384.06184373628361

Estimate Upper Limit of array of random numbers¶

In [20]:

# generate N random numbers

max_num = 246
num_range = np.arange(max_num) + 1

sample_np = np.random.choice(num_range, int(max_num * 0.05) ) # take a 5% sample from overall
sample_np

sample = th.shared(sample_np) # so can use same model with different data

Out[20]:

array([158, 236, 141,  25,  48, 189,  26,  75, 179,  70, 141,  65])

In [21]:

with pm.Model() as rand_m:
    N = pm.DiscreteUniform("N", lower=sample.max(), upper=max_num*5) # define prior
    tank_obs = pm.DiscreteUniform("tank_r", lower=0, upper=N, observed=sample) # define likelihood

In [22]:

with rand_m:
    start = {'N': max_num*2}
    rand_r = pm.sample(50000, start=start, njobs=4 )

Assigned Metropolis to N
INFO:pymc3:Assigned Metropolis to N
100%|██████████| 50500/50500 [00:10<00:00, 4800.98it/s]

In [26]:

rand_r = rand_r[5000:] 
_ = pm.traceplot(rand_r)
_ = pm.plot_posterior(rand_r, kde_plot=False,point_estimate='median')
pm.summary(rand_r)

N:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  258.941          25.416           0.267            [236.000, 309.000]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  236.000        242.000        251.000        267.000        328.000

In [ ]:

GCHESTER.COM

Experimenting with Bayesian Inference via PYMC3

General Experimenting¶

Finding mean and standard deviation¶

Linear Regression¶

German Tank Problem with PYMC3¶

German Tank Problem with Pystan¶

Estimate Upper Limit of array of random numbers¶

Comments