Note
The original gist can be found at: https://gist.github.com/5250dd2f17daf60cbe582ceeeb2fd12f
mixture.ipynb¶
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("default")
plt.rcParams["savefig.dpi"] = 100
plt.rcParams["figure.dpi"] = 100
plt.rcParams["font.size"] = 16
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = ["Liberation Sans"]
plt.rcParams["font.cursive"] = ["Liberation Sans"]
plt.rcParams["mathtext.fontset"] = "custom"
import numpy as np
import pandas as pd
import pymc3 as pm
import theano.tensor as tt
# cut & pasted directly from the fetch_hogg2010test() function
# identical to the original dataset as hardcoded in the Hogg 2010 paper
dfhogg = pd.DataFrame(
np.array([[1, 201, 592, 61, 9, -0.84],
[2, 244, 401, 25, 4, 0.31],
[3, 47, 583, 38, 11, 0.64],
[4, 287, 402, 15, 7, -0.27],
[5, 203, 495, 21, 5, -0.33],
[6, 58, 173, 15, 9, 0.67],
[7, 210, 479, 27, 4, -0.02],
[8, 202, 504, 14, 4, -0.05],
[9, 198, 510, 30, 11, -0.84],
[10, 158, 416, 16, 7, -0.69],
[11, 165, 393, 14, 5, 0.30],
[12, 201, 442, 25, 5, -0.46],
[13, 157, 317, 52, 5, -0.03],
[14, 131, 311, 16, 6, 0.50],
[15, 166, 400, 34, 6, 0.73],
[16, 160, 337, 31, 5, -0.52],
[17, 186, 423, 42, 9, 0.90],
[18, 125, 334, 26, 8, 0.40],
[19, 218, 533, 16, 6, -0.78],
[20, 146, 344, 22, 5, -0.56]]),
columns=['id','x','y','sigma_y','sigma_x','rho_xy'])
dfhogg['id'] = dfhogg['id'].apply(lambda x: 'p{}'.format(int(x)))
dfhogg.set_index('id', inplace=True)
dfhogg.head()
x | y | sigma_y | sigma_x | rho_xy | |
---|---|---|---|---|---|
id | |||||
p1 | 201.0 | 592.0 | 61.0 | 9.0 | -0.84 |
p2 | 244.0 | 401.0 | 25.0 | 4.0 | 0.31 |
p3 | 47.0 | 583.0 | 38.0 | 11.0 | 0.64 |
p4 | 287.0 | 402.0 | 15.0 | 7.0 | -0.27 |
p5 | 203.0 | 495.0 | 21.0 | 5.0 | -0.33 |
dfhoggs = ((dfhogg[['x', 'y']] - dfhogg[['x', 'y']].mean(0)) /
(2 * dfhogg[['x', 'y']].std(0)))
dfhoggs['sigma_x'] = dfhogg['sigma_x'] / ( 2 * dfhogg['x'].std())
dfhoggs['sigma_y'] = dfhogg['sigma_y'] / ( 2 * dfhogg['y'].std())
with pm.Model() as better_hogg:
tsv_x = pm.Data('tsv_x', dfhoggs['x'])
tsv_y = pm.Data('tsv_y', dfhoggs['y'])
tsv_sigma_y = pm.Data('tsv_sigma_y', dfhoggs['sigma_y'])
b0 = pm.Normal('b0_intercept', mu=0, sigma=10, testval=pm.floatX(0.1))
b1 = pm.Normal('b1_slope', mu=0, sigma=10, testval=pm.floatX(1.))
y_est_in = b0 + b1 * tsv_x
y_est_out = pm.Normal('y_est_out', mu=0, sigma=10, testval=pm.floatX(1.))
sigma_y_out = pm.HalfNormal('sigma_y_out', sigma=1, testval=pm.floatX(1.))
# Inlier/outlier likelihoods
inlier_logl = pm.Normal.dist(mu=y_est_in,
sigma=tsv_sigma_y).logp(tsv_y)
outlier_logl = pm.Normal.dist(mu=y_est_out,
sigma=tsv_sigma_y + sigma_y_out).logp(tsv_y)
# This fraction is really the *prior* probability of having an outlier
frac_outliers = pm.Uniform('frac_outliers', lower=0.0, upper=0.5)
# Apply this prior to compute the joint prob
inlier_logp = tt.log(1 - frac_outliers) + inlier_logl
outlier_logp = tt.log(frac_outliers) + outlier_logl
# Marginalize
logp_marg = pm.math.logaddexp(
inlier_logp,
outlier_logp
)
pm.Potential('obs', logp_marg)
# Track the probability that each point is an outlier
pm.Deterministic("logp_outlier", outlier_logp - logp_marg)
with better_hogg:
trc_better = pm.sample(tune=3000, draws=3000, target_accept=0.9)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [frac_outliers, sigma_y_out, y_est_out, b1_slope, b0_intercept]
Sampling 2 chains: 100%|██████████| 12000/12000 [00:09<00:00, 1219.40draws/s]
import arviz as az
rvs = ['b0_intercept', 'b1_slope', 'y_est_out', 'sigma_y_out', 'frac_outliers']
_ = az.plot_trace(trc_better, var_names=rvs, combined=False)
plt.errorbar(dfhoggs['x'], dfhoggs['y'], yerr=dfhoggs['sigma_y'], fmt=",k", zorder=-1)
p_outlier = np.exp(np.median(trc_better["logp_outlier"], axis=0))
plt.scatter(dfhoggs['x'], dfhoggs['y'], c=p_outlier)
plt.colorbar(label="outlier probability")
<matplotlib.colorbar.Colorbar at 0x11910f810>