mixture.ipynb¶

%matplotlib inline

import matplotlib.pyplot as plt
plt.style.use("default")
plt.rcParams["savefig.dpi"] = 100
plt.rcParams["figure.dpi"] = 100
plt.rcParams["font.size"] = 16
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = ["Liberation Sans"]
plt.rcParams["font.cursive"] = ["Liberation Sans"]
plt.rcParams["mathtext.fontset"] = "custom"

import numpy as np
import pandas as pd

import pymc3 as pm
import theano.tensor as tt

# cut & pasted directly from the fetch_hogg2010test() function
# identical to the original dataset as hardcoded in the Hogg 2010 paper

dfhogg = pd.DataFrame(
                np.array([[1, 201, 592, 61, 9, -0.84],
                          [2, 244, 401, 25, 4, 0.31],
                          [3, 47, 583, 38, 11, 0.64],
                          [4, 287, 402, 15, 7, -0.27],
                          [5, 203, 495, 21, 5, -0.33],
                          [6, 58, 173, 15, 9, 0.67],
                          [7, 210, 479, 27, 4, -0.02],
                          [8, 202, 504, 14, 4, -0.05],
                          [9, 198, 510, 30, 11, -0.84],
                          [10, 158, 416, 16, 7, -0.69],
                          [11, 165, 393, 14, 5, 0.30],
                          [12, 201, 442, 25, 5, -0.46],
                          [13, 157, 317, 52, 5, -0.03],
                          [14, 131, 311, 16, 6, 0.50],
                          [15, 166, 400, 34, 6, 0.73],
                          [16, 160, 337, 31, 5, -0.52],
                          [17, 186, 423, 42, 9, 0.90],
                          [18, 125, 334, 26, 8, 0.40],
                          [19, 218, 533, 16, 6, -0.78],
                          [20, 146, 344, 22, 5, -0.56]]),
                columns=['id','x','y','sigma_y','sigma_x','rho_xy'])

dfhogg['id'] = dfhogg['id'].apply(lambda x: 'p{}'.format(int(x)))
dfhogg.set_index('id', inplace=True)
dfhogg.head()

	x	y	sigma_y	sigma_x	rho_xy
id
p1	201.0	592.0	61.0	9.0	-0.84
p2	244.0	401.0	25.0	4.0	0.31
p3	47.0	583.0	38.0	11.0	0.64
p4	287.0	402.0	15.0	7.0	-0.27
p5	203.0	495.0	21.0	5.0	-0.33

dfhoggs = ((dfhogg[['x', 'y']] - dfhogg[['x', 'y']].mean(0)) /
           (2 * dfhogg[['x', 'y']].std(0)))
dfhoggs['sigma_x'] = dfhogg['sigma_x'] / ( 2 * dfhogg['x'].std())
dfhoggs['sigma_y'] = dfhogg['sigma_y'] / ( 2 * dfhogg['y'].std())

with pm.Model() as better_hogg:

    tsv_x = pm.Data('tsv_x', dfhoggs['x'])
    tsv_y = pm.Data('tsv_y', dfhoggs['y'])
    tsv_sigma_y = pm.Data('tsv_sigma_y', dfhoggs['sigma_y'])

    b0 = pm.Normal('b0_intercept', mu=0, sigma=10, testval=pm.floatX(0.1))
    b1 = pm.Normal('b1_slope', mu=0, sigma=10, testval=pm.floatX(1.))
    y_est_in = b0 + b1 * tsv_x
    y_est_out = pm.Normal('y_est_out', mu=0, sigma=10, testval=pm.floatX(1.))
    sigma_y_out = pm.HalfNormal('sigma_y_out', sigma=1, testval=pm.floatX(1.))

    # Inlier/outlier likelihoods
    inlier_logl = pm.Normal.dist(mu=y_est_in,
                                 sigma=tsv_sigma_y).logp(tsv_y)

    outlier_logl = pm.Normal.dist(mu=y_est_out,
                                  sigma=tsv_sigma_y + sigma_y_out).logp(tsv_y)

    # This fraction is really the *prior* probability of having an outlier
    frac_outliers = pm.Uniform('frac_outliers', lower=0.0, upper=0.5)

    # Apply this prior to compute the joint prob
    inlier_logp = tt.log(1 - frac_outliers) + inlier_logl
    outlier_logp = tt.log(frac_outliers) + outlier_logl

    # Marginalize
    logp_marg = pm.math.logaddexp(
        inlier_logp,
        outlier_logp
    )
    pm.Potential('obs', logp_marg)

    # Track the probability that each point is an outlier
    pm.Deterministic("logp_outlier", outlier_logp - logp_marg)

with better_hogg:
    trc_better = pm.sample(tune=3000, draws=3000, target_accept=0.9)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [frac_outliers, sigma_y_out, y_est_out, b1_slope, b0_intercept]
Sampling 2 chains: 100%|██████████| 12000/12000 [00:09<00:00, 1219.40draws/s]

import arviz as az

rvs = ['b0_intercept', 'b1_slope', 'y_est_out', 'sigma_y_out', 'frac_outliers']
_ = az.plot_trace(trc_better, var_names=rvs, combined=False)

plt.errorbar(dfhoggs['x'], dfhoggs['y'], yerr=dfhoggs['sigma_y'], fmt=",k", zorder=-1)

p_outlier = np.exp(np.median(trc_better["logp_outlier"], axis=0))

plt.scatter(dfhoggs['x'], dfhoggs['y'], c=p_outlier)
plt.colorbar(label="outlier probability")

<matplotlib.colorbar.Colorbar at 0x11910f810>