-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
/
Copy pathcensored_data.py
148 lines (122 loc) · 4.71 KB
/
censored_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import numpy as np
import matplotlib.pyplot as plt
import pymc3 as pm
import theano.tensor as tt
'''
Data can be left, right, or interval censored.
In this example we take interval censoring as it is the most general. In order
to deal with data censored only on one side one can simply remove one of the
sides.
Many modeling problems are of this nature, two common examples are:
- Survival analysis: at the end of a clinical trial to study the impact of a
new drug on lifespan, it is almost never possible to follow through with the
study until all subjects have died. At the end of the study, the only
information known for many subjects is that a subject was still alive.
- Sensor saturation: a sensor might have a limited dynamic range and the upper
and lower limits would simply be the lowest and highest values a sensor can
report. An 8-bit pixel value can only hold values from 0 to 255.
This example presents two different ways of dealing with censored data in
PyMC3:
- An imputed censored model, which represents censored data as parameters and
makes up plausible values for all censored values. This produces as a
byproduct a plausible set of made up values that would have been censored.
Each censored element introduces a random variable.
- An unimputed censored model, where the censored data are integrated out and
accounted for only through the log-likelihood. This method deals more
adequately with large amounts of censored data and converges more quickly.
To establish a baseline they compare to an uncensored model of uncensored data.
'''
# Helper functions
def normal_lcdf(mu, sigma, x):
z = (x - mu) / sigma
return tt.switch(
tt.lt(z, -1.0),
tt.log(tt.erfcx(-z / tt.sqrt(2.)) / 2.) - tt.sqr(z) / 2.,
tt.log1p(-tt.erfc(z / tt.sqrt(2.)) / 2.)
)
def normal_lccdf(mu, sigma, x):
z = (x - mu) / sigma
return tt.switch(
tt.gt(z, 1.0),
tt.log(tt.erfcx(z / tt.sqrt(2.)) / 2.) - tt.sqr(z) / 2.,
tt.log1p(-tt.erfc(-z / tt.sqrt(2.)) / 2.)
)
# Produce normally distributed samples
np.random.seed(123)
size = 500
sigma = 5.
mu = 13.
samples = np.random.normal(mu, sigma, size)
# Set censoring limits.
high = 16.
low = -1.
# Truncate samples
truncated = samples[(samples > low) & (samples < high)]
# Omniscient model
with pm.Model() as omniscient_model:
mu = pm.Normal('mu', mu=0., sd=(high - low) / 2.)
sigma = pm.HalfNormal('sigma', sd=(high - low) / 2.)
observed = pm.Normal('observed', mu=mu, sd=sigma, observed=samples)
# Imputed censored model
# Keep tabs on left/right censoring
n_right_censored = len(samples[samples >= high])
n_left_censored = len(samples[samples <= low])
n_observed = len(samples) - n_right_censored - n_left_censored
with pm.Model() as imputed_censored_model:
mu = pm.Normal('mu', mu=0., sd=(high - low) / 2.)
sigma = pm.HalfNormal('sigma', sd=(high - low) / 2.)
right_censored = pm.Bound(pm.Normal, lower=high)(
'right_censored', mu=mu, sd=sigma, shape=n_right_censored
)
left_censored = pm.Bound(pm.Normal, upper=low)(
'left_censored', mu=mu, sd=sigma, shape=n_left_censored
)
observed = pm.Normal(
'observed',
mu=mu,
sd=sigma,
observed=truncated,
shape=n_observed
)
# Unimputed censored model
def censored_right_likelihood(mu, sigma, n_right_censored, upper_bound):
return n_right_censored * normal_lccdf(mu, sigma, upper_bound)
def censored_left_likelihood(mu, sigma, n_left_censored, lower_bound):
return n_left_censored * normal_lcdf(mu, sigma, lower_bound)
with pm.Model() as unimputed_censored_model:
mu = pm.Normal('mu', mu=0., sd=(high - low) / 2.)
sigma = pm.HalfNormal('sigma', sd=(high - low) / 2.)
observed = pm.Normal(
'observed',
mu=mu,
sd=sigma,
observed=truncated,
)
right_censored = pm.Potential(
'right_censored',
censored_right_likelihood(mu, sigma, n_right_censored, high)
)
left_censored = pm.Potential(
'left_censored',
censored_left_likelihood(mu, sigma, n_left_censored, low)
)
def run(n=1500):
if n == 'short':
n = 50
print('Model with no censored data (omniscient)')
with omniscient_model:
trace = pm.sample(n)
pm.plot_posterior(trace[-1000:], varnames=['mu', 'sigma'])
plt.show()
print('Imputed censored model')
with imputed_censored_model:
trace = pm.sample(n)
pm.plot_posterior(trace[-1000:], varnames=['mu', 'sigma'])
plt.show()
print('Unimputed censored model')
with unimputed_censored_model:
trace = pm.sample(n)
pm.plot_posterior(trace[-1000:], varnames=['mu', 'sigma'])
plt.show()
if __name__ == '__main__':
run()