From 05462f2feb15381bff0427b72a9aa3fc11532cfc Mon Sep 17 00:00:00 2001 From: hottwaj Date: Wed, 26 Feb 2020 10:25:20 +0000 Subject: [PATCH 01/15] Initial changes to allow pymc3.Data() to support both int and float input data (previously all input data was coerced to float) WIP for #3813 --- pymc3/data.py | 14 ++++++++++++-- pymc3/model.py | 8 +++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/pymc3/data.py b/pymc3/data.py index c638478b08..e39809123e 100644 --- a/pymc3/data.py +++ b/pymc3/data.py @@ -478,10 +478,20 @@ class Data: For more information, take a look at this example notebook https://docs.pymc.io/notebooks/data_container.html """ - def __new__(self, name, value): + def __new__(self, name, value, dtype = None): + if dtype is None: + if hasattr(value, 'dtype'): + # if no dtype given, but available as attr of value, use that as dtype + dtype = value.dtype + elif isinstance(value, int): + dtype = int + else: + # otherwise, assume float + dtype = float + # `pm.model.pandas_to_array` takes care of parameter `value` and # transforms it to something digestible for pymc3 - shared_object = theano.shared(pm.model.pandas_to_array(value), name) + shared_object = theano.shared(pm.model.pandas_to_array(value, dtype = dtype), name) # To draw the node for this variable in the graphviz Digraph we need # its shape. diff --git a/pymc3/model.py b/pymc3/model.py index 3de6e4f380..a4616fd9f3 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -1473,7 +1473,7 @@ def init_value(self): return self.tag.test_value -def pandas_to_array(data): +def pandas_to_array(data, dtype = float): if hasattr(data, 'values'): # pandas if data.isnull().any().any(): # missing values ret = np.ma.MaskedArray(data.values, data.isnull().values) @@ -1492,8 +1492,10 @@ def pandas_to_array(data): ret = generator(data) else: ret = np.asarray(data) - return pm.floatX(ret) - + if dtype in [float, np.float32, np.float64]: + return pm.floatX(ret) + elif dtype in [int, np.int32, np.int64]: + return pm.intX(ret) def as_tensor(data, name, model, distribution): dtype = distribution.dtype From 0041e9f36abb0223d33366ac82f34b559d229c29 Mon Sep 17 00:00:00 2001 From: hottwaj Date: Wed, 26 Feb 2020 10:34:47 +0000 Subject: [PATCH 02/15] added exception for invalid dtype input to pandas_to_array --- pymc3/model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pymc3/model.py b/pymc3/model.py index a4616fd9f3..62123267da 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -1496,6 +1496,8 @@ def pandas_to_array(data, dtype = float): return pm.floatX(ret) elif dtype in [int, np.int32, np.int64]: return pm.intX(ret) + else: + raise ValueError('Unsupported type for pandas_to_array: %s' % str(dtype)) def as_tensor(data, name, model, distribution): dtype = distribution.dtype From 38536aaf2d993ee21a020b7d64801be90d9645a8 Mon Sep 17 00:00:00 2001 From: AlexAndorra Date: Sun, 17 May 2020 11:19:43 +0200 Subject: [PATCH 03/15] Refined implementation --- pymc3/data.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/pymc3/data.py b/pymc3/data.py index a80d62ec4d..fa569dd332 100644 --- a/pymc3/data.py +++ b/pymc3/data.py @@ -479,10 +479,10 @@ class Data: https://docs.pymc.io/notebooks/data_container.html """ - def __new__(self, name, value, dtype = None): - if dtype is None: - if hasattr(value, 'dtype'): - # if no dtype given, but available as attr of value, use that as dtype + def __new__(self, name, value, dtype=None): + if not dtype: + if hasattr(value, "dtype"): + # if no dtype given but available as attr of value, use that as dtype dtype = value.dtype elif isinstance(value, int): dtype = int @@ -490,33 +490,24 @@ def __new__(self, name, value, dtype = None): # otherwise, assume float dtype = float - # `pm.model.pandas_to_array` takes care of parameter `value` and - # transforms it to something digestible for pymc3 - shared_object = theano.shared(pm.model.pandas_to_array(value, dtype = dtype), name) - - # To draw the node for this variable in the graphviz Digraph we need - # its shape. - shared_object.dshape = tuple(shared_object.shape.eval()) - # Add data container to the named variables of the model. try: model = pm.Model.get_context() except TypeError: - raise TypeError("No model on context stack, which is needed to " - "instantiate a data container. Add variable " - "inside a 'with model:' block.") - + raise TypeError( + "No model on context stack, which is needed to instantiate a data container. " + "Add variable inside a 'with model:' block." + ) name = model.name_for(name) # `pm.model.pandas_to_array` takes care of parameter `value` and # transforms it to something digestible for pymc3 - shared_object = theano.shared(pm.model.pandas_to_array(value), name) + shared_object = theano.shared(pm.model.pandas_to_array(value, dtype=dtype), name) # To draw the node for this variable in the graphviz Digraph we need # its shape. shared_object.dshape = tuple(shared_object.shape.eval()) - model.add_random_variable(shared_object) return shared_object From 08b3ba46c290b2d0b711786e57310392652f319e Mon Sep 17 00:00:00 2001 From: AlexAndorra Date: Sun, 17 May 2020 15:27:37 +0200 Subject: [PATCH 04/15] Finished dtype conversion handling --- pymc3/model.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pymc3/model.py b/pymc3/model.py index d572dd3829..da21d53811 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -1482,7 +1482,7 @@ def init_value(self): return self.tag.test_value -def pandas_to_array(data, dtype = float): +def pandas_to_array(data, dtype=float): if hasattr(data, 'values'): # pandas if data.isnull().any().any(): # missing values ret = np.ma.MaskedArray(data.values, data.isnull().values) @@ -1501,13 +1501,15 @@ def pandas_to_array(data, dtype = float): ret = generator(data) else: ret = np.asarray(data) - if dtype in [float, np.float32, np.float64]: - return pm.floatX(ret) - elif dtype in [int, np.int32, np.int64]: + + if dtype in [int, np.int8, np.int16, np.int32, np.int64]: return pm.intX(ret) + elif dtype in [float, np.float16, np.float32, np.float64]: + return pm.floatX(ret) else: raise ValueError('Unsupported type for pandas_to_array: %s' % str(dtype)) + def as_tensor(data, name, model, distribution): dtype = distribution.dtype data = pandas_to_array(data).astype(dtype) From 1fd0e8b0e2377b98015f76c077989113d358da8c Mon Sep 17 00:00:00 2001 From: AlexAndorra Date: Sun, 17 May 2020 18:08:30 +0200 Subject: [PATCH 05/15] Added SharedVariable option to getattr_value --- pymc3/distributions/distribution.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py index 8aaa171d14..437001880b 100644 --- a/pymc3/distributions/distribution.py +++ b/pymc3/distributions/distribution.py @@ -111,6 +111,9 @@ def getattr_value(self, val): if isinstance(val, tt.TensorVariable): return val.tag.test_value + if isinstance(val, tt.sharedvar.TensorSharedVariable): + return val.get_value() + if isinstance(val, theano_constant): return val.value From 766285c60958bec25b170198bfc0007ceb9b9c21 Mon Sep 17 00:00:00 2001 From: AlexAndorra Date: Sun, 17 May 2020 19:33:16 +0200 Subject: [PATCH 06/15] Added dtype handling to set_data function --- pymc3/model.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pymc3/model.py b/pymc3/model.py index da21d53811..45d1629794 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -1244,7 +1244,7 @@ def set_data(new_data, model=None): ---------- new_data: dict New values for the data containers. The keys of the dictionary are - the variables names in the model and the values are the objects + the variables' names in the model and the values are the objects with which to update. model: Model (optional if in `with` context) @@ -1266,7 +1266,7 @@ def set_data(new_data, model=None): .. code:: ipython >>> with model: - ... pm.set_data({'x': [5,6,9]}) + ... pm.set_data({'x': [5., 6., 9.]}) ... y_test = pm.sample_posterior_predictive(trace) >>> y_test['obs'].mean(axis=0) array([4.6088569 , 5.54128318, 8.32953844]) @@ -1275,7 +1275,15 @@ def set_data(new_data, model=None): for variable_name, new_value in new_data.items(): if isinstance(model[variable_name], SharedVariable): - model[variable_name].set_value(pandas_to_array(new_value)) + if hasattr(new_value, "dtype"): + # if no dtype given but available as attr of value, use that as dtype + dtype = new_value.dtype + elif isinstance(new_value, int): + dtype = int + else: + # otherwise, assume float + dtype = float + model[variable_name].set_value(pandas_to_array(new_value, dtype=dtype)) else: message = 'The variable `{}` must be defined as `pymc3.' \ 'Data` inside the model to allow updating. The ' \ From 63132e74942228a266336db15b351253510a1000 Mon Sep 17 00:00:00 2001 From: AlexAndorra Date: Sun, 17 May 2020 19:33:57 +0200 Subject: [PATCH 07/15] Added tests for pm.Data used for index variables --- pymc3/tests/test_data_container.py | 36 +++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/pymc3/tests/test_data_container.py b/pymc3/tests/test_data_container.py index e49cab457a..46e0531ae2 100644 --- a/pymc3/tests/test_data_container.py +++ b/pymc3/tests/test_data_container.py @@ -36,16 +36,16 @@ def test_sample(self): x_shared = pm.Data('x_shared', x) b = pm.Normal('b', 0., 10.) pm.Normal('obs', b * x_shared, np.sqrt(1e-2), observed=y) - prior_trace0 = pm.sample_prior_predictive(1000) + prior_trace0 = pm.sample_prior_predictive(1000) trace = pm.sample(1000, init=None, tune=1000, chains=1) pp_trace0 = pm.sample_posterior_predictive(trace, 1000) pp_trace01 = pm.fast_sample_posterior_predictive(trace, 1000) x_shared.set_value(x_pred) + prior_trace1 = pm.sample_prior_predictive(1000) pp_trace1 = pm.sample_posterior_predictive(trace, samples=1000) pp_trace11 = pm.fast_sample_posterior_predictive(trace, samples=1000) - prior_trace1 = pm.sample_prior_predictive(1000) assert prior_trace0['b'].shape == (1000,) assert prior_trace0['obs'].shape == (1000, 100) @@ -109,6 +109,36 @@ def test_sample_after_set_data(self): np.testing.assert_allclose(new_y, pp_tracef['obs'].mean(axis=0), atol=1e-1) + def test_shared_data_as_index(self): + """ + Allow pm.Data to be used for index variables, i.e with integers as well as floats. + See https://github.com/pymc-devs/pymc3/issues/3813 + """ + with pm.Model() as model: + index = pm.Data('index', [2, 0, 1, 0, 2], dtype=int) + y = pm.Data('y', [1., 2., 3., 2., 1.]) + alpha = pm.Normal('alpha', 0, 1.5, shape=3) + pm.Normal('obs', alpha[index], np.sqrt(1e-2), observed=y) + + prior_trace = pm.sample_prior_predictive(1000, var_names=["alpha"]) + trace = pm.sample(1000, init=None, tune=1000, chains=1) + + # Predict on new data + new_index = np.array([0, 1, 2]) + new_y = [5., 6., 9.] + with model: + pm.set_data(new_data={'index': new_index, 'y': new_y}) + pp_trace = pm.sample_posterior_predictive(trace, 1000, var_names=["alpha", "obs"]) + pp_tracef = pm.fast_sample_posterior_predictive(trace, 1000, var_names=["alpha", "obs"]) + + assert prior_trace['alpha'].shape == (1000, 3) + assert trace['alpha'].shape == (1000, 3) + assert pp_trace['alpha'].shape == (1000, 3) + assert pp_trace['obs'].shape == (1000, 3) + assert pp_tracef['alpha'].shape == (1000, 3) + assert pp_tracef['obs'].shape == (1000, 3) + + def test_creation_of_data_outside_model_context(self): with pytest.raises((IndexError, TypeError)) as error: pm.Data('data', [1.1, 2.2, 3.3]) @@ -147,7 +177,7 @@ def test_model_to_graphviz_for_model_with_data_container(self): def test_data_naming(): """ - This is a test for issue #3793 -- `Data` objects in named models are + This is a test for issue #3793 -- `Data` objects in named models are not given model-relative names. """ with pm.Model("named_model") as model: From 1bd864282fd03c212bab3a7037cc96aab3f64207 Mon Sep 17 00:00:00 2001 From: AlexAndorra Date: Sun, 17 May 2020 19:55:06 +0200 Subject: [PATCH 08/15] Added tests for using pm.data as RV input --- pymc3/tests/test_data_container.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pymc3/tests/test_data_container.py b/pymc3/tests/test_data_container.py index 46e0531ae2..fa38900751 100644 --- a/pymc3/tests/test_data_container.py +++ b/pymc3/tests/test_data_container.py @@ -139,6 +139,31 @@ def test_shared_data_as_index(self): assert pp_tracef['obs'].shape == (1000, 3) + def test_shared_data_as_rv_input(self): + """ + Allow pm.Data to be used as input for other RVs. + See https://github.com/pymc-devs/pymc3/issues/3842 + """ + with pm.Model() as m: + x = pm.Data("x", [1.0, 2.0, 3.0]) + _ = pm.Normal("y", mu=x, shape=3) + trace = pm.sample(chains=1) + + np.testing.assert_allclose(np.array([1.0, 2.0, 3.0]), x.get_value(), + atol=1e-1) + np.testing.assert_allclose(np.array([1.0, 2.0, 3.0]), trace["y"].mean(0), + atol=1e-1) + + with m: + pm.set_data({"x": np.array([2.0, 4.0, 6.0])}) + trace = pm.sample(chains=1) + + np.testing.assert_allclose(np.array([2.0, 4.0, 6.0]), x.get_value(), + atol=1e-1) + np.testing.assert_allclose(np.array([2.0, 4.0, 6.0]), trace["y"].mean(0), + atol=1e-1) + + def test_creation_of_data_outside_model_context(self): with pytest.raises((IndexError, TypeError)) as error: pm.Data('data', [1.1, 2.2, 3.3]) From 6597f28116393511f70bc6bef0b0e81950ac13b3 Mon Sep 17 00:00:00 2001 From: AlexAndorra Date: Sun, 17 May 2020 20:50:52 +0200 Subject: [PATCH 09/15] Ran Black on data tests files --- pymc3/tests/test_data_container.py | 158 ++++++++++++++--------------- 1 file changed, 77 insertions(+), 81 deletions(-) diff --git a/pymc3/tests/test_data_container.py b/pymc3/tests/test_data_container.py index fa38900751..38b640ac5a 100644 --- a/pymc3/tests/test_data_container.py +++ b/pymc3/tests/test_data_container.py @@ -20,22 +20,22 @@ class TestData(SeededTest): def test_deterministic(self): - data_values = np.array([.5, .4, 5, 2]) + data_values = np.array([0.5, 0.4, 5, 2]) with pm.Model() as model: - X = pm.Data('X', data_values) - pm.Normal('y', 0, 1, observed=X) + X = pm.Data("X", data_values) + pm.Normal("y", 0, 1, observed=X) model.logp(model.test_point) def test_sample(self): x = np.random.normal(size=100) y = x + np.random.normal(scale=1e-2, size=100) - x_pred = np.linspace(-3, 3, 200, dtype='float32') + x_pred = np.linspace(-3, 3, 200, dtype="float32") with pm.Model(): - x_shared = pm.Data('x_shared', x) - b = pm.Normal('b', 0., 10.) - pm.Normal('obs', b * x_shared, np.sqrt(1e-2), observed=y) + x_shared = pm.Data("x_shared", x) + b = pm.Normal("b", 0.0, 10.0) + pm.Normal("obs", b * x_shared, np.sqrt(1e-2), observed=y) prior_trace0 = pm.sample_prior_predictive(1000) trace = pm.sample(1000, init=None, tune=1000, chains=1) @@ -47,67 +47,61 @@ def test_sample(self): pp_trace1 = pm.sample_posterior_predictive(trace, samples=1000) pp_trace11 = pm.fast_sample_posterior_predictive(trace, samples=1000) - assert prior_trace0['b'].shape == (1000,) - assert prior_trace0['obs'].shape == (1000, 100) - assert prior_trace1['obs'].shape == (1000, 200) + assert prior_trace0["b"].shape == (1000,) + assert prior_trace0["obs"].shape == (1000, 100) + assert prior_trace1["obs"].shape == (1000, 200) - assert pp_trace0['obs'].shape == (1000, 100) - assert pp_trace01['obs'].shape == (1000, 100) + assert pp_trace0["obs"].shape == (1000, 100) + assert pp_trace01["obs"].shape == (1000, 100) - np.testing.assert_allclose(x, pp_trace0['obs'].mean(axis=0), atol=1e-1) - np.testing.assert_allclose(x, pp_trace01['obs'].mean(axis=0), atol=1e-1) + np.testing.assert_allclose(x, pp_trace0["obs"].mean(axis=0), atol=1e-1) + np.testing.assert_allclose(x, pp_trace01["obs"].mean(axis=0), atol=1e-1) - assert pp_trace1['obs'].shape == (1000, 200) - assert pp_trace11['obs'].shape == (1000, 200) + assert pp_trace1["obs"].shape == (1000, 200) + assert pp_trace11["obs"].shape == (1000, 200) - np.testing.assert_allclose(x_pred, pp_trace1['obs'].mean(axis=0), - atol=1e-1) - np.testing.assert_allclose(x_pred, pp_trace11['obs'].mean(axis=0), - atol=1e-1) + np.testing.assert_allclose(x_pred, pp_trace1["obs"].mean(axis=0), atol=1e-1) + np.testing.assert_allclose(x_pred, pp_trace11["obs"].mean(axis=0), atol=1e-1) def test_sample_posterior_predictive_after_set_data(self): with pm.Model() as model: - x = pm.Data('x', [1., 2., 3.]) - y = pm.Data('y', [1., 2., 3.]) - beta = pm.Normal('beta', 0, 10.) - pm.Normal('obs', beta * x, np.sqrt(1e-2), observed=y) + x = pm.Data("x", [1.0, 2.0, 3.0]) + y = pm.Data("y", [1.0, 2.0, 3.0]) + beta = pm.Normal("beta", 0, 10.0) + pm.Normal("obs", beta * x, np.sqrt(1e-2), observed=y) trace = pm.sample(1000, tune=1000, chains=1) # Predict on new data. with model: x_test = [5, 6, 9] - pm.set_data(new_data={'x': x_test}) + pm.set_data(new_data={"x": x_test}) y_test = pm.sample_posterior_predictive(trace) y_test1 = pm.fast_sample_posterior_predictive(trace) - assert y_test['obs'].shape == (1000, 3) - assert y_test1['obs'].shape == (1000, 3) - np.testing.assert_allclose(x_test, y_test['obs'].mean(axis=0), - atol=1e-1) - np.testing.assert_allclose(x_test, y_test1['obs'].mean(axis=0), - atol=1e-1) + assert y_test["obs"].shape == (1000, 3) + assert y_test1["obs"].shape == (1000, 3) + np.testing.assert_allclose(x_test, y_test["obs"].mean(axis=0), atol=1e-1) + np.testing.assert_allclose(x_test, y_test1["obs"].mean(axis=0), atol=1e-1) def test_sample_after_set_data(self): with pm.Model() as model: - x = pm.Data('x', [1., 2., 3.]) - y = pm.Data('y', [1., 2., 3.]) - beta = pm.Normal('beta', 0, 10.) - pm.Normal('obs', beta * x, np.sqrt(1e-2), observed=y) + x = pm.Data("x", [1.0, 2.0, 3.0]) + y = pm.Data("y", [1.0, 2.0, 3.0]) + beta = pm.Normal("beta", 0, 10.0) + pm.Normal("obs", beta * x, np.sqrt(1e-2), observed=y) pm.sample(1000, init=None, tune=1000, chains=1) # Predict on new data. - new_x = [5., 6., 9.] - new_y = [5., 6., 9.] + new_x = [5.0, 6.0, 9.0] + new_y = [5.0, 6.0, 9.0] with model: - pm.set_data(new_data={'x': new_x, 'y': new_y}) + pm.set_data(new_data={"x": new_x, "y": new_y}) new_trace = pm.sample(1000, init=None, tune=1000, chains=1) pp_trace = pm.sample_posterior_predictive(new_trace, 1000) pp_tracef = pm.fast_sample_posterior_predictive(new_trace, 1000) - assert pp_trace['obs'].shape == (1000, 3) - assert pp_tracef['obs'].shape == (1000, 3) - np.testing.assert_allclose(new_y, pp_trace['obs'].mean(axis=0), - atol=1e-1) - np.testing.assert_allclose(new_y, pp_tracef['obs'].mean(axis=0), - atol=1e-1) + assert pp_trace["obs"].shape == (1000, 3) + assert pp_tracef["obs"].shape == (1000, 3) + np.testing.assert_allclose(new_y, pp_trace["obs"].mean(axis=0), atol=1e-1) + np.testing.assert_allclose(new_y, pp_tracef["obs"].mean(axis=0), atol=1e-1) def test_shared_data_as_index(self): """ @@ -115,29 +109,32 @@ def test_shared_data_as_index(self): See https://github.com/pymc-devs/pymc3/issues/3813 """ with pm.Model() as model: - index = pm.Data('index', [2, 0, 1, 0, 2], dtype=int) - y = pm.Data('y', [1., 2., 3., 2., 1.]) - alpha = pm.Normal('alpha', 0, 1.5, shape=3) - pm.Normal('obs', alpha[index], np.sqrt(1e-2), observed=y) + index = pm.Data("index", [2, 0, 1, 0, 2], dtype=int) + y = pm.Data("y", [1.0, 2.0, 3.0, 2.0, 1.0]) + alpha = pm.Normal("alpha", 0, 1.5, shape=3) + pm.Normal("obs", alpha[index], np.sqrt(1e-2), observed=y) prior_trace = pm.sample_prior_predictive(1000, var_names=["alpha"]) trace = pm.sample(1000, init=None, tune=1000, chains=1) # Predict on new data new_index = np.array([0, 1, 2]) - new_y = [5., 6., 9.] + new_y = [5.0, 6.0, 9.0] with model: - pm.set_data(new_data={'index': new_index, 'y': new_y}) - pp_trace = pm.sample_posterior_predictive(trace, 1000, var_names=["alpha", "obs"]) - pp_tracef = pm.fast_sample_posterior_predictive(trace, 1000, var_names=["alpha", "obs"]) - - assert prior_trace['alpha'].shape == (1000, 3) - assert trace['alpha'].shape == (1000, 3) - assert pp_trace['alpha'].shape == (1000, 3) - assert pp_trace['obs'].shape == (1000, 3) - assert pp_tracef['alpha'].shape == (1000, 3) - assert pp_tracef['obs'].shape == (1000, 3) - + pm.set_data(new_data={"index": new_index, "y": new_y}) + pp_trace = pm.sample_posterior_predictive( + trace, 1000, var_names=["alpha", "obs"] + ) + pp_tracef = pm.fast_sample_posterior_predictive( + trace, 1000, var_names=["alpha", "obs"] + ) + + assert prior_trace["alpha"].shape == (1000, 3) + assert trace["alpha"].shape == (1000, 3) + assert pp_trace["alpha"].shape == (1000, 3) + assert pp_trace["obs"].shape == (1000, 3) + assert pp_tracef["alpha"].shape == (1000, 3) + assert pp_tracef["obs"].shape == (1000, 3) def test_shared_data_as_rv_input(self): """ @@ -149,43 +146,42 @@ def test_shared_data_as_rv_input(self): _ = pm.Normal("y", mu=x, shape=3) trace = pm.sample(chains=1) - np.testing.assert_allclose(np.array([1.0, 2.0, 3.0]), x.get_value(), - atol=1e-1) - np.testing.assert_allclose(np.array([1.0, 2.0, 3.0]), trace["y"].mean(0), - atol=1e-1) + np.testing.assert_allclose(np.array([1.0, 2.0, 3.0]), x.get_value(), atol=1e-1) + np.testing.assert_allclose( + np.array([1.0, 2.0, 3.0]), trace["y"].mean(0), atol=1e-1 + ) with m: pm.set_data({"x": np.array([2.0, 4.0, 6.0])}) trace = pm.sample(chains=1) - np.testing.assert_allclose(np.array([2.0, 4.0, 6.0]), x.get_value(), - atol=1e-1) - np.testing.assert_allclose(np.array([2.0, 4.0, 6.0]), trace["y"].mean(0), - atol=1e-1) - + np.testing.assert_allclose(np.array([2.0, 4.0, 6.0]), x.get_value(), atol=1e-1) + np.testing.assert_allclose( + np.array([2.0, 4.0, 6.0]), trace["y"].mean(0), atol=1e-1 + ) def test_creation_of_data_outside_model_context(self): with pytest.raises((IndexError, TypeError)) as error: - pm.Data('data', [1.1, 2.2, 3.3]) - error.match('No model on context stack') + pm.Data("data", [1.1, 2.2, 3.3]) + error.match("No model on context stack") def test_set_data_to_non_data_container_variables(self): with pm.Model() as model: - x = np.array([1., 2., 3.]) - y = np.array([1., 2., 3.]) - beta = pm.Normal('beta', 0, 10.) - pm.Normal('obs', beta * x, np.sqrt(1e-2), observed=y) + x = np.array([1.0, 2.0, 3.0]) + y = np.array([1.0, 2.0, 3.0]) + beta = pm.Normal("beta", 0, 10.0) + pm.Normal("obs", beta * x, np.sqrt(1e-2), observed=y) pm.sample(1000, init=None, tune=1000, chains=1) with pytest.raises(TypeError) as error: - pm.set_data({'beta': [1.1, 2.2, 3.3]}, model=model) - error.match('defined as `pymc3.Data` inside the model') + pm.set_data({"beta": [1.1, 2.2, 3.3]}, model=model) + error.match("defined as `pymc3.Data` inside the model") def test_model_to_graphviz_for_model_with_data_container(self): with pm.Model() as model: - x = pm.Data('x', [1., 2., 3.]) - y = pm.Data('y', [1., 2., 3.]) - beta = pm.Normal('beta', 0, 10.) - pm.Normal('obs', beta * x, np.sqrt(1e-2), observed=y) + x = pm.Data("x", [1.0, 2.0, 3.0]) + y = pm.Data("y", [1.0, 2.0, 3.0]) + beta = pm.Normal("beta", 0, 10.0) + pm.Normal("obs", beta * x, np.sqrt(1e-2), observed=y) pm.sample(1000, init=None, tune=1000, chains=1) g = pm.model_to_graphviz(model) From b3a9dee069e35244f148b7e737cffb4f66c57f50 Mon Sep 17 00:00:00 2001 From: AlexAndorra Date: Sun, 17 May 2020 22:02:04 +0200 Subject: [PATCH 10/15] Added release note --- RELEASE-NOTES.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 6ebf36917c..e42893d16b 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -3,7 +3,7 @@ ## PyMC3 3.9 (On deck) ### New features -- use [fastprogress](https://github.com/fastai/fastprogress) instead of tqdm [#3693](https://github.com/pymc-devs/pymc3/pull/3693) +- Use [fastprogress](https://github.com/fastai/fastprogress) instead of tqdm [#3693](https://github.com/pymc-devs/pymc3/pull/3693). - `DEMetropolis` can now tune both `lambda` and `scaling` parameters, but by default neither of them are tuned. See [#3743](https://github.com/pymc-devs/pymc3/pull/3743) for more info. - `DEMetropolisZ`, an improved variant of `DEMetropolis` brings better parallelization and higher efficiency with fewer chains with a slower initial convergence. This implementation is experimental. See [#3784](https://github.com/pymc-devs/pymc3/pull/3784) for more info. - Notebooks that give insight into `DEMetropolis`, `DEMetropolisZ` and the `DifferentialEquation` interface are now located in the [Tutorials/Deep Dive](https://docs.pymc.io/nb_tutorials/index.html) section. @@ -14,6 +14,8 @@ - `pm.sample` now has support for adapting dense mass matrix using `QuadPotentialFullAdapt` (see [#3596](https://github.com/pymc-devs/pymc3/pull/3596), [#3705](https://github.com/pymc-devs/pymc3/pull/3705), [#3858](https://github.com/pymc-devs/pymc3/pull/3858), and [#3893](https://github.com/pymc-devs/pymc3/pull/3893)). Use `init="adapt_full"` or `init="jitter+adapt_full"` to use. - `Moyal` distribution added (see [#3870](https://github.com/pymc-devs/pymc3/pull/3870)). - `pm.LKJCholeskyCov` now automatically computes and returns the unpacked Cholesky decomposition, the correlations and the standard deviations of the covariance matrix (see [#3881](https://github.com/pymc-devs/pymc3/pull/3881)). +- `pm.Data` container can now be used for index variables, i.e with integer data and not only floats (see [#3925](https://github.com/pymc-devs/pymc3/pull/3925)). +- `pm.Data` container can now be used as input for other random variables (see [#3925](https://github.com/pymc-devs/pymc3/pull/3925)). ### Maintenance - Tuning results no longer leak into sequentially sampled `Metropolis` chains (see #3733 and #3796). From 5e5440c1e0b41630b9cc82c9d6de90f73c8254f6 Mon Sep 17 00:00:00 2001 From: AlexAndorra Date: Mon, 18 May 2020 10:11:04 +0200 Subject: [PATCH 11/15] Updated release notes --- RELEASE-NOTES.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index e42893d16b..490f905dea 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -14,8 +14,8 @@ - `pm.sample` now has support for adapting dense mass matrix using `QuadPotentialFullAdapt` (see [#3596](https://github.com/pymc-devs/pymc3/pull/3596), [#3705](https://github.com/pymc-devs/pymc3/pull/3705), [#3858](https://github.com/pymc-devs/pymc3/pull/3858), and [#3893](https://github.com/pymc-devs/pymc3/pull/3893)). Use `init="adapt_full"` or `init="jitter+adapt_full"` to use. - `Moyal` distribution added (see [#3870](https://github.com/pymc-devs/pymc3/pull/3870)). - `pm.LKJCholeskyCov` now automatically computes and returns the unpacked Cholesky decomposition, the correlations and the standard deviations of the covariance matrix (see [#3881](https://github.com/pymc-devs/pymc3/pull/3881)). -- `pm.Data` container can now be used for index variables, i.e with integer data and not only floats (see [#3925](https://github.com/pymc-devs/pymc3/pull/3925)). -- `pm.Data` container can now be used as input for other random variables (see [#3925](https://github.com/pymc-devs/pymc3/pull/3925)). +- `pm.Data` container can now be used for index variables, i.e with integer data and not only floats (issue [#3813](https://github.com/pymc-devs/pymc3/issues/3813), fixed by [#3925](https://github.com/pymc-devs/pymc3/pull/3925)). +- `pm.Data` container can now be used as input for other random variables (issue [#3842](https://github.com/pymc-devs/pymc3/issues/3842), fixed by [#3925](https://github.com/pymc-devs/pymc3/pull/3925)). ### Maintenance - Tuning results no longer leak into sequentially sampled `Metropolis` chains (see #3733 and #3796). From 0d07347b7a9a6edbfbe93dac2471ab6f35fb829a Mon Sep 17 00:00:00 2001 From: AlexAndorra Date: Mon, 18 May 2020 12:21:54 +0200 Subject: [PATCH 12/15] Updated code in light of Luciano's comments --- pymc3/data.py | 14 ++------------ pymc3/model.py | 19 +++---------------- pymc3/tests/test_data_container.py | 2 +- 3 files changed, 6 insertions(+), 29 deletions(-) diff --git a/pymc3/data.py b/pymc3/data.py index fa569dd332..35f797a576 100644 --- a/pymc3/data.py +++ b/pymc3/data.py @@ -479,17 +479,7 @@ class Data: https://docs.pymc.io/notebooks/data_container.html """ - def __new__(self, name, value, dtype=None): - if not dtype: - if hasattr(value, "dtype"): - # if no dtype given but available as attr of value, use that as dtype - dtype = value.dtype - elif isinstance(value, int): - dtype = int - else: - # otherwise, assume float - dtype = float - + def __new__(self, name, value): # Add data container to the named variables of the model. try: model = pm.Model.get_context() @@ -502,7 +492,7 @@ def __new__(self, name, value, dtype=None): # `pm.model.pandas_to_array` takes care of parameter `value` and # transforms it to something digestible for pymc3 - shared_object = theano.shared(pm.model.pandas_to_array(value, dtype=dtype), name) + shared_object = theano.shared(pm.model.pandas_to_array(value), name) # To draw the node for this variable in the graphviz Digraph we need # its shape. diff --git a/pymc3/model.py b/pymc3/model.py index 45d1629794..d891e909fc 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -1275,15 +1275,7 @@ def set_data(new_data, model=None): for variable_name, new_value in new_data.items(): if isinstance(model[variable_name], SharedVariable): - if hasattr(new_value, "dtype"): - # if no dtype given but available as attr of value, use that as dtype - dtype = new_value.dtype - elif isinstance(new_value, int): - dtype = int - else: - # otherwise, assume float - dtype = float - model[variable_name].set_value(pandas_to_array(new_value, dtype=dtype)) + model[variable_name].set_value(pandas_to_array(new_value)) else: message = 'The variable `{}` must be defined as `pymc3.' \ 'Data` inside the model to allow updating. The ' \ @@ -1490,7 +1482,7 @@ def init_value(self): return self.tag.test_value -def pandas_to_array(data, dtype=float): +def pandas_to_array(data): if hasattr(data, 'values'): # pandas if data.isnull().any().any(): # missing values ret = np.ma.MaskedArray(data.values, data.isnull().values) @@ -1510,12 +1502,7 @@ def pandas_to_array(data, dtype=float): else: ret = np.asarray(data) - if dtype in [int, np.int8, np.int16, np.int32, np.int64]: - return pm.intX(ret) - elif dtype in [float, np.float16, np.float32, np.float64]: - return pm.floatX(ret) - else: - raise ValueError('Unsupported type for pandas_to_array: %s' % str(dtype)) + return ret def as_tensor(data, name, model, distribution): diff --git a/pymc3/tests/test_data_container.py b/pymc3/tests/test_data_container.py index 38b640ac5a..9bdee14f8b 100644 --- a/pymc3/tests/test_data_container.py +++ b/pymc3/tests/test_data_container.py @@ -109,7 +109,7 @@ def test_shared_data_as_index(self): See https://github.com/pymc-devs/pymc3/issues/3813 """ with pm.Model() as model: - index = pm.Data("index", [2, 0, 1, 0, 2], dtype=int) + index = pm.Data("index", [2, 0, 1, 0, 2]) y = pm.Data("y", [1.0, 2.0, 3.0, 2.0, 1.0]) alpha = pm.Normal("alpha", 0, 1.5, shape=3) pm.Normal("obs", alpha[index], np.sqrt(1e-2), observed=y) From 41200ef7bb67607827c114ab05b3845cce5c84f0 Mon Sep 17 00:00:00 2001 From: AlexAndorra Date: Mon, 18 May 2020 15:34:54 +0200 Subject: [PATCH 13/15] Fixed implementation of integer checking --- pymc3/data.py | 17 ++++++++++++++++- pymc3/model.py | 24 +++++++++++++++++++++--- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/pymc3/data.py b/pymc3/data.py index 35f797a576..67e8fb1d89 100644 --- a/pymc3/data.py +++ b/pymc3/data.py @@ -480,6 +480,21 @@ class Data: """ def __new__(self, name, value): + if isinstance(value, list): + print(value, type(value)) + value = np.array(value) + print("Converted list to np: ", value, type(value), value.dtype) + + # Type handling to enable index variables + # set int type when appropriate: + if "int" in str(value.dtype): + dtype = pm.intX(value).dtype + print(value, ": ", dtype) + # otherwise, assume float + else: + dtype = theano.config.floatX + print(value, ": ", dtype) + # Add data container to the named variables of the model. try: model = pm.Model.get_context() @@ -492,7 +507,7 @@ def __new__(self, name, value): # `pm.model.pandas_to_array` takes care of parameter `value` and # transforms it to something digestible for pymc3 - shared_object = theano.shared(pm.model.pandas_to_array(value), name) + shared_object = theano.shared(pm.model.pandas_to_array(value, dtype=dtype), name) # To draw the node for this variable in the graphviz Digraph we need # its shape. diff --git a/pymc3/model.py b/pymc3/model.py index d891e909fc..65a6191a9f 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -1275,7 +1275,20 @@ def set_data(new_data, model=None): for variable_name, new_value in new_data.items(): if isinstance(model[variable_name], SharedVariable): - model[variable_name].set_value(pandas_to_array(new_value)) + if isinstance(new_value, list): + print(new_value, type(new_value)) + new_value = np.array(new_value) + print("Converted list to np: ", new_value, type(new_value), new_value.dtype) + # Type handling to enable index variables + # set int type when appropriate: + if "int" in str(new_value.dtype): + dtype = pm.intX(new_value).dtype + print(new_value, ": ", dtype) + # otherwise, assume float + else: + dtype = theano.config.floatX + print(new_value, ": ", dtype) + model[variable_name].set_value(pandas_to_array(new_value, dtype=dtype)) else: message = 'The variable `{}` must be defined as `pymc3.' \ 'Data` inside the model to allow updating. The ' \ @@ -1482,7 +1495,7 @@ def init_value(self): return self.tag.test_value -def pandas_to_array(data): +def pandas_to_array(data, dtype=theano.config.floatX): if hasattr(data, 'values'): # pandas if data.isnull().any().any(): # missing values ret = np.ma.MaskedArray(data.values, data.isnull().values) @@ -1502,7 +1515,12 @@ def pandas_to_array(data): else: ret = np.asarray(data) - return ret + if "int" in str(dtype): + print("in pandas function, int boucle: ", ret, str(dtype)) + return pm.intX(ret) + else: + print("in pandas function, float boucle: ", ret, str(dtype)) + return pm.floatX(ret) def as_tensor(data, name, model, distribution): From 7c359d81bb0ff29439698730b0ab179b7036fd90 Mon Sep 17 00:00:00 2001 From: AlexAndorra Date: Mon, 18 May 2020 15:59:19 +0200 Subject: [PATCH 14/15] Simplified implementation of type checking --- pymc3/data.py | 14 +------------- pymc3/model.py | 21 +++++---------------- 2 files changed, 6 insertions(+), 29 deletions(-) diff --git a/pymc3/data.py b/pymc3/data.py index 67e8fb1d89..04d8901fae 100644 --- a/pymc3/data.py +++ b/pymc3/data.py @@ -481,19 +481,7 @@ class Data: def __new__(self, name, value): if isinstance(value, list): - print(value, type(value)) value = np.array(value) - print("Converted list to np: ", value, type(value), value.dtype) - - # Type handling to enable index variables - # set int type when appropriate: - if "int" in str(value.dtype): - dtype = pm.intX(value).dtype - print(value, ": ", dtype) - # otherwise, assume float - else: - dtype = theano.config.floatX - print(value, ": ", dtype) # Add data container to the named variables of the model. try: @@ -507,7 +495,7 @@ def __new__(self, name, value): # `pm.model.pandas_to_array` takes care of parameter `value` and # transforms it to something digestible for pymc3 - shared_object = theano.shared(pm.model.pandas_to_array(value, dtype=dtype), name) + shared_object = theano.shared(pm.model.pandas_to_array(value), name) # To draw the node for this variable in the graphviz Digraph we need # its shape. diff --git a/pymc3/model.py b/pymc3/model.py index 65a6191a9f..8d1cb2aacf 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -1276,19 +1276,8 @@ def set_data(new_data, model=None): for variable_name, new_value in new_data.items(): if isinstance(model[variable_name], SharedVariable): if isinstance(new_value, list): - print(new_value, type(new_value)) new_value = np.array(new_value) - print("Converted list to np: ", new_value, type(new_value), new_value.dtype) - # Type handling to enable index variables - # set int type when appropriate: - if "int" in str(new_value.dtype): - dtype = pm.intX(new_value).dtype - print(new_value, ": ", dtype) - # otherwise, assume float - else: - dtype = theano.config.floatX - print(new_value, ": ", dtype) - model[variable_name].set_value(pandas_to_array(new_value, dtype=dtype)) + model[variable_name].set_value(pandas_to_array(new_value)) else: message = 'The variable `{}` must be defined as `pymc3.' \ 'Data` inside the model to allow updating. The ' \ @@ -1495,7 +1484,7 @@ def init_value(self): return self.tag.test_value -def pandas_to_array(data, dtype=theano.config.floatX): +def pandas_to_array(data): if hasattr(data, 'values'): # pandas if data.isnull().any().any(): # missing values ret = np.ma.MaskedArray(data.values, data.isnull().values) @@ -1515,11 +1504,11 @@ def pandas_to_array(data, dtype=theano.config.floatX): else: ret = np.asarray(data) - if "int" in str(dtype): - print("in pandas function, int boucle: ", ret, str(dtype)) + # type handling to enable index variables when data is int: + if "int" in str(data.dtype): return pm.intX(ret) + # otherwise, assume float: else: - print("in pandas function, float boucle: ", ret, str(dtype)) return pm.floatX(ret) From f7bf6dbf4f919d2436782fd074419303985a06ef Mon Sep 17 00:00:00 2001 From: AlexAndorra Date: Mon, 18 May 2020 16:16:10 +0200 Subject: [PATCH 15/15] Corrected implementation for other uses of pandas_to_array --- pymc3/model.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pymc3/model.py b/pymc3/model.py index 8d1cb2aacf..8d54a45878 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -1505,9 +1505,13 @@ def pandas_to_array(data): ret = np.asarray(data) # type handling to enable index variables when data is int: - if "int" in str(data.dtype): - return pm.intX(ret) - # otherwise, assume float: + if hasattr(data, "dtype"): + if "int" in str(data.dtype): + return pm.intX(ret) + # otherwise, assume float: + else: + return pm.floatX(ret) + # needed for uses of this function other than with pm.Data: else: return pm.floatX(ret)