pymc-devs · jmloyola · Aug 30, 2019 · Aug 30, 2019 · Aug 30, 2019 · Aug 30, 2019
diff --git a/docs/source/notebooks/data_container.ipynb b/docs/source/notebooks/data_container.ipynb
diff --git a/pymc3/model.py b/pymc3/model.py
@@ -13,7 +13,7 @@
 from theano.tensor.var import TensorVariable
 from theano.compile import SharedVariable
 
-from pymc3.theanof import set_theano_conf, floatX
+from pymc3.theanof import set_theano_conf, floatX, smartfloatX
 import pymc3 as pm
 from pymc3.math import flatten_list
 from .memoize import memoize, WithMemoization
@@ -1302,6 +1302,11 @@ def init_value(self):
 
 
 def pandas_to_array(data):
+    '''
+    If the data is already of dtype equal to theano.config.floatX or
+    other not-floating point type, the function will not cast the output.
+    Otherwise, the function will cast the output to theano.config.floatX
+    '''
     if hasattr(data, 'values'):  # pandas
         if data.isnull().any().any():  # missing values
             ret = np.ma.MaskedArray(data.values, data.isnull().values)
@@ -1320,7 +1325,7 @@ def pandas_to_array(data):
         ret = generator(data)
     else:
         ret = np.asarray(data)
-    return pm.floatX(ret)
+    return smartfloatX(ret)
 
 
 def as_tensor(data, name, model, distribution):

diff --git a/pymc3/tests/test_data_container.py b/pymc3/tests/test_data_container.py
@@ -49,7 +49,7 @@ def test_sample_posterior_predictive_after_set_data(self):
             trace = pm.sample(1000, tune=1000, chains=1)
         # Predict on new data.
         with model:
-            x_test = [5, 6, 9]
+            x_test = [5., 6., 9.]
             pm.set_data(new_data={'x': x_test})
             y_test = pm.sample_posterior_predictive(trace)
 
@@ -65,8 +65,8 @@ def test_sample_after_set_data(self):
             pm.Normal('obs', beta * x, np.sqrt(1e-2), observed=y)
             pm.sample(1000, init=None, tune=1000, chains=1)
         # Predict on new data.
-        new_x = [5, 6, 9]
-        new_y = [5, 6, 9]
+        new_x = [5., 6., 9.]
+        new_y = [5., 6., 9.]
         with model:
             pm.set_data(new_data={'x': new_x, 'y': new_y})
             new_trace = pm.sample()

diff --git a/pymc3/tests/test_model_helpers.py b/pymc3/tests/test_model_helpers.py
@@ -11,14 +11,19 @@
 
 
 class TestHelperFunc:
-    def test_pandas_to_array(self):
+    def test_pandas_to_array_casting(self):
         """
         Ensure that pandas_to_array returns the dense array, masked array,
-        graph variable, TensorVariable, or sparse matrix as appropriate.
+        graph variable, TensorVariable, or sparse matrix as appropriate
+        when it has to cast the variable.
         """
+        # Force cast of input
+        input_type = 'float32' if theano.config.floatX is 'float64' else 'float64'
+
         # Create the various inputs to the function
-        sparse_input = sps.csr_matrix(np.eye(3))
-        dense_input = np.arange(9).reshape((3, 3))
+        sparse_input = sps.csr_matrix(np.eye(3), dtype=input_type)
+
+        dense_input = np.arange(9, dtype=input_type).reshape((3, 3))
 
         input_name = 'input_variable'
         theano_graph_input = tt.as_tensor(dense_input, name=input_name)
@@ -28,13 +33,13 @@ def test_pandas_to_array(self):
         # All the even numbers are replaced with NaN
         missing_pandas_input = pd.DataFrame(np.array([[np.nan, 1, np.nan],
                                                       [3, np.nan, 5],
-                                                      [np.nan, 7, np.nan]]))
+                                                      [np.nan, 7, np.nan]], dtype=input_type))
         masked_array_input = ma.array(dense_input,
                                       mask=(np.mod(dense_input, 2) == 0))
 
         # Create a generator object. Apparently the generator object needs to
         # yield numpy arrays.
-        square_generator = (np.array([i**2], dtype=int) for i in range(100))
+        square_generator = (np.array([i**2], dtype=input_type) for i in range(100))
 
         # Alias the function to be tested
         func = pm.model.pandas_to_array
@@ -73,13 +78,85 @@ def test_pandas_to_array(self):
         # Check function behavior with generator data
         generator_output = func(square_generator)
 
-        # Output is wrapped with `pm.floatX`, and this unwraps
-        wrapped = generator_output.owner.inputs[0]
         # Make sure the returned object has .set_gen and .set_default methods
-        assert hasattr(wrapped, "set_gen")
-        assert hasattr(wrapped, "set_default")
+        assert hasattr(generator_output, "set_gen")
+        assert hasattr(generator_output, "set_default")
+        # Make sure the returned object is a Theano TensorVariable
+        assert isinstance(generator_output, tt.TensorVariable)
+
+    def test_pandas_to_array_not_casting(self):
+        """
+        Ensure that pandas_to_array returns the dense array, masked array,
+        graph variable, TensorVariable, or sparse matrix as appropriate
+        when it does not have to cast the variable.
+        """
+        # Input type to force the function not to cast.
+        # It could also have been int64 but for the missing values variable would have been invalid
+        input_type = theano.config.floatX
+
+        # Create the various inputs to the function
+        sparse_input = sps.csr_matrix(np.eye(3), dtype=input_type)
+
+        dense_input = np.arange(9, dtype=input_type).reshape((3, 3))
+
+        input_name = 'input_variable'
+        theano_graph_input = tt.as_tensor(dense_input, name=input_name)
+
+        pandas_input = pd.DataFrame(dense_input)
+
+        # All the even numbers are replaced with NaN
+        missing_pandas_input = pd.DataFrame(np.array([[np.nan, 1, np.nan],
+                                                      [3, np.nan, 5],
+                                                      [np.nan, 7, np.nan]], dtype=input_type))
+        masked_array_input = ma.array(dense_input,
+                                      mask=(np.mod(dense_input, 2) == 0))
+
+        # Create a generator object. Apparently the generator object needs to
+        # yield numpy arrays.
+        square_generator = (np.array([i**2], dtype=input_type) for i in range(100))
+
+        # Alias the function to be tested
+        func = pm.model.pandas_to_array
+
+        #####
+        # Perform the various tests
+        #####
+        # Check function behavior with dense arrays and pandas dataframes
+        # without missing values
+        for input_value in [dense_input, pandas_input]:
+            func_output = func(input_value)
+            assert isinstance(func_output, np.ndarray)
+            assert func_output.shape == input_value.shape
+            npt.assert_allclose(func_output, dense_input)
+
+        # Check function behavior with sparse matrix inputs
+        sparse_output = func(sparse_input)
+        assert sps.issparse(sparse_output)
+        assert sparse_output.shape == sparse_input.shape
+        npt.assert_allclose(sparse_output.toarray(),
+                            sparse_input.toarray())
+
+        # Check function behavior when using masked array inputs and pandas
+        # objects with missing data
+        for input_value in [masked_array_input, missing_pandas_input]:
+            func_output = func(input_value)
+            assert isinstance(func_output, ma.core.MaskedArray)
+            assert func_output.shape == input_value.shape
+            npt.assert_allclose(func_output, masked_array_input)
+
+        # Check function behavior with Theano graph variable
+        theano_output = func(theano_graph_input)
+        assert isinstance(theano_output, theano.gof.graph.Variable)
+        assert theano_output.name == input_name
+
+        # Check function behavior with generator data
+        generator_output = func(square_generator)
+
+        # Make sure the returned object has .set_gen and .set_default methods
+        assert hasattr(generator_output, "set_gen")
+        assert hasattr(generator_output, "set_default")
         # Make sure the returned object is a Theano TensorVariable
-        assert isinstance(wrapped, tt.TensorVariable)
+        assert isinstance(generator_output, tt.TensorVariable)
 
     def test_as_tensor(self):
         """