make release-tag: Merge branch 'master' into stable

katxiao · katxiao · commit bf9fc19d974b · 2021-10-12T12:42:17.000-07:00
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.ipynb linguist-vendored
diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md
@@ -1,6 +1,6 @@
 ---
 name: Question
-about: Doubts about SDV usage
+about: Ask a general question about SDV usage
 title: ''
 labels: question, pending review
 assignees: ''
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,25 @@
 # Release Notes
 
+## 0.12.1 - 2021-10-12
+
+This release fixes bugs in constraints, metadata behavior, and SDV documentation. Specifically, we added
+proper handling of data containing null values for constraints and timeseries data, and updated the
+default metadata detection behavior.
+
+### Bugs Fixed
+* ValueError: The parameter loc has invalid values - Issue [#353](https://github.com/sdv-dev/SDV/issues/353) by @fealho
+* Gaussian Copula is generating different data with metadata and without metadata - Issue [#576](https://github.com/sdv-dev/SDV/issues/576) by @katxiao
+* Make pomegranate an optional dependency - Issue [#567](https://github.com/sdv-dev/SDV/issues/567) by @katxiao
+* Small wording change for Question Issue Template - Issue [#571](https://github.com/sdv-dev/SDV/issues/571) by @katxiao
+* ConstraintsNotMetError when using GreaterThan constraint with datetime - Issue [#590](https://github.com/sdv-dev/SDV/issues/590) by @katxiao
+* GreaterThan constraint crashing with NaN values - Issue [#592](https://github.com/sdv-dev/SDV/issues/592) by @katxiao
+* Null values in GreaterThan constraint raises error - Issue [#589](https://github.com/sdv-dev/SDV/issues/589) by @katxiao
+* ColumnFormula raises ConstraintsNotMetError when checking NaN values - Issue [#593](https://github.com/sdv-dev/SDV/issues/593) by @katxiao
+* GreaterThan constraint raises TypeError when using datetime - Issue [#596](https://github.com/sdv-dev/SDV/issues/596) by @katxiao
+* Fix repository language - Issue [#464](https://github.com/sdv-dev/SDV/issues/464) by @fealho
+* Update __init__.py - Issue [#578](https://github.com/sdv-dev/SDV/issues/578) by @dyuliu
+* IndexingError: Unalignable boolean - Issue [#446](https://github.com/sdv-dev/SDV/issues/446) by @fealho
+
 ## 0.12.0 - 2021-08-17
 
 This release focuses on improving and expanding upon the existing constraints. More specifically, the users can now
diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = 'sdv' %}
-{% set version = '0.12.0' %}
+{% set version = '0.12.1.dev1' %}
 
 package:
   name: "{{ name|lower }}"
diff --git a/sdv/__init__.py b/sdv/__init__.py
@@ -1,12 +1,12 @@
 # -*- coding: utf-8 -*-
 # configure logging for the library with a null handler (nothing is printed by default). See
-# http://docs.pthon-guide.org/en/latest/writing/logging/
+# http://docs.python-guide.org/en/latest/writing/logging/
 
 """Top-level package for SDV."""
 
 __author__ = """MIT Data To AI Lab"""
 __email__ = 'dailabmit@gmail.com'
-__version__ = '0.12.0'
+__version__ = '0.12.1.dev1'
 
 from sdv import constraints, evaluation, metadata, relational, tabular
 from sdv.demo import get_available_demos, load_demo
diff --git a/sdv/constraints/base.py b/sdv/constraints/base.py
@@ -342,6 +342,9 @@ def filter_valid(self, table_data):
             LOGGER.debug('%s: %s invalid rows out of %s.',
                          self.__class__.__name__, sum(~valid), len(valid))
 
+        if isinstance(valid, pd.Series):
+            return table_data[valid.values]
+
         return table_data[valid]
 
     @classmethod
diff --git a/sdv/constraints/tabular.py b/sdv/constraints/tabular.py
@@ -320,10 +320,16 @@ def _validate_inputs(cls, low, high, scalar, drop):
             cls._validate_drop(scalar, drop)
             high = cls._validate_scalar(scalar_column=low, column_names=high, scalar=scalar)
             constraint_columns = tuple(high)
+            if isinstance(low, pd.Timestamp):
+                low = low.to_datetime64()
+
         elif scalar == 'high':
             cls._validate_drop(scalar, drop)
             low = cls._validate_scalar(scalar_column=high, column_names=low, scalar=scalar)
             constraint_columns = tuple(low)
+            if isinstance(high, pd.Timestamp):
+                high = high.to_datetime64()
+
         else:
             raise ValueError(f"Invalad `scalar` value: `{scalar}`. "
                              "Use either: 'high', 'low', or None.")
@@ -436,8 +442,11 @@ def is_valid(self, table_data):
         """
         low = self._get_value(table_data, 'low')
         high = self._get_value(table_data, 'high')
+        isnull = np.logical_or(np.isnan(low), np.isnan(high))
 
-        return self.operator(high, low).all(axis=1)
+        valid = np.logical_or(self.operator(high, low), isnull)
+
+        return valid.all(axis=1)
 
     def _transform(self, table_data):
         """Transform the table data.
@@ -630,7 +639,9 @@ def is_valid(self, table_data):
                 Whether each row is valid.
         """
         computed = self._formula(table_data)
-        return table_data[self._column] == computed
+        isnan = table_data[self._column].isna() & computed.isna()
+
+        return table_data[self._column].eq(computed) | isnan
 
     def _transform(self, table_data):
         """Transform the table data.
diff --git a/sdv/metadata/table.py b/sdv/metadata/table.py
@@ -769,6 +769,9 @@ def from_dict(cls, metadata_dict, dtype_transformers=None):
             entity_columns=metadata_dict.get('entity_columns') or [],
             context_columns=metadata_dict.get('context_columns') or [],
             dtype_transformers=dtype_transformers,
+            min_value=metadata_dict.get('min_value', 'auto'),
+            max_value=metadata_dict.get('max_value', 'auto'),
+            rounding=metadata_dict.get('rounding', 'auto'),
         )
         instance._fields_metadata = fields
         return instance
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.12.0
+current_version = 0.12.1.dev1
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
diff --git a/setup.py b/setup.py
@@ -20,11 +20,14 @@
     'tqdm>=4.14,<5',
     'copulas>=0.5.0,<0.6',
     'ctgan>=0.4.3,<0.5',
-    'deepecho>=0.2.0,<0.3',
-    'rdt>=0.5.0,<0.6',
+    'deepecho>=0.2.1,<0.3',
+    'rdt>=0.5.3,<0.6',
     'sdmetrics>=0.3.1,<0.4',
     'torchvision>=0.5.0,<1',
     'sktime>=0.4,<0.6',
+]
+
+pomegranate_requires = [
     'pomegranate>=0.13.4,<0.14.2',
 ]
 
@@ -93,6 +96,7 @@
     extras_require={
         'test': tests_require,
         'dev': development_requires + tests_require,
+        'pomegranate': pomegranate_requires,
     },
     include_package_data=True,
     install_requires=install_requires,
@@ -107,6 +111,6 @@
     test_suite='tests',
     tests_require=tests_require,
     url='https://github.com/sdv-dev/SDV',
-    version='0.12.0',
+    version='0.12.1.dev1',
     zip_safe=False,
 )
diff --git a/tasks.py b/tasks.py
@@ -23,15 +23,17 @@ def install_minimum(c):
     for line in lines:
         if started:
             if line == ']':
-                break
+                started = False
+                continue
 
             line = line.strip()
             line = re.sub(r',?<=?[\d.]*,?', '', line)
             line = re.sub(r'>=?', '==', line)
             line = re.sub(r"""['",]""", '', line)
             versions.append(line)
 
-        elif line.startswith('install_requires = ['):
+        elif (line.startswith('install_requires = [') or
+              line.startswith('pomegranate_requires = [')):
             started = True
 
     c.run(f'python -m pip install {" ".join(versions)}')
diff --git a/tests/unit/constraints/test_base.py b/tests/unit/constraints/test_base.py
@@ -639,6 +639,38 @@ def test_filter_valid(self):
         })
         pd.testing.assert_frame_equal(expected_out, out)
 
+    def test_filter_valid_with_invalid_index(self):
+        """Test the ``Constraint.filter_valid`` method.
+
+        Tests when the is_valid method returns a Series with an invalid index.
+
+        Note: `is_valid.index` can be [0, 1, 5] if, for example, the Series is a subset
+        of an original table with 10 rows, but only rows 0/1/5 were selected.
+
+        Input:
+        - Table data (pandas.DataFrame)
+        Output:
+        - Table data, with only the valid rows (pandas.DataFrame)
+        """
+        # Setup
+        table_data = pd.DataFrame({
+            'a': [1, 2, 3]
+        })
+
+        constraint_mock = Mock()
+        is_valid = pd.Series([True, True, False])
+        is_valid.index = [0, 1, 5]
+        constraint_mock.is_valid.return_value = is_valid
+
+        # Run
+        out = Constraint.filter_valid(constraint_mock, table_data)
+
+        # Assert
+        expected_out = pd.DataFrame({
+            'a': [1, 2]
+        })
+        pd.testing.assert_frame_equal(expected_out, out)
+
     def test_from_dict_fqn(self):
         """Test the ``Constraint.from_dict`` method passing a FQN.
 
diff --git a/tests/unit/constraints/test_tabular.py b/tests/unit/constraints/test_tabular.py
@@ -1,6 +1,7 @@
 """Tests for the sdv.constraints.tabular module."""
 
 import uuid
+from datetime import datetime
 from unittest.mock import Mock
 
 import numpy as np
@@ -2109,12 +2110,121 @@ def test_is_valid_scalar_is_none_multi_column(self):
             'b': [4, 2, 2],
             'c': [7, 8, 9]
         })
+
+        # Run
         out = instance.is_valid(table_data)
 
         # Assert
         expected_out = [False, True, True]
         np.testing.assert_array_equal(expected_out, out)
 
+    def test_is_valid_high_is_datetime(self):
+        """Test the ``GreaterThan.is_valid`` method.
+
+        If high is a datetime and low is a column,
+        the values in that column should all be lower than
+        ``instance._high``.
+
+        Input:
+        - Table with values above and below `high`.
+        Output:
+        - True should be returned for the rows where the low
+        column is below `high`.
+        """
+        # Setup
+        high_dt = pd.to_datetime('8/31/2021')
+        instance = GreaterThan(low='a', high=high_dt, strict=False, scalar='high')
+        table_data = pd.DataFrame({
+            'a': [datetime(2020, 5, 17), datetime(2020, 2, 1), datetime(2021, 9, 1)],
+            'b': [4, 2, 2],
+        })
+
+        # Run
+        out = instance.is_valid(table_data)
+
+        # Assert
+        expected_out = [True, True, False]
+        np.testing.assert_array_equal(expected_out, out)
+
+    def test_is_valid_low_is_datetime(self):
+        """Test the ``GreaterThan.is_valid`` method.
+
+        If low is a datetime and high is a column,
+        the values in that column should all be higher than
+        ``instance._low``.
+
+        Input:
+        - Table with values above and below `low`.
+        Output:
+        - True should be returned for the rows where the high
+        column is above `low`.
+        """
+        # Setup
+        low_dt = pd.to_datetime('8/31/2021')
+        instance = GreaterThan(low=low_dt, high='a', strict=False, scalar='low')
+        table_data = pd.DataFrame({
+            'a': [datetime(2021, 9, 17), datetime(2021, 7, 1), datetime(2021, 9, 1)],
+            'b': [4, 2, 2],
+        })
+
+        # Run
+        out = instance.is_valid(table_data)
+
+        # Assert
+        expected_out = [True, False, True]
+        np.testing.assert_array_equal(expected_out, out)
+
+    def test_is_valid_two_cols_with_nans(self):
+        """Test the ``GreaterThan.is_valid`` method with nan values.
+
+        If there is a NaN row, expect that `is_valid` returns True.
+
+        Input:
+        - Table with a NaN row
+        Output:
+        - True should be returned for the NaN row.
+        """
+        # Setup
+        instance = GreaterThan(low='a', high='b', strict=True)
+
+        # Run
+        table_data = pd.DataFrame({
+            'a': [1, None, 3],
+            'b': [4, None, 2],
+            'c': [7, 8, 9]
+        })
+        out = instance.is_valid(table_data)
+
+        # Assert
+        expected_out = [True, True, False]
+        np.testing.assert_array_equal(expected_out, out)
+
+    def test_is_valid_two_cols_with_one_nan(self):
+        """Test the ``GreaterThan.is_valid`` method with nan values.
+
+        If there is a row in which we compare one NaN value with one
+        non-NaN value, expect that `is_valid` returns True.
+
+        Input:
+        - Table with a row that contains only one NaN value.
+        Output:
+        - True should be returned for the row with the NaN value.
+        """
+        # Setup
+        instance = GreaterThan(low='a', high='b', strict=True)
+
+        # Run
+        table_data = pd.DataFrame({
+            'a': [1, None, 3],
+            'b': [4, 5, 2],
+            'c': [7, 8, 9]
+        })
+        out = instance.is_valid(table_data)
+
+        # Assert
+        expected_out = [True, True, False]
+        np.testing.assert_array_equal(expected_out, out)
+
     def test__transform_int_drop_none(self):
         """Test the ``GreaterThan._transform`` method passing a high column of type int.
 
@@ -3205,6 +3315,9 @@ def test__init__drop_true(self):
 
 def new_column(data):
     """Formula to be used for the ``TestColumnFormula`` class."""
+    if data['a'] is None or data['b'] is None:
+        return None
+
     return data['a'] + data['b']
 
 
@@ -3322,6 +3435,33 @@ def test_is_valid_non_valid(self):
         expected_out = pd.Series([False, False, False])
         pd.testing.assert_series_equal(expected_out, out)
 
+    def test_is_valid_with_nans(self):
+        """Test the ``ColumnFormula.is_valid`` method for with a formula that produces nans.
+
+        If the data fulfills the formula, result is a series of ``True`` values.
+
+        Input:
+        - Table data fulfilling the formula (pandas.DataFrame)
+        Output:
+        - Series of ``True`` values (pandas.Series)
+        """
+        # Setup
+        column = 'c'
+        instance = ColumnFormula(column=column, formula=new_column)
+
+        # Run
+        table_data = pd.DataFrame({
+            'a': [1, 2, 3],
+            'b': [4, 5, None],
+            'c': [5, 7, None]
+        })
+        instance = ColumnFormula(column=column, formula=new_column)
+        out = instance.is_valid(table_data)
+
+        # Assert
+        expected_out = pd.Series([True, True, True])
+        pd.testing.assert_series_equal(expected_out, out)
+
     def test__transform(self):
         """Test the ``ColumnFormula._transform`` method.
 
diff --git a/tests/unit/metadata/test_table.py b/tests/unit/metadata/test_table.py