Skip to content

Commit bf9fc19

Browse files
committed
make release-tag: Merge branch 'master' into stable
2 parents 3bdfe03 + 977db57 commit bf9fc19

File tree

14 files changed

+259
-12
lines changed

14 files changed

+259
-12
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.ipynb linguist-vendored

.github/ISSUE_TEMPLATE/question.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
22
name: Question
3-
about: Doubts about SDV usage
3+
about: Ask a general question about SDV usage
44
title: ''
55
labels: question, pending review
66
assignees: ''

HISTORY.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,25 @@
11
# Release Notes
22

3+
## 0.12.1 - 2021-10-12
4+
5+
This release fixes bugs in constraints, metadata behavior, and SDV documentation. Specifically, we added
6+
proper handling of data containing null values for constraints and timeseries data, and updated the
7+
default metadata detection behavior.
8+
9+
### Bugs Fixed
10+
* ValueError: The parameter loc has invalid values - Issue [#353](https://github.com/sdv-dev/SDV/issues/353) by @fealho
11+
* Gaussian Copula is generating different data with metadata and without metadata - Issue [#576](https://github.com/sdv-dev/SDV/issues/576) by @katxiao
12+
* Make pomegranate an optional dependency - Issue [#567](https://github.com/sdv-dev/SDV/issues/567) by @katxiao
13+
* Small wording change for Question Issue Template - Issue [#571](https://github.com/sdv-dev/SDV/issues/571) by @katxiao
14+
* ConstraintsNotMetError when using GreaterThan constraint with datetime - Issue [#590](https://github.com/sdv-dev/SDV/issues/590) by @katxiao
15+
* GreaterThan constraint crashing with NaN values - Issue [#592](https://github.com/sdv-dev/SDV/issues/592) by @katxiao
16+
* Null values in GreaterThan constraint raises error - Issue [#589](https://github.com/sdv-dev/SDV/issues/589) by @katxiao
17+
* ColumnFormula raises ConstraintsNotMetError when checking NaN values - Issue [#593](https://github.com/sdv-dev/SDV/issues/593) by @katxiao
18+
* GreaterThan constraint raises TypeError when using datetime - Issue [#596](https://github.com/sdv-dev/SDV/issues/596) by @katxiao
19+
* Fix repository language - Issue [#464](https://github.com/sdv-dev/SDV/issues/464) by @fealho
20+
* Update __init__.py - Issue [#578](https://github.com/sdv-dev/SDV/issues/578) by @dyuliu
21+
* IndexingError: Unalignable boolean - Issue [#446](https://github.com/sdv-dev/SDV/issues/446) by @fealho
22+
323
## 0.12.0 - 2021-08-17
424

525
This release focuses on improving and expanding upon the existing constraints. More specifically, the users can now

conda/meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{% set name = 'sdv' %}
2-
{% set version = '0.12.0' %}
2+
{% set version = '0.12.1.dev1' %}
33

44
package:
55
name: "{{ name|lower }}"

sdv/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
# -*- coding: utf-8 -*-
22
# configure logging for the library with a null handler (nothing is printed by default). See
3-
# http://docs.pthon-guide.org/en/latest/writing/logging/
3+
# http://docs.python-guide.org/en/latest/writing/logging/
44

55
"""Top-level package for SDV."""
66

77
__author__ = """MIT Data To AI Lab"""
88
__email__ = '[email protected]'
9-
__version__ = '0.12.0'
9+
__version__ = '0.12.1.dev1'
1010

1111
from sdv import constraints, evaluation, metadata, relational, tabular
1212
from sdv.demo import get_available_demos, load_demo

sdv/constraints/base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,9 @@ def filter_valid(self, table_data):
342342
LOGGER.debug('%s: %s invalid rows out of %s.',
343343
self.__class__.__name__, sum(~valid), len(valid))
344344

345+
if isinstance(valid, pd.Series):
346+
return table_data[valid.values]
347+
345348
return table_data[valid]
346349

347350
@classmethod

sdv/constraints/tabular.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -320,10 +320,16 @@ def _validate_inputs(cls, low, high, scalar, drop):
320320
cls._validate_drop(scalar, drop)
321321
high = cls._validate_scalar(scalar_column=low, column_names=high, scalar=scalar)
322322
constraint_columns = tuple(high)
323+
if isinstance(low, pd.Timestamp):
324+
low = low.to_datetime64()
325+
323326
elif scalar == 'high':
324327
cls._validate_drop(scalar, drop)
325328
low = cls._validate_scalar(scalar_column=high, column_names=low, scalar=scalar)
326329
constraint_columns = tuple(low)
330+
if isinstance(high, pd.Timestamp):
331+
high = high.to_datetime64()
332+
327333
else:
328334
raise ValueError(f"Invalad `scalar` value: `{scalar}`. "
329335
"Use either: 'high', 'low', or None.")
@@ -436,8 +442,11 @@ def is_valid(self, table_data):
436442
"""
437443
low = self._get_value(table_data, 'low')
438444
high = self._get_value(table_data, 'high')
445+
isnull = np.logical_or(np.isnan(low), np.isnan(high))
439446

440-
return self.operator(high, low).all(axis=1)
447+
valid = np.logical_or(self.operator(high, low), isnull)
448+
449+
return valid.all(axis=1)
441450

442451
def _transform(self, table_data):
443452
"""Transform the table data.
@@ -630,7 +639,9 @@ def is_valid(self, table_data):
630639
Whether each row is valid.
631640
"""
632641
computed = self._formula(table_data)
633-
return table_data[self._column] == computed
642+
isnan = table_data[self._column].isna() & computed.isna()
643+
644+
return table_data[self._column].eq(computed) | isnan
634645

635646
def _transform(self, table_data):
636647
"""Transform the table data.

sdv/metadata/table.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -769,6 +769,9 @@ def from_dict(cls, metadata_dict, dtype_transformers=None):
769769
entity_columns=metadata_dict.get('entity_columns') or [],
770770
context_columns=metadata_dict.get('context_columns') or [],
771771
dtype_transformers=dtype_transformers,
772+
min_value=metadata_dict.get('min_value', 'auto'),
773+
max_value=metadata_dict.get('max_value', 'auto'),
774+
rounding=metadata_dict.get('rounding', 'auto'),
772775
)
773776
instance._fields_metadata = fields
774777
return instance

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.12.0
2+
current_version = 0.12.1.dev1
33
commit = True
44
tag = True
55
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?

setup.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,14 @@
2020
'tqdm>=4.14,<5',
2121
'copulas>=0.5.0,<0.6',
2222
'ctgan>=0.4.3,<0.5',
23-
'deepecho>=0.2.0,<0.3',
24-
'rdt>=0.5.0,<0.6',
23+
'deepecho>=0.2.1,<0.3',
24+
'rdt>=0.5.3,<0.6',
2525
'sdmetrics>=0.3.1,<0.4',
2626
'torchvision>=0.5.0,<1',
2727
'sktime>=0.4,<0.6',
28+
]
29+
30+
pomegranate_requires = [
2831
'pomegranate>=0.13.4,<0.14.2',
2932
]
3033

@@ -93,6 +96,7 @@
9396
extras_require={
9497
'test': tests_require,
9598
'dev': development_requires + tests_require,
99+
'pomegranate': pomegranate_requires,
96100
},
97101
include_package_data=True,
98102
install_requires=install_requires,
@@ -107,6 +111,6 @@
107111
test_suite='tests',
108112
tests_require=tests_require,
109113
url='https://github.com/sdv-dev/SDV',
110-
version='0.12.0',
114+
version='0.12.1.dev1',
111115
zip_safe=False,
112116
)

tasks.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,17 @@ def install_minimum(c):
2323
for line in lines:
2424
if started:
2525
if line == ']':
26-
break
26+
started = False
27+
continue
2728

2829
line = line.strip()
2930
line = re.sub(r',?<=?[\d.]*,?', '', line)
3031
line = re.sub(r'>=?', '==', line)
3132
line = re.sub(r"""['",]""", '', line)
3233
versions.append(line)
3334

34-
elif line.startswith('install_requires = ['):
35+
elif (line.startswith('install_requires = [') or
36+
line.startswith('pomegranate_requires = [')):
3537
started = True
3638

3739
c.run(f'python -m pip install {" ".join(versions)}')

tests/unit/constraints/test_base.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -639,6 +639,38 @@ def test_filter_valid(self):
639639
})
640640
pd.testing.assert_frame_equal(expected_out, out)
641641

642+
def test_filter_valid_with_invalid_index(self):
643+
"""Test the ``Constraint.filter_valid`` method.
644+
645+
Tests when the is_valid method returns a Series with an invalid index.
646+
647+
Note: `is_valid.index` can be [0, 1, 5] if, for example, the Series is a subset
648+
of an original table with 10 rows, but only rows 0/1/5 were selected.
649+
650+
Input:
651+
- Table data (pandas.DataFrame)
652+
Output:
653+
- Table data, with only the valid rows (pandas.DataFrame)
654+
"""
655+
# Setup
656+
table_data = pd.DataFrame({
657+
'a': [1, 2, 3]
658+
})
659+
660+
constraint_mock = Mock()
661+
is_valid = pd.Series([True, True, False])
662+
is_valid.index = [0, 1, 5]
663+
constraint_mock.is_valid.return_value = is_valid
664+
665+
# Run
666+
out = Constraint.filter_valid(constraint_mock, table_data)
667+
668+
# Assert
669+
expected_out = pd.DataFrame({
670+
'a': [1, 2]
671+
})
672+
pd.testing.assert_frame_equal(expected_out, out)
673+
642674
def test_from_dict_fqn(self):
643675
"""Test the ``Constraint.from_dict`` method passing a FQN.
644676

tests/unit/constraints/test_tabular.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Tests for the sdv.constraints.tabular module."""
22

33
import uuid
4+
from datetime import datetime
45
from unittest.mock import Mock
56

67
import numpy as np
@@ -2109,12 +2110,121 @@ def test_is_valid_scalar_is_none_multi_column(self):
21092110
'b': [4, 2, 2],
21102111
'c': [7, 8, 9]
21112112
})
2113+
2114+
# Run
21122115
out = instance.is_valid(table_data)
21132116

21142117
# Assert
21152118
expected_out = [False, True, True]
21162119
np.testing.assert_array_equal(expected_out, out)
21172120

2121+
def test_is_valid_high_is_datetime(self):
2122+
"""Test the ``GreaterThan.is_valid`` method.
2123+
2124+
If high is a datetime and low is a column,
2125+
the values in that column should all be lower than
2126+
``instance._high``.
2127+
2128+
Input:
2129+
- Table with values above and below `high`.
2130+
Output:
2131+
- True should be returned for the rows where the low
2132+
column is below `high`.
2133+
"""
2134+
# Setup
2135+
high_dt = pd.to_datetime('8/31/2021')
2136+
instance = GreaterThan(low='a', high=high_dt, strict=False, scalar='high')
2137+
table_data = pd.DataFrame({
2138+
'a': [datetime(2020, 5, 17), datetime(2020, 2, 1), datetime(2021, 9, 1)],
2139+
'b': [4, 2, 2],
2140+
})
2141+
2142+
# Run
2143+
out = instance.is_valid(table_data)
2144+
2145+
# Assert
2146+
expected_out = [True, True, False]
2147+
np.testing.assert_array_equal(expected_out, out)
2148+
2149+
def test_is_valid_low_is_datetime(self):
2150+
"""Test the ``GreaterThan.is_valid`` method.
2151+
2152+
If low is a datetime and high is a column,
2153+
the values in that column should all be higher than
2154+
``instance._low``.
2155+
2156+
Input:
2157+
- Table with values above and below `low`.
2158+
Output:
2159+
- True should be returned for the rows where the high
2160+
column is above `low`.
2161+
"""
2162+
# Setup
2163+
low_dt = pd.to_datetime('8/31/2021')
2164+
instance = GreaterThan(low=low_dt, high='a', strict=False, scalar='low')
2165+
table_data = pd.DataFrame({
2166+
'a': [datetime(2021, 9, 17), datetime(2021, 7, 1), datetime(2021, 9, 1)],
2167+
'b': [4, 2, 2],
2168+
})
2169+
2170+
# Run
2171+
out = instance.is_valid(table_data)
2172+
2173+
# Assert
2174+
expected_out = [True, False, True]
2175+
np.testing.assert_array_equal(expected_out, out)
2176+
2177+
def test_is_valid_two_cols_with_nans(self):
2178+
"""Test the ``GreaterThan.is_valid`` method with nan values.
2179+
2180+
If there is a NaN row, expect that `is_valid` returns True.
2181+
2182+
Input:
2183+
- Table with a NaN row
2184+
Output:
2185+
- True should be returned for the NaN row.
2186+
"""
2187+
# Setup
2188+
instance = GreaterThan(low='a', high='b', strict=True)
2189+
2190+
# Run
2191+
table_data = pd.DataFrame({
2192+
'a': [1, None, 3],
2193+
'b': [4, None, 2],
2194+
'c': [7, 8, 9]
2195+
})
2196+
out = instance.is_valid(table_data)
2197+
2198+
# Assert
2199+
expected_out = [True, True, False]
2200+
np.testing.assert_array_equal(expected_out, out)
2201+
2202+
def test_is_valid_two_cols_with_one_nan(self):
2203+
"""Test the ``GreaterThan.is_valid`` method with nan values.
2204+
2205+
If there is a row in which we compare one NaN value with one
2206+
non-NaN value, expect that `is_valid` returns True.
2207+
2208+
Input:
2209+
- Table with a row that contains only one NaN value.
2210+
Output:
2211+
- True should be returned for the row with the NaN value.
2212+
"""
2213+
# Setup
2214+
instance = GreaterThan(low='a', high='b', strict=True)
2215+
2216+
# Run
2217+
table_data = pd.DataFrame({
2218+
'a': [1, None, 3],
2219+
'b': [4, 5, 2],
2220+
'c': [7, 8, 9]
2221+
})
2222+
out = instance.is_valid(table_data)
2223+
2224+
# Assert
2225+
expected_out = [True, True, False]
2226+
np.testing.assert_array_equal(expected_out, out)
2227+
21182228
def test__transform_int_drop_none(self):
21192229
"""Test the ``GreaterThan._transform`` method passing a high column of type int.
21202230
@@ -3205,6 +3315,9 @@ def test__init__drop_true(self):
32053315

32063316
def new_column(data):
32073317
"""Formula to be used for the ``TestColumnFormula`` class."""
3318+
if data['a'] is None or data['b'] is None:
3319+
return None
3320+
32083321
return data['a'] + data['b']
32093322

32103323

@@ -3322,6 +3435,33 @@ def test_is_valid_non_valid(self):
33223435
expected_out = pd.Series([False, False, False])
33233436
pd.testing.assert_series_equal(expected_out, out)
33243437

3438+
def test_is_valid_with_nans(self):
3439+
"""Test the ``ColumnFormula.is_valid`` method for with a formula that produces nans.
3440+
3441+
If the data fulfills the formula, result is a series of ``True`` values.
3442+
3443+
Input:
3444+
- Table data fulfilling the formula (pandas.DataFrame)
3445+
Output:
3446+
- Series of ``True`` values (pandas.Series)
3447+
"""
3448+
# Setup
3449+
column = 'c'
3450+
instance = ColumnFormula(column=column, formula=new_column)
3451+
3452+
# Run
3453+
table_data = pd.DataFrame({
3454+
'a': [1, 2, 3],
3455+
'b': [4, 5, None],
3456+
'c': [5, 7, None]
3457+
})
3458+
instance = ColumnFormula(column=column, formula=new_column)
3459+
out = instance.is_valid(table_data)
3460+
3461+
# Assert
3462+
expected_out = pd.Series([True, True, True])
3463+
pd.testing.assert_series_equal(expected_out, out)
3464+
33253465
def test__transform(self):
33263466
"""Test the ``ColumnFormula._transform`` method.
33273467

0 commit comments

Comments
 (0)