Skip to content

Commit 7d989b0

Browse files
authored
For RegexGenerator, update the enforce_uniqueness parameter to cardinality_rule (#957)
1 parent bc6a9f3 commit 7d989b0

File tree

7 files changed

+142
-67
lines changed

7 files changed

+142
-67
lines changed

Diff for: rdt/transformers/id.py

+28-11
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@
77
import pandas as pd
88

99
from rdt.transformers.base import BaseTransformer
10-
from rdt.transformers.utils import strings_from_regex
10+
from rdt.transformers.utils import (
11+
_handle_enforce_uniqueness_and_cardinality_rule,
12+
strings_from_regex,
13+
)
1114

1215
LOGGER = logging.getLogger(__name__)
1316

@@ -82,9 +85,13 @@ class RegexGenerator(BaseTransformer):
8285
regex (str):
8386
String representing the regex function.
8487
enforce_uniqueness (bool):
85-
Whether or not to ensure that the new generated data is all unique. If it isn't
86-
possible to create the requested number of rows, then an error will be raised.
87-
Defaults to ``False``.
88+
**DEPRECATED** Whether or not to ensure that the new generated data is all unique.
89+
If it isn't possible to create the requested number of rows, then an error will
90+
be raised. Defaults to ``None``.
91+
cardinality_rule (str):
92+
Rule that the generated data must follow. If set to ``unique``, the generated
93+
data must be unique. If set to ``None``, then the generated data may contain
94+
duplicates. Defaults to ``None``.
8895
generation_order (str):
8996
String defining how to generate the output. If set to ``alphanumeric``, it will
9097
generate the output in alphanumeric order (ie. 'aaa', 'aab' or '1', '2'...). If
@@ -122,13 +129,16 @@ def __setstate__(self, state):
122129
def __init__(
123130
self,
124131
regex_format='[A-Za-z]{5}',
125-
enforce_uniqueness=False,
132+
cardinality_rule=None,
126133
generation_order='alphanumeric',
134+
enforce_uniqueness=None,
127135
):
128136
super().__init__()
129137
self.output_properties = {None: {'next_transformer': None}}
130-
self.enforce_uniqueness = enforce_uniqueness
131138
self.regex_format = regex_format
139+
self.cardinality_rule = _handle_enforce_uniqueness_and_cardinality_rule(
140+
enforce_uniqueness, cardinality_rule
141+
)
132142
self.data_length = None
133143
self.generator = None
134144
self.generator_size = None
@@ -158,16 +168,18 @@ def _transform(self, _data):
158168
"""Drop the input column by returning ``None``."""
159169
return None
160170

161-
def _warn_not_enough_unique_values(self, sample_size):
171+
def _warn_not_enough_unique_values(self, sample_size, unique_condition):
162172
"""Warn the user that the regex cannot generate enough unique values.
163173
164174
Args:
165175
sample_size (int):
166176
Number of samples to be generated.
177+
unique_condition (bool):
178+
Whether or not to enforce uniqueness.
167179
"""
168180
warned = False
169181
if sample_size > self.generator_size:
170-
if self.enforce_uniqueness:
182+
if unique_condition:
171183
warnings.warn(
172184
f"The regex for '{self.get_input_column()}' can only generate "
173185
f'{self.generator_size} unique values. Additional values may not exactly '
@@ -185,7 +197,7 @@ def _warn_not_enough_unique_values(self, sample_size):
185197
)
186198

187199
remaining = self.generator_size - self.generated
188-
if sample_size > remaining and self.enforce_uniqueness and not warned:
200+
if sample_size > remaining and unique_condition and not warned:
189201
warnings.warn(
190202
f'The regex generator is not able to generate {sample_size} new unique '
191203
f'values (only {max(remaining, 0)} unique values left).'
@@ -201,12 +213,17 @@ def _reverse_transform(self, data):
201213
Returns:
202214
pandas.Series
203215
"""
216+
if hasattr(self, 'cardinality_rule'):
217+
unique_condition = self.cardinality_rule == 'unique'
218+
else:
219+
unique_condition = self.enforce_uniqueness
220+
204221
if data is not None and len(data):
205222
sample_size = len(data)
206223
else:
207224
sample_size = self.data_length
208225

209-
self._warn_not_enough_unique_values(sample_size)
226+
self._warn_not_enough_unique_values(sample_size, unique_condition)
210227

211228
remaining = self.generator_size - self.generated
212229
if sample_size > remaining:
@@ -225,7 +242,7 @@ def _reverse_transform(self, data):
225242
reverse_transformed = generated_values[:]
226243

227244
if len(reverse_transformed) < sample_size:
228-
if self.enforce_uniqueness:
245+
if unique_condition:
229246
try:
230247
remaining_samples = sample_size - len(reverse_transformed)
231248
start = int(generated_values[-1]) + 1

Diff for: rdt/transformers/pii/anonymizer.py

+5-9
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from rdt.errors import TransformerInputError, TransformerProcessingError
1616
from rdt.transformers.base import BaseTransformer
1717
from rdt.transformers.categorical import LabelEncoder
18+
from rdt.transformers.utils import _handle_enforce_uniqueness_and_cardinality_rule
1819

1920

2021
class AnonymizedFaker(BaseTransformer):
@@ -113,22 +114,17 @@ def __init__(
113114
function_kwargs=None,
114115
locales=None,
115116
cardinality_rule=None,
116-
enforce_uniqueness=False,
117+
enforce_uniqueness=None,
117118
missing_value_generation='random',
118119
):
119120
super().__init__()
120121
self._data_cardinality = None
121122
self.data_length = None
122123
self.enforce_uniqueness = enforce_uniqueness
123124
self.cardinality_rule = cardinality_rule.lower() if cardinality_rule else None
124-
if enforce_uniqueness:
125-
warnings.warn(
126-
"The 'enforce_uniqueness' parameter is no longer supported. "
127-
"Please use the 'cardinality_rule' parameter instead.",
128-
FutureWarning,
129-
)
130-
if not self.cardinality_rule:
131-
self.cardinality_rule = 'unique'
125+
self.cardinality_rule = _handle_enforce_uniqueness_and_cardinality_rule(
126+
enforce_uniqueness, cardinality_rule
127+
)
132128

133129
self.provider_name = provider_name if provider_name else 'BaseProvider'
134130
if self.provider_name != 'BaseProvider' and function_name is None:

Diff for: rdt/transformers/utils.py

+14
Original file line numberDiff line numberDiff line change
@@ -367,3 +367,17 @@ def __getitem__(self, sdtype):
367367
phased out.
368368
"""
369369
return self.get(sdtype)
370+
371+
372+
def _handle_enforce_uniqueness_and_cardinality_rule(enforce_uniqueness, cardinality_rule):
373+
result = cardinality_rule
374+
if enforce_uniqueness is not None:
375+
warnings.warn(
376+
"The 'enforce_uniqueness' parameter is no longer supported. "
377+
"Please use the 'cardinality_rule' parameter instead.",
378+
FutureWarning,
379+
)
380+
if enforce_uniqueness and cardinality_rule is None:
381+
result = 'unique'
382+
383+
return result

Diff for: tests/integration/transformers/test_id.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def test_input_data_bigger_than_data_length(self):
173173
def test_called_multiple_times(self):
174174
"""Test the ``RegexGenerator`` with short regex and called multiple times.
175175
176-
This test ensures that when ``enforce_uniqueness`` is ``False`` this generator will
176+
This test ensures that when ``cardinality_rule`` is ``None`` this generator will
177177
continue to work.
178178
"""
179179
# Setup
@@ -223,11 +223,11 @@ def test_called_multiple_times(self):
223223
})
224224
pd.testing.assert_frame_equal(third_reverse_transform, expected_reverse_transformed)
225225

226-
def test_called_multiple_times_enforce_uniqueness(self):
227-
"""Test that calling multiple times with ``enforce_uniqueness`` returns unique values."""
226+
def test_called_multiple_times_cardinality_rule_unique(self):
227+
"""Test calling multiple times when ``cardinality_rule`` is ``unique``."""
228228
# Setup
229229
data = pd.DataFrame({'my_column': np.arange(10)})
230-
generator = RegexGenerator(enforce_uniqueness=True)
230+
generator = RegexGenerator(cardinality_rule='unique')
231231

232232
# Run
233233
transformed_data = generator.fit_transform(data, 'my_column')
@@ -303,13 +303,13 @@ def test_with_many_possibilities(self):
303303
pd.testing.assert_frame_equal(transformed, expected_transformed)
304304
pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transformed)
305305

306-
def test_enforce_uniqueness_not_enough_values_categorical(self):
307-
"""Test with enforce_uniqueness=True but insufficient regex values."""
306+
def test_cardinality_rule_unique_not_enough_values_categorical(self):
307+
"""Test with cardinality_rule='unique' but insufficient regex values."""
308308
# Setup
309309
data = pd.DataFrame({
310310
'id': [1, 2, 3, 4, 5],
311311
})
312-
instance = RegexGenerator('id_[a-b]{1}', enforce_uniqueness=True)
312+
instance = RegexGenerator('id_[a-b]{1}', cardinality_rule='unique')
313313

314314
# Run
315315
transformed = instance.fit_transform(data, 'id')
@@ -319,13 +319,13 @@ def test_enforce_uniqueness_not_enough_values_categorical(self):
319319
expected = pd.DataFrame({'id': ['id_a', 'id_b', 'id_a(0)', 'id_b(0)', 'id_a(1)']})
320320
pd.testing.assert_frame_equal(reverse_transform, expected)
321321

322-
def test_enforce_uniqueness_not_enough_values_numerical(self):
323-
"""Test with enforce_uniqueness=True but insufficient regex values."""
322+
def test_cardinality_rule_not_enough_values_numerical(self):
323+
"""Test with cardinality_rule='unique' but insufficient regex values."""
324324
# Setup
325325
data = pd.DataFrame({
326326
'id': [1, 2, 3, 4, 5],
327327
})
328-
instance = RegexGenerator('[2-3]{1}', enforce_uniqueness=True)
328+
instance = RegexGenerator('[2-3]{1}', cardinality_rule='unique')
329329

330330
# Run
331331
transformed = instance.fit_transform(data, 'id')

Diff for: tests/unit/transformers/pii/test_anonymizer.py

+7-10
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def test___init___enforce_uniqueness_exists(self):
126126
instance = AnonymizedFaker()
127127

128128
# Assert
129-
assert instance.enforce_uniqueness is False
129+
assert instance.enforce_uniqueness is None
130130

131131
def test__function_cardinality_rule_unique(self):
132132
"""Test that ``_function`` uses the ``faker.unique``.
@@ -367,8 +367,8 @@ def test___init__error_missing_value_generation(self):
367367

368368
@patch('rdt.transformers.pii.anonymizer.faker')
369369
@patch('rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function')
370-
@patch('rdt.transformers.pii.anonymizer.warnings')
371-
def test___init__custom(self, mock_warnings, mock_check_provider_function, mock_faker):
370+
@patch('rdt.transformers.pii.anonymizer._handle_enforce_uniqueness_and_cardinality_rule')
371+
def test___init__custom(self, mock__handle, mock_check_provider_function, mock_faker):
372372
"""Test the instantiation of the transformer with custom parameters.
373373
374374
Test that the transformer can be instantiated with a custom provider and function, and
@@ -390,6 +390,9 @@ def test___init__custom(self, mock_warnings, mock_check_provider_function, mock_
390390
``credit_card_full``.
391391
- the ``instance._function`` is ``instance.faker.credit_card_full``.
392392
"""
393+
# Setup
394+
mock__handle.return_value = 'unique'
395+
393396
# Run
394397
instance = AnonymizedFaker(
395398
provider_name='credit_card',
@@ -407,13 +410,7 @@ def test___init__custom(self, mock_warnings, mock_check_provider_function, mock_
407410
assert instance.locales == ['en_US', 'fr_FR']
408411
mock_faker.Faker.assert_called_once_with(['en_US', 'fr_FR'])
409412
assert instance.cardinality_rule == 'unique'
410-
mock_warnings.warn.assert_has_calls([
411-
call(
412-
"The 'enforce_uniqueness' parameter is no longer supported. "
413-
"Please use the 'cardinality_rule' parameter instead.",
414-
FutureWarning,
415-
)
416-
])
413+
mock__handle.assert_called_once_with(True, None)
417414

418415
def test___init__no_function_name(self):
419416
"""Test the instantiation of the transformer with custom parameters.

0 commit comments

Comments
 (0)