Skip to content

Commit 7384de0

Browse files
authored
Metadata anonymize doesn't produce the right METADATA_SPEC_VERSION (#2336)
1 parent 10d40fc commit 7384de0

File tree

6 files changed

+177
-7
lines changed

6 files changed

+177
-7
lines changed

sdv/metadata/metadata.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,3 +288,14 @@ def get_table_metadata(self, table_name=None):
288288
table_name = self._handle_table_name(table_name)
289289
table_metadata = super().get_table_metadata(table_name)
290290
return Metadata.load_from_dict(table_metadata.to_dict(), single_table_name=table_name)
291+
292+
def anonymize(self):
293+
"""Anonymize metadata by obfuscating column names.
294+
295+
Returns:
296+
MultiTableMetadata:
297+
An anonymized MultiTableMetadata instance.
298+
"""
299+
anonymized_metadata = self._get_anonymized_dict()
300+
301+
return Metadata.load_from_dict(anonymized_metadata)

sdv/metadata/multi_table.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -914,13 +914,7 @@ def get_table_metadata(self, table_name):
914914
self._validate_table_exists(table_name)
915915
return deepcopy(self.tables[table_name])
916916

917-
def anonymize(self):
918-
"""Anonymize metadata by obfuscating column names.
919-
920-
Returns:
921-
MultiTableMetadata:
922-
An anonymized MultiTableMetadata instance.
923-
"""
917+
def _get_anonymized_dict(self):
924918
anonymized_metadata = {'tables': {}, 'relationships': []}
925919
anonymized_table_map = {}
926920
counter = 1
@@ -953,6 +947,17 @@ def anonymize(self):
953947
'parent_primary_key': anonymized_primary_key,
954948
})
955949

950+
return anonymized_metadata
951+
952+
def anonymize(self):
953+
"""Anonymize metadata by obfuscating column names.
954+
955+
Returns:
956+
MultiTableMetadata:
957+
An anonymized MultiTableMetadata instance.
958+
"""
959+
anonymized_metadata = self._get_anonymized_dict()
960+
956961
return MultiTableMetadata.load_from_dict(anonymized_metadata)
957962

958963
def visualize(

tests/integration/metadata/test_metadata.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,3 +462,61 @@ def test_any_metadata_update_multi_table(method, args, kwargs):
462462
assert expected_dict != metadata_before
463463
else:
464464
assert result == ['checkin_date', 'checkout_date']
465+
466+
467+
def test_anonymize():
468+
"""Test the ``anonymize`` method."""
469+
# Setup
470+
metadata_dict = {
471+
'tables': {
472+
'real_table1': {
473+
'columns': {
474+
'table1_primary_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
475+
'table1_column2': {'sdtype': 'categorical'},
476+
},
477+
'primary_key': 'table1_primary_key',
478+
},
479+
'real_table2': {
480+
'columns': {
481+
'table2_primary_key': {'sdtype': 'email'},
482+
'table2_foreign_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
483+
},
484+
'primary_key': 'table2_primary_key',
485+
},
486+
},
487+
'relationships': [
488+
{
489+
'parent_table_name': 'real_table1',
490+
'parent_primary_key': 'table1_primary_key',
491+
'child_table_name': 'real_table2',
492+
'child_foreign_key': 'table2_foreign_key',
493+
}
494+
],
495+
}
496+
metadata = Metadata.load_from_dict(metadata_dict)
497+
table1_metadata = metadata.tables['real_table1']
498+
table2_metadata = metadata.tables['real_table2']
499+
metadata.validate()
500+
501+
# Run
502+
anonymized = metadata.anonymize()
503+
504+
# Assert
505+
anonymized.validate()
506+
507+
assert anonymized.METADATA_SPEC_VERSION == 'V1'
508+
assert anonymized.tables.keys() == {'table1', 'table2'}
509+
assert len(anonymized.relationships) == len(metadata.relationships)
510+
assert anonymized.relationships[0]['parent_table_name'] == 'table1'
511+
assert anonymized.relationships[0]['child_table_name'] == 'table2'
512+
assert anonymized.relationships[0]['parent_primary_key'] == 'col1'
513+
assert anonymized.relationships[0]['child_foreign_key'] == 'col2'
514+
515+
anon_primary_key_metadata = anonymized.tables['table1'].columns['col1']
516+
assert anon_primary_key_metadata == table1_metadata.columns['table1_primary_key']
517+
518+
anon_foreign_key_metadata = anonymized.tables['table2'].columns['col2']
519+
assert anon_foreign_key_metadata == table2_metadata.columns['table2_foreign_key']
520+
521+
assert anonymized.tables['table1'].to_dict() == table1_metadata.anonymize().to_dict()
522+
assert anonymized.tables['table2'].to_dict() == table2_metadata.anonymize().to_dict()

tests/integration/metadata/test_multi_table.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,7 @@ def test_anonymize():
374374
# Assert
375375
anonymized.validate()
376376

377+
assert anonymized.METADATA_SPEC_VERSION == 'MULTI_TABLE_V1'
377378
assert anonymized.tables.keys() == {'table1', 'table2'}
378379
assert len(anonymized.relationships) == len(metadata.relationships)
379380
assert anonymized.relationships[0]['parent_table_name'] == 'table1'

tests/unit/metadata/test_metadata.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -717,3 +717,18 @@ def test__handle_table_name_with_empty_tables(self):
717717
error_msg = 'Metadata does not contain any tables. No columns can be added.'
718718
with pytest.raises(ValueError, match=error_msg):
719719
instance._handle_table_name(None)
720+
721+
@patch('sdv.metadata.metadata.Metadata.load_from_dict')
722+
def test_anonymize(self, mock_load_from_dict):
723+
"""Test that the `anonymize` method."""
724+
# Setup
725+
metadata = Metadata()
726+
metadata._get_anonymized_dict = Mock(return_value={})
727+
metadata.load_from_dict = Mock()
728+
729+
# Run
730+
metadata.anonymize()
731+
732+
# Assert
733+
metadata._get_anonymized_dict.assert_called_once()
734+
mock_load_from_dict.assert_called_once_with({})

tests/unit/metadata/test_multi_table.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3160,6 +3160,86 @@ def test_anonymize(self, mock_load):
31603160
'child_foreign_key': 'col2',
31613161
}
31623162

3163+
def test__get_anonymized_dict(self):
3164+
"""Test the ``_get_anonymized_dict`` method."""
3165+
# Setup
3166+
metadata_dict = {
3167+
'tables': {
3168+
'real_table1': {
3169+
'columns': {
3170+
'table1_primary_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
3171+
'table1_column2': {'sdtype': 'categorical'},
3172+
},
3173+
'primary_key': 'table1_primary_key',
3174+
},
3175+
'real_table2': {
3176+
'columns': {
3177+
'table2_primary_key': {'sdtype': 'email'},
3178+
'table2_foreign_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
3179+
},
3180+
'primary_key': 'table2_primary_key',
3181+
},
3182+
},
3183+
'relationships': [
3184+
{
3185+
'parent_table_name': 'real_table1',
3186+
'parent_primary_key': 'table1_primary_key',
3187+
'child_table_name': 'real_table2',
3188+
'child_foreign_key': 'table2_foreign_key',
3189+
}
3190+
],
3191+
}
3192+
metadata = MultiTableMetadata.load_from_dict(metadata_dict)
3193+
3194+
# Run
3195+
anonymized_dict = metadata._get_anonymized_dict()
3196+
3197+
# Assert
3198+
expected_anonymized_dict = {
3199+
'tables': {
3200+
'table1': {
3201+
'columns': {
3202+
'col1': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
3203+
'col2': {'sdtype': 'categorical'},
3204+
},
3205+
'primary_key': 'col1',
3206+
'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
3207+
},
3208+
'table2': {
3209+
'columns': {
3210+
'col1': {'sdtype': 'email'},
3211+
'col2': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
3212+
},
3213+
'primary_key': 'col1',
3214+
'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
3215+
},
3216+
},
3217+
'relationships': [
3218+
{
3219+
'parent_table_name': 'table1',
3220+
'child_table_name': 'table2',
3221+
'child_foreign_key': 'col2',
3222+
'parent_primary_key': 'col1',
3223+
}
3224+
],
3225+
}
3226+
assert anonymized_dict == expected_anonymized_dict
3227+
3228+
@patch('sdv.metadata.metadata.MultiTableMetadata.load_from_dict')
3229+
def test_anonymize_mock(self, mock_load_from_dict):
3230+
"""Test that the `anonymize` method."""
3231+
# Setup
3232+
metadata = MultiTableMetadata()
3233+
metadata._get_anonymized_dict = Mock(return_value={})
3234+
metadata.load_from_dict = Mock()
3235+
3236+
# Run
3237+
metadata.anonymize()
3238+
3239+
# Assert
3240+
metadata._get_anonymized_dict.assert_called_once()
3241+
mock_load_from_dict.assert_called_once_with({})
3242+
31633243
def test_update_columns_no_list_error(self):
31643244
"""Test that ``update_columns`` only takes in list and that an error is thrown."""
31653245
# Setup

0 commit comments

Comments
 (0)