From ba4c2ae50d32ab2ebd100940764fca05cbd82bee Mon Sep 17 00:00:00 2001 From: Marcelle <53578688+m-goggins@users.noreply.github.com> Date: Mon, 10 Feb 2025 14:24:13 -0800 Subject: [PATCH] Reverse identifiers for comparisons (#211) ## Description Since Jaro-Winkler is the default similarity measure for string comparisons, more weight is given to differences at the start of a string. As such, this PR modifies the way that we yield and store Identifiers for comparison. Previously, the Identifier format was `type:authority:value` but now it is `value:authority:type` to give more weight to the value. For example, when comparing two MRNs with values, `1234` and `2345`, we will now give more weight to `1234` vs `2345` than the type, e.g., `MR` vs `MR`. This should yield better differentiation between strings. ## Related Issues Fixes #200 ## Additional Notes I updated the language in the RFC to reflect the new format and added a note at the bottom to indicate that the RFC has been changed. --- src/recordlinker/schemas/pii.py | 45 ++++++--- tests/unit/schemas/test_pii.py | 169 +++++++++++++++++++++++--------- 2 files changed, 152 insertions(+), 62 deletions(-) diff --git a/src/recordlinker/schemas/pii.py b/src/recordlinker/schemas/pii.py index d27ff50b..040d5c95 100644 --- a/src/recordlinker/schemas/pii.py +++ b/src/recordlinker/schemas/pii.py @@ -39,8 +39,9 @@ def __str__(self): """ Return the value of the enum as a string. """ - return self.value - + return self.value + + class Feature(pydantic.BaseModel): """ The schema for a feature. @@ -89,6 +90,7 @@ def all_options(cls) -> list[typing.Any]: options.append(f"{feature}:{identifier}") return options + class Sex(enum.Enum): """ Enum for the Patient.sex field. @@ -190,6 +192,7 @@ def email(self) -> str | None: return None return self.value + class PIIRecord(pydantic.BaseModel): """ The schema for a PII record. @@ -224,7 +227,7 @@ def model_construct( obj.name = [Name.model_construct(**n) for n in values.get("name", [])] obj.telecom = [Telecom.model_construct(**t) for t in values.get("telecom", [])] obj.identifiers = [Identifier.model_construct(**i) for i in values.get("identifiers", [])] - + return obj @pydantic.field_validator("external_id", mode="before") @@ -302,7 +305,7 @@ def feature_iter(self, feature: Feature) -> typing.Iterator[str]: if not isinstance(feature, Feature): raise ValueError(f"Invalid feature: {feature}") - + attribute = feature.attribute identifier_suffix = feature.suffix @@ -375,7 +378,7 @@ def feature_iter(self, feature: Feature) -> typing.Iterator[str]: elif attribute == FeatureAttribute.IDENTIFIER: for identifier in self.identifiers: if identifier_suffix is None or identifier_suffix == identifier.type: - yield f"{identifier.type}:{identifier.authority or ''}:{identifier.value}" + yield f"{identifier.value}:{identifier.authority or ''}:{identifier.type}" def blocking_keys(self, key: models.BlockingKey) -> set[str]: """ @@ -392,25 +395,37 @@ def blocking_keys(self, key: models.BlockingKey) -> set[str]: # NOTE: we could optimize here and remove the dashes from the date vals.update(self.feature_iter(Feature(attribute=FeatureAttribute.BIRTHDATE))) elif key == models.BlockingKey.IDENTIFIER: - vals.update({ - f"{type_part}:{authority_part[:2]}:{value_part[-4:]}" - for x in self.feature_iter(Feature(attribute=FeatureAttribute.IDENTIFIER)) - for type_part, authority_part, value_part in [x.split(":", 2)] - }) + vals.update( + { + f"{value_part[-4:]}:{authority_part[:2]}:{type_part}" + for x in self.feature_iter(Feature(attribute=FeatureAttribute.IDENTIFIER)) + for value_part, authority_part, type_part in [x.split(":", 2)] + } + ) elif key == models.BlockingKey.SEX: vals.update(self.feature_iter(Feature(attribute=FeatureAttribute.SEX))) elif key == models.BlockingKey.ZIP: vals.update(self.feature_iter(Feature(attribute=FeatureAttribute.ZIP))) elif key == models.BlockingKey.FIRST_NAME: - vals.update({x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.FIRST_NAME))}) + vals.update( + {x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.FIRST_NAME))} + ) elif key == models.BlockingKey.LAST_NAME: - vals.update({x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.LAST_NAME))}) + vals.update( + {x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.LAST_NAME))} + ) elif key == models.BlockingKey.ADDRESS: - vals.update({x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.ADDRESS))}) + vals.update( + {x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.ADDRESS))} + ) elif key == models.BlockingKey.PHONE: - vals.update({x[-4:] for x in self.feature_iter(Feature(attribute=FeatureAttribute.PHONE))}) + vals.update( + {x[-4:] for x in self.feature_iter(Feature(attribute=FeatureAttribute.PHONE))} + ) elif key == models.BlockingKey.EMAIL: - vals.update({x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.EMAIL))}) + vals.update( + {x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.EMAIL))} + ) # if any vals are longer than the BLOCKING_KEY_MAX_LENGTH, raise an error if any(len(x) > models.BLOCKING_VALUE_MAX_LENGTH for x in vals): diff --git a/tests/unit/schemas/test_pii.py b/tests/unit/schemas/test_pii.py index 36525db0..41db8699 100644 --- a/tests/unit/schemas/test_pii.py +++ b/tests/unit/schemas/test_pii.py @@ -52,8 +52,8 @@ def test_model_construct(self): "type": "DL", "value": "D1234567", "authority": "VA", - } - ] + }, + ], } record = pii.PIIRecord.model_construct(**data) assert record.birth_date == "1980-2-1" @@ -72,7 +72,7 @@ def test_model_construct(self): assert record.address[1].postal_code == "98765-4321" assert record.address[1].county == "county2" - #identifiers + # identifiers assert str(record.identifiers[0].type) == "MR" assert record.identifiers[0].value == "99" @@ -129,14 +129,14 @@ def test_parse_sex(self): def test_parse_ssn(self): record = pii.PIIRecord(identifiers=[pii.Identifier(type="SS", value="123-45-6789")]) assert record.identifiers[0].value == "123-45-6789" - #testing extra spaces + # testing extra spaces record = pii.PIIRecord(identifiers=[pii.Identifier(type="SS", value=" 123-45-6789 ")]) assert record.identifiers[0].value == "123-45-6789" - #testing no dashes + # testing no dashes record = pii.PIIRecord(identifiers=[pii.Identifier(type="SS", value="123456789")]) assert record.identifiers[0].value == "123-45-6789" record = pii.PIIRecord(identifiers=[pii.Identifier(type="SS", value="1-2-3")]) - assert record.identifiers[0].value == '' + assert record.identifiers[0].value == "" record = pii.PIIRecord() assert record.identifiers == [] @@ -228,37 +228,73 @@ def test_feature_iter(self): "type": "DL", "value": "D1234567", "authority": "VA", - } + }, ], ) with pytest.raises(ValueError): list(record.feature_iter("external_id")) - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.BIRTHDATE))) == ["1980-02-01"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.BIRTHDATE))) == [ + "1980-02-01" + ] assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.SEX))) == ["M"] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.ADDRESS))) == ["123 Main St", "456 Elm St"] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.CITY))) == ["Anytown", "Somecity"] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.STATE))) == ["NY", "CA"] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.ZIP))) == ["12345", "98765"] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.GIVEN_NAME))) == ["John", "L", "Jane"] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.FIRST_NAME))) == ["John", "Jane"] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.LAST_NAME))) == ["Doe", "Smith"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.ADDRESS))) == [ + "123 Main St", + "456 Elm St", + ] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.CITY))) == [ + "Anytown", + "Somecity", + ] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.STATE))) == [ + "NY", + "CA", + ] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.ZIP))) == [ + "12345", + "98765", + ] + assert list( + record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.GIVEN_NAME)) + ) == ["John", "L", "Jane"] + assert list( + record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.FIRST_NAME)) + ) == ["John", "Jane"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.LAST_NAME))) == [ + "Doe", + "Smith", + ] assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == [] assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.TELECOM))) == [ "555-123-4567", "(555) 987-6543", "test@email.com", ] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.PHONE))) == ["5559876543"] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.EMAIL))) == ["test@email.com"] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.SUFFIX))) == ["suffix", "suffix2"] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.COUNTY))) == ["county"] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER))) == ["MR::123456", "SS::123-45-6789", "DL:VA:D1234567"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.PHONE))) == [ + "5559876543" + ] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.EMAIL))) == [ + "test@email.com" + ] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.SUFFIX))) == [ + "suffix", + "suffix2", + ] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.COUNTY))) == [ + "county" + ] + assert list( + record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER)) + ) == ["123456::MR", "123-45-6789::SS", "D1234567:VA:DL"] # IDENTIFIER with suffix - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="MR"))) == ["MR::123456"] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="SS"))) == ["SS::123-45-6789"] + assert list( + record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="MR")) + ) == ["123456::MR"] + assert list( + record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="SS")) + ) == ["123-45-6789::SS"] # Other fields work okay, few more checks on difference race yield values record = pii.PIIRecord(race="asked unknown") @@ -266,12 +302,17 @@ def test_feature_iter(self): record = pii.PIIRecord(race="asked but unknown") assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == [] record = pii.PIIRecord(race="asian") - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["ASIAN"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == [ + "ASIAN" + ] record = pii.PIIRecord(race="african american") - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["BLACK"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == [ + "BLACK" + ] record = pii.PIIRecord(race="white") - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["WHITE"] - + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == [ + "WHITE" + ] def test_blocking_keys_invalid(self): rec = pii.PIIRecord() @@ -301,17 +342,19 @@ def test_blocking_keys_mrn_last_four(self): assert rec.blocking_keys(BlockingKey.IDENTIFIER) == set() rec = pii.PIIRecord(**{"identifiers": []}) assert rec.blocking_keys(BlockingKey.IDENTIFIER) == set() - rec = pii.PIIRecord(**{"identifiers": [{ "type": "MR", "value": "123456789" }]}) - assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR::6789"} - rec = pii.PIIRecord(**{"identifiers": [{ "type": "MR", "value": "89" }]}) - assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR::89"} - - #test multiple identifiers return correctly - rec = pii.PIIRecord(identifiers=[ - pii.Identifier(type="MR", value="123456789"), - pii.Identifier(type="SS", value="123456789"), - ]) - assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR::6789", "SS::6789"} + rec = pii.PIIRecord(**{"identifiers": [{"type": "MR", "value": "123456789"}]}) + assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"6789::MR"} + rec = pii.PIIRecord(**{"identifiers": [{"type": "MR", "value": "89"}]}) + assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"89::MR"} + + # test multiple identifiers return correctly + rec = pii.PIIRecord( + identifiers=[ + pii.Identifier(type="MR", value="123456789"), + pii.Identifier(type="SS", value="123456789"), + ] + ) + assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"6789::MR", "6789::SS"} def test_blocking_keys_sex(self): rec = pii.PIIRecord(**{"gender": "M"}) @@ -387,9 +430,23 @@ def test_blocking_keys_phone_last_four(self): assert rec.blocking_keys(BlockingKey.PHONE) == set() rec = pii.PIIRecord(**{"telecom": [{"value": "(555) 123-4567", "system": "phone"}]}) assert rec.blocking_keys(BlockingKey.PHONE) == {"4567"} - rec = pii.PIIRecord(**{"telecom": [{"value": "555.123.4567", "system": "phone"}, {"value": "555-987-6543 ext 123", "system": "phone"}]}) + rec = pii.PIIRecord( + **{ + "telecom": [ + {"value": "555.123.4567", "system": "phone"}, + {"value": "555-987-6543 ext 123", "system": "phone"}, + ] + } + ) assert rec.blocking_keys(BlockingKey.PHONE) == {"4567", "6543"} - rec = pii.PIIRecord(**{"telecom": [{"value": "555.123.4567", "system": "phone"}, {"value": "555-987-6543", "system": "fax"}]}) + rec = pii.PIIRecord( + **{ + "telecom": [ + {"value": "555.123.4567", "system": "phone"}, + {"value": "555-987-6543", "system": "fax"}, + ] + } + ) assert rec.blocking_keys(BlockingKey.PHONE) == {"4567"} def test_blocking_keys_email_first_four(self): @@ -397,20 +454,38 @@ def test_blocking_keys_email_first_four(self): assert rec.blocking_keys(BlockingKey.EMAIL) == set() rec = pii.PIIRecord(**{"telecom": [{"value": "test123@email.com", "system": "email"}]}) assert rec.blocking_keys(BlockingKey.EMAIL) == {"test"} - rec = pii.PIIRecord(**{"telecom": [{"value": "test@email.com", "system": "email"}, {"value": "bob@email.com", "system": "email"}]}) + rec = pii.PIIRecord( + **{ + "telecom": [ + {"value": "test@email.com", "system": "email"}, + {"value": "bob@email.com", "system": "email"}, + ] + } + ) assert rec.blocking_keys(BlockingKey.EMAIL) == {"test", "bob@"} - rec = pii.PIIRecord(**{"telecom": [{"value": "t@gmail.com", "system": "email"}, {"value": "bob@gmail.com", "system": "other"}]}) + rec = pii.PIIRecord( + **{ + "telecom": [ + {"value": "t@gmail.com", "system": "email"}, + {"value": "bob@gmail.com", "system": "other"}, + ] + } + ) assert rec.blocking_keys(BlockingKey.EMAIL) == {"t@gm"} def test_blocking_keys_identifier(self): rec = pii.PIIRecord(**{"identifiers": []}) assert rec.blocking_keys(BlockingKey.IDENTIFIER) == set() - rec = pii.PIIRecord(**{"identifiers": [{"type": "MR", "value": "123456789", "authority": "NY"}]}) - assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR:NY:6789"} + rec = pii.PIIRecord( + **{"identifiers": [{"type": "MR", "value": "123456789", "authority": "NY"}]} + ) + assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"6789:NY:MR"} - #test only get first 2 characters of authority for blocking - rec = pii.PIIRecord(**{"identifiers": [{"type": "MR", "value": "123456789", "authority": "DMV"}]}) - assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR:DM:6789"} + # test only get first 2 characters of authority for blocking + rec = pii.PIIRecord( + **{"identifiers": [{"type": "MR", "value": "123456789", "authority": "DMV"}]} + ) + assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"6789:DM:MR"} def test_blocking_values(self): rec = pii.PIIRecord( @@ -425,7 +500,7 @@ def test_blocking_values(self): if key == BlockingKey.BIRTHDATE: assert val == "1980-01-01" elif key == BlockingKey.IDENTIFIER: - assert val == "MR::3456" + assert val == "3456::MR" elif key == BlockingKey.FIRST_NAME: assert val == "John" elif key == BlockingKey.LAST_NAME: