From 1507beaf3aa8ef93dcb09e518da4c0963ceeb2e1 Mon Sep 17 00:00:00 2001 From: bamader <49412165+bamader@users.noreply.github.com> Date: Fri, 7 Feb 2025 16:09:42 -0500 Subject: [PATCH] Appropriately handle race unknown (#196) ## Description This PR removes the `feature_iter` yield for the `RACE` field whenever an incoming record has a value for that field of `UNKNOWN` or `ASKED_UNKNOWN`. This ensures that, downstream, we don't perform fuzzy string comparisons against known race values and `UNKNOWN` and thereby award some log odds points where none should be. ## Related Issues #193 ## Additional Notes n/a <--------------------- REMOVE THE LINES BELOW BEFORE MERGING ---------------------> ## Checklist Please review and complete the following checklist before submitting your pull request: - [x] I have ensured that the pull request is of a manageable size, allowing it to be reviewed within a single session. - [x] I have reviewed my changes to ensure they are clear, concise, and well-documented. - [x] I have updated the documentation, if applicable. - [x] I have added or updated test cases to cover my changes, if applicable. - [x] I have minimized the number of reviewers to include only those essential for the review. ## Checklist for Reviewers Please review and complete the following checklist during the review process: - [ ] The code follows best practices and conventions. - [ ] The changes implement the desired functionality or fix the reported issue. - [ ] The tests cover the new changes and pass successfully. - [ ] Any potential edge cases or error scenarios have been considered. --- src/recordlinker/schemas/pii.py | 2 +- tests/unit/schemas/test_pii.py | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/recordlinker/schemas/pii.py b/src/recordlinker/schemas/pii.py index c2ff16a9..d27ff50b 100644 --- a/src/recordlinker/schemas/pii.py +++ b/src/recordlinker/schemas/pii.py @@ -347,7 +347,7 @@ def feature_iter(self, feature: Feature) -> typing.Iterator[str]: if name.family: yield name.family elif attribute == FeatureAttribute.RACE: - if self.race: + if self.race and self.race not in [Race.UNKNOWN, Race.ASKED_UNKNOWN]: yield str(self.race) elif attribute == FeatureAttribute.TELECOM: for telecom in self.telecom: diff --git a/tests/unit/schemas/test_pii.py b/tests/unit/schemas/test_pii.py index cd61be2a..36525db0 100644 --- a/tests/unit/schemas/test_pii.py +++ b/tests/unit/schemas/test_pii.py @@ -148,6 +148,8 @@ def test_parse_race(self): assert record.race == pii.Race.BLACK record = pii.PIIRecord(race="native hawaiian or other pacific islander") assert record.race == pii.Race.HAWAIIAN + record = pii.PIIRecord(race="asked unknown") + assert record.race == pii.Race.ASKED_UNKNOWN record = pii.PIIRecord(race="asked but unknown") assert record.race == pii.Race.ASKED_UNKNOWN record = pii.PIIRecord(race="unknown") @@ -242,7 +244,7 @@ def test_feature_iter(self): assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.GIVEN_NAME))) == ["John", "L", "Jane"] assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.FIRST_NAME))) == ["John", "Jane"] assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.LAST_NAME))) == ["Doe", "Smith"] - assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["UNKNOWN"] + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == [] assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.TELECOM))) == [ "555-123-4567", "(555) 987-6543", @@ -257,6 +259,18 @@ def test_feature_iter(self): # IDENTIFIER with suffix assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="MR"))) == ["MR::123456"] assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="SS"))) == ["SS::123-45-6789"] + + # Other fields work okay, few more checks on difference race yield values + record = pii.PIIRecord(race="asked unknown") + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == [] + record = pii.PIIRecord(race="asked but unknown") + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == [] + record = pii.PIIRecord(race="asian") + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["ASIAN"] + record = pii.PIIRecord(race="african american") + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["BLACK"] + record = pii.PIIRecord(race="white") + assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["WHITE"] def test_blocking_keys_invalid(self):