Skip to content

Commit

Permalink
Appropriately handle race unknown (#196)
Browse files Browse the repository at this point in the history
## Description
This PR removes the `feature_iter` yield for the `RACE` field whenever
an incoming record has a value for that field of `UNKNOWN` or
`ASKED_UNKNOWN`. This ensures that, downstream, we don't perform fuzzy
string comparisons against known race values and `UNKNOWN` and thereby
award some log odds points where none should be.

## Related Issues
#193 

## Additional Notes
n/a
<--------------------- REMOVE THE LINES BELOW BEFORE MERGING
--------------------->

## Checklist
Please review and complete the following checklist before submitting
your pull request:

- [x] I have ensured that the pull request is of a manageable size,
allowing it to be reviewed within a single session.
- [x] I have reviewed my changes to ensure they are clear, concise, and
well-documented.
- [x] I have updated the documentation, if applicable.
- [x] I have added or updated test cases to cover my changes, if
applicable.
- [x] I have minimized the number of reviewers to include only those
essential for the review.

## Checklist for Reviewers
Please review and complete the following checklist during the review
process:

- [ ] The code follows best practices and conventions.
- [ ] The changes implement the desired functionality or fix the
reported issue.
- [ ] The tests cover the new changes and pass successfully.
- [ ] Any potential edge cases or error scenarios have been considered.
  • Loading branch information
bamader authored Feb 7, 2025
1 parent 079bab7 commit 1507bea
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/recordlinker/schemas/pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def feature_iter(self, feature: Feature) -> typing.Iterator[str]:
if name.family:
yield name.family
elif attribute == FeatureAttribute.RACE:
if self.race:
if self.race and self.race not in [Race.UNKNOWN, Race.ASKED_UNKNOWN]:
yield str(self.race)
elif attribute == FeatureAttribute.TELECOM:
for telecom in self.telecom:
Expand Down
16 changes: 15 additions & 1 deletion tests/unit/schemas/test_pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ def test_parse_race(self):
assert record.race == pii.Race.BLACK
record = pii.PIIRecord(race="native hawaiian or other pacific islander")
assert record.race == pii.Race.HAWAIIAN
record = pii.PIIRecord(race="asked unknown")
assert record.race == pii.Race.ASKED_UNKNOWN
record = pii.PIIRecord(race="asked but unknown")
assert record.race == pii.Race.ASKED_UNKNOWN
record = pii.PIIRecord(race="unknown")
Expand Down Expand Up @@ -242,7 +244,7 @@ def test_feature_iter(self):
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.GIVEN_NAME))) == ["John", "L", "Jane"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.FIRST_NAME))) == ["John", "Jane"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.LAST_NAME))) == ["Doe", "Smith"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["UNKNOWN"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == []
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.TELECOM))) == [
"555-123-4567",
"(555) 987-6543",
Expand All @@ -257,6 +259,18 @@ def test_feature_iter(self):
# IDENTIFIER with suffix
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="MR"))) == ["MR::123456"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="SS"))) == ["SS::123-45-6789"]

# Other fields work okay, few more checks on difference race yield values
record = pii.PIIRecord(race="asked unknown")
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == []
record = pii.PIIRecord(race="asked but unknown")
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == []
record = pii.PIIRecord(race="asian")
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["ASIAN"]
record = pii.PIIRecord(race="african american")
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["BLACK"]
record = pii.PIIRecord(race="white")
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["WHITE"]


def test_blocking_keys_invalid(self):
Expand Down

0 comments on commit 1507bea

Please sign in to comment.