Skip to content

Commit

Permalink
Merge branch 'main' into fix/199-concate-given-name
Browse files Browse the repository at this point in the history
  • Loading branch information
ericbuckley committed Feb 11, 2025
2 parents 8e662cd + ba4c2ae commit 24617cb
Show file tree
Hide file tree
Showing 2 changed files with 158 additions and 64 deletions.
45 changes: 30 additions & 15 deletions src/recordlinker/schemas/pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ def __str__(self):
"""
Return the value of the enum as a string.
"""
return self.value

return self.value


class Feature(pydantic.BaseModel):
"""
The schema for a feature.
Expand Down Expand Up @@ -89,6 +90,7 @@ def all_options(cls) -> list[typing.Any]:
options.append(f"{feature}:{identifier}")
return options


class Sex(enum.Enum):
"""
Enum for the Patient.sex field.
Expand Down Expand Up @@ -190,6 +192,7 @@ def email(self) -> str | None:
return None
return self.value


class PIIRecord(pydantic.BaseModel):
"""
The schema for a PII record.
Expand Down Expand Up @@ -224,7 +227,7 @@ def model_construct(
obj.name = [Name.model_construct(**n) for n in values.get("name", [])]
obj.telecom = [Telecom.model_construct(**t) for t in values.get("telecom", [])]
obj.identifiers = [Identifier.model_construct(**i) for i in values.get("identifiers", [])]

return obj

@pydantic.field_validator("external_id", mode="before")
Expand Down Expand Up @@ -302,7 +305,7 @@ def feature_iter(self, feature: Feature) -> typing.Iterator[str]:

if not isinstance(feature, Feature):
raise ValueError(f"Invalid feature: {feature}")

attribute = feature.attribute
identifier_suffix = feature.suffix

Expand Down Expand Up @@ -374,7 +377,7 @@ def feature_iter(self, feature: Feature) -> typing.Iterator[str]:
elif attribute == FeatureAttribute.IDENTIFIER:
for identifier in self.identifiers:
if identifier_suffix is None or identifier_suffix == identifier.type:
yield f"{identifier.type}:{identifier.authority or ''}:{identifier.value}"
yield f"{identifier.value}:{identifier.authority or ''}:{identifier.type}"

def blocking_keys(self, key: models.BlockingKey) -> set[str]:
"""
Expand All @@ -391,25 +394,37 @@ def blocking_keys(self, key: models.BlockingKey) -> set[str]:
# NOTE: we could optimize here and remove the dashes from the date
vals.update(self.feature_iter(Feature(attribute=FeatureAttribute.BIRTHDATE)))
elif key == models.BlockingKey.IDENTIFIER:
vals.update({
f"{type_part}:{authority_part[:2]}:{value_part[-4:]}"
for x in self.feature_iter(Feature(attribute=FeatureAttribute.IDENTIFIER))
for type_part, authority_part, value_part in [x.split(":", 2)]
})
vals.update(
{
f"{value_part[-4:]}:{authority_part[:2]}:{type_part}"
for x in self.feature_iter(Feature(attribute=FeatureAttribute.IDENTIFIER))
for value_part, authority_part, type_part in [x.split(":", 2)]
}
)
elif key == models.BlockingKey.SEX:
vals.update(self.feature_iter(Feature(attribute=FeatureAttribute.SEX)))
elif key == models.BlockingKey.ZIP:
vals.update(self.feature_iter(Feature(attribute=FeatureAttribute.ZIP)))
elif key == models.BlockingKey.FIRST_NAME:
vals.update({x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.FIRST_NAME))})
vals.update(
{x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.FIRST_NAME))}
)
elif key == models.BlockingKey.LAST_NAME:
vals.update({x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.LAST_NAME))})
vals.update(
{x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.LAST_NAME))}
)
elif key == models.BlockingKey.ADDRESS:
vals.update({x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.ADDRESS))})
vals.update(
{x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.ADDRESS))}
)
elif key == models.BlockingKey.PHONE:
vals.update({x[-4:] for x in self.feature_iter(Feature(attribute=FeatureAttribute.PHONE))})
vals.update(
{x[-4:] for x in self.feature_iter(Feature(attribute=FeatureAttribute.PHONE))}
)
elif key == models.BlockingKey.EMAIL:
vals.update({x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.EMAIL))})
vals.update(
{x[:4] for x in self.feature_iter(Feature(attribute=FeatureAttribute.EMAIL))}
)

# if any vals are longer than the BLOCKING_KEY_MAX_LENGTH, raise an error
if any(len(x) > models.BLOCKING_VALUE_MAX_LENGTH for x in vals):
Expand Down
177 changes: 128 additions & 49 deletions tests/unit/schemas/test_pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def test_model_construct(self):
"type": "DL",
"value": "D1234567",
"authority": "VA",
}
]
},
],
}
record = pii.PIIRecord.model_construct(**data)
assert record.birth_date == "1980-2-1"
Expand All @@ -72,7 +72,7 @@ def test_model_construct(self):
assert record.address[1].postal_code == "98765-4321"
assert record.address[1].county == "county2"

#identifiers
# identifiers
assert str(record.identifiers[0].type) == "MR"
assert record.identifiers[0].value == "99"

Expand Down Expand Up @@ -129,14 +129,14 @@ def test_parse_sex(self):
def test_parse_ssn(self):
record = pii.PIIRecord(identifiers=[pii.Identifier(type="SS", value="123-45-6789")])
assert record.identifiers[0].value == "123-45-6789"
#testing extra spaces
# testing extra spaces
record = pii.PIIRecord(identifiers=[pii.Identifier(type="SS", value=" 123-45-6789 ")])
assert record.identifiers[0].value == "123-45-6789"
#testing no dashes
# testing no dashes
record = pii.PIIRecord(identifiers=[pii.Identifier(type="SS", value="123456789")])
assert record.identifiers[0].value == "123-45-6789"
record = pii.PIIRecord(identifiers=[pii.Identifier(type="SS", value="1-2-3")])
assert record.identifiers[0].value == ''
assert record.identifiers[0].value == ""
record = pii.PIIRecord()
assert record.identifiers == []

Expand Down Expand Up @@ -228,49 +228,91 @@ def test_feature_iter(self):
"type": "DL",
"value": "D1234567",
"authority": "VA",
}
},
],
)

with pytest.raises(ValueError):
list(record.feature_iter("external_id"))

assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.BIRTHDATE))) == ["1980-02-01"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.BIRTHDATE))) == [
"1980-02-01"
]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.SEX))) == ["M"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.ADDRESS))) == ["123 Main St", "456 Elm St"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.CITY))) == ["Anytown", "Somecity"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.STATE))) == ["NY", "CA"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.ZIP))) == ["12345", "98765"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.GIVEN_NAME))) == ["John L", "Jane"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.FIRST_NAME))) == ["John", "Jane"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.LAST_NAME))) == ["Doe", "Smith"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.ADDRESS))) == [
"123 Main St",
"456 Elm St",
]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.CITY))) == [
"Anytown",
"Somecity",
]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.STATE))) == [
"NY",
"CA",
]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.ZIP))) == [
"12345",
"98765",
]
assert list(
record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.GIVEN_NAME))
) == ["John L", "Jane"]
assert list(
record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.FIRST_NAME))
) == ["John", "Jane"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.LAST_NAME))) == [
"Doe",
"Smith",
]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == []
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.TELECOM))) == [
"555-123-4567",
"(555) 987-6543",
"[email protected]",
]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.PHONE))) == ["5559876543"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.EMAIL))) == ["[email protected]"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.SUFFIX))) == ["suffix", "suffix2"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.COUNTY))) == ["county"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER))) == ["MR::123456", "SS::123-45-6789", "DL:VA:D1234567"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.PHONE))) == [
"5559876543"
]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.EMAIL))) == [
"[email protected]"
]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.SUFFIX))) == [
"suffix",
"suffix2",
]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.COUNTY))) == [
"county"
]
assert list(
record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER))
) == ["123456::MR", "123-45-6789::SS", "D1234567:VA:DL"]

# IDENTIFIER with suffix
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="MR"))) == ["MR::123456"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="SS"))) == ["SS::123-45-6789"]
assert list(
record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="MR"))
) == ["123456::MR"]
assert list(
record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.IDENTIFIER, suffix="SS"))
) == ["123-45-6789::SS"]

# Other fields work okay, few more checks on difference race yield values
record = pii.PIIRecord(race="asked unknown")
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == []
record = pii.PIIRecord(race="asked but unknown")
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == []
record = pii.PIIRecord(race="asian")
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["ASIAN"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == [
"ASIAN"
]
record = pii.PIIRecord(race="african american")
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["BLACK"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == [
"BLACK"
]
record = pii.PIIRecord(race="white")
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == ["WHITE"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.RACE))) == [
"WHITE"
]

def test_feature_iter_given_name(self):
record = pii.PIIRecord(
Expand All @@ -280,9 +322,12 @@ def test_feature_iter_given_name(self):
],
)

assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.GIVEN_NAME))) == ["John L", "Jon Lewis Doe"]
assert list(record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.FIRST_NAME))) == ["John", "Jon"]

assert list(
record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.GIVEN_NAME))
) == ["John L", "Jon Lewis Doe"]
assert list(
record.feature_iter(pii.Feature(attribute=pii.FeatureAttribute.FIRST_NAME))
) == ["John", "Jon"]

def test_blocking_keys_invalid(self):
rec = pii.PIIRecord()
Expand Down Expand Up @@ -312,17 +357,19 @@ def test_blocking_keys_mrn_last_four(self):
assert rec.blocking_keys(BlockingKey.IDENTIFIER) == set()
rec = pii.PIIRecord(**{"identifiers": []})
assert rec.blocking_keys(BlockingKey.IDENTIFIER) == set()
rec = pii.PIIRecord(**{"identifiers": [{ "type": "MR", "value": "123456789" }]})
assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR::6789"}
rec = pii.PIIRecord(**{"identifiers": [{ "type": "MR", "value": "89" }]})
assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR::89"}

#test multiple identifiers return correctly
rec = pii.PIIRecord(identifiers=[
pii.Identifier(type="MR", value="123456789"),
pii.Identifier(type="SS", value="123456789"),
])
assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR::6789", "SS::6789"}
rec = pii.PIIRecord(**{"identifiers": [{"type": "MR", "value": "123456789"}]})
assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"6789::MR"}
rec = pii.PIIRecord(**{"identifiers": [{"type": "MR", "value": "89"}]})
assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"89::MR"}

# test multiple identifiers return correctly
rec = pii.PIIRecord(
identifiers=[
pii.Identifier(type="MR", value="123456789"),
pii.Identifier(type="SS", value="123456789"),
]
)
assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"6789::MR", "6789::SS"}

def test_blocking_keys_sex(self):
rec = pii.PIIRecord(**{"gender": "M"})
Expand Down Expand Up @@ -398,30 +445,62 @@ def test_blocking_keys_phone_last_four(self):
assert rec.blocking_keys(BlockingKey.PHONE) == set()
rec = pii.PIIRecord(**{"telecom": [{"value": "(555) 123-4567", "system": "phone"}]})
assert rec.blocking_keys(BlockingKey.PHONE) == {"4567"}
rec = pii.PIIRecord(**{"telecom": [{"value": "555.123.4567", "system": "phone"}, {"value": "555-987-6543 ext 123", "system": "phone"}]})
rec = pii.PIIRecord(
**{
"telecom": [
{"value": "555.123.4567", "system": "phone"},
{"value": "555-987-6543 ext 123", "system": "phone"},
]
}
)
assert rec.blocking_keys(BlockingKey.PHONE) == {"4567", "6543"}
rec = pii.PIIRecord(**{"telecom": [{"value": "555.123.4567", "system": "phone"}, {"value": "555-987-6543", "system": "fax"}]})
rec = pii.PIIRecord(
**{
"telecom": [
{"value": "555.123.4567", "system": "phone"},
{"value": "555-987-6543", "system": "fax"},
]
}
)
assert rec.blocking_keys(BlockingKey.PHONE) == {"4567"}

def test_blocking_keys_email_first_four(self):
rec = pii.PIIRecord(**{"email": "[email protected]"})
assert rec.blocking_keys(BlockingKey.EMAIL) == set()
rec = pii.PIIRecord(**{"telecom": [{"value": "[email protected]", "system": "email"}]})
assert rec.blocking_keys(BlockingKey.EMAIL) == {"test"}
rec = pii.PIIRecord(**{"telecom": [{"value": "[email protected]", "system": "email"}, {"value": "[email protected]", "system": "email"}]})
rec = pii.PIIRecord(
**{
"telecom": [
{"value": "[email protected]", "system": "email"},
{"value": "[email protected]", "system": "email"},
]
}
)
assert rec.blocking_keys(BlockingKey.EMAIL) == {"test", "bob@"}
rec = pii.PIIRecord(**{"telecom": [{"value": "[email protected]", "system": "email"}, {"value": "[email protected]", "system": "other"}]})
rec = pii.PIIRecord(
**{
"telecom": [
{"value": "[email protected]", "system": "email"},
{"value": "[email protected]", "system": "other"},
]
}
)
assert rec.blocking_keys(BlockingKey.EMAIL) == {"t@gm"}

def test_blocking_keys_identifier(self):
rec = pii.PIIRecord(**{"identifiers": []})
assert rec.blocking_keys(BlockingKey.IDENTIFIER) == set()
rec = pii.PIIRecord(**{"identifiers": [{"type": "MR", "value": "123456789", "authority": "NY"}]})
assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR:NY:6789"}
rec = pii.PIIRecord(
**{"identifiers": [{"type": "MR", "value": "123456789", "authority": "NY"}]}
)
assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"6789:NY:MR"}

#test only get first 2 characters of authority for blocking
rec = pii.PIIRecord(**{"identifiers": [{"type": "MR", "value": "123456789", "authority": "DMV"}]})
assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"MR:DM:6789"}
# test only get first 2 characters of authority for blocking
rec = pii.PIIRecord(
**{"identifiers": [{"type": "MR", "value": "123456789", "authority": "DMV"}]}
)
assert rec.blocking_keys(BlockingKey.IDENTIFIER) == {"6789:DM:MR"}

def test_blocking_values(self):
rec = pii.PIIRecord(
Expand All @@ -436,7 +515,7 @@ def test_blocking_values(self):
if key == BlockingKey.BIRTHDATE:
assert val == "1980-01-01"
elif key == BlockingKey.IDENTIFIER:
assert val == "MR::3456"
assert val == "3456::MR"
elif key == BlockingKey.FIRST_NAME:
assert val == "John"
elif key == BlockingKey.LAST_NAME:
Expand Down

0 comments on commit 24617cb

Please sign in to comment.