Skip to content

Commit

Permalink
Create function for probabilistic exact match (#209)
Browse files Browse the repository at this point in the history
## Description
This PR adds a matcher function for evaluating a probabilistic exact
match. For a given feature field in two patient records, the matcher
awards maximum log odds points for that field if the two fields are
exactly the same and 0 points otherwise.

## Related Issues
#201
  • Loading branch information
bamader authored Feb 11, 2025
1 parent ba4c2ae commit 2e228e2
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 1 deletion.
10 changes: 10 additions & 0 deletions docs/site/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,16 @@ existing Patient with the GIVEN_NAME of ["John", "D"].
Use the `kwargs` parameter to specify the desired algorithm and thresholds.
Example: `{"kwargs": {"similarity_measure": "levenshtein", "thresholds": {"FIRST_NAME": 0.8}}}`

`func:recordlinker.linking.matchers.compare_probabilistic_exact_match`

: Determines if a Feature Field has the same value in two different patient records. If the two fields agree
exactly (i.e. are exactly the same), then the function returns the full extent of the log-odds weights for
the particular field with which it was called. If the two fields do not exactly agree, the function returns
0.0. This is useful when performing probabilistic comparisons (which score a possible match's strength by
accumulating a sum of link weights) on fields for which fuzzy similarity doesn't make sense, such as fields
defined by an enum (e.g. Sex). Use the kwargs parameter to specify the log-odds ratios based on training.
Example: `{"kwargs": {"log_odds": {"SEX": 6.8}}}`

`func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match`

: Similar to the above function, but uses a log-odds ratio to determine if the features are a match
Expand Down
41 changes: 40 additions & 1 deletion src/recordlinker/linking/matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ class FeatureFunc(enum.Enum):
COMPARE_MATCH_ANY = "func:recordlinker.linking.matchers.compare_match_any"
COMPARE_MATCH_ALL = "func:recordlinker.linking.matchers.compare_match_all"
COMPARE_FUZZY_MATCH = "func:recordlinker.linking.matchers.compare_fuzzy_match"
COMPARE_PROBABILISTIC_EXACT_MATCH = (
"func:recordlinker.linking.matchers.compare_probabilistic_exact_match"
)
COMPARE_PROBABILISTIC_FUZZY_MATCH = (
"func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"
)
Expand Down Expand Up @@ -189,11 +192,47 @@ def compare_fuzzy_match(
return 0


def compare_probabilistic_exact_match(
record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any
) -> float:
"""
Compare the same Feature Field in two patient records, one incoming and one
previously seen, to determine whether the fields fully agree.
If they do, the full log-odds weight-points for this field are added to the
record pair's match strength. Otherwise, no points are added.
:param record: The incoming record to evaluate.
:param patient: The patient record to compare against.
:param key: The name of the column being evaluated (e.g. "city").
:param **kwargs: Optionally, a dictionary including specifications for
the string comparison metric to use, as well as the cutoff score
beyond which to classify the strings as a partial match.
:return: A float of the score the feature comparison earned.
"""
log_odds = kwargs.get("log_odds", {}).get(str(key.attribute))
if log_odds is None:
raise ValueError(f"Log odds not found for feature {key}")

agree = 0.0
for x in patient.record.feature_iter(key):
for y in record.feature_iter(key):
# for each permutation of values, check whether the values agree
if (x == y):
agree = 1.0
break
return agree * log_odds


def compare_probabilistic_fuzzy_match(
record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any
) -> float:
"""
...
Compare the same Feature Field in two patient records, one incoming and one
previously seen, to determine the extent to which the fields agree.
If their string similarity score (agreement) is above a minimum threshold
specified as a kwarg, that proportion of the Field's maximum log-odds
weight points are added to the record match strength. Otherwise, no points
are added.
:param record: The incoming record to evaluate.
:param patient: The patient record to compare against.
Expand Down
47 changes: 47 additions & 0 deletions tests/unit/linking/test_matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,53 @@ def test_compare_fuzzy_match():
matchers.compare_fuzzy_match(record, pat1, schemas.Feature(attribute="first_name"))


def test_compare_probabilistic_exact_match():
with pytest.raises(ValueError):
matchers.compare_probabilistic_exact_match(
schemas.PIIRecord(),
models.Patient(),
schemas.Feature(attribute=schemas.FeatureAttribute.SEX),
)

rec = schemas.PIIRecord(
name=[{"given": ["John", "T"], "family": "Shepard"}],
birthDate="1980-11-7",
)
pat = models.Patient(
data={
"name": [{"given": ["John"], "family": "Shepard"}],
"birthDate": "1970-06-07",
}
)
log_odds = {
schemas.FeatureAttribute.FIRST_NAME.value: 4.0,
schemas.FeatureAttribute.LAST_NAME.value: 6.5,
schemas.FeatureAttribute.BIRTHDATE.value: 9.8,
schemas.FeatureAttribute.ADDRESS.value: 3.7,
}

assert (
matchers.compare_probabilistic_exact_match(
rec, pat, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME), log_odds=log_odds
)
== 4.0
)

assert (
matchers.compare_probabilistic_exact_match(
rec, pat, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME), log_odds=log_odds
)
== 6.5
)

assert (
matchers.compare_probabilistic_exact_match(
rec, pat, schemas.Feature(attribute=schemas.FeatureAttribute.BIRTHDATE), log_odds=log_odds
)
== 0.0
)


def test_compare_probabilistic_fuzzy_match():
with pytest.raises(ValueError):
matchers.compare_probabilistic_fuzzy_match(
Expand Down

0 comments on commit 2e228e2

Please sign in to comment.