From 2e228e2511132fe696be490b52456abb40cf3d57 Mon Sep 17 00:00:00 2001 From: bamader <49412165+bamader@users.noreply.github.com> Date: Tue, 11 Feb 2025 08:20:46 -0500 Subject: [PATCH] Create function for probabilistic exact match (#209) ## Description This PR adds a matcher function for evaluating a probabilistic exact match. For a given feature field in two patient records, the matcher awards maximum log odds points for that field if the two fields are exactly the same and 0 points otherwise. ## Related Issues #201 --- docs/site/reference.md | 10 ++++++ src/recordlinker/linking/matchers.py | 41 +++++++++++++++++++++++- tests/unit/linking/test_matchers.py | 47 ++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 1 deletion(-) diff --git a/docs/site/reference.md b/docs/site/reference.md index b45ce7b7..6f24e185 100644 --- a/docs/site/reference.md +++ b/docs/site/reference.md @@ -166,6 +166,16 @@ existing Patient with the GIVEN_NAME of ["John", "D"]. Use the `kwargs` parameter to specify the desired algorithm and thresholds. Example: `{"kwargs": {"similarity_measure": "levenshtein", "thresholds": {"FIRST_NAME": 0.8}}}` +`func:recordlinker.linking.matchers.compare_probabilistic_exact_match` + +: Determines if a Feature Field has the same value in two different patient records. If the two fields agree + exactly (i.e. are exactly the same), then the function returns the full extent of the log-odds weights for + the particular field with which it was called. If the two fields do not exactly agree, the function returns + 0.0. This is useful when performing probabilistic comparisons (which score a possible match's strength by + accumulating a sum of link weights) on fields for which fuzzy similarity doesn't make sense, such as fields + defined by an enum (e.g. Sex). Use the kwargs parameter to specify the log-odds ratios based on training. + Example: `{"kwargs": {"log_odds": {"SEX": 6.8}}}` + `func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match` : Similar to the above function, but uses a log-odds ratio to determine if the features are a match diff --git a/src/recordlinker/linking/matchers.py b/src/recordlinker/linking/matchers.py index 1fc6b945..5daff2e7 100644 --- a/src/recordlinker/linking/matchers.py +++ b/src/recordlinker/linking/matchers.py @@ -47,6 +47,9 @@ class FeatureFunc(enum.Enum): COMPARE_MATCH_ANY = "func:recordlinker.linking.matchers.compare_match_any" COMPARE_MATCH_ALL = "func:recordlinker.linking.matchers.compare_match_all" COMPARE_FUZZY_MATCH = "func:recordlinker.linking.matchers.compare_fuzzy_match" + COMPARE_PROBABILISTIC_EXACT_MATCH = ( + "func:recordlinker.linking.matchers.compare_probabilistic_exact_match" + ) COMPARE_PROBABILISTIC_FUZZY_MATCH = ( "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match" ) @@ -189,11 +192,47 @@ def compare_fuzzy_match( return 0 +def compare_probabilistic_exact_match( + record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any +) -> float: + """ + Compare the same Feature Field in two patient records, one incoming and one + previously seen, to determine whether the fields fully agree. + If they do, the full log-odds weight-points for this field are added to the + record pair's match strength. Otherwise, no points are added. + + :param record: The incoming record to evaluate. + :param patient: The patient record to compare against. + :param key: The name of the column being evaluated (e.g. "city"). + :param **kwargs: Optionally, a dictionary including specifications for + the string comparison metric to use, as well as the cutoff score + beyond which to classify the strings as a partial match. + :return: A float of the score the feature comparison earned. + """ + log_odds = kwargs.get("log_odds", {}).get(str(key.attribute)) + if log_odds is None: + raise ValueError(f"Log odds not found for feature {key}") + + agree = 0.0 + for x in patient.record.feature_iter(key): + for y in record.feature_iter(key): + # for each permutation of values, check whether the values agree + if (x == y): + agree = 1.0 + break + return agree * log_odds + + def compare_probabilistic_fuzzy_match( record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any ) -> float: """ - ... + Compare the same Feature Field in two patient records, one incoming and one + previously seen, to determine the extent to which the fields agree. + If their string similarity score (agreement) is above a minimum threshold + specified as a kwarg, that proportion of the Field's maximum log-odds + weight points are added to the record match strength. Otherwise, no points + are added. :param record: The incoming record to evaluate. :param patient: The patient record to compare against. diff --git a/tests/unit/linking/test_matchers.py b/tests/unit/linking/test_matchers.py index d421f391..ba1ada4c 100644 --- a/tests/unit/linking/test_matchers.py +++ b/tests/unit/linking/test_matchers.py @@ -148,6 +148,53 @@ def test_compare_fuzzy_match(): matchers.compare_fuzzy_match(record, pat1, schemas.Feature(attribute="first_name")) +def test_compare_probabilistic_exact_match(): + with pytest.raises(ValueError): + matchers.compare_probabilistic_exact_match( + schemas.PIIRecord(), + models.Patient(), + schemas.Feature(attribute=schemas.FeatureAttribute.SEX), + ) + + rec = schemas.PIIRecord( + name=[{"given": ["John", "T"], "family": "Shepard"}], + birthDate="1980-11-7", + ) + pat = models.Patient( + data={ + "name": [{"given": ["John"], "family": "Shepard"}], + "birthDate": "1970-06-07", + } + ) + log_odds = { + schemas.FeatureAttribute.FIRST_NAME.value: 4.0, + schemas.FeatureAttribute.LAST_NAME.value: 6.5, + schemas.FeatureAttribute.BIRTHDATE.value: 9.8, + schemas.FeatureAttribute.ADDRESS.value: 3.7, + } + + assert ( + matchers.compare_probabilistic_exact_match( + rec, pat, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME), log_odds=log_odds + ) + == 4.0 + ) + + assert ( + matchers.compare_probabilistic_exact_match( + rec, pat, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME), log_odds=log_odds + ) + == 6.5 + ) + + assert ( + matchers.compare_probabilistic_exact_match( + rec, pat, schemas.Feature(attribute=schemas.FeatureAttribute.BIRTHDATE), log_odds=log_odds + ) + == 0.0 + ) + + def test_compare_probabilistic_fuzzy_match(): with pytest.raises(ValueError): matchers.compare_probabilistic_fuzzy_match(