From 2e228e2511132fe696be490b52456abb40cf3d57 Mon Sep 17 00:00:00 2001
From: bamader <49412165+bamader@users.noreply.github.com>
Date: Tue, 11 Feb 2025 08:20:46 -0500
Subject: [PATCH] Create function for probabilistic exact match (#209)

## Description
This PR adds a matcher function for evaluating a probabilistic exact
match. For a given feature field in two patient records, the matcher
awards maximum log odds points for that field if the two fields are
exactly the same and 0 points otherwise.

## Related Issues
#201
---
 docs/site/reference.md               | 10 ++++++
 src/recordlinker/linking/matchers.py | 41 +++++++++++++++++++++++-
 tests/unit/linking/test_matchers.py  | 47 ++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/docs/site/reference.md b/docs/site/reference.md
index b45ce7b7..6f24e185 100644
--- a/docs/site/reference.md
+++ b/docs/site/reference.md
@@ -166,6 +166,16 @@ existing Patient with the GIVEN_NAME of ["John", "D"].
     Use the `kwargs` parameter to specify the desired algorithm and thresholds.
     Example: `{"kwargs": {"similarity_measure": "levenshtein", "thresholds": {"FIRST_NAME": 0.8}}}`
 
+`func:recordlinker.linking.matchers.compare_probabilistic_exact_match`
+
+:   Determines if a Feature Field has the same value in two different patient records. If the two fields agree
+    exactly (i.e. are exactly the same), then the function returns the full extent of the log-odds weights for 
+    the particular field with which it was called. If the two fields do not exactly agree, the function returns
+    0.0. This is useful when performing probabilistic comparisons (which score a possible match's strength by
+    accumulating a sum of link weights) on fields for which fuzzy similarity doesn't make sense, such as fields
+    defined by an enum (e.g. Sex). Use the kwargs parameter to specify the log-odds ratios based on training.
+    Example: `{"kwargs": {"log_odds": {"SEX": 6.8}}}`
+
 `func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match`
 
 :   Similar to the above function, but uses a log-odds ratio to determine if the features are a match 
diff --git a/src/recordlinker/linking/matchers.py b/src/recordlinker/linking/matchers.py
index 1fc6b945..5daff2e7 100644
--- a/src/recordlinker/linking/matchers.py
+++ b/src/recordlinker/linking/matchers.py
@@ -47,6 +47,9 @@ class FeatureFunc(enum.Enum):
     COMPARE_MATCH_ANY = "func:recordlinker.linking.matchers.compare_match_any"
     COMPARE_MATCH_ALL = "func:recordlinker.linking.matchers.compare_match_all"
     COMPARE_FUZZY_MATCH = "func:recordlinker.linking.matchers.compare_fuzzy_match"
+    COMPARE_PROBABILISTIC_EXACT_MATCH = (
+        "func:recordlinker.linking.matchers.compare_probabilistic_exact_match"
+    )
     COMPARE_PROBABILISTIC_FUZZY_MATCH = (
         "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"
     )
@@ -189,11 +192,47 @@ def compare_fuzzy_match(
     return 0
 
 
+def compare_probabilistic_exact_match(
+    record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any
+) -> float:
+    """
+    Compare the same Feature Field in two patient records, one incoming and one
+    previously seen, to determine whether the fields fully agree.
+    If they do, the full log-odds weight-points for this field are added to the
+    record pair's match strength. Otherwise, no points are added.
+
+    :param record: The incoming record to evaluate.
+    :param patient: The patient record to compare against.
+    :param key: The name of the column being evaluated (e.g. "city").
+    :param **kwargs: Optionally, a dictionary including specifications for
+      the string comparison metric to use, as well as the cutoff score
+      beyond which to classify the strings as a partial match.
+    :return: A float of the score the feature comparison earned.
+    """
+    log_odds = kwargs.get("log_odds", {}).get(str(key.attribute))
+    if log_odds is None:
+        raise ValueError(f"Log odds not found for feature {key}")
+
+    agree = 0.0
+    for x in patient.record.feature_iter(key):
+        for y in record.feature_iter(key):
+            # for each permutation of values, check whether the values agree
+            if (x == y):
+                agree = 1.0
+                break
+    return agree * log_odds
+
+
 def compare_probabilistic_fuzzy_match(
     record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any
 ) -> float:
     """
-    ...
+    Compare the same Feature Field in two patient records, one incoming and one
+    previously seen, to determine the extent to which the fields agree.
+    If their string similarity score (agreement) is above a minimum threshold
+    specified as a kwarg, that proportion of the Field's maximum log-odds
+    weight points are added to the record match strength. Otherwise, no points
+    are added.
 
     :param record: The incoming record to evaluate.
     :param patient: The patient record to compare against.
diff --git a/tests/unit/linking/test_matchers.py b/tests/unit/linking/test_matchers.py
index d421f391..ba1ada4c 100644
--- a/tests/unit/linking/test_matchers.py
+++ b/tests/unit/linking/test_matchers.py
@@ -148,6 +148,53 @@ def test_compare_fuzzy_match():
         matchers.compare_fuzzy_match(record, pat1, schemas.Feature(attribute="first_name"))
 
 
+def test_compare_probabilistic_exact_match():
+    with pytest.raises(ValueError):
+        matchers.compare_probabilistic_exact_match(
+            schemas.PIIRecord(),
+            models.Patient(),
+            schemas.Feature(attribute=schemas.FeatureAttribute.SEX),
+        )
+    
+    rec = schemas.PIIRecord(
+        name=[{"given": ["John", "T"], "family": "Shepard"}],
+        birthDate="1980-11-7",
+    )
+    pat = models.Patient(
+        data={
+            "name": [{"given": ["John"], "family": "Shepard"}],
+            "birthDate": "1970-06-07",
+        }
+    )
+    log_odds = {
+        schemas.FeatureAttribute.FIRST_NAME.value: 4.0,
+        schemas.FeatureAttribute.LAST_NAME.value: 6.5,
+        schemas.FeatureAttribute.BIRTHDATE.value: 9.8,
+        schemas.FeatureAttribute.ADDRESS.value: 3.7,
+    }
+
+    assert (
+        matchers.compare_probabilistic_exact_match(
+            rec, pat, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME), log_odds=log_odds
+        )
+        == 4.0
+    )
+
+    assert (
+        matchers.compare_probabilistic_exact_match(
+            rec, pat, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME), log_odds=log_odds
+        )
+        == 6.5
+    )
+
+    assert (
+        matchers.compare_probabilistic_exact_match(
+            rec, pat, schemas.Feature(attribute=schemas.FeatureAttribute.BIRTHDATE), log_odds=log_odds
+        )
+        == 0.0
+    )
+
+
 def test_compare_probabilistic_fuzzy_match():
     with pytest.raises(ValueError):
         matchers.compare_probabilistic_fuzzy_match(