Merge branch 'main' into feature/195-a11y-doc-workflow

CDCgov · Feb 13, 2025 · 79836ec · 79836ec
2 parents 55e7cac + c7aa14d
commit 79836ec
Show file tree

Hide file tree

Showing 21 changed files with 308 additions and 902 deletions.
diff --git a/.github/workflows/check_smoke_tests.yml b/.github/workflows/check_smoke_tests.yml
@@ -81,7 +81,7 @@ jobs:
       
         # Run smoke tests and print the response
         JSON_BODY_1='{"record": {"birth_date": "2053-11-07", "sex": "M", "identifiers":[{"value": "123456789", "type": "MR"}], "name":[{"family":"Shepard", "given":["John"]}]}}'
-        JSON_BODY_2='{"algorithm": "dibbs-enhanced", "record": {"birth_date": "2000-12-06", "sex": "M", "identifiers":[{"value": "9876543210", "type": "MR"}], "name":[{"family":"Smith", "given":["William"]}]}}'
+        JSON_BODY_2='{"algorithm": "dibbs-default", "record": {"birth_date": "2000-12-06", "sex": "M", "identifiers":[{"value": "9876543210", "type": "MR"}], "name":[{"family":"Smith", "given":["William"]}]}}'
 
         #basic tests
         RESPONSE_1=$(curl -s -X POST http://localhost:8080/link \

diff --git a/README.md b/README.md
@@ -39,6 +39,14 @@ To run the API locally, use the following command:
 
 The API will be available at `http://localhost:8000`. Visit `http://localhost:8000/redoc` to view the API documentation.
 
+## Testing
+
+The RecordLinker system comes with a number of built-in tests spread across several different types. Some of these tests are run automatically (e.g. by Github), while others must be manually executed by a developer.
+
+- `tests/unit`: These comprise basic unit (and in some cases integration) tests providing code coverage to RecordLinker. These tests demonstrate the functionality of different parts of the code base under different logical conditions and with different inputs and outputs. They are automataically executed by a Github Actions workflow as part of a PR.
+- `tests/algorithm`: This is a set of scripts developed to test an algorithm configuration with a known set of particular edge cases. In response to frequent questions of how the DIBBs algorithm handles case X, this mini-project was created to help answer those questions by giving developers some persistent evaluation tools. These tests are _not_ automated, and developers will need to go through the steps in the README in the relevant directory in order to run them.
+- `tests/performance`: Another set of scripts developed to see how fast the API can process linkage requests using synthetic data. This is useful for verifying refactors are still performant and helping developers identify bottlenecks along the way. These tests are _not_ automated, and developers need to go through the steps in the README of the relevant directory in order to run them.
+
 ### Running unit tests
 
 To run all the unit tests, use the following command:

diff --git a/docs/api_template.hbs b/docs/api_template.hbs
@@ -0,0 +1,56 @@
+{{!-- This is the default template for the ReDoc API documentation. --}}
+{{!-- You can customize it be following the instructions at https://redocly.com/docs/cli/commands/build-docs --}}
+{{!-- The template uses Handlebars.js syntax: https://handlebarsjs.com/ --}}
+{{!-- Styling and attribute changes have been made to account for a11y issues --}}
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+  <meta charset="utf8" />
+  <title>{{title}}</title>
+  <!-- needed for adaptive design -->
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <style>
+    body {
+      padding: 0;
+      margin: 0;
+    }
+    /* unselected status code tabs with dark backgrounds should have a lighter color */
+    .cJteCP > ul > li.tab-success:not(.react-tabs__tab--selected),
+    .cJteCP > ul > li.tab-error:not(.react-tabs__tab--selected) {
+        color: rgb(204, 204, 204) !important;
+    }
+    /* body/parameter titles should have dark font color */
+    .hNlDMA {
+        color: rgb(51, 51, 51) !important;
+    }
+    /* deprecated warnings should have dark font color */
+    .cMhXxZ {
+        color: rgb(51, 51, 51) !important;
+    }
+    /* description code blocks should have dark dark color */
+    .hzUya code {
+        color: rgb(51, 51, 51) !important;
+    }
+  </style>
+  {{{redocHead}}}
+  {{#unless disableGoogleFont}}<link href="https://fonts.googleapis.com/css?family=Montserrat:300,400,700|Roboto:300,400,700" rel="stylesheet">{{/unless}}
+</head>
+
+<body>
+  {{{redocHTML}}}
+</body>
+
+<script>
+    window.onload = () => {
+        document.querySelectorAll('.sc-eeDRCY.gvJSKt.sc-fXSgeo.jJRLaa').forEach(element => {
+            /* Add a tabindex to scrollable divs */
+            element.setAttribute('tabindex', '0');
+        });
+        document.querySelectorAll('.sc-kYxDKI.eoKbCJ').forEach(element => {
+            /* Add role="menu" to the ul element with child menuitems */
+            element.setAttribute('role', 'menu');
+        });
+    };
+</script>
+</html>
diff --git a/docs/site/reference.md b/docs/site/reference.md
@@ -127,11 +127,6 @@ patient data and used during query retrieval. The following blocking key types a
 These are the functions that can be used to evaluate the matching results as a collection, thus
 determining it the incoming payload is a match or not to an existing Patient record.
 
-`func:recordlinker.linking.matchers.rule_match`
-
-:   Determines whether a given set of feature comparisons represent a 'perfect' match
-    (i.e. all features that were compared match in whatever criteria was specified).
-
 `func:recordlinker.linking.matchers.rule_probabilistic_match`
 
 :   Determines whether a given set of feature comparisons matches enough to be the
@@ -154,21 +149,6 @@ that to an existing Patient with the ADDRESS of
 [{"address": ["123 Main Street"], "city": "Springfield", "state": "IL"}, {"address": ["456 Elm St"], "state": "IL"}].
 In that case we'd want to evaluate "123 Main St" against both "123 Main Street" and "456 Elm St".
 
-`func:recordlinker.linking.matchers.compare_match_any`
-
-:   Determines if any of the features are a direct match.
-
-`func:recordlinker.linking.matchers.compare_match_all`
-
-:   Determines if all of the features are a direct match.
-
-`func:recordlinker.linking.matchers.compare_fuzzy_match`
-
-:   Determines if the features are a fuzzy match based on a string comparison.
-    JaroWinkler, Levenshtein and Damerau-Levenshtein are supported, with JaroWinkler as the default.
-    Use the `kwargs` parameter to specify the desired algorithm and thresholds.
-    Example: `{"kwargs": {"similarity_measure": "levenshtein", "thresholds": {"FIRST_NAME": 0.8}}}`
-
 `func:recordlinker.linking.matchers.compare_probabilistic_exact_match`
 
 :   Determines if a Feature Field has the same value in two different patient records. If the two fields agree

diff --git a/scripts/build_docs.sh b/scripts/build_docs.sh
@@ -14,5 +14,5 @@ VERSION=${VERSION:-$(python -c "from recordlinker._version import __version__; p
 SITE_NAME="RecordLinker Documentation (${VERSION})"
 
 SITE_NAME=${SITE_NAME} mkdocs build --config-file docs/mkdocs.yml -d "../${OUT}"
-python -m recordlinker.utils.openapi_schema > ${OUT}/openapi.json
-npx @redocly/cli build-docs -o "${OUT}/api-docs.html" "${OUT}/openapi.json"
+python -m recordlinker.utils.openapi_schema > "${OUT}/openapi.json"
+npx @redocly/cli build-docs -t docs/api_template.hbs -o "${OUT}/api-docs.html" "${OUT}/openapi.json"
diff --git a/src/recordlinker/assets/initial_algorithms.json b/src/recordlinker/assets/initial_algorithms.json
@@ -1,76 +1,13 @@
 [
     {
-        "label": "dibbs-basic",
-        "description": "The DIBBs Default Algorithm. Based on field experimentation and statistical analysis, this deterministic two-pass algorithm combines geographical and personal information to maximize linkage quality while minimizing false positives",
+        "label": "dibbs-default",
+        "description": "The core DIBBs Log-Odds Algorithm. This default, recommended algorithm uses statistical correction to adjust the links between incoming records and previously processed patients (it does so by taking advantage of the fact that some fields are more informative than others—e.g., two records matching on MRN is stronger evidence that they should be linked than if the records matched on zip code). It can be used if additional granularity in matching links is desired. However, while the DIBBs Log-Odds Algorithm can create higher-quality links, it is dependent on statistical updating and pre-calculated population analysis, which requires some work on the part of the user. For those cases where additional precision or stronger matching criteria are required, the Log-Odds algorithm is detailed below.",
         "is_default": true,
         "include_multiple_matches": true,
-        "belongingness_ratio": [0.75, 0.9],
-        "passes": [
-            {
-                "blocking_keys": [
-                    "BIRTHDATE",
-                    "IDENTIFIER",  
-                    "SEX"
-                ],
-                "evaluators": [
-                    {
-                        "feature": "FIRST_NAME",
-                        "func": "func:recordlinker.linking.matchers.compare_fuzzy_match"
-                    },
-                    {
-                        "feature": "LAST_NAME",
-                        "func": "func:recordlinker.linking.matchers.compare_match_all"
-                    }
-                ],
-                "rule": "func:recordlinker.linking.matchers.rule_match",
-                "kwargs": {
-                    "thresholds": {
-                        "FIRST_NAME": 0.9,
-                        "LAST_NAME": 0.9,
-                        "BIRTHDATE": 0.95,
-                        "ADDRESS": 0.9,
-                        "CITY": 0.92,
-                        "ZIP": 0.95
-                    }
-                }
-            },
-            {
-                "blocking_keys": [
-                    "ZIP",
-                    "FIRST_NAME",
-                    "LAST_NAME",
-                    "SEX"
-                ],
-                "evaluators": [
-                    {
-                        "feature": "ADDRESS",
-                        "func": "func:recordlinker.linking.matchers.compare_fuzzy_match"
-                    },
-                    {
-                        "feature": "BIRTHDATE",
-                        "func": "func:recordlinker.linking.matchers.compare_match_all"
-                    }
-                ],
-                "rule": "func:recordlinker.linking.matchers.rule_match",
-                "kwargs": {
-                    "thresholds": {
-                        "FIRST_NAME": 0.9,
-                        "LAST_NAME": 0.9,
-                        "BIRTHDATE": 0.95,
-                        "ADDRESS": 0.9,
-                        "CITY": 0.92,
-                        "ZIP": 0.95
-                    }
-                }
-            }
-        ]
-    },
-    {
-        "label": "dibbs-enhanced",
-        "description": "The DIBBs Log-Odds Algorithm. This optional algorithm uses statistical correction to adjust the links between incoming records and previously processed patients (it does so by taking advantage of the fact that some fields are more informative than others—e.g., two records matching on MRN is stronger evidence that they should be linked than if the records matched on zip code). It can be used if additional granularity in matching links is desired. However, while the DIBBs Log-Odds Algorithm can create higher-quality links, it is dependent on statistical updating and pre-calculated population analysis, which requires some work on the part of the user. For those cases where additional precision or stronger matching criteria are required, the Log-Odds algorithm is detailed below.",
-        "is_default": false,
-        "include_multiple_matches": true,
-        "belongingness_ratio": [0.75, 0.9],
+        "belongingness_ratio": [
+            0.75,
+            0.9
+        ],
         "passes": [
             {
                 "blocking_keys": [
@@ -157,4 +94,4 @@
             }
         ]
     }
-]
+]
diff --git a/src/recordlinker/linking/matchers.py b/src/recordlinker/linking/matchers.py
@@ -30,7 +30,6 @@ class RuleFunc(enum.Enum):
     the algorithm.
     """
 
-    RULE_MATCH = "func:recordlinker.linking.matchers.rule_match"
     RULE_PROBABILISTIC_MATCH = "func:recordlinker.linking.matchers.rule_probabilistic_match"
 
 
@@ -44,9 +43,6 @@ class FeatureFunc(enum.Enum):
     matching, based on the configuration of the algorithm.
     """
 
-    COMPARE_MATCH_ANY = "func:recordlinker.linking.matchers.compare_match_any"
-    COMPARE_MATCH_ALL = "func:recordlinker.linking.matchers.compare_match_all"
-    COMPARE_FUZZY_MATCH = "func:recordlinker.linking.matchers.compare_fuzzy_match"
     COMPARE_PROBABILISTIC_EXACT_MATCH = (
         "func:recordlinker.linking.matchers.compare_probabilistic_exact_match"
     )
@@ -102,19 +98,6 @@ def _get_fuzzy_params(col: str, **kwargs) -> tuple[SIMILARITY_MEASURES, float]:
     return (similarity_measure, threshold)
 
 
-def rule_match(feature_comparisons: list[float], **kwargs: typing.Any) -> bool:
-    """
-    Determines whether a given set of feature comparisons represent a
-    'perfect' match (i.e. whether all features that were compared match
-    in whatever criteria was specified for them).
-
-    :param feature_comparisons: A list of 1s and 0s, one for each feature
-      that was compared during the match algorithm.
-    :return: The evaluation of whether the given features all match.
-    """
-    return sum(feature_comparisons) == len(feature_comparisons)
-
-
 def rule_probabilistic_match(feature_comparisons: list[float], **kwargs: typing.Any) -> bool:
     """
     Determines whether a given set of feature comparisons matches enough
@@ -132,66 +115,6 @@ def rule_probabilistic_match(feature_comparisons: list[float], **kwargs: typing.
     return sum(feature_comparisons) >= float(threshold)
 
 
-def compare_match_any(
-    record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any
-) -> float:
-    """
-    ...
-
-    :param record: The incoming record to evaluate.
-    :param patient: The patient record to compare against.
-    :param key: The name of the column being evaluated (e.g. "city").
-    :return: A float indicating whether any of the features are an exact match.
-    """
-    rec_values = set(record.feature_iter(key))
-    if not rec_values:
-        return 0
-    pat_values = set(patient.record.feature_iter(key))
-    return float(bool(rec_values & pat_values))
-
-
-def compare_match_all(
-    record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any
-) -> float:
-    """
-    ...
-
-    :param record: The incoming record to evaluate.
-    :param patient: The patient record to compare against.
-    :param key: The name of the column being evaluated (e.g. "city").
-    :return: A float indicating whether all of the features are an exact match.
-    """
-    rec_values = set(record.feature_iter(key))
-    if not rec_values:
-        return 0
-    pat_values = set(patient.record.feature_iter(key))
-    return float(rec_values == pat_values)
-
-
-def compare_fuzzy_match(
-    record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any
-) -> float:
-    """
-    ...
-
-    :param record: The incoming record to evaluate.
-    :param patient: The patient record to compare against.
-    :param key: The name of the column being evaluated (e.g. "city").
-    :param **kwargs: Optionally, a dictionary including specifications for
-      the string comparison metric to use, as well as the cutoff score
-      beyond which to classify the strings as a partial match.
-    :return: A float indicating whether the features are a fuzzy match.
-    """
-    similarity_measure, threshold = _get_fuzzy_params(str(key.attribute), **kwargs)
-    comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
-    for x in record.feature_iter(key):
-        for y in patient.record.feature_iter(key):
-            score = comp_func(x, y)
-            if score >= threshold:
-                return 1
-    return 0
-
-
 def compare_probabilistic_exact_match(
     record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any
 ) -> float:

diff --git a/tests/algorithm/README.md b/tests/algorithm/README.md
@@ -92,7 +92,7 @@ The following environment variables can be tuned in the `algo-test.env` file:
 - `SEED_FILE`: The file containing person data to seed the mpi with
 - `TEST_FILE`: The file containing patient data to test the algorithm with
 - `ALGORITHM_CONFIGURATION`: The file containing the algorithm configuration json
-- `ALGORITHM_NAME`: The name of the algorithm to use (either the name of your `ALGORITHM_CONFIGURATION` or can be the built in `dibbs-basic` or `dibbs-enhanced` algorithms)
+- `ALGORITHM_NAME`: The name of the algorithm to use (either the name of your `ALGORITHM_CONFIGURATION` or can be the built in `dibbs-default` algorithms)
 
 
 ## Cleanup